flushing out the W.D. so I can start anew
parent
5480b1feee
commit
20efd8efe7
|
@ -0,0 +1,17 @@
|
||||||
|
module Main where
|
||||||
|
|
||||||
|
type Seconds = Int
|
||||||
|
type DomainName = String
|
||||||
|
type Url = (DomainName, String)
|
||||||
|
type UserAgent = String
|
||||||
|
|
||||||
|
data RobotsTxt = RobotsTxt
|
||||||
|
{ crawlDelay :: Seconds
|
||||||
|
, canonicalDomain :: DomainName
|
||||||
|
, sitemapLocation :: Url
|
||||||
|
, globalDisallows :: [Url]
|
||||||
|
, botDisallows :: [(UserAgent, [Url])]
|
||||||
|
}
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = putStrLn ""
|
|
@ -0,0 +1,84 @@
|
||||||
|
module RobotsTxt ( defaultRobotsTxt
|
||||||
|
, generateRobotsTxt
|
||||||
|
, forbidUrl
|
||||||
|
, forbidBotUrls
|
||||||
|
, withCrawlDelay
|
||||||
|
, withCanonicalDomain
|
||||||
|
, withSitemap
|
||||||
|
, usingDefaultSitemap
|
||||||
|
) where
|
||||||
|
|
||||||
|
type Seconds = Int
|
||||||
|
type DomainName = String
|
||||||
|
type Url = String
|
||||||
|
type UserAgent = String
|
||||||
|
|
||||||
|
data RobotsTxt = RobotsTxt
|
||||||
|
{ crawlDelay :: Seconds
|
||||||
|
, canonicalDomain :: Maybe DomainName
|
||||||
|
, sitemapLocation :: Maybe Url
|
||||||
|
, globalDisallows :: [Url]
|
||||||
|
, botDisallows :: [(UserAgent, [Url])]
|
||||||
|
}
|
||||||
|
|
||||||
|
defaultRobotsTxt :: RobotsTxt
|
||||||
|
defaultRobotsTxt = RobotsTxt
|
||||||
|
{ crawlDelay = 10
|
||||||
|
, canonicalDomain = Nothing
|
||||||
|
, sitemapLocation = Nothing
|
||||||
|
, globalDisallows = []
|
||||||
|
, botDisallows = []
|
||||||
|
}
|
||||||
|
|
||||||
|
forbidUrl :: Url -> RobotsTxt -> RobotsTxt
|
||||||
|
forbidUrl url cnf = cnf { globalDisallows = url : globalDisallows cnf }
|
||||||
|
|
||||||
|
forbidBotUrls :: (UserAgent, [Url]) -> RobotsTxt -> RobotsTxt
|
||||||
|
forbidBotUrls forbid cnf = cnf { botDisallows = forbid : botDisallows cnf }
|
||||||
|
|
||||||
|
withCrawlDelay :: Seconds -> RobotsTxt -> RobotsTxt
|
||||||
|
withCrawlDelay time cnf = cnf { crawlDelay = time }
|
||||||
|
|
||||||
|
withCanonicalDomain :: String -> RobotsTxt -> RobotsTxt
|
||||||
|
withCanonicalDomain domain cnf = cnf { canonicalDomain = Just domain }
|
||||||
|
|
||||||
|
withSitemap :: String -> RobotsTxt -> RobotsTxt
|
||||||
|
withSitemap url cnf = cnf { sitemapLocation = Just url }
|
||||||
|
|
||||||
|
usingDefaultSitemap :: RobotsTxt -> RobotsTxt
|
||||||
|
usingDefaultSitemap cnf = cnf { sitemapLocation = fmap (++ "/sitemap.xml") $ canonicalDomain cnf }
|
||||||
|
|
||||||
|
robotsTxtField :: String -> String -> String
|
||||||
|
robotsTxtField name value = name ++ ": " ++ value ++ "\n"
|
||||||
|
|
||||||
|
robotsTxtEmptyField :: String -> String
|
||||||
|
robotsTxtEmptyField name = name ++ ":"
|
||||||
|
|
||||||
|
globalUserAgent :: String
|
||||||
|
globalUserAgent = "*"
|
||||||
|
|
||||||
|
userAgentField :: UserAgent -> String
|
||||||
|
userAgentField = robotsTxtField "User-agent"
|
||||||
|
|
||||||
|
crawlDelayField :: Seconds -> String
|
||||||
|
crawlDelayField = robotsTxtField "Crawl-delay" . show
|
||||||
|
|
||||||
|
canonicalDomainField :: DomainName -> String
|
||||||
|
canonicalDomainField = robotsTxtField "Host"
|
||||||
|
|
||||||
|
disallowField :: Url -> String
|
||||||
|
disallowField = robotsTxtField "Disallow"
|
||||||
|
|
||||||
|
generateDisallowList :: [Url] -> String
|
||||||
|
generateDisallowList [] = robotsTxtEmptyField "Disallow"
|
||||||
|
generateDisallowList xs = foldl1 (++) $ map disallowField xs
|
||||||
|
|
||||||
|
generateBotDisallowList :: (UserAgent, [Url]) -> String
|
||||||
|
generateBotDisallowList (bot, urls) = userAgentField bot ++ generateDisallowList urls
|
||||||
|
|
||||||
|
generateRobotsTxt :: RobotsTxt -> String
|
||||||
|
generateRobotsTxt config = (userAgentField globalUserAgent)
|
||||||
|
++ (crawlDelayField $ crawlDelay config)
|
||||||
|
++ (maybe "" canonicalDomainField $ canonicalDomain config)
|
||||||
|
++ (generateDisallowList $ globalDisallows config)
|
||||||
|
++ (foldl1 (++) $ map generateBotDisallowList $ botDisallows config)
|
|
@ -0,0 +1,105 @@
|
||||||
|
module SitemapXml
|
||||||
|
( ChangeFreq
|
||||||
|
, mkPriority
|
||||||
|
, defaultPriority
|
||||||
|
, defaultUrlData
|
||||||
|
, withPriority
|
||||||
|
, withLastMod
|
||||||
|
, withChangeFreq
|
||||||
|
, generateSitemapXml
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
type Sitemap = [UrlData]
|
||||||
|
|
||||||
|
data ChangeFreq = CFNever
|
||||||
|
| CFYearly
|
||||||
|
| CFMonthly
|
||||||
|
| CFWeekly
|
||||||
|
| CFDaily
|
||||||
|
| CFHourly
|
||||||
|
| CFAlways
|
||||||
|
|
||||||
|
instance Show ChangeFreq where
|
||||||
|
show CFNever = "never"
|
||||||
|
show CFYearly = "yearly"
|
||||||
|
show CFMonthly = "monthly"
|
||||||
|
show CFWeekly = "weekly"
|
||||||
|
show CFDaily = "daily"
|
||||||
|
show CFHourly = "hourly"
|
||||||
|
show CFAlways = "always"
|
||||||
|
|
||||||
|
data Priority = Priority Int
|
||||||
|
|
||||||
|
instance Show Priority where
|
||||||
|
-- There is probably a better way to do this
|
||||||
|
show (Priority x) = printf "%.1f" (fromIntegral x / 10 :: Float)
|
||||||
|
|
||||||
|
mkPriority :: Int -> Priority
|
||||||
|
mkPriority x | x >= 0 && x <= 10 = Priority x
|
||||||
|
|
||||||
|
defaultPriority :: Priority
|
||||||
|
defaultPriority = mkPriority 5
|
||||||
|
|
||||||
|
data UrlData = UrlData
|
||||||
|
{ url :: String
|
||||||
|
, lastMod :: Maybe String
|
||||||
|
, changeFreq :: Maybe ChangeFreq
|
||||||
|
, priority :: Priority
|
||||||
|
}
|
||||||
|
|
||||||
|
defaultUrlData :: String -> UrlData
|
||||||
|
defaultUrlData url = UrlData { url = url, lastMod = Nothing, changeFreq = Nothing, priority = defaultPriority }
|
||||||
|
|
||||||
|
withPriority :: Int -> UrlData -> UrlData
|
||||||
|
withPriority x dat = dat { priority = mkPriority x }
|
||||||
|
|
||||||
|
withLastMod :: String -> UrlData -> UrlData
|
||||||
|
withLastMod x dat = dat { lastMod = Just x }
|
||||||
|
|
||||||
|
withChangeFreq :: ChangeFreq -> UrlData -> UrlData
|
||||||
|
withChangeFreq x dat = dat { changeFreq = Just x }
|
||||||
|
|
||||||
|
-- I know there are Xml generation libraries, but it's not worth their inclusion
|
||||||
|
-- over such a trivial application at this point
|
||||||
|
data Xml = Tag String [(String, String)] [Xml] | Data String
|
||||||
|
|
||||||
|
renderAttribute :: (String, String) -> String
|
||||||
|
renderAttribute (name, value) = name ++ "=\"" ++ value ++ "\""
|
||||||
|
|
||||||
|
xmlComment :: String
|
||||||
|
xmlComment = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
|
||||||
|
|
||||||
|
renderXml :: Xml -> String
|
||||||
|
renderXml (Data x) = x
|
||||||
|
renderXml (Tag name attrs body) = openTag
|
||||||
|
++ renderedBody
|
||||||
|
++ closeTag
|
||||||
|
where openTag = "<" ++ name ++ " " ++ attributes ++ ">"
|
||||||
|
attributes = unwords $ map renderAttribute attrs
|
||||||
|
renderedBody = foldl1 (++) $ map renderXml body
|
||||||
|
closeTag = "</" ++ name ++ ">"
|
||||||
|
|
||||||
|
tagNoAttrs :: String -> [Xml] -> Xml
|
||||||
|
tagNoAttrs name body = Tag name [] body
|
||||||
|
|
||||||
|
tagOneBody :: String -> Xml -> Xml
|
||||||
|
tagOneBody name body = tagNoAttrs name [body]
|
||||||
|
|
||||||
|
maybeList :: Maybe a -> [a]
|
||||||
|
maybeList Nothing = []
|
||||||
|
maybeList (Just x) = [x]
|
||||||
|
|
||||||
|
urlDataToXml :: UrlData -> Xml
|
||||||
|
urlDataToXml dat = tagNoAttrs "url" $ locTag ++ priorityTag ++ lastModTag ++ changeFreqTag
|
||||||
|
where locTag = [tagOneBody "loc" $ Data $ url dat]
|
||||||
|
priorityTag = [tagOneBody "priority" $ Data $ show $ priority dat]
|
||||||
|
lastModTag = maybeList $ fmap (tagOneBody "lastmod" . Data) $ lastMod dat
|
||||||
|
changeFreqTag = maybeList $ fmap (tagOneBody "changefreq" . Data . show) $ changeFreq dat
|
||||||
|
|
||||||
|
sitemapToXml :: Sitemap -> Xml
|
||||||
|
sitemapToXml = Tag "urlset" [("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")] . map urlDataToXml
|
||||||
|
|
||||||
|
generateSitemapXml :: Sitemap -> String
|
||||||
|
generateSitemapXml sitemap = xmlComment ++ (renderXml $ sitemapToXml sitemap)
|
|
@ -0,0 +1,27 @@
|
||||||
|
module SitemapXml where
|
||||||
|
|
||||||
|
type Sitemap = [UrlData]
|
||||||
|
|
||||||
|
data ChangeFreq = CFNever
|
||||||
|
| CFYearly
|
||||||
|
| CFMonthly
|
||||||
|
| CFWeekly
|
||||||
|
| CFDaily
|
||||||
|
| CFHourly
|
||||||
|
| CFAlways
|
||||||
|
|
||||||
|
instance Show ChangeFreq where
|
||||||
|
show CFNever = "never"
|
||||||
|
show CFYearly = "yearly"
|
||||||
|
show CFMonthly = "monthly"
|
||||||
|
show CFWeekly = "weekly"
|
||||||
|
show CFDaily = "daily"
|
||||||
|
show CFHourly = "hourly"
|
||||||
|
show CFAlways = "always"
|
||||||
|
|
||||||
|
data UrlData = UrlData
|
||||||
|
{ url :: String
|
||||||
|
, lastMod :: Maybe String
|
||||||
|
, changeFreq :: ChangeFreq
|
||||||
|
, priority :: Priority
|
||||||
|
}
|
|
@ -0,0 +1,59 @@
|
||||||
|
#lang racket
|
||||||
|
;; /robots.txt specifies how web crawlers should access your website
|
||||||
|
;; see www.robotstxt.org
|
||||||
|
|
||||||
|
; Generate a robots.txt field
|
||||||
|
(define (robots-field name (body '()))
|
||||||
|
(define robots-field-base (string-append name ":"))
|
||||||
|
(if (null? body) robots-field-base
|
||||||
|
(string-append robots-field-base " " body)))
|
||||||
|
|
||||||
|
; Generate a user agent line
|
||||||
|
(define (robots-ua (name "*"))
|
||||||
|
(robots-field "User-agent" name))
|
||||||
|
|
||||||
|
; Generate a bunch of Disallow lines from a list of urls
|
||||||
|
(define (robots-disallow list)
|
||||||
|
(if (empty? list) (robots-field "Disallow")
|
||||||
|
(string-append*
|
||||||
|
(map list (lambda (url)
|
||||||
|
(robots-field "Disallow" url))))))
|
||||||
|
|
||||||
|
; Map into and unwrap an optional value: if x present, f x, else d
|
||||||
|
(define (when-present d f x)
|
||||||
|
(if (null? x) d
|
||||||
|
(apply f x)))
|
||||||
|
|
||||||
|
; Forbid specific urls to a specific bot by user agent
|
||||||
|
(define (robots-forbidbot bot disallows)
|
||||||
|
(string-append
|
||||||
|
(robots-ua bot)
|
||||||
|
(robots-disallow disallows)))
|
||||||
|
|
||||||
|
; Blocks format: (cons (list of global blocks) (list of (cons bot-ua (list of urls))))
|
||||||
|
(define (robots-config (blocks '())
|
||||||
|
#:crawl-delay (crawl-delay 10) ; How frequently a bot should access your site-- poorly specified
|
||||||
|
#:host (host '()) ; The canonical domain for your website
|
||||||
|
#:sitemap (sitemap '())) ; Your sitemap.xml
|
||||||
|
(define (field-when-present name value)
|
||||||
|
(when-present "" ((curry robots-field) name) value))
|
||||||
|
(define block-lists
|
||||||
|
(when-present ""
|
||||||
|
(match-lambda
|
||||||
|
([cons global rest]
|
||||||
|
(string-append
|
||||||
|
; First we have the global disallow rules
|
||||||
|
(robots-disallow global)
|
||||||
|
(string-append*
|
||||||
|
; then a list of the disallow rules for individual bots
|
||||||
|
(map (match-lambda
|
||||||
|
([cons bot urls]
|
||||||
|
(robots-forbidbot bot urls))) rest)))))
|
||||||
|
blocks))
|
||||||
|
(string-append
|
||||||
|
(robots-ua)
|
||||||
|
block-lists
|
||||||
|
(robots-field "Crawl-delay" (number->string crawl-delay))
|
||||||
|
(field-when-present "Sitemap" sitemap)
|
||||||
|
(field-when-present "Host" host)))
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
#lang racket
|
||||||
|
(require "sitemap.rkt")
|
||||||
|
(require "robots.rkt")
|
||||||
|
|
||||||
|
(define base-url "https://lijero.co")
|
||||||
|
|
||||||
|
(define (site-page url body #:priority priority #:lastmod lastmod #:changefreq changefreq)
|
||||||
|
(cons (sitemap-url (string-append base-url url) #:priority priority #:lastmod lastmod #:changefreq changefreq)
|
||||||
|
(cons url (xexprs body))))
|
||||||
|
|
||||||
|
(define (gen-site pages)
|
||||||
|
(define sitemap-urls (append* (map (match-lambda ([cons sitemap _] sitemap)) pages)))
|
||||||
|
(define page-bodies (append* (map (match-lambda ([cons _ page] page)) pages)))
|
||||||
|
(cons (sitemap sitemap-urls)
|
||||||
|
(page-bodies)
|
|
@ -0,0 +1,26 @@
|
||||||
|
#lang racket
|
||||||
|
(require "xexprs/xexprs.rkt")
|
||||||
|
; A quick sitemap-xml generator (https://www.sitemaps.org/)
|
||||||
|
; Sitemaps help web crawlers index your website
|
||||||
|
|
||||||
|
; Ugly hack because I'm bad at Racket
|
||||||
|
(define (when-present name value rest)
|
||||||
|
(if (null? value)
|
||||||
|
rest
|
||||||
|
(cons (cons name value) rest)))
|
||||||
|
|
||||||
|
; A sitemap URL entry
|
||||||
|
; https://www.sitemaps.org/protocol.html
|
||||||
|
(define (sitemap-url loc #:lastmod (lastmod '()) #:changefreq (changefreq '()) #:priority (priority 0.5))
|
||||||
|
`(url (priority ,priority)
|
||||||
|
,@(when-present "lastmod" lastmod
|
||||||
|
(when-present "changefreq" changefreq
|
||||||
|
'()))))
|
||||||
|
|
||||||
|
; Generates a sitemap xml
|
||||||
|
(define (sitemap urls)
|
||||||
|
(string-append
|
||||||
|
"<?xml version="1.0" encoding="UTF-8"?>"
|
||||||
|
(xexprs
|
||||||
|
`(urlset #:xmlns "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||||
|
,@urls))))
|
|
@ -0,0 +1,18 @@
|
||||||
|
#lang racket
|
||||||
|
(define (robots-field name (body '()))
|
||||||
|
(define robots-field-base (string-append name ":"))
|
||||||
|
(if (nil? body) robots-field-base
|
||||||
|
(string-append robots-field-base " " body)))
|
||||||
|
|
||||||
|
(define (robots-ua (name "*"))
|
||||||
|
(string-append "User-agent: " name))
|
||||||
|
|
||||||
|
(define (robots-disallow list)
|
||||||
|
(if (empty? list)
|
||||||
|
"Disallow:"
|
||||||
|
|
||||||
|
(define (robots-config (#:crawl-delay 10))
|
||||||
|
(string-append
|
||||||
|
(robots-ua)
|
||||||
|
(
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
#lang racket
|
||||||
|
(require "sitemap.rkt")
|
||||||
|
(require "robots.rkt")
|
||||||
|
|
||||||
|
(define base-url "https://lijero.co")
|
||||||
|
|
||||||
|
(define (site-page url body #:priority priority #:lastmod lastmod #:changefreq changefreq)
|
||||||
|
(cons (sitemap-url (string-append base-url url) #:priority priority #:lastmod lastmod #:changefreq changefreq)
|
||||||
|
(cons url (xexprs body))))
|
||||||
|
|
||||||
|
(define (gen-site pages)
|
||||||
|
(define sitemap-urls (append* (map (match-lambda ([cons sitemap _] sitemap)) pages)))
|
||||||
|
(define page-bodies (append* (map (match-lambda ([cons _ page] page)) pages)))
|
||||||
|
(cons (sitemap sitemap-urls)
|
||||||
|
(page-bodies)
|
|
@ -0,0 +1,8 @@
|
||||||
|
#lang racket
|
||||||
|
(require "xexprs/xexprs.rkt")
|
||||||
|
|
||||||
|
(define (sitemap urls)
|
||||||
|
(string-append
|
||||||
|
"<?xml version="1.0" encoding="UTF-8"?>"
|
||||||
|
(xexprs
|
||||||
|
`(urlset #:xmlns "http://www.sitemaps.org/schemas/sitemap/0.9" ,@urls))))
|
Loading…
Reference in New Issue