flushing out the W.D. so I can start anew
parent
5480b1feee
commit
20efd8efe7
|
@ -0,0 +1,17 @@
|
|||
module Main where
|
||||
|
||||
type Seconds = Int
|
||||
type DomainName = String
|
||||
type Url = (DomainName, String)
|
||||
type UserAgent = String
|
||||
|
||||
data RobotsTxt = RobotsTxt
|
||||
{ crawlDelay :: Seconds
|
||||
, canonicalDomain :: DomainName
|
||||
, sitemapLocation :: Url
|
||||
, globalDisallows :: [Url]
|
||||
, botDisallows :: [(UserAgent, [Url])]
|
||||
}
|
||||
|
||||
main :: IO ()
|
||||
main = putStrLn ""
|
|
@ -0,0 +1,84 @@
|
|||
module RobotsTxt ( defaultRobotsTxt
|
||||
, generateRobotsTxt
|
||||
, forbidUrl
|
||||
, forbidBotUrls
|
||||
, withCrawlDelay
|
||||
, withCanonicalDomain
|
||||
, withSitemap
|
||||
, usingDefaultSitemap
|
||||
) where
|
||||
|
||||
type Seconds = Int
|
||||
type DomainName = String
|
||||
type Url = String
|
||||
type UserAgent = String
|
||||
|
||||
data RobotsTxt = RobotsTxt
|
||||
{ crawlDelay :: Seconds
|
||||
, canonicalDomain :: Maybe DomainName
|
||||
, sitemapLocation :: Maybe Url
|
||||
, globalDisallows :: [Url]
|
||||
, botDisallows :: [(UserAgent, [Url])]
|
||||
}
|
||||
|
||||
defaultRobotsTxt :: RobotsTxt
|
||||
defaultRobotsTxt = RobotsTxt
|
||||
{ crawlDelay = 10
|
||||
, canonicalDomain = Nothing
|
||||
, sitemapLocation = Nothing
|
||||
, globalDisallows = []
|
||||
, botDisallows = []
|
||||
}
|
||||
|
||||
forbidUrl :: Url -> RobotsTxt -> RobotsTxt
|
||||
forbidUrl url cnf = cnf { globalDisallows = url : globalDisallows cnf }
|
||||
|
||||
forbidBotUrls :: (UserAgent, [Url]) -> RobotsTxt -> RobotsTxt
|
||||
forbidBotUrls forbid cnf = cnf { botDisallows = forbid : botDisallows cnf }
|
||||
|
||||
withCrawlDelay :: Seconds -> RobotsTxt -> RobotsTxt
|
||||
withCrawlDelay time cnf = cnf { crawlDelay = time }
|
||||
|
||||
withCanonicalDomain :: String -> RobotsTxt -> RobotsTxt
|
||||
withCanonicalDomain domain cnf = cnf { canonicalDomain = Just domain }
|
||||
|
||||
withSitemap :: String -> RobotsTxt -> RobotsTxt
|
||||
withSitemap url cnf = cnf { sitemapLocation = Just url }
|
||||
|
||||
usingDefaultSitemap :: RobotsTxt -> RobotsTxt
|
||||
usingDefaultSitemap cnf = cnf { sitemapLocation = fmap (++ "/sitemap.xml") $ canonicalDomain cnf }
|
||||
|
||||
robotsTxtField :: String -> String -> String
|
||||
robotsTxtField name value = name ++ ": " ++ value ++ "\n"
|
||||
|
||||
robotsTxtEmptyField :: String -> String
|
||||
robotsTxtEmptyField name = name ++ ":"
|
||||
|
||||
globalUserAgent :: String
|
||||
globalUserAgent = "*"
|
||||
|
||||
userAgentField :: UserAgent -> String
|
||||
userAgentField = robotsTxtField "User-agent"
|
||||
|
||||
crawlDelayField :: Seconds -> String
|
||||
crawlDelayField = robotsTxtField "Crawl-delay" . show
|
||||
|
||||
canonicalDomainField :: DomainName -> String
|
||||
canonicalDomainField = robotsTxtField "Host"
|
||||
|
||||
disallowField :: Url -> String
|
||||
disallowField = robotsTxtField "Disallow"
|
||||
|
||||
generateDisallowList :: [Url] -> String
|
||||
generateDisallowList [] = robotsTxtEmptyField "Disallow"
|
||||
generateDisallowList xs = foldl1 (++) $ map disallowField xs
|
||||
|
||||
generateBotDisallowList :: (UserAgent, [Url]) -> String
|
||||
generateBotDisallowList (bot, urls) = userAgentField bot ++ generateDisallowList urls
|
||||
|
||||
generateRobotsTxt :: RobotsTxt -> String
|
||||
generateRobotsTxt config = (userAgentField globalUserAgent)
|
||||
++ (crawlDelayField $ crawlDelay config)
|
||||
++ (maybe "" canonicalDomainField $ canonicalDomain config)
|
||||
++ (generateDisallowList $ globalDisallows config)
|
||||
++ (foldl1 (++) $ map generateBotDisallowList $ botDisallows config)
|
|
@ -0,0 +1,105 @@
|
|||
module SitemapXml
|
||||
( ChangeFreq
|
||||
, mkPriority
|
||||
, defaultPriority
|
||||
, defaultUrlData
|
||||
, withPriority
|
||||
, withLastMod
|
||||
, withChangeFreq
|
||||
, generateSitemapXml
|
||||
) where
|
||||
|
||||
import Text.Printf (printf)
|
||||
|
||||
type Sitemap = [UrlData]
|
||||
|
||||
data ChangeFreq = CFNever
|
||||
| CFYearly
|
||||
| CFMonthly
|
||||
| CFWeekly
|
||||
| CFDaily
|
||||
| CFHourly
|
||||
| CFAlways
|
||||
|
||||
instance Show ChangeFreq where
|
||||
show CFNever = "never"
|
||||
show CFYearly = "yearly"
|
||||
show CFMonthly = "monthly"
|
||||
show CFWeekly = "weekly"
|
||||
show CFDaily = "daily"
|
||||
show CFHourly = "hourly"
|
||||
show CFAlways = "always"
|
||||
|
||||
data Priority = Priority Int
|
||||
|
||||
instance Show Priority where
|
||||
-- There is probably a better way to do this
|
||||
show (Priority x) = printf "%.1f" (fromIntegral x / 10 :: Float)
|
||||
|
||||
mkPriority :: Int -> Priority
|
||||
mkPriority x | x >= 0 && x <= 10 = Priority x
|
||||
|
||||
defaultPriority :: Priority
|
||||
defaultPriority = mkPriority 5
|
||||
|
||||
data UrlData = UrlData
|
||||
{ url :: String
|
||||
, lastMod :: Maybe String
|
||||
, changeFreq :: Maybe ChangeFreq
|
||||
, priority :: Priority
|
||||
}
|
||||
|
||||
defaultUrlData :: String -> UrlData
|
||||
defaultUrlData url = UrlData { url = url, lastMod = Nothing, changeFreq = Nothing, priority = defaultPriority }
|
||||
|
||||
withPriority :: Int -> UrlData -> UrlData
|
||||
withPriority x dat = dat { priority = mkPriority x }
|
||||
|
||||
withLastMod :: String -> UrlData -> UrlData
|
||||
withLastMod x dat = dat { lastMod = Just x }
|
||||
|
||||
withChangeFreq :: ChangeFreq -> UrlData -> UrlData
|
||||
withChangeFreq x dat = dat { changeFreq = Just x }
|
||||
|
||||
-- I know there are Xml generation libraries, but it's not worth their inclusion
|
||||
-- over such a trivial application at this point
|
||||
data Xml = Tag String [(String, String)] [Xml] | Data String
|
||||
|
||||
renderAttribute :: (String, String) -> String
|
||||
renderAttribute (name, value) = name ++ "=\"" ++ value ++ "\""
|
||||
|
||||
xmlComment :: String
|
||||
xmlComment = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
|
||||
|
||||
renderXml :: Xml -> String
|
||||
renderXml (Data x) = x
|
||||
renderXml (Tag name attrs body) = openTag
|
||||
++ renderedBody
|
||||
++ closeTag
|
||||
where openTag = "<" ++ name ++ " " ++ attributes ++ ">"
|
||||
attributes = unwords $ map renderAttribute attrs
|
||||
renderedBody = foldl1 (++) $ map renderXml body
|
||||
closeTag = "</" ++ name ++ ">"
|
||||
|
||||
tagNoAttrs :: String -> [Xml] -> Xml
|
||||
tagNoAttrs name body = Tag name [] body
|
||||
|
||||
tagOneBody :: String -> Xml -> Xml
|
||||
tagOneBody name body = tagNoAttrs name [body]
|
||||
|
||||
maybeList :: Maybe a -> [a]
|
||||
maybeList Nothing = []
|
||||
maybeList (Just x) = [x]
|
||||
|
||||
urlDataToXml :: UrlData -> Xml
|
||||
urlDataToXml dat = tagNoAttrs "url" $ locTag ++ priorityTag ++ lastModTag ++ changeFreqTag
|
||||
where locTag = [tagOneBody "loc" $ Data $ url dat]
|
||||
priorityTag = [tagOneBody "priority" $ Data $ show $ priority dat]
|
||||
lastModTag = maybeList $ fmap (tagOneBody "lastmod" . Data) $ lastMod dat
|
||||
changeFreqTag = maybeList $ fmap (tagOneBody "changefreq" . Data . show) $ changeFreq dat
|
||||
|
||||
sitemapToXml :: Sitemap -> Xml
|
||||
sitemapToXml = Tag "urlset" [("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")] . map urlDataToXml
|
||||
|
||||
generateSitemapXml :: Sitemap -> String
|
||||
generateSitemapXml sitemap = xmlComment ++ (renderXml $ sitemapToXml sitemap)
|
|
@ -0,0 +1,27 @@
|
|||
module SitemapXml where
|
||||
|
||||
type Sitemap = [UrlData]
|
||||
|
||||
data ChangeFreq = CFNever
|
||||
| CFYearly
|
||||
| CFMonthly
|
||||
| CFWeekly
|
||||
| CFDaily
|
||||
| CFHourly
|
||||
| CFAlways
|
||||
|
||||
instance Show ChangeFreq where
|
||||
show CFNever = "never"
|
||||
show CFYearly = "yearly"
|
||||
show CFMonthly = "monthly"
|
||||
show CFWeekly = "weekly"
|
||||
show CFDaily = "daily"
|
||||
show CFHourly = "hourly"
|
||||
show CFAlways = "always"
|
||||
|
||||
data UrlData = UrlData
|
||||
{ url :: String
|
||||
, lastMod :: Maybe String
|
||||
, changeFreq :: ChangeFreq
|
||||
, priority :: Priority
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
#lang racket
|
||||
;; /robots.txt specifies how web crawlers should access your website
|
||||
;; see www.robotstxt.org
|
||||
|
||||
; Generate a robots.txt field
|
||||
(define (robots-field name (body '()))
|
||||
(define robots-field-base (string-append name ":"))
|
||||
(if (null? body) robots-field-base
|
||||
(string-append robots-field-base " " body)))
|
||||
|
||||
; Generate a user agent line
|
||||
(define (robots-ua (name "*"))
|
||||
(robots-field "User-agent" name))
|
||||
|
||||
; Generate a bunch of Disallow lines from a list of urls
|
||||
(define (robots-disallow list)
|
||||
(if (empty? list) (robots-field "Disallow")
|
||||
(string-append*
|
||||
(map list (lambda (url)
|
||||
(robots-field "Disallow" url))))))
|
||||
|
||||
; Map into and unwrap an optional value: if x present, f x, else d
|
||||
(define (when-present d f x)
|
||||
(if (null? x) d
|
||||
(apply f x)))
|
||||
|
||||
; Forbid specific urls to a specific bot by user agent
|
||||
(define (robots-forbidbot bot disallows)
|
||||
(string-append
|
||||
(robots-ua bot)
|
||||
(robots-disallow disallows)))
|
||||
|
||||
; Blocks format: (cons (list of global blocks) (list of (cons bot-ua (list of urls))))
|
||||
(define (robots-config (blocks '())
|
||||
#:crawl-delay (crawl-delay 10) ; How frequently a bot should access your site-- poorly specified
|
||||
#:host (host '()) ; The canonical domain for your website
|
||||
#:sitemap (sitemap '())) ; Your sitemap.xml
|
||||
(define (field-when-present name value)
|
||||
(when-present "" ((curry robots-field) name) value))
|
||||
(define block-lists
|
||||
(when-present ""
|
||||
(match-lambda
|
||||
([cons global rest]
|
||||
(string-append
|
||||
; First we have the global disallow rules
|
||||
(robots-disallow global)
|
||||
(string-append*
|
||||
; then a list of the disallow rules for individual bots
|
||||
(map (match-lambda
|
||||
([cons bot urls]
|
||||
(robots-forbidbot bot urls))) rest)))))
|
||||
blocks))
|
||||
(string-append
|
||||
(robots-ua)
|
||||
block-lists
|
||||
(robots-field "Crawl-delay" (number->string crawl-delay))
|
||||
(field-when-present "Sitemap" sitemap)
|
||||
(field-when-present "Host" host)))
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
#lang racket
|
||||
(require "sitemap.rkt")
|
||||
(require "robots.rkt")
|
||||
|
||||
(define base-url "https://lijero.co")
|
||||
|
||||
(define (site-page url body #:priority priority #:lastmod lastmod #:changefreq changefreq)
|
||||
(cons (sitemap-url (string-append base-url url) #:priority priority #:lastmod lastmod #:changefreq changefreq)
|
||||
(cons url (xexprs body))))
|
||||
|
||||
(define (gen-site pages)
|
||||
(define sitemap-urls (append* (map (match-lambda ([cons sitemap _] sitemap)) pages)))
|
||||
(define page-bodies (append* (map (match-lambda ([cons _ page] page)) pages)))
|
||||
(cons (sitemap sitemap-urls)
|
||||
(page-bodies)
|
|
@ -0,0 +1,26 @@
|
|||
#lang racket
|
||||
(require "xexprs/xexprs.rkt")
|
||||
; A quick sitemap-xml generator (https://www.sitemaps.org/)
|
||||
; Sitemaps help web crawlers index your website
|
||||
|
||||
; Ugly hack because I'm bad at Racket
|
||||
(define (when-present name value rest)
|
||||
(if (null? value)
|
||||
rest
|
||||
(cons (cons name value) rest)))
|
||||
|
||||
; A sitemap URL entry
|
||||
; https://www.sitemaps.org/protocol.html
|
||||
(define (sitemap-url loc #:lastmod (lastmod '()) #:changefreq (changefreq '()) #:priority (priority 0.5))
|
||||
`(url (priority ,priority)
|
||||
,@(when-present "lastmod" lastmod
|
||||
(when-present "changefreq" changefreq
|
||||
'()))))
|
||||
|
||||
; Generates a sitemap xml
|
||||
(define (sitemap urls)
|
||||
(string-append
|
||||
"<?xml version="1.0" encoding="UTF-8"?>"
|
||||
(xexprs
|
||||
`(urlset #:xmlns "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
,@urls))))
|
|
@ -0,0 +1,18 @@
|
|||
#lang racket
|
||||
(define (robots-field name (body '()))
|
||||
(define robots-field-base (string-append name ":"))
|
||||
(if (nil? body) robots-field-base
|
||||
(string-append robots-field-base " " body)))
|
||||
|
||||
(define (robots-ua (name "*"))
|
||||
(string-append "User-agent: " name))
|
||||
|
||||
(define (robots-disallow list)
|
||||
(if (empty? list)
|
||||
"Disallow:"
|
||||
|
||||
(define (robots-config (#:crawl-delay 10))
|
||||
(string-append
|
||||
(robots-ua)
|
||||
(
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
#lang racket
|
||||
(require "sitemap.rkt")
|
||||
(require "robots.rkt")
|
||||
|
||||
(define base-url "https://lijero.co")
|
||||
|
||||
(define (site-page url body #:priority priority #:lastmod lastmod #:changefreq changefreq)
|
||||
(cons (sitemap-url (string-append base-url url) #:priority priority #:lastmod lastmod #:changefreq changefreq)
|
||||
(cons url (xexprs body))))
|
||||
|
||||
(define (gen-site pages)
|
||||
(define sitemap-urls (append* (map (match-lambda ([cons sitemap _] sitemap)) pages)))
|
||||
(define page-bodies (append* (map (match-lambda ([cons _ page] page)) pages)))
|
||||
(cons (sitemap sitemap-urls)
|
||||
(page-bodies)
|
|
@ -0,0 +1,8 @@
|
|||
#lang racket
|
||||
(require "xexprs/xexprs.rkt")
|
||||
|
||||
(define (sitemap urls)
|
||||
(string-append
|
||||
"<?xml version="1.0" encoding="UTF-8"?>"
|
||||
(xexprs
|
||||
`(urlset #:xmlns "http://www.sitemaps.org/schemas/sitemap/0.9" ,@urls))))
|
Loading…
Reference in New Issue