flushing out the W.D. so I can start anew

master
Lijero 2018-01-22 15:58:12 -08:00
parent 5480b1feee
commit 20efd8efe7
11 changed files with 374 additions and 0 deletions

17
Main.hs~ Normal file
View File

@ -0,0 +1,17 @@
module Main where
type Seconds = Int
type DomainName = String
type Url = (DomainName, String)
type UserAgent = String
data RobotsTxt = RobotsTxt
{ crawlDelay :: Seconds
, canonicalDomain :: DomainName
, sitemapLocation :: Url
, globalDisallows :: [Url]
, botDisallows :: [(UserAgent, [Url])]
}
main :: IO ()
main = putStrLn ""

84
RobotsTxt.hs Normal file
View File

@ -0,0 +1,84 @@
module RobotsTxt ( defaultRobotsTxt
, generateRobotsTxt
, forbidUrl
, forbidBotUrls
, withCrawlDelay
, withCanonicalDomain
, withSitemap
, usingDefaultSitemap
) where
type Seconds = Int
type DomainName = String
type Url = String
type UserAgent = String
data RobotsTxt = RobotsTxt
{ crawlDelay :: Seconds
, canonicalDomain :: Maybe DomainName
, sitemapLocation :: Maybe Url
, globalDisallows :: [Url]
, botDisallows :: [(UserAgent, [Url])]
}
defaultRobotsTxt :: RobotsTxt
defaultRobotsTxt = RobotsTxt
{ crawlDelay = 10
, canonicalDomain = Nothing
, sitemapLocation = Nothing
, globalDisallows = []
, botDisallows = []
}
forbidUrl :: Url -> RobotsTxt -> RobotsTxt
forbidUrl url cnf = cnf { globalDisallows = url : globalDisallows cnf }
forbidBotUrls :: (UserAgent, [Url]) -> RobotsTxt -> RobotsTxt
forbidBotUrls forbid cnf = cnf { botDisallows = forbid : botDisallows cnf }
withCrawlDelay :: Seconds -> RobotsTxt -> RobotsTxt
withCrawlDelay time cnf = cnf { crawlDelay = time }
withCanonicalDomain :: String -> RobotsTxt -> RobotsTxt
withCanonicalDomain domain cnf = cnf { canonicalDomain = Just domain }
withSitemap :: String -> RobotsTxt -> RobotsTxt
withSitemap url cnf = cnf { sitemapLocation = Just url }
usingDefaultSitemap :: RobotsTxt -> RobotsTxt
usingDefaultSitemap cnf = cnf { sitemapLocation = fmap (++ "/sitemap.xml") $ canonicalDomain cnf }
robotsTxtField :: String -> String -> String
robotsTxtField name value = name ++ ": " ++ value ++ "\n"
robotsTxtEmptyField :: String -> String
robotsTxtEmptyField name = name ++ ":"
globalUserAgent :: String
globalUserAgent = "*"
userAgentField :: UserAgent -> String
userAgentField = robotsTxtField "User-agent"
crawlDelayField :: Seconds -> String
crawlDelayField = robotsTxtField "Crawl-delay" . show
canonicalDomainField :: DomainName -> String
canonicalDomainField = robotsTxtField "Host"
disallowField :: Url -> String
disallowField = robotsTxtField "Disallow"
generateDisallowList :: [Url] -> String
generateDisallowList [] = robotsTxtEmptyField "Disallow"
generateDisallowList xs = foldl1 (++) $ map disallowField xs
generateBotDisallowList :: (UserAgent, [Url]) -> String
generateBotDisallowList (bot, urls) = userAgentField bot ++ generateDisallowList urls
generateRobotsTxt :: RobotsTxt -> String
generateRobotsTxt config = (userAgentField globalUserAgent)
++ (crawlDelayField $ crawlDelay config)
++ (maybe "" canonicalDomainField $ canonicalDomain config)
++ (generateDisallowList $ globalDisallows config)
++ (foldl1 (++) $ map generateBotDisallowList $ botDisallows config)

105
SitemapXml.hs Normal file
View File

@ -0,0 +1,105 @@
module SitemapXml
( ChangeFreq
, mkPriority
, defaultPriority
, defaultUrlData
, withPriority
, withLastMod
, withChangeFreq
, generateSitemapXml
) where
import Text.Printf (printf)
type Sitemap = [UrlData]
data ChangeFreq = CFNever
| CFYearly
| CFMonthly
| CFWeekly
| CFDaily
| CFHourly
| CFAlways
instance Show ChangeFreq where
show CFNever = "never"
show CFYearly = "yearly"
show CFMonthly = "monthly"
show CFWeekly = "weekly"
show CFDaily = "daily"
show CFHourly = "hourly"
show CFAlways = "always"
data Priority = Priority Int
instance Show Priority where
-- There is probably a better way to do this
show (Priority x) = printf "%.1f" (fromIntegral x / 10 :: Float)
mkPriority :: Int -> Priority
mkPriority x | x >= 0 && x <= 10 = Priority x
defaultPriority :: Priority
defaultPriority = mkPriority 5
data UrlData = UrlData
{ url :: String
, lastMod :: Maybe String
, changeFreq :: Maybe ChangeFreq
, priority :: Priority
}
defaultUrlData :: String -> UrlData
defaultUrlData url = UrlData { url = url, lastMod = Nothing, changeFreq = Nothing, priority = defaultPriority }
withPriority :: Int -> UrlData -> UrlData
withPriority x dat = dat { priority = mkPriority x }
withLastMod :: String -> UrlData -> UrlData
withLastMod x dat = dat { lastMod = Just x }
withChangeFreq :: ChangeFreq -> UrlData -> UrlData
withChangeFreq x dat = dat { changeFreq = Just x }
-- I know there are Xml generation libraries, but it's not worth their inclusion
-- over such a trivial application at this point
data Xml = Tag String [(String, String)] [Xml] | Data String
renderAttribute :: (String, String) -> String
renderAttribute (name, value) = name ++ "=\"" ++ value ++ "\""
xmlComment :: String
xmlComment = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
renderXml :: Xml -> String
renderXml (Data x) = x
renderXml (Tag name attrs body) = openTag
++ renderedBody
++ closeTag
where openTag = "<" ++ name ++ " " ++ attributes ++ ">"
attributes = unwords $ map renderAttribute attrs
renderedBody = foldl1 (++) $ map renderXml body
closeTag = "</" ++ name ++ ">"
tagNoAttrs :: String -> [Xml] -> Xml
tagNoAttrs name body = Tag name [] body
tagOneBody :: String -> Xml -> Xml
tagOneBody name body = tagNoAttrs name [body]
maybeList :: Maybe a -> [a]
maybeList Nothing = []
maybeList (Just x) = [x]
urlDataToXml :: UrlData -> Xml
urlDataToXml dat = tagNoAttrs "url" $ locTag ++ priorityTag ++ lastModTag ++ changeFreqTag
where locTag = [tagOneBody "loc" $ Data $ url dat]
priorityTag = [tagOneBody "priority" $ Data $ show $ priority dat]
lastModTag = maybeList $ fmap (tagOneBody "lastmod" . Data) $ lastMod dat
changeFreqTag = maybeList $ fmap (tagOneBody "changefreq" . Data . show) $ changeFreq dat
sitemapToXml :: Sitemap -> Xml
sitemapToXml = Tag "urlset" [("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")] . map urlDataToXml
generateSitemapXml :: Sitemap -> String
generateSitemapXml sitemap = xmlComment ++ (renderXml $ sitemapToXml sitemap)

27
SitemapXml.hs~ Normal file
View File

@ -0,0 +1,27 @@
module SitemapXml where
type Sitemap = [UrlData]
data ChangeFreq = CFNever
| CFYearly
| CFMonthly
| CFWeekly
| CFDaily
| CFHourly
| CFAlways
instance Show ChangeFreq where
show CFNever = "never"
show CFYearly = "yearly"
show CFMonthly = "monthly"
show CFWeekly = "weekly"
show CFDaily = "daily"
show CFHourly = "hourly"
show CFAlways = "always"
data UrlData = UrlData
{ url :: String
, lastMod :: Maybe String
, changeFreq :: ChangeFreq
, priority :: Priority
}

59
rkt/robots.rkt Normal file
View File

@ -0,0 +1,59 @@
#lang racket
;; /robots.txt specifies how web crawlers should access your website
;; see www.robotstxt.org
; Generate a robots.txt field
(define (robots-field name (body '()))
(define robots-field-base (string-append name ":"))
(if (null? body) robots-field-base
(string-append robots-field-base " " body)))
; Generate a user agent line
(define (robots-ua (name "*"))
(robots-field "User-agent" name))
; Generate a bunch of Disallow lines from a list of urls
(define (robots-disallow list)
(if (empty? list) (robots-field "Disallow")
(string-append*
(map list (lambda (url)
(robots-field "Disallow" url))))))
; Map into and unwrap an optional value: if x present, f x, else d
(define (when-present d f x)
(if (null? x) d
(apply f x)))
; Forbid specific urls to a specific bot by user agent
(define (robots-forbidbot bot disallows)
(string-append
(robots-ua bot)
(robots-disallow disallows)))
; Blocks format: (cons (list of global blocks) (list of (cons bot-ua (list of urls))))
(define (robots-config (blocks '())
#:crawl-delay (crawl-delay 10) ; How frequently a bot should access your site-- poorly specified
#:host (host '()) ; The canonical domain for your website
#:sitemap (sitemap '())) ; Your sitemap.xml
(define (field-when-present name value)
(when-present "" ((curry robots-field) name) value))
(define block-lists
(when-present ""
(match-lambda
([cons global rest]
(string-append
; First we have the global disallow rules
(robots-disallow global)
(string-append*
; then a list of the disallow rules for individual bots
(map (match-lambda
([cons bot urls]
(robots-forbidbot bot urls))) rest)))))
blocks))
(string-append
(robots-ua)
block-lists
(robots-field "Crawl-delay" (number->string crawl-delay))
(field-when-present "Sitemap" sitemap)
(field-when-present "Host" host)))

15
rkt/site.rkt Normal file
View File

@ -0,0 +1,15 @@
#lang racket
(require "sitemap.rkt")
(require "robots.rkt")
(define base-url "https://lijero.co")
(define (site-page url body #:priority priority #:lastmod lastmod #:changefreq changefreq)
(cons (sitemap-url (string-append base-url url) #:priority priority #:lastmod lastmod #:changefreq changefreq)
(cons url (xexprs body))))
(define (gen-site pages)
(define sitemap-urls (append* (map (match-lambda ([cons sitemap _] sitemap)) pages)))
(define page-bodies (append* (map (match-lambda ([cons _ page] page)) pages)))
(cons (sitemap sitemap-urls)
(page-bodies)

26
rkt/sitemap.rkt Normal file
View File

@ -0,0 +1,26 @@
#lang racket
(require "xexprs/xexprs.rkt")
; A quick sitemap-xml generator (https://www.sitemaps.org/)
; Sitemaps help web crawlers index your website
; Ugly hack because I'm bad at Racket
(define (when-present name value rest)
(if (null? value)
rest
(cons (cons name value) rest)))
; A sitemap URL entry
; https://www.sitemaps.org/protocol.html
(define (sitemap-url loc #:lastmod (lastmod '()) #:changefreq (changefreq '()) #:priority (priority 0.5))
`(url (priority ,priority)
,@(when-present "lastmod" lastmod
(when-present "changefreq" changefreq
'()))))
; Generates a sitemap xml
(define (sitemap urls)
(string-append
"<?xml version="1.0" encoding="UTF-8"?>"
(xexprs
`(urlset #:xmlns "http://www.sitemaps.org/schemas/sitemap/0.9"
,@urls))))

18
robots.rkt~ Normal file
View File

@ -0,0 +1,18 @@
#lang racket
(define (robots-field name (body '()))
(define robots-field-base (string-append name ":"))
(if (nil? body) robots-field-base
(string-append robots-field-base " " body)))
(define (robots-ua (name "*"))
(string-append "User-agent: " name))
(define (robots-disallow list)
(if (empty? list)
"Disallow:"
(define (robots-config (#:crawl-delay 10))
(string-append
(robots-ua)
(

15
site.rkt~ Normal file
View File

@ -0,0 +1,15 @@
#lang racket
(require "sitemap.rkt")
(require "robots.rkt")
(define base-url "https://lijero.co")
(define (site-page url body #:priority priority #:lastmod lastmod #:changefreq changefreq)
(cons (sitemap-url (string-append base-url url) #:priority priority #:lastmod lastmod #:changefreq changefreq)
(cons url (xexprs body))))
(define (gen-site pages)
(define sitemap-urls (append* (map (match-lambda ([cons sitemap _] sitemap)) pages)))
(define page-bodies (append* (map (match-lambda ([cons _ page] page)) pages)))
(cons (sitemap sitemap-urls)
(page-bodies)

8
sitemap.rkt~ Normal file
View File

@ -0,0 +1,8 @@
#lang racket
(require "xexprs/xexprs.rkt")
(define (sitemap urls)
(string-append
"<?xml version="1.0" encoding="UTF-8"?>"
(xexprs
`(urlset #:xmlns "http://www.sitemaps.org/schemas/sitemap/0.9" ,@urls))))