From 20efd8efe7c2181c69fe22152ecaeec43eb36e9c Mon Sep 17 00:00:00 2001 From: Lijero Date: Mon, 22 Jan 2018 15:58:12 -0800 Subject: [PATCH] flushing out the W.D. so I can start anew --- Main.hs~ | 17 +++++++ RobotsTxt.hs | 84 +++++++++++++++++++++++++++++++++ SitemapXml.hs | 105 +++++++++++++++++++++++++++++++++++++++++ SitemapXml.hs~ | 27 +++++++++++ rkt/robots.rkt | 59 +++++++++++++++++++++++ rkt/site.rkt | 15 ++++++ rkt/sitemap.rkt | 26 ++++++++++ web.rkt => rkt/web.rkt | 0 robots.rkt~ | 18 +++++++ site.rkt~ | 15 ++++++ sitemap.rkt~ | 8 ++++ 11 files changed, 374 insertions(+) create mode 100644 Main.hs~ create mode 100644 RobotsTxt.hs create mode 100644 SitemapXml.hs create mode 100644 SitemapXml.hs~ create mode 100644 rkt/robots.rkt create mode 100644 rkt/site.rkt create mode 100644 rkt/sitemap.rkt rename web.rkt => rkt/web.rkt (100%) create mode 100644 robots.rkt~ create mode 100644 site.rkt~ create mode 100644 sitemap.rkt~ diff --git a/Main.hs~ b/Main.hs~ new file mode 100644 index 0000000..3b01ca1 --- /dev/null +++ b/Main.hs~ @@ -0,0 +1,17 @@ +module Main where + +type Seconds = Int +type DomainName = String +type Url = (DomainName, String) +type UserAgent = String + +data RobotsTxt = RobotsTxt + { crawlDelay :: Seconds + , canonicalDomain :: DomainName + , sitemapLocation :: Url + , globalDisallows :: [Url] + , botDisallows :: [(UserAgent, [Url])] + } + +main :: IO () +main = putStrLn "" diff --git a/RobotsTxt.hs b/RobotsTxt.hs new file mode 100644 index 0000000..5c1da2c --- /dev/null +++ b/RobotsTxt.hs @@ -0,0 +1,84 @@ +module RobotsTxt ( defaultRobotsTxt + , generateRobotsTxt + , forbidUrl + , forbidBotUrls + , withCrawlDelay + , withCanonicalDomain + , withSitemap + , usingDefaultSitemap + ) where + +type Seconds = Int +type DomainName = String +type Url = String +type UserAgent = String + +data RobotsTxt = RobotsTxt + { crawlDelay :: Seconds + , canonicalDomain :: Maybe DomainName + , sitemapLocation :: Maybe Url + , globalDisallows :: [Url] + , botDisallows :: [(UserAgent, [Url])] + } + +defaultRobotsTxt :: RobotsTxt +defaultRobotsTxt = RobotsTxt + { crawlDelay = 10 + , canonicalDomain = Nothing + , sitemapLocation = Nothing + , globalDisallows = [] + , botDisallows = [] + } + +forbidUrl :: Url -> RobotsTxt -> RobotsTxt +forbidUrl url cnf = cnf { globalDisallows = url : globalDisallows cnf } + +forbidBotUrls :: (UserAgent, [Url]) -> RobotsTxt -> RobotsTxt +forbidBotUrls forbid cnf = cnf { botDisallows = forbid : botDisallows cnf } + +withCrawlDelay :: Seconds -> RobotsTxt -> RobotsTxt +withCrawlDelay time cnf = cnf { crawlDelay = time } + +withCanonicalDomain :: String -> RobotsTxt -> RobotsTxt +withCanonicalDomain domain cnf = cnf { canonicalDomain = Just domain } + +withSitemap :: String -> RobotsTxt -> RobotsTxt +withSitemap url cnf = cnf { sitemapLocation = Just url } + +usingDefaultSitemap :: RobotsTxt -> RobotsTxt +usingDefaultSitemap cnf = cnf { sitemapLocation = fmap (++ "/sitemap.xml") $ canonicalDomain cnf } + +robotsTxtField :: String -> String -> String +robotsTxtField name value = name ++ ": " ++ value ++ "\n" + +robotsTxtEmptyField :: String -> String +robotsTxtEmptyField name = name ++ ":" + +globalUserAgent :: String +globalUserAgent = "*" + +userAgentField :: UserAgent -> String +userAgentField = robotsTxtField "User-agent" + +crawlDelayField :: Seconds -> String +crawlDelayField = robotsTxtField "Crawl-delay" . show + +canonicalDomainField :: DomainName -> String +canonicalDomainField = robotsTxtField "Host" + +disallowField :: Url -> String +disallowField = robotsTxtField "Disallow" + +generateDisallowList :: [Url] -> String +generateDisallowList [] = robotsTxtEmptyField "Disallow" +generateDisallowList xs = foldl1 (++) $ map disallowField xs + +generateBotDisallowList :: (UserAgent, [Url]) -> String +generateBotDisallowList (bot, urls) = userAgentField bot ++ generateDisallowList urls + +generateRobotsTxt :: RobotsTxt -> String +generateRobotsTxt config = (userAgentField globalUserAgent) + ++ (crawlDelayField $ crawlDelay config) + ++ (maybe "" canonicalDomainField $ canonicalDomain config) + ++ (generateDisallowList $ globalDisallows config) + ++ (foldl1 (++) $ map generateBotDisallowList $ botDisallows config) diff --git a/SitemapXml.hs b/SitemapXml.hs new file mode 100644 index 0000000..c7abcff --- /dev/null +++ b/SitemapXml.hs @@ -0,0 +1,105 @@ +module SitemapXml + ( ChangeFreq + , mkPriority + , defaultPriority + , defaultUrlData + , withPriority + , withLastMod + , withChangeFreq + , generateSitemapXml + ) where + +import Text.Printf (printf) + +type Sitemap = [UrlData] + +data ChangeFreq = CFNever + | CFYearly + | CFMonthly + | CFWeekly + | CFDaily + | CFHourly + | CFAlways + +instance Show ChangeFreq where + show CFNever = "never" + show CFYearly = "yearly" + show CFMonthly = "monthly" + show CFWeekly = "weekly" + show CFDaily = "daily" + show CFHourly = "hourly" + show CFAlways = "always" + +data Priority = Priority Int + +instance Show Priority where + -- There is probably a better way to do this + show (Priority x) = printf "%.1f" (fromIntegral x / 10 :: Float) + +mkPriority :: Int -> Priority +mkPriority x | x >= 0 && x <= 10 = Priority x + +defaultPriority :: Priority +defaultPriority = mkPriority 5 + +data UrlData = UrlData + { url :: String + , lastMod :: Maybe String + , changeFreq :: Maybe ChangeFreq + , priority :: Priority + } + +defaultUrlData :: String -> UrlData +defaultUrlData url = UrlData { url = url, lastMod = Nothing, changeFreq = Nothing, priority = defaultPriority } + +withPriority :: Int -> UrlData -> UrlData +withPriority x dat = dat { priority = mkPriority x } + +withLastMod :: String -> UrlData -> UrlData +withLastMod x dat = dat { lastMod = Just x } + +withChangeFreq :: ChangeFreq -> UrlData -> UrlData +withChangeFreq x dat = dat { changeFreq = Just x } + +-- I know there are Xml generation libraries, but it's not worth their inclusion +-- over such a trivial application at this point +data Xml = Tag String [(String, String)] [Xml] | Data String + +renderAttribute :: (String, String) -> String +renderAttribute (name, value) = name ++ "=\"" ++ value ++ "\"" + +xmlComment :: String +xmlComment = "" + +renderXml :: Xml -> String +renderXml (Data x) = x +renderXml (Tag name attrs body) = openTag + ++ renderedBody + ++ closeTag + where openTag = "<" ++ name ++ " " ++ attributes ++ ">" + attributes = unwords $ map renderAttribute attrs + renderedBody = foldl1 (++) $ map renderXml body + closeTag = "" + +tagNoAttrs :: String -> [Xml] -> Xml +tagNoAttrs name body = Tag name [] body + +tagOneBody :: String -> Xml -> Xml +tagOneBody name body = tagNoAttrs name [body] + +maybeList :: Maybe a -> [a] +maybeList Nothing = [] +maybeList (Just x) = [x] + +urlDataToXml :: UrlData -> Xml +urlDataToXml dat = tagNoAttrs "url" $ locTag ++ priorityTag ++ lastModTag ++ changeFreqTag + where locTag = [tagOneBody "loc" $ Data $ url dat] + priorityTag = [tagOneBody "priority" $ Data $ show $ priority dat] + lastModTag = maybeList $ fmap (tagOneBody "lastmod" . Data) $ lastMod dat + changeFreqTag = maybeList $ fmap (tagOneBody "changefreq" . Data . show) $ changeFreq dat + +sitemapToXml :: Sitemap -> Xml +sitemapToXml = Tag "urlset" [("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")] . map urlDataToXml + +generateSitemapXml :: Sitemap -> String +generateSitemapXml sitemap = xmlComment ++ (renderXml $ sitemapToXml sitemap) diff --git a/SitemapXml.hs~ b/SitemapXml.hs~ new file mode 100644 index 0000000..54b4a07 --- /dev/null +++ b/SitemapXml.hs~ @@ -0,0 +1,27 @@ +module SitemapXml where + +type Sitemap = [UrlData] + +data ChangeFreq = CFNever + | CFYearly + | CFMonthly + | CFWeekly + | CFDaily + | CFHourly + | CFAlways + +instance Show ChangeFreq where + show CFNever = "never" + show CFYearly = "yearly" + show CFMonthly = "monthly" + show CFWeekly = "weekly" + show CFDaily = "daily" + show CFHourly = "hourly" + show CFAlways = "always" + +data UrlData = UrlData + { url :: String + , lastMod :: Maybe String + , changeFreq :: ChangeFreq + , priority :: Priority + } diff --git a/rkt/robots.rkt b/rkt/robots.rkt new file mode 100644 index 0000000..0c2b908 --- /dev/null +++ b/rkt/robots.rkt @@ -0,0 +1,59 @@ +#lang racket +;; /robots.txt specifies how web crawlers should access your website +;; see www.robotstxt.org + +; Generate a robots.txt field +(define (robots-field name (body '())) + (define robots-field-base (string-append name ":")) + (if (null? body) robots-field-base + (string-append robots-field-base " " body))) + +; Generate a user agent line +(define (robots-ua (name "*")) + (robots-field "User-agent" name)) + +; Generate a bunch of Disallow lines from a list of urls +(define (robots-disallow list) + (if (empty? list) (robots-field "Disallow") + (string-append* + (map list (lambda (url) + (robots-field "Disallow" url)))))) + +; Map into and unwrap an optional value: if x present, f x, else d +(define (when-present d f x) + (if (null? x) d + (apply f x))) + +; Forbid specific urls to a specific bot by user agent +(define (robots-forbidbot bot disallows) + (string-append + (robots-ua bot) + (robots-disallow disallows))) + +; Blocks format: (cons (list of global blocks) (list of (cons bot-ua (list of urls)))) +(define (robots-config (blocks '()) + #:crawl-delay (crawl-delay 10) ; How frequently a bot should access your site-- poorly specified + #:host (host '()) ; The canonical domain for your website + #:sitemap (sitemap '())) ; Your sitemap.xml + (define (field-when-present name value) + (when-present "" ((curry robots-field) name) value)) + (define block-lists + (when-present "" + (match-lambda + ([cons global rest] + (string-append + ; First we have the global disallow rules + (robots-disallow global) + (string-append* + ; then a list of the disallow rules for individual bots + (map (match-lambda + ([cons bot urls] + (robots-forbidbot bot urls))) rest))))) + blocks)) + (string-append + (robots-ua) + block-lists + (robots-field "Crawl-delay" (number->string crawl-delay)) + (field-when-present "Sitemap" sitemap) + (field-when-present "Host" host))) + \ No newline at end of file diff --git a/rkt/site.rkt b/rkt/site.rkt new file mode 100644 index 0000000..82acf87 --- /dev/null +++ b/rkt/site.rkt @@ -0,0 +1,15 @@ +#lang racket +(require "sitemap.rkt") +(require "robots.rkt") + +(define base-url "https://lijero.co") + +(define (site-page url body #:priority priority #:lastmod lastmod #:changefreq changefreq) + (cons (sitemap-url (string-append base-url url) #:priority priority #:lastmod lastmod #:changefreq changefreq) + (cons url (xexprs body)))) + +(define (gen-site pages) + (define sitemap-urls (append* (map (match-lambda ([cons sitemap _] sitemap)) pages))) + (define page-bodies (append* (map (match-lambda ([cons _ page] page)) pages))) + (cons (sitemap sitemap-urls) + (page-bodies) diff --git a/rkt/sitemap.rkt b/rkt/sitemap.rkt new file mode 100644 index 0000000..5dbdf4c --- /dev/null +++ b/rkt/sitemap.rkt @@ -0,0 +1,26 @@ +#lang racket +(require "xexprs/xexprs.rkt") +; A quick sitemap-xml generator (https://www.sitemaps.org/) +; Sitemaps help web crawlers index your website + +; Ugly hack because I'm bad at Racket +(define (when-present name value rest) + (if (null? value) + rest + (cons (cons name value) rest))) + +; A sitemap URL entry +; https://www.sitemaps.org/protocol.html +(define (sitemap-url loc #:lastmod (lastmod '()) #:changefreq (changefreq '()) #:priority (priority 0.5)) + `(url (priority ,priority) + ,@(when-present "lastmod" lastmod + (when-present "changefreq" changefreq + '())))) + +; Generates a sitemap xml +(define (sitemap urls) + (string-append + "" + (xexprs + `(urlset #:xmlns "http://www.sitemaps.org/schemas/sitemap/0.9" + ,@urls)))) \ No newline at end of file diff --git a/web.rkt b/rkt/web.rkt similarity index 100% rename from web.rkt rename to rkt/web.rkt diff --git a/robots.rkt~ b/robots.rkt~ new file mode 100644 index 0000000..4a19d98 --- /dev/null +++ b/robots.rkt~ @@ -0,0 +1,18 @@ +#lang racket +(define (robots-field name (body '())) + (define robots-field-base (string-append name ":")) + (if (nil? body) robots-field-base + (string-append robots-field-base " " body))) + +(define (robots-ua (name "*")) + (string-append "User-agent: " name)) + +(define (robots-disallow list) + (if (empty? list) + "Disallow:" + +(define (robots-config (#:crawl-delay 10)) + (string-append + (robots-ua) + ( + \ No newline at end of file diff --git a/site.rkt~ b/site.rkt~ new file mode 100644 index 0000000..420f6e3 --- /dev/null +++ b/site.rkt~ @@ -0,0 +1,15 @@ +#lang racket +(require "sitemap.rkt") +(require "robots.rkt") + +(define base-url "https://lijero.co") + +(define (site-page url body #:priority priority #:lastmod lastmod #:changefreq changefreq) + (cons (sitemap-url (string-append base-url url) #:priority priority #:lastmod lastmod #:changefreq changefreq) + (cons url (xexprs body)))) + +(define (gen-site pages) + (define sitemap-urls (append* (map (match-lambda ([cons sitemap _] sitemap)) pages))) + (define page-bodies (append* (map (match-lambda ([cons _ page] page)) pages))) + (cons (sitemap sitemap-urls) + (page-bodies) \ No newline at end of file diff --git a/sitemap.rkt~ b/sitemap.rkt~ new file mode 100644 index 0000000..27342aa --- /dev/null +++ b/sitemap.rkt~ @@ -0,0 +1,8 @@ +#lang racket +(require "xexprs/xexprs.rkt") + +(define (sitemap urls) + (string-append + "" + (xexprs + `(urlset #:xmlns "http://www.sitemaps.org/schemas/sitemap/0.9" ,@urls)))) \ No newline at end of file