jtm.dev/rkt/robots.rkt

#lang racket
;; /robots.txt specifies how web crawlers should access your website
;; see www.robotstxt.org

; Generate a robots.txt field
(define (robots-field name (body '()))
  (define robots-field-base (string-append name ":"))
  (if (null? body) robots-field-base
      (string-append robots-field-base " " body)))

; Generate a user agent line
(define (robots-ua (name "*"))
  (robots-field "User-agent" name))

; Generate a bunch of Disallow lines from a list of urls
(define (robots-disallow list)
  (if (empty? list) (robots-field "Disallow")
      (string-append*
       (map list (lambda (url)
                   (robots-field "Disallow" url))))))

; Map into and unwrap an optional value: if x present, f x, else d
(define (when-present d f x)
  (if (null? x) d
      (apply f x)))

; Forbid specific urls to a specific bot by user agent
(define (robots-forbidbot bot disallows)
  (string-append
   (robots-ua bot)
   (robots-disallow disallows)))

; Blocks format: (cons (list of global blocks) (list of (cons bot-ua (list of urls))))
(define (robots-config (blocks '())
                       #:crawl-delay (crawl-delay 10) ; How frequently a bot should access your site-- poorly specified
                       #:host (host '())              ; The canonical domain for your website
                       #:sitemap (sitemap '()))       ; Your sitemap.xml
  (define (field-when-present name value)
    (when-present "" ((curry robots-field) name) value))
  (define block-lists
    (when-present ""
     (match-lambda
       ([cons global rest]
        (string-append
         ; First we have the global disallow rules
         (robots-disallow global)
         (string-append*
          ; then a list of the disallow rules for individual bots
          (map (match-lambda
                 ([cons bot urls]
                  (robots-forbidbot bot urls))) rest)))))
     blocks))
  (string-append
   (robots-ua)
   block-lists
   (robots-field "Crawl-delay" (number->string crawl-delay))
   (field-when-present "Sitemap" sitemap)
   (field-when-present "Host" host)))