59 lines
2.0 KiB
Racket
59 lines
2.0 KiB
Racket
#lang racket
|
|
;; /robots.txt specifies how web crawlers should access your website
|
|
;; see www.robotstxt.org
|
|
|
|
; Generate a robots.txt field
|
|
(define (robots-field name (body '()))
|
|
(define robots-field-base (string-append name ":"))
|
|
(if (null? body) robots-field-base
|
|
(string-append robots-field-base " " body)))
|
|
|
|
; Generate a user agent line
|
|
(define (robots-ua (name "*"))
|
|
(robots-field "User-agent" name))
|
|
|
|
; Generate a bunch of Disallow lines from a list of urls
|
|
(define (robots-disallow list)
|
|
(if (empty? list) (robots-field "Disallow")
|
|
(string-append*
|
|
(map list (lambda (url)
|
|
(robots-field "Disallow" url))))))
|
|
|
|
; Map into and unwrap an optional value: if x present, f x, else d
|
|
(define (when-present d f x)
|
|
(if (null? x) d
|
|
(apply f x)))
|
|
|
|
; Forbid specific urls to a specific bot by user agent
|
|
(define (robots-forbidbot bot disallows)
|
|
(string-append
|
|
(robots-ua bot)
|
|
(robots-disallow disallows)))
|
|
|
|
; Blocks format: (cons (list of global blocks) (list of (cons bot-ua (list of urls))))
|
|
(define (robots-config (blocks '())
|
|
#:crawl-delay (crawl-delay 10) ; How frequently a bot should access your site-- poorly specified
|
|
#:host (host '()) ; The canonical domain for your website
|
|
#:sitemap (sitemap '())) ; Your sitemap.xml
|
|
(define (field-when-present name value)
|
|
(when-present "" ((curry robots-field) name) value))
|
|
(define block-lists
|
|
(when-present ""
|
|
(match-lambda
|
|
([cons global rest]
|
|
(string-append
|
|
; First we have the global disallow rules
|
|
(robots-disallow global)
|
|
(string-append*
|
|
; then a list of the disallow rules for individual bots
|
|
(map (match-lambda
|
|
([cons bot urls]
|
|
(robots-forbidbot bot urls))) rest)))))
|
|
blocks))
|
|
(string-append
|
|
(robots-ua)
|
|
block-lists
|
|
(robots-field "Crawl-delay" (number->string crawl-delay))
|
|
(field-when-present "Sitemap" sitemap)
|
|
(field-when-present "Host" host)))
|
|
|