# Example cmdline (forward requests from upstream to port :8080) # $ go-away --bind :8080 --backend site.example.com=http://site:3000 --policy examples/generic.yml --challenge-template anubis # Define networks to be used later below networks: googlebot: - url: https://developers.google.com/static/search/apis/ipranges/googlebot.json jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' bingbot: - url: https://www.bing.com/toolbox/bingbot.json jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' qwantbot: - url: https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' duckduckbot: - url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot regex: "
  • (?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)
  • " yandexbot: # todo: detected as bot # - url: https://yandex.com/ips # regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)[ \\\\t]*
    " - prefixes: - "5.45.192.0/18" - "5.255.192.0/18" - "37.9.64.0/18" - "37.140.128.0/18" - "77.88.0.0/18" - "84.252.160.0/19" - "87.250.224.0/19" - "90.156.176.0/22" - "93.158.128.0/18" - "95.108.128.0/17" - "141.8.128.0/18" - "178.154.128.0/18" - "185.32.187.0/24" - "2a02:6b8::/29" kagibot: - url: https://kagi.com/bot regex: "\\n(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) " challenges: js-pow-sha256: # Asset must be under challenges/{name}/static/{asset} # Other files here will be available under that path mode: js asset: load.mjs parameters: difficulty: 15 runtime: mode: wasm # Verify must be under challenges/{name}/runtime/{asset} asset: runtime.wasm probability: 0.02 # Challenges with a cookie, self redirect (non-JS, requires HTTP parsing) self-cookie: mode: "cookie" # Challenges with a redirect via Link header with rel=preload and early hints (non-JS, requires HTTP parsing, fetching and logic) # Works on HTTP/2 and above! self-preload-link: condition: '"Sec-Fetch-Mode" in headers && headers["Sec-Fetch-Mode"] == "navigate"' mode: "preload-link" runtime: # verifies that result = key mode: "key" probability: 0.1 parameters: preload-early-hint-deadline: 3s key-code: 200 key-mime: text/css key-content: "" # Challenges with a redirect via Refresh header (non-JS, requires HTTP parsing and logic) self-header-refresh: mode: "header-refresh" runtime: # verifies that result = key mode: "key" probability: 0.1 # Challenges with a redirect via Refresh meta (non-JS, requires HTML parsing and logic) self-meta-refresh: mode: "meta-refresh" runtime: # verifies that result = key mode: "key" probability: 0.1 # Challenges with loading a random CSS or image document (non-JS, requires HTML parsing and logic) self-resource-load: mode: "resource-load" runtime: # verifies that result = key mode: "key" probability: 0.1 parameters: key-code: 200 key-mime: text/css key-content: "" conditions: # Conditions will get replaced on rules AST when found as ($condition-name) # Checks to detect a headless chromium via headers only is-headless-chromium: - 'userAgent.contains("HeadlessChrome") || userAgent.contains("HeadlessChromium")' - '"Sec-Ch-Ua" in headers && (headers["Sec-Ch-Ua"].contains("HeadlessChrome") || headers["Sec-Ch-Ua"].contains("HeadlessChromium"))' #- '(userAgent.contains("Chrome/") || userAgent.contains("Chromium/")) && (!("Accept-Language" in headers) || !("Accept-Encoding" in headers))' is-generic-browser: - 'userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")' is-well-known-asset: - 'path == "/robots.txt"' - 'path.startsWith("/.well-known")' is-static-asset: - 'path == "/favicon.ico"' - 'path == "/apple-touch-icon.png"' - 'path == "/apple-touch-icon-precomposed.png"' - 'path.matches("\\.(manifest|ttf|woff|woff2|jpg|jpeg|gif|png|webp|avif|svg|mp4|webm|css|js|mjs|wasm)$")' is-generic-robot-ua: - 'userAgent.contains("compatible;") && !userAgent.contains("Trident/")' - 'userAgent.matches("\\+https?://")' - 'userAgent.contains("@")' - 'userAgent.matches("[bB]ot/[0-9]")' is-tool-ua: - 'userAgent.startsWith("python-requests/")' - 'userAgent.startsWith("Python-urllib/")' - 'userAgent.startsWith("python-httpx/")' - 'userAgent.contains("aoihttp/")' - 'userAgent.startsWith("http.rb/")' - 'userAgent.startsWith("curl/")' - 'userAgent.startsWith("Wget/")' - 'userAgent.startsWith("libcurl/")' - 'userAgent.startsWith("okhttp/")' - 'userAgent.startsWith("Java/")' - 'userAgent.startsWith("Apache-HttpClient//")' - 'userAgent.startsWith("Go-http-client/")' - 'userAgent.startsWith("node-fetch/")' - 'userAgent.startsWith("reqwest/")' is-suspicious-crawler: - 'userAgent.contains("Presto/") || userAgent.contains("Trident/")' # Old IE browsers - 'userAgent.matches("MSIE ([2-9]|10|11)\\.")' # Old Linux browsers - 'userAgent.contains("Linux i[63]86") || userAgent.contains("FreeBSD i[63]86")' # Old Windows browsers - 'userAgent.matches("Windows (3|95|98|CE)") || userAgent.matches("Windows NT [1-5]\\.")' # Old mobile browsers - 'userAgent.matches("Android [1-5]\\.") || userAgent.matches("(iPad|iPhone) OS [1-9]_")' # Old generic browsers - 'userAgent.startsWith("Opera/")' #- 'userAgent.matches("Gecko/(201[0-9]|200[0-9])")' - 'userAgent.matches("^Mozilla/[1-4]")' rules: - name: allow-well-known-resources conditions: - '($is-well-known-asset)' action: pass - name: allow-static-resources conditions: - '($is-static-asset)' action: pass - name: undesired-crawlers conditions: - '($is-headless-chromium)' - 'userAgent.startsWith("Lightpanda/")' - 'userAgent.startsWith("masscan/")' # Typo'd opera botnet - 'userAgent.matches("^Opera/[0-9.]+\\.\\(")' # AI bullshit stuff, they do not respect robots.txt even while they read it # TikTok Bytedance AI training - 'userAgent.contains("Bytedance") || userAgent.contains("Bytespider")' # Meta AI training; The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly. - 'userAgent.contains("meta-externalagent/") || userAgent.contains("meta-externalfetcher/") || userAgent.contains("FacebookBot")' # Anthropic AI training and usage - 'userAgent.contains("ClaudeBot") || userAgent.contains("Claude-User")|| userAgent.contains("Claude-SearchBot")' # Common Crawl AI crawlers - 'userAgent.contains("CCBot")' # ChatGPT AI crawlers https://platform.openai.com/docs/bots - 'userAgent.contains("GPTBot") || userAgent.contains("OAI-SearchBot") || userAgent.contains("ChatGPT-User")' # Other AI crawlers - 'userAgent.contains("Amazonbot") || userAgent.contains("Google-Extended") || userAgent.contains("PanguBot") || userAgent.contains("AI2Bot") || userAgent.contains("Diffbot") || userAgent.contains("cohere-training-data-crawler") || userAgent.contains("Applebot-Extended")' # SEO / Ads and marketing - 'userAgent.contains("BLEXBot")' action: deny - name: unknown-crawlers conditions: # No user agent set - 'userAgent == ""' action: deny # check a sequence of challenges - name: suspicious-crawlers/0 conditions: ['($is-suspicious-crawler)'] action: check challenges: [js-pow-sha256] - name: suspicious-crawlers/1 conditions: ['($is-suspicious-crawler)'] action: check challenges: [self-preload-link] - name: suspicious-crawlers/2 conditions: ['($is-suspicious-crawler)'] action: check challenges: [self-header-refresh] - name: suspicious-crawlers/3 conditions: ['($is-suspicious-crawler)'] action: check challenges: [self-resource-load] - name: desired-crawlers conditions: - 'userAgent.contains("+https://kagi.com/bot") && inNetwork("kagibot", remoteAddress)' - '(userAgent.contains("+http://www.google.com/bot.html") || userAgent.contains("Google-InspectionTool") || userAgent.contains("Googlebot")) && inNetwork("googlebot", remoteAddress)' - 'userAgent.contains("+http://www.bing.com/bingbot.htm") && inNetwork("bingbot", remoteAddress)' - 'userAgent.contains("+http://duckduckgo.com/duckduckbot.html") && inNetwork("duckduckbot", remoteAddress)' - 'userAgent.contains("+https://help.qwant.com/bot/") && inNetwork("qwantbot", remoteAddress)' - 'userAgent.contains("+http://yandex.com/bots") && inNetwork("yandexbot", remoteAddress)' action: pass - name: homesite conditions: - 'path == "/"' action: pass # check DNSBL and serve harder challenges - name: undesired-dnsbl conditions: - 'inDNSBL(remoteAddress)' action: check challenges: [js-pow-sha256] - name: suspicious-fetchers action: check challenges: [js-pow-sha256] conditions: - 'userAgent.contains("facebookexternalhit/") || userAgent.contains("facebookcatalog/")' # Allow PUT/DELETE/PATCH/POST requests in general - name: non-get-request action: pass conditions: - '!(method == "HEAD" || method == "GET")' - name: plaintext-browser action: challenge challenges: [self-meta-refresh, self-cookie] conditions: - 'userAgent.startsWith("Lynx/")' - name: standard-tools action: challenge challenges: [self-cookie] conditions: - '($is-generic-robot-ua)' - '($is-tool-ua)' - '!($is-generic-browser)' - name: standard-browser action: challenge challenges: [self-preload-link, self-meta-refresh, self-resource-load, js-pow-sha256] conditions: - '($is-generic-browser)'