Add generic template, update README

2025-04-13 11:50:03 +02:00
parent cb0a3732bd
commit 06e8556d68
3 changed files with 353 additions and 15 deletions
--- a/examples/generic.yml
+++ b/examples/generic.yml
@@ -0,0 +1,285 @@
+# Example cmdline (forward requests from upstream to port :8080)
+# $ go-away --bind :8080 --backend site.example.com=http://site:3000 --policy examples/generic.yml --challenge-template anubis
+
+
+
+# Define networks to be used later below
+networks:
+
+  googlebot:
+    - url: https://developers.google.com/static/search/apis/ipranges/googlebot.json
+      jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
+  bingbot:
+    - url: https://www.bing.com/toolbox/bingbot.json
+      jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
+  qwantbot:
+    - url: https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json
+      jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
+  duckduckbot:
+    - url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/
+      regex: "<li>(?P<prefix>[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)</li>"
+  yandexbot:
+    # todo: detected as bot
+    # - url: https://yandex.com/ips
+    #  regex: "<span>(?P<prefix>(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)[ \\\\t]*</span><br/>"
+    - prefixes:
+        - "5.45.192.0/18"
+        - "5.255.192.0/18"
+        - "37.9.64.0/18"
+        - "37.140.128.0/18"
+        - "77.88.0.0/18"
+        - "84.252.160.0/19"
+        - "87.250.224.0/19"
+        - "90.156.176.0/22"
+        - "93.158.128.0/18"
+        - "95.108.128.0/17"
+        - "141.8.128.0/18"
+        - "178.154.128.0/18"
+        - "185.32.187.0/24"
+        - "2a02:6b8::/29"
+  kagibot:
+    - url: https://kagi.com/bot
+      regex: "\\n(?P<prefix>[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) "
+
+
+challenges:
+  js-pow-sha256:
+    # Asset must be under challenges/{name}/static/{asset}
+    # Other files here will be available under that path
+    mode: js
+    asset: load.mjs
+    parameters:
+      difficulty: 15
+    runtime:
+      mode: wasm
+      # Verify must be under challenges/{name}/runtime/{asset}
+      asset: runtime.wasm
+      probability: 0.02
+
+  # Challenges with a cookie, self redirect (non-JS, requires HTTP parsing)
+  self-cookie:
+    mode: "cookie"
+
+
+  # Challenges with a redirect via Link header with rel=preload and early hints (non-JS, requires HTTP parsing, fetching and logic)
+  # Works on HTTP/2 and above!
+  self-preload-link:
+    condition: '"Sec-Fetch-Mode" in headers && headers["Sec-Fetch-Mode"] == "navigate"'
+    mode: "preload-link"
+    runtime:
+      # verifies that result = key
+      mode: "key"
+      probability: 0.1
+    parameters:
+      preload-early-hint-deadline: 3s
+      key-code: 200
+      key-mime: text/css
+      key-content: ""
+
+  # Challenges with a redirect via Refresh header (non-JS, requires HTTP parsing and logic)
+  self-header-refresh:
+    mode: "header-refresh"
+    runtime:
+      # verifies that result = key
+      mode: "key"
+      probability: 0.1
+
+  # Challenges with a redirect via Refresh meta (non-JS, requires HTML parsing and logic)
+  self-meta-refresh:
+    mode: "meta-refresh"
+    runtime:
+      # verifies that result = key
+      mode: "key"
+      probability: 0.1
+
+  # Challenges with loading a random CSS or image document (non-JS, requires HTML parsing and logic)
+  self-resource-load:
+    mode: "resource-load"
+    runtime:
+      # verifies that result = key
+      mode: "key"
+      probability: 0.1
+    parameters:
+      key-code: 200
+      key-mime: text/css
+      key-content: ""
+
+conditions:
+  # Conditions will get replaced on rules AST when found as ($condition-name)
+  # Checks to detect a headless chromium via headers only
+  is-headless-chromium:
+    - 'userAgent.contains("HeadlessChrome") || userAgent.contains("HeadlessChromium")'
+    - '"Sec-Ch-Ua" in headers && (headers["Sec-Ch-Ua"].contains("HeadlessChrome") || headers["Sec-Ch-Ua"].contains("HeadlessChromium"))'
+    #- '(userAgent.contains("Chrome/") || userAgent.contains("Chromium/")) && (!("Accept-Language" in headers) || !("Accept-Encoding" in headers))'
+
+  is-generic-browser:
+    - 'userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")'
+
+  is-well-known-asset:
+    - 'path == "/robots.txt"'
+    - 'path.startsWith("/.well-known")'
+
+  is-static-asset:
+    - 'path == "/favicon.ico"'
+    - 'path == "/apple-touch-icon.png"'
+    - 'path == "/apple-touch-icon-precomposed.png"'
+    - 'path.matches("\\.(manifest|ttf|woff|woff2|jpg|jpeg|gif|png|webp|avif|svg|mp4|webm|css|js|mjs|wasm)$")'
+
+
+  is-generic-robot-ua:
+    - 'userAgent.contains("compatible;") && !userAgent.contains("Trident/")'
+    - 'userAgent.matches("\\+https?://")'
+    - 'userAgent.contains("@")'
+    - 'userAgent.matches("[bB]ot/[0-9]")'
+
+  is-tool-ua:
+    - 'userAgent.startsWith("python-requests/")'
+    - 'userAgent.startsWith("Python-urllib/")'
+    - 'userAgent.startsWith("python-httpx/")'
+    - 'userAgent.contains("aoihttp/")'
+    - 'userAgent.startsWith("http.rb/")'
+    - 'userAgent.startsWith("curl/")'
+    - 'userAgent.startsWith("Wget/")'
+    - 'userAgent.startsWith("libcurl/")'
+    - 'userAgent.startsWith("okhttp/")'
+    - 'userAgent.startsWith("Java/")'
+    - 'userAgent.startsWith("Apache-HttpClient//")'
+    - 'userAgent.startsWith("Go-http-client/")'
+    - 'userAgent.startsWith("node-fetch/")'
+    - 'userAgent.startsWith("reqwest/")'
+
+  is-suspicious-crawler:
+    - 'userAgent.contains("Presto/") || userAgent.contains("Trident/")'
+    # Old IE browsers
+    - 'userAgent.matches("MSIE ([2-9]|10|11)\\.")'
+    # Old Linux browsers
+    - 'userAgent.contains("Linux i[63]86") || userAgent.contains("FreeBSD i[63]86")'
+    # Old Windows browsers
+    - 'userAgent.matches("Windows (3|95|98|CE)") || userAgent.matches("Windows NT [1-5]\\.")'
+    # Old mobile browsers
+    - 'userAgent.matches("Android [1-5]\\.") || userAgent.matches("(iPad|iPhone) OS [1-9]_")'
+    # Old generic browsers
+    - 'userAgent.startsWith("Opera/")'
+    #- 'userAgent.matches("Gecko/(201[0-9]|200[0-9])")'
+    - 'userAgent.matches("^Mozilla/[1-4]")'
+
+
+# Rules and conditions are served this environment
+#   remoteAddress (net.IP) - Connecting client remote address from headers or properties
+#   host (string) - HTTP Host
+#   method (string) - HTTP Method/Verb
+#   userAgent (string) - HTTP User-Agent header
+#   path (string) - HTTP request Path
+#   query (map[string]string) - HTTP request Query arguments
+#   headers (map[string]string) - HTTP request headers
+#
+# Additionally these functions are available
+#   inNetwork(networkName string, address net.IP) bool
+#   inNetwork(networkCIDR string, address net.IP) bool
+rules:
+  - name: allow-well-known-resources
+    conditions:
+      - '($is-well-known-asset)'
+    action: pass
+
+  - name: allow-static-resources
+    conditions:
+      - '($is-static-asset)'
+    action: pass
+
+  - name: undesired-crawlers
+    conditions:
+      - '($is-headless-chromium)'
+      - 'userAgent.startsWith("Lightpanda/")'
+      - 'userAgent.startsWith("masscan/")'
+      # Typo'd opera botnet
+      - 'userAgent.matches("^Opera/[0-9.]+\\.\\(")'
+      # AI bullshit stuff, they do not respect robots.txt even while they read it
+      # TikTok Bytedance AI training
+      - 'userAgent.contains("Bytedance") || userAgent.contains("Bytespider")'
+      # Meta AI training; The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.
+      - 'userAgent.contains("meta-externalagent/") || userAgent.contains("meta-externalfetcher/") || userAgent.contains("FacebookBot")'
+      # Anthropic AI training and usage
+      - 'userAgent.contains("ClaudeBot") || userAgent.contains("Claude-User")|| userAgent.contains("Claude-SearchBot")'
+      # Common Crawl AI crawlers
+      - 'userAgent.contains("CCBot")'
+      # ChatGPT AI crawlers https://platform.openai.com/docs/bots
+      - 'userAgent.contains("GPTBot") || userAgent.contains("OAI-SearchBot") || userAgent.contains("ChatGPT-User")'
+      # Other AI crawlers
+      - 'userAgent.contains("Amazonbot") || userAgent.contains("Google-Extended") || userAgent.contains("PanguBot") || userAgent.contains("AI2Bot") || userAgent.contains("Diffbot") || userAgent.contains("cohere-training-data-crawler") || userAgent.contains("Applebot-Extended")'
+      # SEO / Ads and marketing
+      - 'userAgent.contains("BLEXBot")'
+    action: deny
+
+  - name: unknown-crawlers
+    conditions:
+      # No user agent set
+      - 'userAgent == ""'
+    action: deny
+
+  # check a sequence of challenges
+  - name: suspicious-crawlers/0
+    conditions: ['($is-suspicious-crawler)']
+    action: check
+    challenges: [js-pow-sha256]
+  - name: suspicious-crawlers/1
+    conditions: ['($is-suspicious-crawler)']
+    action: check
+    challenges: [self-preload-link]
+  - name: suspicious-crawlers/2
+    conditions: ['($is-suspicious-crawler)']
+    action: check
+    challenges: [self-header-refresh]
+  - name: suspicious-crawlers/3
+    conditions: ['($is-suspicious-crawler)']
+    action: check
+    challenges: [self-resource-load]
+
+  - name: desired-crawlers
+    conditions:
+      - 'userAgent.contains("+https://kagi.com/bot") && inNetwork("kagibot", remoteAddress)'
+      - '(userAgent.contains("+http://www.google.com/bot.html") || userAgent.contains("Google-InspectionTool") || userAgent.contains("Googlebot")) && inNetwork("googlebot", remoteAddress)'
+      - 'userAgent.contains("+http://www.bing.com/bingbot.htm") && inNetwork("bingbot", remoteAddress)'
+      - 'userAgent.contains("+http://duckduckgo.com/duckduckbot.html") && inNetwork("duckduckbot", remoteAddress)'
+      - 'userAgent.contains("+https://help.qwant.com/bot/") && inNetwork("qwantbot", remoteAddress)'
+      - 'userAgent.contains("+http://yandex.com/bots") && inNetwork("yandexbot", remoteAddress)'
+    action: pass
+
+  - name: homesite
+    conditions:
+      - 'path == "/"'
+    action: pass
+
+  # check DNSBL and serve harder challenges
+  - name: undesired-dnsbl
+    conditions:
+      - 'inDNSBL(remoteAddress)'
+    action: check
+    challenges: [js-pow-sha256]
+
+  - name: suspicious-fetchers
+    action: check
+    challenges: [js-pow-sha256]
+    conditions:
+      - 'userAgent.contains("facebookexternalhit/") || userAgent.contains("facebookcatalog/")'
+
+  # Allow PUT/DELETE/PATCH/POST requests in general
+  - name: non-get-request
+    action: pass
+    conditions:
+      - '!(method == "HEAD" || method == "GET")'
+
+
+  - name: standard-tools
+    action: challenge
+    challenges: [self-cookie]
+    conditions:
+      - '($is-generic-robot-ua)'
+      - '($is-tool-ua)'
+      - '!($is-generic-browser)'
+
+  - name: standard-browser
+    action: challenge
+    challenges: [self-preload-link, self-meta-refresh, self-resource-load, js-pow-sha256]
+    conditions:
+      - '($is-generic-browser)'