diff --git a/README.md b/README.md index 51b9953..6da6d06 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,44 @@ Self-hosted abuse detection and rule enforcement against low-effort mass AI scra This documentation is a work in progress. For now, see policy examples under [examples/](examples/). +This Go package can be used as a command on `git.gammaspectra.live/git/go-away/cmd/go-away` or a library under `git.gammaspectra.live/git/go-away/lib` + + +## Support + +If you have some suggestion or issue, feel free to open a [New Issue](https://git.gammaspectra.live/git/go-away/issues/new) on the repository. + +[Pull Requests](https://git.gammaspectra.live/git/go-away/pulls) are encouraged and desired. + +For real-time chat and other support join IRC on [##go-away](ircs://irc.libera.chat/##go-away) on Libera.Chat. The channel may not be monitored at all times, feel free to ping the operators there. + + +## Example policies + +### Forgejo + +The policy file at [examples/forgejo.yml](examples/forgejo.yml) provides a ready template to be used on your own Forgejo instance. + +Important notes: +* Edit the `homesite` rule, as it's targeted to common users or orgs on the instance. A better regex might be possible in the future. +* Edit the `http-cookie-check` challenge, as this will fetch the listed backend with the given session cookie to check for user login. +* Adjust the desired blocked networks or others. A template list of network ranges is provided, feel free to remove these if not needed. +* Check the conditions and base rules to change your challenges offered and other ordering. +* By default Googlebot / Bingbot / DuckDuckBot / Kagibot / Qwantbot / Yandexbot are allowed by useragent and network ranges. + +### Generic + +The policy file at [examples/generic.yml](examples/forgejo.yml) provides a baseline to place on any site, that can be modified to fit your needs. + +Important notes: +* Edit the `homesite` rule, as it's targeted to pages you always want to have available, like landing pages. +* Edit the `is-static-asset` condition or the `allow-static-resources` rule to allow static file access as necessary. +* If you have an API, add a PASS rule targeting it. +* Check the conditions and base rules to change your challenges offered and other ordering. +* Add or modify rules to target specific pages on your site as desired. +* By default Googlebot / Bingbot / DuckDuckBot / Kagibot / Qwantbot / Yandexbot are allowed by useragent and network ranges. + + ## Setup It is recommended to have another reverse proxy above (for example [Caddy](https://caddyserver.com/), nginx, HAProxy) to handle HTTPs or similar. @@ -15,7 +53,7 @@ go-away for now only accepts plaintext connections, although it can take _HTTP/2 ### Binary / Go -Requires Go 1.24+. Builds statically without CGo. +Requires Go 1.24+. Builds statically without CGo usage. ```shell git clone https://git.gammaspectra.live/git/go-away.git && cd go-away @@ -36,7 +74,16 @@ Available under [Dockerfile](Dockerfile). See the _docker compose_ below for the ### docker compose +Example follows a hypothetical Forgejo server running on `http://forgejo:3000` serving `git.example.com` + ```yaml +networks: + forgejo: + external: false + +volumes: + goaway_cache: + services: go-away: image: git.gammaspectra.live/git/go-away:latest @@ -48,12 +95,24 @@ services: depends_on: - forgejo volumes: + - "goaway_cache:/cache" - "./examples/forgejo.yml:/policy.yml:ro" environment: #GOAWAY_BIND: ":8080" + # Supported tcp, unix, and proxy (for enabling PROXY module for request unwrapping) #GOAWAY_BIND_NETWORK: "tcp" #GOAWAY_SOCKET_MODE: "0770" + # set to letsencrypt or other directory URL to enable HTTPS. Above ports will be TLS only. + # enables request JA3N / JA4 client TLS fingerprinting + # TLS fingerprints are served on X-TLS-Fingerprint-JA3N and X-TLS-Fingerprint-JA4 headers + # TLS fingerprints can be matched against on CEL conditions + #GOAWAY_ACME_AUTOCERT: "" + + # Cache path for several services like certificates and caching network ranges + # Can be semi-ephemeral, recommended to be mapped to a permanent volume + #GOAWAY_CACHE="/cache" + # default is WARN, set to INFO to also see challenge successes and others #GOAWAY_SLOG_LEVEL: "INFO" @@ -64,8 +123,13 @@ services: # HTTP header that the client ip will be fetched from # Defaults to the connection ip itself, if set here make sure your upstream proxy sets this properly # Usually X-Forwarded-For is a good pick + # Not necessary with GOAWAY_BIND_NETWORK: proxy GOAWAY_CLIENT_IP_HEADER: "X-Real-Ip" + # HTTP header that go-away will set the obtained ip will be set to + # If left empty, the header on GOAWAY_CLIENT_IP_HEADER will be left as-is + #GOAWAY_BACKEND_IP_HEADER: "" + GOAWAY_POLICY: "/policy.yml" # Template, and theme for the template to pick. defaults to an anubis-like one @@ -81,22 +145,12 @@ services: # additional backends can be specified via more command arguments # command: ["--backend", "ci.example.com=http://ci:3000"] + forgejo: + # etc. + ``` -## Example policies - -### Forgejo - -The policy file at [examples/forgejo.yml](examples/forgejo.yml) provides a ready template to be used on your own Forgejo instance. - -Important notes: -* Edit the `homesite` rule, as it's targeted to common users or orgs on the instance. A better regex might be possible in the future. -* Edit the `http-cookie-check` challenge, as this will fetch the listed backend with the given session cookie to check for user login. -* Adjust the desired blocked networks or others. A template list of network ranges is provided, feel free to remove these if not needed. -* Check the conditions and base rules to change your challenges offered and other ordering. - - ## Development ### Compiling WASM runtime challenge modules diff --git a/examples/forgejo.yml b/examples/forgejo.yml index a7d02f7..4e3e142 100644 --- a/examples/forgejo.yml +++ b/examples/forgejo.yml @@ -93,7 +93,6 @@ networks: regex: "\\n(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) " -# todo: define interface challenges: js-pow-sha256: # Asset must be under challenges/{name}/static/{asset} diff --git a/examples/generic.yml b/examples/generic.yml new file mode 100644 index 0000000..c38afd0 --- /dev/null +++ b/examples/generic.yml @@ -0,0 +1,285 @@ +# Example cmdline (forward requests from upstream to port :8080) +# $ go-away --bind :8080 --backend site.example.com=http://site:3000 --policy examples/generic.yml --challenge-template anubis + + + +# Define networks to be used later below +networks: + + googlebot: + - url: https://developers.google.com/static/search/apis/ipranges/googlebot.json + jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' + bingbot: + - url: https://www.bing.com/toolbox/bingbot.json + jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' + qwantbot: + - url: https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json + jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' + duckduckbot: + - url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/ + regex: "
  • (?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)
  • " + yandexbot: + # todo: detected as bot + # - url: https://yandex.com/ips + # regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)[ \\\\t]*
    " + - prefixes: + - "5.45.192.0/18" + - "5.255.192.0/18" + - "37.9.64.0/18" + - "37.140.128.0/18" + - "77.88.0.0/18" + - "84.252.160.0/19" + - "87.250.224.0/19" + - "90.156.176.0/22" + - "93.158.128.0/18" + - "95.108.128.0/17" + - "141.8.128.0/18" + - "178.154.128.0/18" + - "185.32.187.0/24" + - "2a02:6b8::/29" + kagibot: + - url: https://kagi.com/bot + regex: "\\n(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) " + + +challenges: + js-pow-sha256: + # Asset must be under challenges/{name}/static/{asset} + # Other files here will be available under that path + mode: js + asset: load.mjs + parameters: + difficulty: 15 + runtime: + mode: wasm + # Verify must be under challenges/{name}/runtime/{asset} + asset: runtime.wasm + probability: 0.02 + + # Challenges with a cookie, self redirect (non-JS, requires HTTP parsing) + self-cookie: + mode: "cookie" + + + # Challenges with a redirect via Link header with rel=preload and early hints (non-JS, requires HTTP parsing, fetching and logic) + # Works on HTTP/2 and above! + self-preload-link: + condition: '"Sec-Fetch-Mode" in headers && headers["Sec-Fetch-Mode"] == "navigate"' + mode: "preload-link" + runtime: + # verifies that result = key + mode: "key" + probability: 0.1 + parameters: + preload-early-hint-deadline: 3s + key-code: 200 + key-mime: text/css + key-content: "" + + # Challenges with a redirect via Refresh header (non-JS, requires HTTP parsing and logic) + self-header-refresh: + mode: "header-refresh" + runtime: + # verifies that result = key + mode: "key" + probability: 0.1 + + # Challenges with a redirect via Refresh meta (non-JS, requires HTML parsing and logic) + self-meta-refresh: + mode: "meta-refresh" + runtime: + # verifies that result = key + mode: "key" + probability: 0.1 + + # Challenges with loading a random CSS or image document (non-JS, requires HTML parsing and logic) + self-resource-load: + mode: "resource-load" + runtime: + # verifies that result = key + mode: "key" + probability: 0.1 + parameters: + key-code: 200 + key-mime: text/css + key-content: "" + +conditions: + # Conditions will get replaced on rules AST when found as ($condition-name) + # Checks to detect a headless chromium via headers only + is-headless-chromium: + - 'userAgent.contains("HeadlessChrome") || userAgent.contains("HeadlessChromium")' + - '"Sec-Ch-Ua" in headers && (headers["Sec-Ch-Ua"].contains("HeadlessChrome") || headers["Sec-Ch-Ua"].contains("HeadlessChromium"))' + #- '(userAgent.contains("Chrome/") || userAgent.contains("Chromium/")) && (!("Accept-Language" in headers) || !("Accept-Encoding" in headers))' + + is-generic-browser: + - 'userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")' + + is-well-known-asset: + - 'path == "/robots.txt"' + - 'path.startsWith("/.well-known")' + + is-static-asset: + - 'path == "/favicon.ico"' + - 'path == "/apple-touch-icon.png"' + - 'path == "/apple-touch-icon-precomposed.png"' + - 'path.matches("\\.(manifest|ttf|woff|woff2|jpg|jpeg|gif|png|webp|avif|svg|mp4|webm|css|js|mjs|wasm)$")' + + + is-generic-robot-ua: + - 'userAgent.contains("compatible;") && !userAgent.contains("Trident/")' + - 'userAgent.matches("\\+https?://")' + - 'userAgent.contains("@")' + - 'userAgent.matches("[bB]ot/[0-9]")' + + is-tool-ua: + - 'userAgent.startsWith("python-requests/")' + - 'userAgent.startsWith("Python-urllib/")' + - 'userAgent.startsWith("python-httpx/")' + - 'userAgent.contains("aoihttp/")' + - 'userAgent.startsWith("http.rb/")' + - 'userAgent.startsWith("curl/")' + - 'userAgent.startsWith("Wget/")' + - 'userAgent.startsWith("libcurl/")' + - 'userAgent.startsWith("okhttp/")' + - 'userAgent.startsWith("Java/")' + - 'userAgent.startsWith("Apache-HttpClient//")' + - 'userAgent.startsWith("Go-http-client/")' + - 'userAgent.startsWith("node-fetch/")' + - 'userAgent.startsWith("reqwest/")' + + is-suspicious-crawler: + - 'userAgent.contains("Presto/") || userAgent.contains("Trident/")' + # Old IE browsers + - 'userAgent.matches("MSIE ([2-9]|10|11)\\.")' + # Old Linux browsers + - 'userAgent.contains("Linux i[63]86") || userAgent.contains("FreeBSD i[63]86")' + # Old Windows browsers + - 'userAgent.matches("Windows (3|95|98|CE)") || userAgent.matches("Windows NT [1-5]\\.")' + # Old mobile browsers + - 'userAgent.matches("Android [1-5]\\.") || userAgent.matches("(iPad|iPhone) OS [1-9]_")' + # Old generic browsers + - 'userAgent.startsWith("Opera/")' + #- 'userAgent.matches("Gecko/(201[0-9]|200[0-9])")' + - 'userAgent.matches("^Mozilla/[1-4]")' + + +# Rules and conditions are served this environment +# remoteAddress (net.IP) - Connecting client remote address from headers or properties +# host (string) - HTTP Host +# method (string) - HTTP Method/Verb +# userAgent (string) - HTTP User-Agent header +# path (string) - HTTP request Path +# query (map[string]string) - HTTP request Query arguments +# headers (map[string]string) - HTTP request headers +# +# Additionally these functions are available +# inNetwork(networkName string, address net.IP) bool +# inNetwork(networkCIDR string, address net.IP) bool +rules: + - name: allow-well-known-resources + conditions: + - '($is-well-known-asset)' + action: pass + + - name: allow-static-resources + conditions: + - '($is-static-asset)' + action: pass + + - name: undesired-crawlers + conditions: + - '($is-headless-chromium)' + - 'userAgent.startsWith("Lightpanda/")' + - 'userAgent.startsWith("masscan/")' + # Typo'd opera botnet + - 'userAgent.matches("^Opera/[0-9.]+\\.\\(")' + # AI bullshit stuff, they do not respect robots.txt even while they read it + # TikTok Bytedance AI training + - 'userAgent.contains("Bytedance") || userAgent.contains("Bytespider")' + # Meta AI training; The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly. + - 'userAgent.contains("meta-externalagent/") || userAgent.contains("meta-externalfetcher/") || userAgent.contains("FacebookBot")' + # Anthropic AI training and usage + - 'userAgent.contains("ClaudeBot") || userAgent.contains("Claude-User")|| userAgent.contains("Claude-SearchBot")' + # Common Crawl AI crawlers + - 'userAgent.contains("CCBot")' + # ChatGPT AI crawlers https://platform.openai.com/docs/bots + - 'userAgent.contains("GPTBot") || userAgent.contains("OAI-SearchBot") || userAgent.contains("ChatGPT-User")' + # Other AI crawlers + - 'userAgent.contains("Amazonbot") || userAgent.contains("Google-Extended") || userAgent.contains("PanguBot") || userAgent.contains("AI2Bot") || userAgent.contains("Diffbot") || userAgent.contains("cohere-training-data-crawler") || userAgent.contains("Applebot-Extended")' + # SEO / Ads and marketing + - 'userAgent.contains("BLEXBot")' + action: deny + + - name: unknown-crawlers + conditions: + # No user agent set + - 'userAgent == ""' + action: deny + + # check a sequence of challenges + - name: suspicious-crawlers/0 + conditions: ['($is-suspicious-crawler)'] + action: check + challenges: [js-pow-sha256] + - name: suspicious-crawlers/1 + conditions: ['($is-suspicious-crawler)'] + action: check + challenges: [self-preload-link] + - name: suspicious-crawlers/2 + conditions: ['($is-suspicious-crawler)'] + action: check + challenges: [self-header-refresh] + - name: suspicious-crawlers/3 + conditions: ['($is-suspicious-crawler)'] + action: check + challenges: [self-resource-load] + + - name: desired-crawlers + conditions: + - 'userAgent.contains("+https://kagi.com/bot") && inNetwork("kagibot", remoteAddress)' + - '(userAgent.contains("+http://www.google.com/bot.html") || userAgent.contains("Google-InspectionTool") || userAgent.contains("Googlebot")) && inNetwork("googlebot", remoteAddress)' + - 'userAgent.contains("+http://www.bing.com/bingbot.htm") && inNetwork("bingbot", remoteAddress)' + - 'userAgent.contains("+http://duckduckgo.com/duckduckbot.html") && inNetwork("duckduckbot", remoteAddress)' + - 'userAgent.contains("+https://help.qwant.com/bot/") && inNetwork("qwantbot", remoteAddress)' + - 'userAgent.contains("+http://yandex.com/bots") && inNetwork("yandexbot", remoteAddress)' + action: pass + + - name: homesite + conditions: + - 'path == "/"' + action: pass + + # check DNSBL and serve harder challenges + - name: undesired-dnsbl + conditions: + - 'inDNSBL(remoteAddress)' + action: check + challenges: [js-pow-sha256] + + - name: suspicious-fetchers + action: check + challenges: [js-pow-sha256] + conditions: + - 'userAgent.contains("facebookexternalhit/") || userAgent.contains("facebookcatalog/")' + + # Allow PUT/DELETE/PATCH/POST requests in general + - name: non-get-request + action: pass + conditions: + - '!(method == "HEAD" || method == "GET")' + + + - name: standard-tools + action: challenge + challenges: [self-cookie] + conditions: + - '($is-generic-robot-ua)' + - '($is-tool-ua)' + - '!($is-generic-browser)' + + - name: standard-browser + action: challenge + challenges: [self-preload-link, self-meta-refresh, self-resource-load, js-pow-sha256] + conditions: + - '($is-generic-browser)'