diff --git a/Dockerfile b/Dockerfile index dccfcab..10b82e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,6 +39,7 @@ ENV GOAWAY_BIND=":8080" ENV GOAWAY_BIND_NETWORK="tcp" ENV GOAWAY_SOCKET_MODE="0770" ENV GOAWAY_POLICY="/policy.yml" +ENV GOAWAY_POLICY_SNIPPETS="/policy/snippets" ENV GOAWAY_CHALLENGE_TEMPLATE="anubis" ENV GOAWAY_CHALLENGE_TEMPLATE_THEME="" ENV GOAWAY_SLOG_LEVEL="WARN" @@ -56,7 +57,8 @@ EXPOSE 8080/udp ENV JWT_PRIVATE_KEY_SEED="${GOAWAY_JWT_PRIVATE_KEY_SEED}" ENTRYPOINT /bin/go-away --bind "${GOAWAY_BIND}" --bind-network "${GOAWAY_BIND_NETWORK}" --socket-mode "${GOAWAY_SOCKET_MODE}" \ - --policy ${GOAWAY_POLICY} --client-ip-header "${GOAWAY_CLIENT_IP_HEADER}" --backend-ip-header "${GOAWAY_BACKEND_IP_HEADER}" \ + --policy "${GOAWAY_POLICY}" --policy-snippets "${GOAWAY_POLICY_SNIPPETS}" \ + --client-ip-header "${GOAWAY_CLIENT_IP_HEADER}" --backend-ip-header "${GOAWAY_BACKEND_IP_HEADER}" \ --cache "${GOAWAY_CACHE}" \ --dnsbl "${GOAWAY_DNSBL}" \ --challenge-template "${GOAWAY_CHALLENGE_TEMPLATE}" --challenge-template-theme "${GOAWAY_CHALLENGE_TEMPLATE_THEME}" \ diff --git a/README.md b/README.md index 37b9da6..8a4ceed 100644 --- a/README.md +++ b/README.md @@ -357,6 +357,7 @@ services: volumes: - "goaway_cache:/cache" - "./examples/forgejo.yml:/policy.yml:ro" + - "./examples/snippets/:/policy/snippets/:ro" environment: #GOAWAY_BIND: ":8080" # Supported tcp, unix, and proxy (for enabling PROXY module for request unwrapping) @@ -391,6 +392,8 @@ services: #GOAWAY_BACKEND_IP_HEADER: "" GOAWAY_POLICY: "/policy.yml" + + GOAWAY_POLICY_SNIPPETS: "/policy/snippets" # Template, and theme for the template to pick. defaults to an anubis-like one # An file path can be specified. See embed/templates for a few examples diff --git a/cmd/go-away/main.go b/cmd/go-away/main.go index 051fbc4..3bb3b02 100644 --- a/cmd/go-away/main.go +++ b/cmd/go-away/main.go @@ -1,6 +1,7 @@ package main import ( + "bytes" "context" "crypto/ed25519" "crypto/rand" @@ -12,13 +13,11 @@ import ( "git.gammaspectra.live/git/go-away/lib" "git.gammaspectra.live/git/go-away/lib/policy" "git.gammaspectra.live/git/go-away/utils" - "github.com/goccy/go-yaml" "github.com/pires/go-proxyproto" "golang.org/x/crypto/acme" "golang.org/x/crypto/acme/autocert" "log" "log/slog" - "maps" "net" "net/http" "os" @@ -137,6 +136,7 @@ func main() { cachePath := flag.String("cache", path.Join(os.TempDir(), "go_away_cache"), "path to temporary cache directory") policyFile := flag.String("policy", "", "path to policy YAML file") + policySnippets := flag.String("policy-snippets", "", "path to YAML snippets folder") challengeTemplate := flag.String("challenge-template", "anubis", "name or path of the challenge template to use (anubis, forgejo)") challengeTemplateTheme := flag.String("challenge-template-theme", "", "name of the challenge template theme to use (forgejo => [forgejo-auto, forgejo-dark, forgejo-light, gitea...])") @@ -203,17 +203,14 @@ func main() { log.Fatal(fmt.Errorf("failed to read policy file: %w", err)) } - var p policy.Policy - - if err = yaml.Unmarshal(policyData, &p); err != nil { + p, err := policy.NewPolicy(bytes.NewReader(policyData), *policySnippets) + if err != nil { log.Fatal(fmt.Errorf("failed to parse policy file: %w", err)) } createdBackends := make(map[string]http.Handler) parsedBackends := make(map[string]string) - //TODO: deprecate - maps.Copy(parsedBackends, p.Backends) for _, backend := range backends { parts := strings.Split(backend, "=") if len(parts) != 2 { @@ -232,6 +229,10 @@ func main() { createdBackends[k] = backend } + if len(createdBackends) == 0 { + log.Fatal(fmt.Errorf("no backends defined in policy file")) + } + if *cachePath != "" { err = os.MkdirAll(*cachePath, 0755) if err != nil { @@ -328,7 +329,7 @@ func main() { ChallengeResponseCode: http.StatusTeapot, } - state, err := lib.NewState(p, settings) + state, err := lib.NewState(*p, settings) if err != nil { log.Fatal(fmt.Errorf("failed to create state: %w", err)) diff --git a/examples/forgejo.yml b/examples/forgejo.yml index eb2d46b..a200d1e 100644 --- a/examples/forgejo.yml +++ b/examples/forgejo.yml @@ -1,12 +1,12 @@ # Example cmdline (forward requests from upstream to port :8080) -# $ go-away --bind :8080 --backend git.example.com=http://forgejo:3000 --policy examples/forgejo.yml --challenge-template forgejo --challenge-template-theme forgejo-auto +# $ go-away --bind :8080 --backend git.example.com=http://forgejo:3000 --policy examples/forgejo.yml --policy-snippets example/snippets/ --challenge-template forgejo --challenge-template-theme forgejo-auto # Define networks to be used later below networks: - # todo: support direct ASN lookups - # todo: cache these values + # Networks will get included from snippets + huawei-cloud: # AS136907 - url: https://raw.githubusercontent.com/ipverse/asn-ip/refs/heads/master/as/136907/aggregated.json @@ -19,123 +19,10 @@ networks: # AS21859 - url: https://raw.githubusercontent.com/ipverse/asn-ip/refs/heads/master/as/21859/aggregated.json jq-path: '.subnets.ipv4[], .subnets.ipv6[]' -# aws-cloud: -# - url: https://ip-ranges.amazonaws.com/ip-ranges.json -# jq-path: '(.prefixes[] | select(has("ip_prefix")) | .ip_prefix), (.prefixes[] | select(has("ipv6_prefix")) | .ipv6_prefix)' -# google-cloud: -# - url: https://www.gstatic.com/ipranges/cloud.json -# jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' -# oracle-cloud: -# - url: https://docs.oracle.com/en-us/iaas/tools/public_ip_ranges.json -# jq-path: '.regions[] | .cidrs[] | .cidr' -# azure-cloud: -# # todo: https://www.microsoft.com/en-us/download/details.aspx?id=56519 does not provide direct JSON -# - url: https://raw.githubusercontent.com/femueller/cloud-ip-ranges/refs/heads/master/microsoft-azure-ip-ranges.json -# jq-path: '.values[] | .properties.addressPrefixes[]' -# -# digitalocean: -# - url: https://www.digitalocean.com/geo/google.csv -# regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)," -# linode: -# - url: https://geoip.linode.com/ -# regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)," -# vultr: -# - url: "https://geofeed.constant.com/?json" -# jq-path: '.subnets[] | .ip_prefix' -# cloudflare: -# - url: https://www.cloudflare.com/ips-v4 -# regex: "(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+/[0-9]+)" -# - url: https://www.cloudflare.com/ips-v6 -# regex: "(?P[0-9a-f:]+::/[0-9]+)" -# -# icloud-private-relay: -# - url: https://mask-api.icloud.com/egress-ip-ranges.csv -# regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)," -# tunnelbroker-relay: -# # HE Tunnelbroker -# - url: https://tunnelbroker.net/export/google -# regex: "(?P([0-9a-f:]+::)/[0-9]+)," - - - googlebot: - - url: https://developers.google.com/static/search/apis/ipranges/googlebot.json - jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' - bingbot: - - url: https://www.bing.com/toolbox/bingbot.json - jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' - qwantbot: - - url: https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json - jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' - duckduckbot: - - url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot - regex: "
  • (?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)
  • " - yandexbot: - # todo: detected as bot - # - url: https://yandex.com/ips - # regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)[ \\\\t]*
    " - - prefixes: - - "5.45.192.0/18" - - "5.255.192.0/18" - - "37.9.64.0/18" - - "37.140.128.0/18" - - "77.88.0.0/18" - - "84.252.160.0/19" - - "87.250.224.0/19" - - "90.156.176.0/22" - - "93.158.128.0/18" - - "95.108.128.0/17" - - "141.8.128.0/18" - - "178.154.128.0/18" - - "185.32.187.0/24" - - "2a02:6b8::/29" - kagibot: - - url: https://kagi.com/bot - regex: "\\n(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) " challenges: - js-pow-sha256: - runtime: js - parameters: - # specifies the folder path that assets are under - # can be either embedded or external path - # defaults to name of challenge - path: "js-pow-sha256" - # needs to be under static folder - js-loader: load.mjs - # needs to be under runtime folder - wasm-runtime: runtime.wasm - wasm-runtime-settings: - difficulty: 20 - verify-probability: 0.02 - - # Challenges with a cookie, self redirect (non-JS, requires HTTP parsing) - self-cookie: - runtime: "cookie" - - # Challenges with a redirect via Link header with rel=preload and early hints (non-JS, requires HTTP parsing, fetching and logic) - # Works on HTTP/2 and above! - self-preload-link: - condition: '"Sec-Fetch-Mode" in headers && headers["Sec-Fetch-Mode"] == "navigate"' - runtime: "preload-link" - parameters: - preload-early-hint-deadline: 3s - - # Challenges with a redirect via Refresh header (non-JS, requires HTTP parsing and logic) - self-header-refresh: - runtime: "refresh" - parameters: - refresh-via: "header" - - # Challenges with a redirect via Refresh meta (non-JS, requires HTML parsing and logic) - self-meta-refresh: - runtime: "refresh" - parameters: - refresh-via: "meta" - - # Challenges with loading a random CSS or image document (non-JS, requires HTML parsing and logic) - self-resource-load: - runtime: "resource-load" + # Challenges will get included from snippets # Verifies the existence of a cookie and confirms it against some backend request, passing the entire client cookie contents http-cookie-check: @@ -149,29 +36,12 @@ challenges: http-code: 200 verify-probability: 0.1 - dnsbl: - runtime: dnsbl - parameters: - dnsbl-decay: 1h - dnsbl-timeout: 1s - conditions: # Conditions will get replaced on rules AST when found as ($condition-name) - # Checks to detect a headless chromium via headers only - is-headless-chromium: - - 'userAgent.contains("HeadlessChrome") || userAgent.contains("HeadlessChromium")' - - '"Sec-Ch-Ua" in headers && (headers["Sec-Ch-Ua"].contains("HeadlessChrome") || headers["Sec-Ch-Ua"].contains("HeadlessChromium"))' - #- '(userAgent.contains("Chrome/") || userAgent.contains("Chromium/")) && (!("Accept-Language" in headers) || !("Accept-Encoding" in headers))' - is-generic-browser: - - 'userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")' - - is-well-known-asset: - - 'path == "/robots.txt"' - - 'path.startsWith("/.well-known")' + # Conditions will get included from snippets is-static-asset: - - 'path == "/favicon.ico"' - 'path == "/apple-touch-icon.png"' - 'path == "/apple-touch-icon-precomposed.png"' - 'path.startsWith("/assets/")' @@ -181,39 +51,9 @@ conditions: - 'path.startsWith("/user/avatar/")' - 'path.startsWith("/attachments/")' - is-git-ua: - - 'userAgent.startsWith("git/") || userAgent.contains("libgit")' - - 'userAgent.startsWith("go-git")' - - 'userAgent.startsWith("JGit/") || userAgent.startsWith("JGit-")' - # Golang proxy and initial fetch - - 'userAgent.startsWith("GoModuleMirror/")' - - 'userAgent.startsWith("Go-http-client/") && "go-get" in query && query["go-get"] == "1"' - - '"Git-Protocol" in headers && headers["Git-Protocol"] == "version=2"' is-git-path: - 'path.matches("^/[^/]+/[^/]+/(git-upload-pack|git-receive-pack|HEAD|info/refs|info/lfs|objects)")' - is-generic-robot-ua: - - 'userAgent.matches("compatible[;)]") && !userAgent.contains("Trident/")' - - 'userAgent.matches("\\+https?://")' - - 'userAgent.contains("@")' - - 'userAgent.matches("[bB]ot/[0-9]")' - - is-tool-ua: - - 'userAgent.startsWith("python-requests/")' - - 'userAgent.startsWith("Python-urllib/")' - - 'userAgent.startsWith("python-httpx/")' - - 'userAgent.contains("aoihttp/")' - - 'userAgent.startsWith("http.rb/")' - - 'userAgent.startsWith("curl/")' - - 'userAgent.startsWith("Wget/")' - - 'userAgent.startsWith("libcurl/")' - - 'userAgent.startsWith("okhttp/")' - - 'userAgent.startsWith("Java/")' - - 'userAgent.startsWith("Apache-HttpClient//")' - - 'userAgent.startsWith("Go-http-client/")' - - 'userAgent.startsWith("node-fetch/")' - - 'userAgent.startsWith("reqwest/")' - is-suspicious-crawler: # TLS Fingerprint for specific agent without ALPN - '(userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")) && ("ja4" in fp && fp.ja4.matches("^t[0-9a-z]+00_"))' @@ -305,11 +145,11 @@ rules: - name: 1 action: check settings: - challenges: [self-preload-link, self-resource-load] + challenges: [preload-link, resource-load] - name: 2 action: check settings: - challenges: [self-header-refresh] + challenges: [header-refresh] - name: always-pow-challenge conditions: @@ -388,12 +228,12 @@ rules: - name: desired-crawlers conditions: - - 'userAgent.contains("+https://kagi.com/bot") && remoteAddress.network("kagibot")' - - '(userAgent.contains("+http://www.google.com/bot.html") || userAgent.contains("Google-PageRenderer") || userAgent.contains("Google-InspectionTool") || userAgent.contains("Googlebot")) && remoteAddress.network("googlebot")' - - 'userAgent.contains("+http://www.bing.com/bingbot.htm") && remoteAddress.network("bingbot")' - - 'userAgent.contains("+http://duckduckgo.com/duckduckbot.html") && remoteAddress.network("duckduckbot")' - - 'userAgent.contains("+https://help.qwant.com/bot/") && remoteAddress.network("qwantbot")' - - 'userAgent.contains("+http://yandex.com/bots") && remoteAddress.network("yandexbot")' + - *is-bot-googlebot + - *is-bot-bingbot + - *is-bot-duckduckbot + - *is-bot-kagibot + - *is-bot-qwantbot + - *is-bot-yandexbot action: pass # check a sequence of challenges @@ -404,16 +244,16 @@ rules: - name: 0 action: check settings: - challenges: [self-preload-link, self-header-refresh, js-pow-sha256, http-cookie-check] + challenges: [preload-link, header-refresh, js-pow-sha256, http-cookie-check] - name: 1 action: check settings: - challenges: [ self-resource-load, js-pow-sha256, http-cookie-check ] + challenges: [ resource-load, js-pow-sha256, http-cookie-check ] - name: standard-bots action: check settings: - challenges: [self-meta-refresh, self-resource-load] + challenges: [meta-refresh, resource-load] conditions: - '($is-generic-robot-ua)' @@ -454,14 +294,14 @@ rules: - name: plaintext-browser action: challenge settings: - challenges: [http-cookie-check, self-meta-refresh, self-cookie] + challenges: [http-cookie-check, meta-refresh, cookie] conditions: - 'userAgent.startsWith("Lynx/")' - name: standard-tools action: challenge settings: - challenges: [self-cookie] + challenges: [cookie] conditions: - '($is-tool-ua)' - '!($is-generic-browser)' @@ -469,6 +309,6 @@ rules: - name: standard-browser action: challenge settings: - challenges: [http-cookie-check, self-preload-link, self-meta-refresh, self-resource-load, js-pow-sha256] + challenges: [http-cookie-check, preload-link, meta-refresh, resource-load, js-pow-sha256] conditions: - '($is-generic-browser)' diff --git a/examples/generic.yml b/examples/generic.yml index cf3a5c0..973b3db 100644 --- a/examples/generic.yml +++ b/examples/generic.yml @@ -1,138 +1,27 @@ # Example cmdline (forward requests from upstream to port :8080) -# $ go-away --bind :8080 --backend site.example.com=http://site:3000 --policy examples/generic.yml --challenge-template anubis +# $ go-away --bind :8080 --backend site.example.com=http://site:3000 --policy examples/generic.yml --policy-snippets example/snippets/ --challenge-template anubis # Define networks to be used later below networks: - - googlebot: - - url: https://developers.google.com/static/search/apis/ipranges/googlebot.json - jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' - bingbot: - - url: https://www.bing.com/toolbox/bingbot.json - jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' - qwantbot: - - url: https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json - jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' - duckduckbot: - - url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot - regex: "
  • (?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)
  • " - yandexbot: - # todo: detected as bot - # - url: https://yandex.com/ips - # regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)[ \\\\t]*
    " - - prefixes: - - "5.45.192.0/18" - - "5.255.192.0/18" - - "37.9.64.0/18" - - "37.140.128.0/18" - - "77.88.0.0/18" - - "84.252.160.0/19" - - "87.250.224.0/19" - - "90.156.176.0/22" - - "93.158.128.0/18" - - "95.108.128.0/17" - - "141.8.128.0/18" - - "178.154.128.0/18" - - "185.32.187.0/24" - - "2a02:6b8::/29" - kagibot: - - url: https://kagi.com/bot - regex: "\\n(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) " + # Networks will get included from snippets challenges: - js-pow-sha256: - runtime: js - parameters: - # needs to be under static folder - js-loader: load.mjs - # needs to be under runtime folder - wasm-runtime: runtime.wasm - wasm-runtime-settings: - difficulty: 15 - verify-probability: 0.02 - - # Challenges with a cookie, self redirect (non-JS, requires HTTP parsing) - self-cookie: - runtime: "cookie" - - - # Challenges with a redirect via Link header with rel=preload and early hints (non-JS, requires HTTP parsing, fetching and logic) - # Works on HTTP/2 and above! - self-preload-link: - condition: '"Sec-Fetch-Mode" in headers && headers["Sec-Fetch-Mode"] == "navigate"' - runtime: "preload-link" - parameters: - preload-early-hint-deadline: 3s - - # Challenges with a redirect via Refresh header (non-JS, requires HTTP parsing and logic) - self-header-refresh: - runtime: "refresh" - parameters: - refresh-via: "header" - - # Challenges with a redirect via Refresh meta (non-JS, requires HTML parsing and logic) - self-meta-refresh: - runtime: "refresh" - parameters: - refresh-via: "meta" - - # Challenges with loading a random CSS or image document (non-JS, requires HTML parsing and logic) - self-resource-load: - runtime: "resource-load" - - dnsbl: - runtime: dnsbl - parameters: - dnsbl-decay: 1h - dnsbl-timeout: 1s - + # Challenges will get included from snippets + conditions: # Conditions will get replaced on rules AST when found as ($condition-name) - # Checks to detect a headless chromium via headers only - is-headless-chromium: - - 'userAgent.contains("HeadlessChrome") || userAgent.contains("HeadlessChromium")' - - '"Sec-Ch-Ua" in headers && (headers["Sec-Ch-Ua"].contains("HeadlessChrome") || headers["Sec-Ch-Ua"].contains("HeadlessChromium"))' - #- '(userAgent.contains("Chrome/") || userAgent.contains("Chromium/")) && (!("Accept-Language" in headers) || !("Accept-Encoding" in headers))' - is-generic-browser: - - 'userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")' + # Conditions will get included from snippets - is-well-known-asset: - - 'path == "/robots.txt"' - - 'path.startsWith("/.well-known")' is-static-asset: - - 'path == "/favicon.ico"' - 'path == "/apple-touch-icon.png"' - 'path == "/apple-touch-icon-precomposed.png"' - 'path.matches("\\.(manifest|ttf|woff|woff2|jpg|jpeg|gif|png|webp|avif|svg|mp4|webm|css|js|mjs|wasm)$")' - - is-generic-robot-ua: - - 'userAgent.matches("compatible[;)]") && !userAgent.contains("Trident/")' - - 'userAgent.matches("\\+https?://")' - - 'userAgent.contains("@")' - - 'userAgent.matches("[bB]ot/[0-9]")' - - is-tool-ua: - - 'userAgent.startsWith("python-requests/")' - - 'userAgent.startsWith("Python-urllib/")' - - 'userAgent.startsWith("python-httpx/")' - - 'userAgent.contains("aoihttp/")' - - 'userAgent.startsWith("http.rb/")' - - 'userAgent.startsWith("curl/")' - - 'userAgent.startsWith("Wget/")' - - 'userAgent.startsWith("libcurl/")' - - 'userAgent.startsWith("okhttp/")' - - 'userAgent.startsWith("Java/")' - - 'userAgent.startsWith("Apache-HttpClient//")' - - 'userAgent.startsWith("Go-http-client/")' - - 'userAgent.startsWith("node-fetch/")' - - 'userAgent.startsWith("reqwest/")' - is-suspicious-crawler: - 'userAgent.contains("Presto/") || userAgent.contains("Trident/")' # Old IE browsers @@ -203,20 +92,20 @@ rules: - name: 1 action: check settings: - challenges: [self-preload-link, self-resource-load] + challenges: [preload-link, resource-load] - name: 2 action: check settings: - challenges: [self-header-refresh] + challenges: [header-refresh] - name: desired-crawlers conditions: - - 'userAgent.contains("+https://kagi.com/bot") && remoteAddress.network("kagibot")' - - '(userAgent.contains("+http://www.google.com/bot.html") || userAgent.contains("Google-PageRenderer") || userAgent.contains("Google-InspectionTool") || userAgent.contains("Googlebot")) && remoteAddress.network("googlebot")' - - 'userAgent.contains("+http://www.bing.com/bingbot.htm") && remoteAddress.network("bingbot")' - - 'userAgent.contains("+http://duckduckgo.com/duckduckbot.html") && remoteAddress.network("duckduckbot")' - - 'userAgent.contains("+https://help.qwant.com/bot/") && remoteAddress.network("qwantbot")' - - 'userAgent.contains("+http://yandex.com/bots") && remoteAddress.network("yandexbot")' + - *is-bot-googlebot + - *is-bot-bingbot + - *is-bot-duckduckbot + - *is-bot-kagibot + - *is-bot-qwantbot + - *is-bot-yandexbot action: pass - name: homesite @@ -251,14 +140,14 @@ rules: - name: plaintext-browser action: challenge settings: - challenges: [self-meta-refresh, self-cookie] + challenges: [meta-refresh, cookie] conditions: - 'userAgent.startsWith("Lynx/")' - name: standard-tools action: challenge settings: - challenges: [self-cookie] + challenges: [cookie] conditions: - '($is-generic-robot-ua)' - '($is-tool-ua)' @@ -267,6 +156,6 @@ rules: - name: standard-browser action: challenge settings: - challenges: [self-preload-link, self-meta-refresh, self-resource-load, js-pow-sha256] + challenges: [preload-link, meta-refresh, resource-load, js-pow-sha256] conditions: - '($is-generic-browser)' diff --git a/examples/snippets/bot-bingbot.yml b/examples/snippets/bot-bingbot.yml new file mode 100644 index 0000000..98fb3db --- /dev/null +++ b/examples/snippets/bot-bingbot.yml @@ -0,0 +1,8 @@ +networks: + bingbot: + - url: https://www.bing.com/toolbox/bingbot.json + jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' + +conditions: + is-bot-bingbot: + - &is-bot-bingbot 'userAgent.contains("+http://www.bing.com/bingbot.htm") && remoteAddress.network("bingbot")' \ No newline at end of file diff --git a/examples/snippets/bot-duckduckbot.yml b/examples/snippets/bot-duckduckbot.yml new file mode 100644 index 0000000..b5f45e1 --- /dev/null +++ b/examples/snippets/bot-duckduckbot.yml @@ -0,0 +1,8 @@ +networks: + duckduckbot: + - url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot + regex: "
  • (?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)
  • " + +conditions: + is-bot-duckduckbot: + - &is-bot-duckduckbot 'userAgent.contains("+http://duckduckgo.com/duckduckbot.html") && remoteAddress.network("duckduckbot")' \ No newline at end of file diff --git a/examples/snippets/bot-googlebot.yml b/examples/snippets/bot-googlebot.yml new file mode 100644 index 0000000..da26884 --- /dev/null +++ b/examples/snippets/bot-googlebot.yml @@ -0,0 +1,8 @@ +networks: + googlebot: + - url: https://developers.google.com/static/search/apis/ipranges/googlebot.json + jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' + +conditions: + is-bot-googlebot: + - &is-bot-googlebot '(userAgent.contains("+http://www.google.com/bot.html") || userAgent.contains("Google-PageRenderer") || userAgent.contains("Google-InspectionTool") || userAgent.contains("Googlebot")) && remoteAddress.network("googlebot")' \ No newline at end of file diff --git a/examples/snippets/bot-kagibot.yml b/examples/snippets/bot-kagibot.yml new file mode 100644 index 0000000..8edb3c3 --- /dev/null +++ b/examples/snippets/bot-kagibot.yml @@ -0,0 +1,8 @@ +networks: + kagibot: + - url: https://kagi.com/bot + regex: "\\n(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) " + +conditions: + is-bot-kagibot: + - &is-bot-kagibot 'userAgent.contains("+https://kagi.com/bot") && remoteAddress.network("kagibot")' \ No newline at end of file diff --git a/examples/snippets/bot-qwantbot.yml b/examples/snippets/bot-qwantbot.yml new file mode 100644 index 0000000..095e1be --- /dev/null +++ b/examples/snippets/bot-qwantbot.yml @@ -0,0 +1,8 @@ +networks: + qwantbot: + - url: https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json + jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' + +conditions: + is-bot-qwantbot: + - &is-bot-qwantbot 'userAgent.contains("+https://help.qwant.com/bot/") && remoteAddress.network("qwantbot")' \ No newline at end of file diff --git a/examples/snippets/bot-yandexbot.yml b/examples/snippets/bot-yandexbot.yml new file mode 100644 index 0000000..981ee42 --- /dev/null +++ b/examples/snippets/bot-yandexbot.yml @@ -0,0 +1,24 @@ +networks: + yandexbot: + # todo: detected as bot + # - url: https://yandex.com/ips + # regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)[ \\\\t]*
    " + - prefixes: + - "5.45.192.0/18" + - "5.255.192.0/18" + - "37.9.64.0/18" + - "37.140.128.0/18" + - "77.88.0.0/18" + - "84.252.160.0/19" + - "87.250.224.0/19" + - "90.156.176.0/22" + - "93.158.128.0/18" + - "95.108.128.0/17" + - "141.8.128.0/18" + - "178.154.128.0/18" + - "185.32.187.0/24" + - "2a02:6b8::/29" + +conditions: + is-bot-yandexbot: + - &is-bot-yandexbot 'userAgent.contains("+http://yandex.com/bots") && remoteAddress.network("yandexbot")' \ No newline at end of file diff --git a/examples/snippets/challenge-dnsbl.yml b/examples/snippets/challenge-dnsbl.yml new file mode 100644 index 0000000..893116c --- /dev/null +++ b/examples/snippets/challenge-dnsbl.yml @@ -0,0 +1,6 @@ +challenges: + dnsbl: + runtime: dnsbl + parameters: + dnsbl-decay: 1h + dnsbl-timeout: 1s \ No newline at end of file diff --git a/examples/snippets/challenge-js-pow-sha256.yml b/examples/snippets/challenge-js-pow-sha256.yml new file mode 100644 index 0000000..1e98d0f --- /dev/null +++ b/examples/snippets/challenge-js-pow-sha256.yml @@ -0,0 +1,15 @@ +challenges: + js-pow-sha256: + runtime: js + parameters: + # specifies the folder path that assets are under + # can be either embedded or external path + # defaults to name of challenge + path: "js-pow-sha256" + # needs to be under static folder + js-loader: load.mjs + # needs to be under runtime folder + wasm-runtime: runtime.wasm + wasm-runtime-settings: + difficulty: 20 + verify-probability: 0.02 \ No newline at end of file diff --git a/examples/snippets/challenges-non-js.yml b/examples/snippets/challenges-non-js.yml new file mode 100644 index 0000000..ae94056 --- /dev/null +++ b/examples/snippets/challenges-non-js.yml @@ -0,0 +1,28 @@ +challenges: + # Challenges with a cookie, self redirect (non-JS, requires HTTP parsing) + cookie: + runtime: "cookie" + + # Challenges with a redirect via Link header with rel=preload and early hints (non-JS, requires HTTP parsing, fetching and logic) + # Works on HTTP/2 and above! + preload-link: + condition: '"Sec-Fetch-Mode" in headers && headers["Sec-Fetch-Mode"] == "navigate"' + runtime: "preload-link" + parameters: + preload-early-hint-deadline: 3s + + # Challenges with a redirect via Refresh header (non-JS, requires HTTP parsing and logic) + header-refresh: + runtime: "refresh" + parameters: + refresh-via: "header" + + # Challenges with a redirect via Refresh meta (non-JS, requires HTML parsing and logic) + meta-refresh: + runtime: "refresh" + parameters: + refresh-via: "meta" + + # Challenges with loading a random CSS or image document (non-JS, requires HTML parsing and logic) + resource-load: + runtime: "resource-load" \ No newline at end of file diff --git a/examples/snippets/conditions-generic.yml b/examples/snippets/conditions-generic.yml new file mode 100644 index 0000000..0c93d75 --- /dev/null +++ b/examples/snippets/conditions-generic.yml @@ -0,0 +1,45 @@ +conditions: + is-well-known-asset: + - 'path == "/robots.txt"' + - 'path == "/favicon.ico"' + - 'path.startsWith("/.well-known")' + + is-git-ua: + - 'userAgent.startsWith("git/") || userAgent.contains("libgit")' + - 'userAgent.startsWith("go-git")' + - 'userAgent.startsWith("JGit/") || userAgent.startsWith("JGit-")' + # Golang proxy and initial fetch + - 'userAgent.startsWith("GoModuleMirror/")' + - 'userAgent.startsWith("Go-http-client/") && "go-get" in query && query["go-get"] == "1"' + - '"Git-Protocol" in headers && headers["Git-Protocol"] == "version=2"' + + is-generic-browser: + - 'userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")' + + is-generic-robot-ua: + - 'userAgent.matches("compatible[;)]") && !userAgent.contains("Trident/")' + - 'userAgent.matches("\\+https?://")' + - 'userAgent.contains("@")' + - 'userAgent.matches("[bB]ot/[0-9]")' + + is-tool-ua: + - 'userAgent.startsWith("python-requests/")' + - 'userAgent.startsWith("Python-urllib/")' + - 'userAgent.startsWith("python-httpx/")' + - 'userAgent.contains("aoihttp/")' + - 'userAgent.startsWith("http.rb/")' + - 'userAgent.startsWith("curl/")' + - 'userAgent.startsWith("Wget/")' + - 'userAgent.startsWith("libcurl/")' + - 'userAgent.startsWith("okhttp/")' + - 'userAgent.startsWith("Java/")' + - 'userAgent.startsWith("Apache-HttpClient//")' + - 'userAgent.startsWith("Go-http-client/")' + - 'userAgent.startsWith("node-fetch/")' + - 'userAgent.startsWith("reqwest/")' + + # Checks to detect a headless chromium via headers only + is-headless-chromium: + - 'userAgent.contains("HeadlessChrome") || userAgent.contains("HeadlessChromium")' + - '"Sec-Ch-Ua" in headers && (headers["Sec-Ch-Ua"].contains("HeadlessChrome") || headers["Sec-Ch-Ua"].contains("HeadlessChromium"))' + #- '(userAgent.contains("Chrome/") || userAgent.contains("Chromium/")) && (!("Accept-Language" in headers) || !("Accept-Encoding" in headers))' \ No newline at end of file diff --git a/examples/snippets/networks-other.yml b/examples/snippets/networks-other.yml new file mode 100644 index 0000000..3952e1f --- /dev/null +++ b/examples/snippets/networks-other.yml @@ -0,0 +1,37 @@ +networks: +# aws-cloud: +# - url: https://ip-ranges.amazonaws.com/ip-ranges.json +# jq-path: '(.prefixes[] | select(has("ip_prefix")) | .ip_prefix), (.prefixes[] | select(has("ipv6_prefix")) | .ipv6_prefix)' +# google-cloud: +# - url: https://www.gstatic.com/ipranges/cloud.json +# jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' +# oracle-cloud: +# - url: https://docs.oracle.com/en-us/iaas/tools/public_ip_ranges.json +# jq-path: '.regions[] | .cidrs[] | .cidr' +# azure-cloud: +# # todo: https://www.microsoft.com/en-us/download/details.aspx?id=56519 does not provide direct JSON +# - url: https://raw.githubusercontent.com/femueller/cloud-ip-ranges/refs/heads/master/microsoft-azure-ip-ranges.json +# jq-path: '.values[] | .properties.addressPrefixes[]' +# +# digitalocean: +# - url: https://www.digitalocean.com/geo/google.csv +# regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)," +# linode: +# - url: https://geoip.linode.com/ +# regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)," +# vultr: +# - url: "https://geofeed.constant.com/?json" +# jq-path: '.subnets[] | .ip_prefix' +# cloudflare: +# - url: https://www.cloudflare.com/ips-v4 +# regex: "(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+/[0-9]+)" +# - url: https://www.cloudflare.com/ips-v6 +# regex: "(?P[0-9a-f:]+::/[0-9]+)" +# +# icloud-private-relay: +# - url: https://mask-api.icloud.com/egress-ip-ranges.csv +# regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)," +# tunnelbroker-relay: +# # HE Tunnelbroker +# - url: https://tunnelbroker.net/export/google +# regex: "(?P([0-9a-f:]+::)/[0-9]+)," diff --git a/lib/policy/network.go b/lib/policy/network.go index e287870..d741320 100644 --- a/lib/policy/network.go +++ b/lib/policy/network.go @@ -115,3 +115,30 @@ func (n Network) FetchPrefixes(c *http.Client) (output []net.IPNet, err error) { } return output, nil } + +func parseCIDROrIP(value string) (net.IPNet, error) { + _, ipNet, err := net.ParseCIDR(value) + if err != nil { + ip := net.ParseIP(value) + if ip == nil { + return net.IPNet{}, fmt.Errorf("failed to parse CIDR: %s", err) + } + + if ip4 := ip.To4(); ip4 != nil { + return net.IPNet{ + IP: ip4, + // single ip + Mask: net.CIDRMask(len(ip4)*8, len(ip4)*8), + }, nil + } + return net.IPNet{ + IP: ip, + // single ip + Mask: net.CIDRMask(len(ip)*8, len(ip)*8), + }, nil + } else if ipNet != nil { + return *ipNet, nil + } else { + return net.IPNet{}, errors.New("invalid CIDR") + } +} diff --git a/lib/policy/policy.go b/lib/policy/policy.go index 33eca0b..f60a6eb 100644 --- a/lib/policy/policy.go +++ b/lib/policy/policy.go @@ -1,38 +1,13 @@ package policy import ( - "errors" - "fmt" - "net" + "bytes" + "github.com/goccy/go-yaml" + "io" + "os" + "path" ) -func parseCIDROrIP(value string) (net.IPNet, error) { - _, ipNet, err := net.ParseCIDR(value) - if err != nil { - ip := net.ParseIP(value) - if ip == nil { - return net.IPNet{}, fmt.Errorf("failed to parse CIDR: %s", err) - } - - if ip4 := ip.To4(); ip4 != nil { - return net.IPNet{ - IP: ip4, - // single ip - Mask: net.CIDRMask(len(ip4)*8, len(ip4)*8), - }, nil - } - return net.IPNet{ - IP: ip, - // single ip - Mask: net.CIDRMask(len(ip)*8, len(ip)*8), - }, nil - } else if ipNet != nil { - return *ipNet, nil - } else { - return net.IPNet{}, errors.New("invalid CIDR") - } -} - type Policy struct { // Networks map of networks and prefixes to be loaded @@ -43,8 +18,70 @@ type Policy struct { Challenges map[string]Challenge `yaml:"challenges"` Rules []Rule `yaml:"rules"` - - // Backends - // Deprecated - Backends map[string]string `json:"backends"` +} + +func NewPolicy(r io.Reader, snippetsDirectory string) (*Policy, error) { + var p Policy + p.Networks = make(map[string][]Network) + p.Conditions = make(map[string][]string) + p.Challenges = make(map[string]Challenge) + + if snippetsDirectory == "" { + err := yaml.NewDecoder(r).Decode(&p) + if err != nil { + return nil, err + } + } else { + err := yaml.NewDecoder(r, yaml.ReferenceDirs(snippetsDirectory)).Decode(&p) + if err != nil { + return nil, err + } + + // add specific entries from snippets + entries, err := os.ReadDir(snippetsDirectory) + if err != nil { + return nil, err + } + for _, entry := range entries { + var entryPolicy Policy + if !entry.IsDir() { + entryData, err := os.ReadFile(path.Join(snippetsDirectory, entry.Name())) + if err != nil { + return nil, err + } + err = yaml.NewDecoder(bytes.NewReader(entryData), yaml.ReferenceDirs(snippetsDirectory)).Decode(&entryPolicy) + if err != nil { + return nil, err + } + + // add networks / conditions / challenges definitions if they don't exist already + + for k, v := range entryPolicy.Networks { + // add network if policy entry does not exist + _, ok := p.Networks[k] + if !ok { + p.Networks[k] = v + } + } + + for k, v := range entryPolicy.Conditions { + // add condition if policy entry does not exist + _, ok := p.Conditions[k] + if !ok { + p.Conditions[k] = v + } + } + + for k, v := range entryPolicy.Challenges { + // add challenge if policy entry does not exist + _, ok := p.Challenges[k] + if !ok { + p.Challenges[k] = v + } + } + + } + } + } + return &p, nil }