Files
go-away/examples/generic.yml
Alan Orth c16f0863ae examples/generic.yml: use path.matches in condition
The string here uses a character set with path.contains, which will
not work in CEL. We need to use path.matches to use regex syntax.
2025-05-17 23:50:36 +03:00

187 lines
6.0 KiB
YAML

# Example cmdline (forward requests from upstream to port :8080)
# $ go-away --bind :8080 --backend site.example.com=http://site:3000 --policy examples/generic.yml --policy-snippets example/snippets/ --challenge-template anubis
# Define networks to be used later below
networks:
# Networks will get included from snippets
challenges:
# Challenges will get included from snippets
conditions:
# Conditions will get replaced on rules AST when found as ($condition-name)
# Conditions will get included from snippets
is-static-asset:
- 'path == "/apple-touch-icon.png"'
- 'path == "/apple-touch-icon-precomposed.png"'
- 'path.matches("\\.(manifest|ttf|woff|woff2|jpg|jpeg|gif|png|webp|avif|svg|mp4|webm|css|js|mjs|wasm)$")'
is-suspicious-crawler:
- 'userAgent.contains("Presto/") || userAgent.contains("Trident/")'
# Old IE browsers
- 'userAgent.matches("MSIE ([2-9]|10|11)\\.")'
# Old Linux browsers
- 'userAgent.matches("Linux i[63]86") || userAgent.matches("FreeBSD i[63]86")'
# Old Windows browsers
- 'userAgent.matches("Windows (3|95|98|CE)") || userAgent.matches("Windows NT [1-5]\\.")'
# Old mobile browsers
- 'userAgent.matches("Android [1-5]\\.") || userAgent.matches("(iPad|iPhone) OS [1-9]_")'
# Old generic browsers
- 'userAgent.startsWith("Opera/")'
#- 'userAgent.matches("Gecko/(201[0-9]|200[0-9])")'
- 'userAgent.matches("^Mozilla/[1-4]")'
# Rules are checked sequentially in order, from top to bottom
rules:
- name: allow-well-known-resources
conditions:
- '($is-well-known-asset)'
action: pass
- name: allow-static-resources
conditions:
- '($is-static-asset)'
action: pass
- name: desired-crawlers
conditions:
- *is-bot-googlebot
- *is-bot-bingbot
- *is-bot-duckduckbot
- *is-bot-kagibot
- *is-bot-qwantbot
- *is-bot-yandexbot
action: pass
# Matches private networks and localhost.
# Uncomment this if you want to let your own tools this way
# - name: allow-private-networks
# conditions:
# # Allows localhost and private networks CIDR
# - *is-network-localhost
# - *is-network-private
# action: pass
- name: undesired-crawlers
conditions:
- '($is-headless-chromium)'
- 'userAgent.startsWith("Lightpanda/")'
- 'userAgent.startsWith("masscan/")'
# Typo'd opera botnet
- 'userAgent.matches("^Opera/[0-9.]+\\.\\(")'
# AI bullshit stuff, they do not respect robots.txt even while they read it
# TikTok Bytedance AI training
- 'userAgent.contains("Bytedance") || userAgent.contains("Bytespider") || userAgent.contains("TikTokSpider")'
# Meta AI training; The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.
- 'userAgent.contains("meta-externalagent/") || userAgent.contains("meta-externalfetcher/") || userAgent.contains("FacebookBot")'
# Anthropic AI training and usage
- 'userAgent.contains("ClaudeBot") || userAgent.contains("Claude-User")|| userAgent.contains("Claude-SearchBot")'
# Common Crawl AI crawlers
- 'userAgent.contains("CCBot")'
# ChatGPT AI crawlers https://platform.openai.com/docs/bots
- 'userAgent.contains("GPTBot") || userAgent.contains("OAI-SearchBot") || userAgent.contains("ChatGPT-User")'
# Other AI crawlers
- 'userAgent.contains("Amazonbot") || userAgent.contains("Google-Extended") || userAgent.contains("PanguBot") || userAgent.contains("AI2Bot") || userAgent.contains("Diffbot") || userAgent.contains("cohere-training-data-crawler") || userAgent.contains("Applebot-Extended")'
# SEO / Ads and marketing
- 'userAgent.contains("BLEXBot")'
action: drop
- name: unknown-crawlers
conditions:
# No user agent set
- 'userAgent == ""'
action: deny
# check a sequence of challenges
- name: suspicious-crawlers
conditions: ['($is-suspicious-crawler)']
action: none
children:
- name: 0
action: check
settings:
challenges: [js-refresh]
- name: 1
action: check
settings:
challenges: [preload-link, resource-load]
- name: 2
action: check
settings:
challenges: [header-refresh]
- name: homesite
conditions:
- 'path == "/"'
action: pass
# check DNSBL and serve harder challenges
# todo: make this specific to score
- name: undesired-dnsbl
action: check
settings:
challenges: [dnsbl]
# if DNSBL fails, check additional challenges
fail: check
fail-settings:
challenges: [js-refresh]
- name: suspicious-fetchers
action: check
settings:
challenges: [js-refresh]
conditions:
- 'userAgent.contains("facebookexternalhit/") || userAgent.contains("facebookcatalog/")'
# Allow PUT/DELETE/PATCH/POST requests in general
- name: non-get-request
action: pass
conditions:
- '!(method == "HEAD" || method == "GET")'
# Enable fetching OpenGraph and other tags from backend on these paths
- name: enable-meta-tags
action: context
settings:
context-set:
# Map OpenGraph or similar <meta> tags back to the reply, even if denied/challenged
proxy-meta-tags: "true"
# Set additional response headers
#response-headers:
# X-Clacks-Overhead:
# - GNU Terry Pratchett
- name: plaintext-browser
action: challenge
settings:
challenges: [meta-refresh, cookie]
conditions:
- 'userAgent.startsWith("Lynx/")'
# Uncomment this rule out to challenge tool-like user agents
#- name: standard-tools
# action: challenge
# settings:
# challenges: [cookie]
# conditions:
# - '($is-generic-robot-ua)'
# - '($is-tool-ua)'
# - '!($is-generic-browser)'
- name: standard-browser
action: challenge
settings:
challenges: [preload-link, meta-refresh, resource-load, js-refresh]
conditions:
- '($is-generic-browser)'
# If end of rules is reached, default is PASS