From d83fe3653a6e4cd00a99b009f882ee9cbbbab0e9 Mon Sep 17 00:00:00 2001 From: WeebDataHoarder Date: Wed, 23 Apr 2025 07:25:06 +0200 Subject: [PATCH] examples: update bot matches, allow badges to be fetched --- examples/forgejo.yml | 6 ++---- examples/generic.yml | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/forgejo.yml b/examples/forgejo.yml index 0c78257..eb2d46b 100644 --- a/examples/forgejo.yml +++ b/examples/forgejo.yml @@ -193,7 +193,7 @@ conditions: - 'path.matches("^/[^/]+/[^/]+/(git-upload-pack|git-receive-pack|HEAD|info/refs|info/lfs|objects)")' is-generic-robot-ua: - - 'userAgent.contains("compatible;") && !userAgent.contains("Trident/")' + - 'userAgent.matches("compatible[;)]") && !userAgent.contains("Trident/")' - 'userAgent.matches("\\+https?://")' - 'userAgent.contains("@")' - 'userAgent.matches("[bB]ot/[0-9]")' @@ -383,7 +383,7 @@ rules: # generic /*/*/ match gave too many options for scrapers to trigger random endpoints # this is a negative match of endpoints that Forgejo holds as reserved as users or orgs # see https://codeberg.org/forgejo/forgejo/src/branch/forgejo/models/user/user.go#L582 - - '(path.matches("^/[^/]+/[^/]+/?$") || path.matches("^/[^/]+/[^/]+/(issues|pulls)/[0-9]+$") || (path.matches("^/[^/]+/?$") && size(query) == 0)) && !path.matches("(?i)^/(api|metrics|v2|assets|attachments|avatar|avatars|repo-avatars|captcha|login|org|repo|user|admin|devtest|explore|issues|pulls|milestones|notifications|ghost)(/|$)")' + - '(path.matches("^/[^/]+/[^/]+/?$") || path.matches("^/[^/]+/[^/]+/badges/") || path.matches("^/[^/]+/[^/]+/(issues|pulls)/[0-9]+$") || (path.matches("^/[^/]+/?$") && size(query) == 0)) && !path.matches("(?i)^/(api|metrics|v2|assets|attachments|avatar|avatars|repo-avatars|captcha|login|org|repo|user|admin|devtest|explore|issues|pulls|milestones|notifications|ghost)(/|$)")' action: pass - name: desired-crawlers @@ -409,8 +409,6 @@ rules: action: check settings: challenges: [ self-resource-load, js-pow-sha256, http-cookie-check ] - settings: - challenges: [self-preload-link, self-header-refresh, js-pow-sha256, http-cookie-check] - name: standard-bots action: check diff --git a/examples/generic.yml b/examples/generic.yml index 5523bab..cf3a5c0 100644 --- a/examples/generic.yml +++ b/examples/generic.yml @@ -112,7 +112,7 @@ conditions: is-generic-robot-ua: - - 'userAgent.contains("compatible;") && !userAgent.contains("Trident/")' + - 'userAgent.matches("compatible[;)]") && !userAgent.contains("Trident/")' - 'userAgent.matches("\\+https?://")' - 'userAgent.contains("@")' - 'userAgent.matches("[bB]ot/[0-9]")'