From 31c8aeba934e4ae2864a77abda4fd2b66d1e145e Mon Sep 17 00:00:00 2001
From: NoPlagiarism <NoPlagiarism@noreply.codeberg.org>
Date: Thu, 5 Sep 2024 13:33:04 +0500
Subject: [PATCH] instances: add non-clearnet to json, add retries

---
 instances.json    | 24 +++++++++++++++++++
 instances2json.py | 61 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/instances.json b/instances.json
index 4323b7e..222942c 100644
--- a/instances.json
+++ b/instances.json
@@ -64,5 +64,29 @@
         "link": "https://translate.privacyredirect.com",
         "cloudflare": false,
         "host": "Private WebHost"
+    },
+    {
+        "country": "USA",
+        "onion": "http://mozhi.wsuno6lnjdcsiok5mrxvl6e2bdex7nhsqqav6ux7tkwrqiqnulejfbyd.onion",
+        "cloudflare": false,
+        "host": "Hetzner"
+    },
+    {
+        "country": "Germany",
+        "onion": "http://42i2bzogwkph3dvoo2bm6srskf7vvabsphw7uzftymbjjlzgfluhnmid.onion",
+        "cloudflare": false,
+        "host": "Datalix"
+    },
+    {
+        "country": "Germany",
+        "onion": "http://3mu2almmcv7rd7wlwhmkbwqgttntgpqu3hdanutxbv2v72wzbxe5ixqd.onion",
+        "cloudflare": false,
+        "host": "pawhost.de"
+    },
+    {
+        "country": "Germany",
+        "i2p": "http://74lptlnvaukcjnmqefedgna35ahkqexqzq2qq3k7utc2ep4jotcq.b32.i2p",
+        "cloudflare": false,
+        "host": "pawhost.de"
     }
 ]
\ No newline at end of file
diff --git a/instances2json.py b/instances2json.py
index 3d85fc3..d0f895e 100644
--- a/instances2json.py
+++ b/instances2json.py
@@ -1,31 +1,62 @@
 #!/usr/bin/python3
-import requests
 import json
-from bs4 import BeautifulSoup
+import sys
+
+from bs4 import BeautifulSoup
+from urllib3.util import Retry
+import requests
+from requests.adapters import HTTPAdapter
 
-print("Getting HTML")
 
 headers = {
     'User-Agent': 'Mozilla/5.0 MozhiInstanceFetcher/1.0 (+codeberg.org/aryak/mozhi)'
 }
 
+session = requests.Session()
+session.headers.update(headers)
+retries = Retry(total=5,
+                connect=3,       # ConnectTimeoutError
+                read=False,      # ReadTimeoutError or ProtocolError
+                redirect=False,  # obvi, any redirections
+                status=2,        # Status codes by server
+                backoff_factor=1,
+                backoff_max=30,  # Just to be sure that script don't go sleep for a minute
+                respect_retry_after_header=False)
+http_adapter = HTTPAdapter(max_retries=retries)
+session.mount('https://', http_adapter)
+session.mount('http://', http_adapter)
+
+
+print("Getting HTML")
 # Get the HTML from the page
-r = requests.get('https://codeberg.org/aryak/mozhi', headers=headers)
+r = session.get('https://codeberg.org/aryak/mozhi/src/branch/master/README.md') # XXX: relied on Codeberg
 
 # Parse the HTML
 soup = BeautifulSoup(r.text, 'html.parser')
 
 print("Scraping started")
 
-# Get tables
-tables = soup.find_all('table')
-
-# Get table with header 'Master Branch'
-table = tables[1]
+# Get table after Instances header
+instances_h2 = soup.find("h2", string="Instances")
+try:
+    table = instances_h2.find_next_sibling("table")
+except AttributeError:
+    print("Instances header not found")
+    sys.exit(-1)
 
 # Get all rows and columns. Skip the first row because it's the header
 rows = table.find_all('tr')[1:]
 
+def get_net_type(url: str):
+    url = url.strip("/")
+    if url.endswith(".onion"):
+        return "onion"
+    elif url.endswith(".i2p"):
+        return "i2p"
+    elif url.endswith(".loki"):
+        return "lokinet"
+    return "link"
+
 theJson = []
 
 for row in rows:
@@ -36,13 +67,13 @@ for row in rows:
     host = row.find_all('td')[3].text
 
     print("Scraping " + row.find_all('td')[0].find('a')['href'] + ' instance...')
-    if cloudflare == 'Yes':
-        isCloudflare = True
-    else:
-        isCloudflare = False
+    isCloudflare = cloudflare == "Yes"
 
     try:
-        r = requests.get(link + '/', headers=headers)
+        if get_net_type(url=link) == "link":
+            r = session.get(link + '/', headers=headers)
+        else:
+            print(f"Non-clearnet mirror [{row.find_all('td')[0].find('a').get_text()}]. Skipping check")
         if r.status_code != 200:
             print("Error while fetching " + link + '/. We got a ' + str(r.status_code) + ' status code. Skipping...')
             continue
@@ -52,7 +83,7 @@ for row in rows:
 
     theJson.append({
         'country': country,
-        'link': link,
+        get_net_type(url=link): link,
         'cloudflare': isCloudflare,
         'host': host,
     })