From 31c8aeba934e4ae2864a77abda4fd2b66d1e145e Mon Sep 17 00:00:00 2001 From: NoPlagiarism Date: Thu, 5 Sep 2024 13:33:04 +0500 Subject: [PATCH] instances: add non-clearnet to json, add retries --- instances.json | 24 +++++++++++++++++++ instances2json.py | 61 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 70 insertions(+), 15 deletions(-) diff --git a/instances.json b/instances.json index 4323b7e..222942c 100644 --- a/instances.json +++ b/instances.json @@ -64,5 +64,29 @@ "link": "https://translate.privacyredirect.com", "cloudflare": false, "host": "Private WebHost" + }, + { + "country": "USA", + "onion": "http://mozhi.wsuno6lnjdcsiok5mrxvl6e2bdex7nhsqqav6ux7tkwrqiqnulejfbyd.onion", + "cloudflare": false, + "host": "Hetzner" + }, + { + "country": "Germany", + "onion": "http://42i2bzogwkph3dvoo2bm6srskf7vvabsphw7uzftymbjjlzgfluhnmid.onion", + "cloudflare": false, + "host": "Datalix" + }, + { + "country": "Germany", + "onion": "http://3mu2almmcv7rd7wlwhmkbwqgttntgpqu3hdanutxbv2v72wzbxe5ixqd.onion", + "cloudflare": false, + "host": "pawhost.de" + }, + { + "country": "Germany", + "i2p": "http://74lptlnvaukcjnmqefedgna35ahkqexqzq2qq3k7utc2ep4jotcq.b32.i2p", + "cloudflare": false, + "host": "pawhost.de" } ] \ No newline at end of file diff --git a/instances2json.py b/instances2json.py index 3d85fc3..d0f895e 100644 --- a/instances2json.py +++ b/instances2json.py @@ -1,31 +1,62 @@ #!/usr/bin/python3 -import requests import json -from bs4 import BeautifulSoup +import sys + +from bs4 import BeautifulSoup +from urllib3.util import Retry +import requests +from requests.adapters import HTTPAdapter -print("Getting HTML") headers = { 'User-Agent': 'Mozilla/5.0 MozhiInstanceFetcher/1.0 (+codeberg.org/aryak/mozhi)' } +session = requests.Session() +session.headers.update(headers) +retries = Retry(total=5, + connect=3, # ConnectTimeoutError + read=False, # ReadTimeoutError or ProtocolError + redirect=False, # obvi, any redirections + status=2, # Status codes by server + backoff_factor=1, + backoff_max=30, # Just to be sure that script don't go sleep for a minute + respect_retry_after_header=False) +http_adapter = HTTPAdapter(max_retries=retries) +session.mount('https://', http_adapter) +session.mount('http://', http_adapter) + + +print("Getting HTML") # Get the HTML from the page -r = requests.get('https://codeberg.org/aryak/mozhi', headers=headers) +r = session.get('https://codeberg.org/aryak/mozhi/src/branch/master/README.md') # XXX: relied on Codeberg # Parse the HTML soup = BeautifulSoup(r.text, 'html.parser') print("Scraping started") -# Get tables -tables = soup.find_all('table') - -# Get table with header 'Master Branch' -table = tables[1] +# Get table after Instances header +instances_h2 = soup.find("h2", string="Instances") +try: + table = instances_h2.find_next_sibling("table") +except AttributeError: + print("Instances header not found") + sys.exit(-1) # Get all rows and columns. Skip the first row because it's the header rows = table.find_all('tr')[1:] +def get_net_type(url: str): + url = url.strip("/") + if url.endswith(".onion"): + return "onion" + elif url.endswith(".i2p"): + return "i2p" + elif url.endswith(".loki"): + return "lokinet" + return "link" + theJson = [] for row in rows: @@ -36,13 +67,13 @@ for row in rows: host = row.find_all('td')[3].text print("Scraping " + row.find_all('td')[0].find('a')['href'] + ' instance...') - if cloudflare == 'Yes': - isCloudflare = True - else: - isCloudflare = False + isCloudflare = cloudflare == "Yes" try: - r = requests.get(link + '/', headers=headers) + if get_net_type(url=link) == "link": + r = session.get(link + '/', headers=headers) + else: + print(f"Non-clearnet mirror [{row.find_all('td')[0].find('a').get_text()}]. Skipping check") if r.status_code != 200: print("Error while fetching " + link + '/. We got a ' + str(r.status_code) + ' status code. Skipping...') continue @@ -52,7 +83,7 @@ for row in rows: theJson.append({ 'country': country, - 'link': link, + get_net_type(url=link): link, 'cloudflare': isCloudflare, 'host': host, })