Compare commits

...

2 Commits

Author SHA1 Message Date
306f23592b Merge pull request 'instances: add non-clearnet to json, add retries' (#51) from NoPlagiarism/mozhi:tor_i2p_json into master
All checks were successful
mozhi pipeline / Push Docker image to Codeberg docker registry (push) Successful in 18m12s
mozhi pipeline / Build and publish artifacts (push) Successful in 58m53s
Reviewed-on: https://codeberg.org/aryak/mozhi/pulls/51
2024-09-05 16:41:45 +00:00
NoPlagiarism
31c8aeba93 instances: add non-clearnet to json, add retries 2024-09-05 13:33:04 +05:00
2 changed files with 70 additions and 15 deletions

View File

@ -64,5 +64,29 @@
"link": "https://translate.privacyredirect.com", "link": "https://translate.privacyredirect.com",
"cloudflare": false, "cloudflare": false,
"host": "Private WebHost" "host": "Private WebHost"
},
{
"country": "USA",
"onion": "http://mozhi.wsuno6lnjdcsiok5mrxvl6e2bdex7nhsqqav6ux7tkwrqiqnulejfbyd.onion",
"cloudflare": false,
"host": "Hetzner"
},
{
"country": "Germany",
"onion": "http://42i2bzogwkph3dvoo2bm6srskf7vvabsphw7uzftymbjjlzgfluhnmid.onion",
"cloudflare": false,
"host": "Datalix"
},
{
"country": "Germany",
"onion": "http://3mu2almmcv7rd7wlwhmkbwqgttntgpqu3hdanutxbv2v72wzbxe5ixqd.onion",
"cloudflare": false,
"host": "pawhost.de"
},
{
"country": "Germany",
"i2p": "http://74lptlnvaukcjnmqefedgna35ahkqexqzq2qq3k7utc2ep4jotcq.b32.i2p",
"cloudflare": false,
"host": "pawhost.de"
} }
] ]

View File

@ -1,31 +1,62 @@
#!/usr/bin/python3 #!/usr/bin/python3
import requests
import json import json
from bs4 import BeautifulSoup import sys
from bs4 import BeautifulSoup
from urllib3.util import Retry
import requests
from requests.adapters import HTTPAdapter
print("Getting HTML")
headers = { headers = {
'User-Agent': 'Mozilla/5.0 MozhiInstanceFetcher/1.0 (+codeberg.org/aryak/mozhi)' 'User-Agent': 'Mozilla/5.0 MozhiInstanceFetcher/1.0 (+codeberg.org/aryak/mozhi)'
} }
session = requests.Session()
session.headers.update(headers)
retries = Retry(total=5,
connect=3, # ConnectTimeoutError
read=False, # ReadTimeoutError or ProtocolError
redirect=False, # obvi, any redirections
status=2, # Status codes by server
backoff_factor=1,
backoff_max=30, # Just to be sure that script don't go sleep for a minute
respect_retry_after_header=False)
http_adapter = HTTPAdapter(max_retries=retries)
session.mount('https://', http_adapter)
session.mount('http://', http_adapter)
print("Getting HTML")
# Get the HTML from the page # Get the HTML from the page
r = requests.get('https://codeberg.org/aryak/mozhi', headers=headers) r = session.get('https://codeberg.org/aryak/mozhi/src/branch/master/README.md') # XXX: relied on Codeberg
# Parse the HTML # Parse the HTML
soup = BeautifulSoup(r.text, 'html.parser') soup = BeautifulSoup(r.text, 'html.parser')
print("Scraping started") print("Scraping started")
# Get tables # Get table after Instances header
tables = soup.find_all('table') instances_h2 = soup.find("h2", string="Instances")
try:
# Get table with header 'Master Branch' table = instances_h2.find_next_sibling("table")
table = tables[1] except AttributeError:
print("Instances header not found")
sys.exit(-1)
# Get all rows and columns. Skip the first row because it's the header # Get all rows and columns. Skip the first row because it's the header
rows = table.find_all('tr')[1:] rows = table.find_all('tr')[1:]
def get_net_type(url: str):
url = url.strip("/")
if url.endswith(".onion"):
return "onion"
elif url.endswith(".i2p"):
return "i2p"
elif url.endswith(".loki"):
return "lokinet"
return "link"
theJson = [] theJson = []
for row in rows: for row in rows:
@ -36,13 +67,13 @@ for row in rows:
host = row.find_all('td')[3].text host = row.find_all('td')[3].text
print("Scraping " + row.find_all('td')[0].find('a')['href'] + ' instance...') print("Scraping " + row.find_all('td')[0].find('a')['href'] + ' instance...')
if cloudflare == 'Yes': isCloudflare = cloudflare == "Yes"
isCloudflare = True
else:
isCloudflare = False
try: try:
r = requests.get(link + '/', headers=headers) if get_net_type(url=link) == "link":
r = session.get(link + '/', headers=headers)
else:
print(f"Non-clearnet mirror [{row.find_all('td')[0].find('a').get_text()}]. Skipping check")
if r.status_code != 200: if r.status_code != 200:
print("Error while fetching " + link + '/. We got a ' + str(r.status_code) + ' status code. Skipping...') print("Error while fetching " + link + '/. We got a ' + str(r.status_code) + ' status code. Skipping...')
continue continue
@ -52,7 +83,7 @@ for row in rows:
theJson.append({ theJson.append({
'country': country, 'country': country,
'link': link, get_net_type(url=link): link,
'cloudflare': isCloudflare, 'cloudflare': isCloudflare,
'host': host, 'host': host,
}) })