mirror of
https://codeberg.org/aryak/mozhi
synced 2024-11-22 17:22:58 +05:30
Merge pull request 'instances: add non-clearnet to json, add retries' (#51) from NoPlagiarism/mozhi:tor_i2p_json into master
Reviewed-on: https://codeberg.org/aryak/mozhi/pulls/51
This commit is contained in:
commit
306f23592b
@ -64,5 +64,29 @@
|
||||
"link": "https://translate.privacyredirect.com",
|
||||
"cloudflare": false,
|
||||
"host": "Private WebHost"
|
||||
},
|
||||
{
|
||||
"country": "USA",
|
||||
"onion": "http://mozhi.wsuno6lnjdcsiok5mrxvl6e2bdex7nhsqqav6ux7tkwrqiqnulejfbyd.onion",
|
||||
"cloudflare": false,
|
||||
"host": "Hetzner"
|
||||
},
|
||||
{
|
||||
"country": "Germany",
|
||||
"onion": "http://42i2bzogwkph3dvoo2bm6srskf7vvabsphw7uzftymbjjlzgfluhnmid.onion",
|
||||
"cloudflare": false,
|
||||
"host": "Datalix"
|
||||
},
|
||||
{
|
||||
"country": "Germany",
|
||||
"onion": "http://3mu2almmcv7rd7wlwhmkbwqgttntgpqu3hdanutxbv2v72wzbxe5ixqd.onion",
|
||||
"cloudflare": false,
|
||||
"host": "pawhost.de"
|
||||
},
|
||||
{
|
||||
"country": "Germany",
|
||||
"i2p": "http://74lptlnvaukcjnmqefedgna35ahkqexqzq2qq3k7utc2ep4jotcq.b32.i2p",
|
||||
"cloudflare": false,
|
||||
"host": "pawhost.de"
|
||||
}
|
||||
]
|
@ -1,31 +1,62 @@
|
||||
#!/usr/bin/python3
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
import sys
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib3.util import Retry
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
||||
print("Getting HTML")
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 MozhiInstanceFetcher/1.0 (+codeberg.org/aryak/mozhi)'
|
||||
}
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update(headers)
|
||||
retries = Retry(total=5,
|
||||
connect=3, # ConnectTimeoutError
|
||||
read=False, # ReadTimeoutError or ProtocolError
|
||||
redirect=False, # obvi, any redirections
|
||||
status=2, # Status codes by server
|
||||
backoff_factor=1,
|
||||
backoff_max=30, # Just to be sure that script don't go sleep for a minute
|
||||
respect_retry_after_header=False)
|
||||
http_adapter = HTTPAdapter(max_retries=retries)
|
||||
session.mount('https://', http_adapter)
|
||||
session.mount('http://', http_adapter)
|
||||
|
||||
|
||||
print("Getting HTML")
|
||||
# Get the HTML from the page
|
||||
r = requests.get('https://codeberg.org/aryak/mozhi', headers=headers)
|
||||
r = session.get('https://codeberg.org/aryak/mozhi/src/branch/master/README.md') # XXX: relied on Codeberg
|
||||
|
||||
# Parse the HTML
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
|
||||
print("Scraping started")
|
||||
|
||||
# Get tables
|
||||
tables = soup.find_all('table')
|
||||
|
||||
# Get table with header 'Master Branch'
|
||||
table = tables[1]
|
||||
# Get table after Instances header
|
||||
instances_h2 = soup.find("h2", string="Instances")
|
||||
try:
|
||||
table = instances_h2.find_next_sibling("table")
|
||||
except AttributeError:
|
||||
print("Instances header not found")
|
||||
sys.exit(-1)
|
||||
|
||||
# Get all rows and columns. Skip the first row because it's the header
|
||||
rows = table.find_all('tr')[1:]
|
||||
|
||||
def get_net_type(url: str):
|
||||
url = url.strip("/")
|
||||
if url.endswith(".onion"):
|
||||
return "onion"
|
||||
elif url.endswith(".i2p"):
|
||||
return "i2p"
|
||||
elif url.endswith(".loki"):
|
||||
return "lokinet"
|
||||
return "link"
|
||||
|
||||
theJson = []
|
||||
|
||||
for row in rows:
|
||||
@ -36,13 +67,13 @@ for row in rows:
|
||||
host = row.find_all('td')[3].text
|
||||
|
||||
print("Scraping " + row.find_all('td')[0].find('a')['href'] + ' instance...')
|
||||
if cloudflare == 'Yes':
|
||||
isCloudflare = True
|
||||
else:
|
||||
isCloudflare = False
|
||||
isCloudflare = cloudflare == "Yes"
|
||||
|
||||
try:
|
||||
r = requests.get(link + '/', headers=headers)
|
||||
if get_net_type(url=link) == "link":
|
||||
r = session.get(link + '/', headers=headers)
|
||||
else:
|
||||
print(f"Non-clearnet mirror [{row.find_all('td')[0].find('a').get_text()}]. Skipping check")
|
||||
if r.status_code != 200:
|
||||
print("Error while fetching " + link + '/. We got a ' + str(r.status_code) + ' status code. Skipping...')
|
||||
continue
|
||||
@ -52,7 +83,7 @@ for row in rows:
|
||||
|
||||
theJson.append({
|
||||
'country': country,
|
||||
'link': link,
|
||||
get_net_type(url=link): link,
|
||||
'cloudflare': isCloudflare,
|
||||
'host': host,
|
||||
})
|
||||
|
Loading…
Reference in New Issue
Block a user