mirror of
https://codeberg.org/aryak/mozhi
synced 2024-11-23 04:12:58 +05:30
Merge pull request 'instances: add non-clearnet to json, add retries' (#51) from NoPlagiarism/mozhi:tor_i2p_json into master
Reviewed-on: https://codeberg.org/aryak/mozhi/pulls/51
This commit is contained in:
commit
306f23592b
@ -64,5 +64,29 @@
|
|||||||
"link": "https://translate.privacyredirect.com",
|
"link": "https://translate.privacyredirect.com",
|
||||||
"cloudflare": false,
|
"cloudflare": false,
|
||||||
"host": "Private WebHost"
|
"host": "Private WebHost"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"country": "USA",
|
||||||
|
"onion": "http://mozhi.wsuno6lnjdcsiok5mrxvl6e2bdex7nhsqqav6ux7tkwrqiqnulejfbyd.onion",
|
||||||
|
"cloudflare": false,
|
||||||
|
"host": "Hetzner"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"country": "Germany",
|
||||||
|
"onion": "http://42i2bzogwkph3dvoo2bm6srskf7vvabsphw7uzftymbjjlzgfluhnmid.onion",
|
||||||
|
"cloudflare": false,
|
||||||
|
"host": "Datalix"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"country": "Germany",
|
||||||
|
"onion": "http://3mu2almmcv7rd7wlwhmkbwqgttntgpqu3hdanutxbv2v72wzbxe5ixqd.onion",
|
||||||
|
"cloudflare": false,
|
||||||
|
"host": "pawhost.de"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"country": "Germany",
|
||||||
|
"i2p": "http://74lptlnvaukcjnmqefedgna35ahkqexqzq2qq3k7utc2ep4jotcq.b32.i2p",
|
||||||
|
"cloudflare": false,
|
||||||
|
"host": "pawhost.de"
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -1,31 +1,62 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
import requests
|
|
||||||
import json
|
import json
|
||||||
from bs4 import BeautifulSoup
|
import sys
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib3.util import Retry
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
|
||||||
print("Getting HTML")
|
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 MozhiInstanceFetcher/1.0 (+codeberg.org/aryak/mozhi)'
|
'User-Agent': 'Mozilla/5.0 MozhiInstanceFetcher/1.0 (+codeberg.org/aryak/mozhi)'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update(headers)
|
||||||
|
retries = Retry(total=5,
|
||||||
|
connect=3, # ConnectTimeoutError
|
||||||
|
read=False, # ReadTimeoutError or ProtocolError
|
||||||
|
redirect=False, # obvi, any redirections
|
||||||
|
status=2, # Status codes by server
|
||||||
|
backoff_factor=1,
|
||||||
|
backoff_max=30, # Just to be sure that script don't go sleep for a minute
|
||||||
|
respect_retry_after_header=False)
|
||||||
|
http_adapter = HTTPAdapter(max_retries=retries)
|
||||||
|
session.mount('https://', http_adapter)
|
||||||
|
session.mount('http://', http_adapter)
|
||||||
|
|
||||||
|
|
||||||
|
print("Getting HTML")
|
||||||
# Get the HTML from the page
|
# Get the HTML from the page
|
||||||
r = requests.get('https://codeberg.org/aryak/mozhi', headers=headers)
|
r = session.get('https://codeberg.org/aryak/mozhi/src/branch/master/README.md') # XXX: relied on Codeberg
|
||||||
|
|
||||||
# Parse the HTML
|
# Parse the HTML
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
|
|
||||||
print("Scraping started")
|
print("Scraping started")
|
||||||
|
|
||||||
# Get tables
|
# Get table after Instances header
|
||||||
tables = soup.find_all('table')
|
instances_h2 = soup.find("h2", string="Instances")
|
||||||
|
try:
|
||||||
# Get table with header 'Master Branch'
|
table = instances_h2.find_next_sibling("table")
|
||||||
table = tables[1]
|
except AttributeError:
|
||||||
|
print("Instances header not found")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
# Get all rows and columns. Skip the first row because it's the header
|
# Get all rows and columns. Skip the first row because it's the header
|
||||||
rows = table.find_all('tr')[1:]
|
rows = table.find_all('tr')[1:]
|
||||||
|
|
||||||
|
def get_net_type(url: str):
|
||||||
|
url = url.strip("/")
|
||||||
|
if url.endswith(".onion"):
|
||||||
|
return "onion"
|
||||||
|
elif url.endswith(".i2p"):
|
||||||
|
return "i2p"
|
||||||
|
elif url.endswith(".loki"):
|
||||||
|
return "lokinet"
|
||||||
|
return "link"
|
||||||
|
|
||||||
theJson = []
|
theJson = []
|
||||||
|
|
||||||
for row in rows:
|
for row in rows:
|
||||||
@ -36,13 +67,13 @@ for row in rows:
|
|||||||
host = row.find_all('td')[3].text
|
host = row.find_all('td')[3].text
|
||||||
|
|
||||||
print("Scraping " + row.find_all('td')[0].find('a')['href'] + ' instance...')
|
print("Scraping " + row.find_all('td')[0].find('a')['href'] + ' instance...')
|
||||||
if cloudflare == 'Yes':
|
isCloudflare = cloudflare == "Yes"
|
||||||
isCloudflare = True
|
|
||||||
else:
|
|
||||||
isCloudflare = False
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(link + '/', headers=headers)
|
if get_net_type(url=link) == "link":
|
||||||
|
r = session.get(link + '/', headers=headers)
|
||||||
|
else:
|
||||||
|
print(f"Non-clearnet mirror [{row.find_all('td')[0].find('a').get_text()}]. Skipping check")
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
print("Error while fetching " + link + '/. We got a ' + str(r.status_code) + ' status code. Skipping...')
|
print("Error while fetching " + link + '/. We got a ' + str(r.status_code) + ' status code. Skipping...')
|
||||||
continue
|
continue
|
||||||
@ -52,7 +83,7 @@ for row in rows:
|
|||||||
|
|
||||||
theJson.append({
|
theJson.append({
|
||||||
'country': country,
|
'country': country,
|
||||||
'link': link,
|
get_net_type(url=link): link,
|
||||||
'cloudflare': isCloudflare,
|
'cloudflare': isCloudflare,
|
||||||
'host': host,
|
'host': host,
|
||||||
})
|
})
|
||||||
|
Loading…
Reference in New Issue
Block a user