vnp/sites/ReceivesmsCo.py

160 lines
4.5 KiB
Python
Raw Normal View History

2023-06-12 05:13:51 +05:30
import re
import WebSiteParser
class ReceivesmsCo(WebSiteParser.WebSiteParser):
def __init__(self):
super().__init__("receivesms.co")
def Parse(self):
"""Perform parsing of entire web site.
"""
# Parsing main page with list of countries
country_uris = self.ParseAllFromPage(
"<td.*table_link.*>[\s]*<a.*href=.*phone-numbers/.*>.*</a>",
to_remove="(<td.*table_link.*>[\s]*<a.*href=(\"|'))|((\"|').*>.*</a>)"
)
numbers = []
for country_uri in country_uris:
country_number_uris = self.ParseAllFromPage(
"<td>[\s]*<a.*href=.*target=['\"]_self['\"]>.*</a>[\s]*</td>",
to_remove="(<td>[\s]*<a.*href=(\"|'))|((\"|').*target=(\"|')_self(\"|')>.*</a>[\s]*</td>)",
location=(country_uri if country_uri[0]!="/" else country_uri[1:])
)
for num_uri in country_number_uris:
numbers.append({
"uri_raw": (num_uri if num_uri[0]!="/" else num_uri[1:])
})
break # TODO: remove
for i in range(len(numbers)):
numbers[i].update(
self.ParseNumberPage(numbers[i]["uri_raw"])
)
self.ProcessRawNumbers(numbers)
self.ParseDone = True
self.Log("parsing done")
def ParseNumberPage(self, uri):
"""Parse page with history of messages, related to single number.
"""
def die(text):
self.Log(text, f"ParseNumberPage(self at {id(self)}, \"{uri}\")")
result = {
"number": self.EMPTY,
"country": self.EMPTY,
"last_use_raw": self.EMPTY,
"born_raw": self.EMPTY,
"times_used": self.EMPTY
}
markup = self.RequestPage(uri).text
country = re.findall(
"<h3>[\s]*<a.*href=.*>[\s]*<i.*class=.*></i></a>.*[\s]-[\s].*Phone Number[\s]*</h3>",
markup
)
country = re.sub(
"(<h3>[\s]*<a.*href=.*>[\s]*<i.*class=.*></i></a>[\s]*)|([\s]-[\s].*Phone Number[\s]*</h3>)",
"",
str(country[0]) if country else ""
)
if not country:
die("error: page parsing failed, country is empty")
return result
result["country"] = country
number = re.findall(
"<div.*class=[\"'].*h3.*[\"'].*>+.*</div",
markup
)
number = re.sub(
"(<div.*class=[\"'].*h3.*[\"'].*>)|(</div)",
"",
str(number[0]) if number else ""
)
if not number:
die("error: page parsing failed, number is empty")
return result
result["number"] = number
expMsgHead = "<div .*class=[\"'].*message_head.*[\"'].*>[\s]*From[\s]*<a.*>.*</a>[\s]*.*[(].+ago[)][\s]*</div"
expMsgHead2 = "(<div .*class=[\"'].*message_head.*[\"'].*>[\s]*From[\s]*<a.*>.*</a>[\s]*.*[(])|([)][\s]*</div)"
msg_heads = re.findall(
expMsgHead,
markup
)
if msg_heads:
result["last_use_raw"] = re.sub(
expMsgHead2,
"",
msg_heads[0]
)
nav_links = re.findall(
"<li class=[\"']page-item[a-z- ]*[\"']>[\s]*<a class=[\"'][a-zA-Z0-9 -]*[\"'] href=[\"'][a-zA-Z0-9/#-]*[\"']",
markup
)
pages_amount = len(nav_links)
msgs_at_last_page = 0
if nav_links:
if len(nav_links) > 1:
pages_amount -= 2
last_page_uri = nav_links[-1]
last_page_uri = re.sub(
"(^<li class=[\"']page-item[a-z- ]*[\"']>[\s]*<a class=[\"'][a-zA-Z0-9 -]*[\"'] href=[\"'])|([\"']$)",
"",
last_page_uri
)
msg_heads_last = self.ParseAllFromPage(
expMsgHead,
expMsgHead2,
last_page_uri
)
if msg_heads_last:
result["born_raw"] = msg_heads_last[-1]
msgs_at_last_page = len(msg_heads_last)
result["times_used"] = (pages_amount - 1) * len(msg_heads) + msgs_at_last_page
return result
def ProcessRawNumbers(self, raws):
"""Process list of raw parsed numbers and make them look as designed.
"""
for i in range(0, len(raws)):
if not re.search("\+[0-9]+", raws[i]["number"]):
self.Log(f"error: incorrect number: \"{raws[i]['number']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
continue
if not re.search("[a-zA-Z0-9]+", raws[i]["last_use_raw"]):
self.Log(f"warning: malformed last_use_raw field: \"{raws[i]['last_use_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
raws[i]["last_use_raw"] = self.EMPTY
raws[i]["last_use"] = raws[i]["last_use_raw"]
if not re.search("[a-zA-Z0-9]+", raws[i]["born_raw"]):
self.Log(f"warning: malformed born_raw field: \"{raws[i]['born_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
raws[i]["born_raw"] = self.EMPTY
raws[i]["born"] = raws[i]["born_raw"]
raws[i]["url"] = self.WebSiteFullURL + raws[i]["uri_raw"]
if raws[i]["times_used"] == 0:
self.Output["not_used"].append(raws[i])
elif raws[i]["times_used"] < self.AlmostNotUsedThreshold:
self.Output["almost_not_used"].append(raws[i])
else:
self.Output["heavily_used"].append(raws[i])