160 lines
4.5 KiB
Python
160 lines
4.5 KiB
Python
|
import re
|
||
|
import WebSiteParser
|
||
|
|
||
|
|
||
|
|
||
|
class ReceivesmsCo(WebSiteParser.WebSiteParser):
|
||
|
def __init__(self):
|
||
|
super().__init__("receivesms.co")
|
||
|
|
||
|
|
||
|
def Parse(self):
|
||
|
"""Perform parsing of entire web site.
|
||
|
"""
|
||
|
|
||
|
# Parsing main page with list of countries
|
||
|
country_uris = self.ParseAllFromPage(
|
||
|
"<td.*table_link.*>[\s]*<a.*href=.*phone-numbers/.*>.*</a>",
|
||
|
to_remove="(<td.*table_link.*>[\s]*<a.*href=(\"|'))|((\"|').*>.*</a>)"
|
||
|
)
|
||
|
|
||
|
numbers = []
|
||
|
for country_uri in country_uris:
|
||
|
country_number_uris = self.ParseAllFromPage(
|
||
|
"<td>[\s]*<a.*href=.*target=['\"]_self['\"]>.*</a>[\s]*</td>",
|
||
|
to_remove="(<td>[\s]*<a.*href=(\"|'))|((\"|').*target=(\"|')_self(\"|')>.*</a>[\s]*</td>)",
|
||
|
location=(country_uri if country_uri[0]!="/" else country_uri[1:])
|
||
|
)
|
||
|
for num_uri in country_number_uris:
|
||
|
numbers.append({
|
||
|
"uri_raw": (num_uri if num_uri[0]!="/" else num_uri[1:])
|
||
|
})
|
||
|
break # TODO: remove
|
||
|
|
||
|
for i in range(len(numbers)):
|
||
|
numbers[i].update(
|
||
|
self.ParseNumberPage(numbers[i]["uri_raw"])
|
||
|
)
|
||
|
|
||
|
self.ProcessRawNumbers(numbers)
|
||
|
self.ParseDone = True
|
||
|
self.Log("parsing done")
|
||
|
|
||
|
def ParseNumberPage(self, uri):
|
||
|
"""Parse page with history of messages, related to single number.
|
||
|
"""
|
||
|
|
||
|
def die(text):
|
||
|
self.Log(text, f"ParseNumberPage(self at {id(self)}, \"{uri}\")")
|
||
|
|
||
|
result = {
|
||
|
"number": self.EMPTY,
|
||
|
"country": self.EMPTY,
|
||
|
"last_use_raw": self.EMPTY,
|
||
|
"born_raw": self.EMPTY,
|
||
|
"times_used": self.EMPTY
|
||
|
}
|
||
|
|
||
|
markup = self.RequestPage(uri).text
|
||
|
|
||
|
country = re.findall(
|
||
|
"<h3>[\s]*<a.*href=.*>[\s]*<i.*class=.*></i></a>.*[\s]-[\s].*Phone Number[\s]*</h3>",
|
||
|
markup
|
||
|
)
|
||
|
country = re.sub(
|
||
|
"(<h3>[\s]*<a.*href=.*>[\s]*<i.*class=.*></i></a>[\s]*)|([\s]-[\s].*Phone Number[\s]*</h3>)",
|
||
|
"",
|
||
|
str(country[0]) if country else ""
|
||
|
)
|
||
|
if not country:
|
||
|
die("error: page parsing failed, country is empty")
|
||
|
return result
|
||
|
result["country"] = country
|
||
|
|
||
|
number = re.findall(
|
||
|
"<div.*class=[\"'].*h3.*[\"'].*>+.*</div",
|
||
|
markup
|
||
|
)
|
||
|
number = re.sub(
|
||
|
"(<div.*class=[\"'].*h3.*[\"'].*>)|(</div)",
|
||
|
"",
|
||
|
str(number[0]) if number else ""
|
||
|
)
|
||
|
if not number:
|
||
|
die("error: page parsing failed, number is empty")
|
||
|
return result
|
||
|
result["number"] = number
|
||
|
|
||
|
expMsgHead = "<div .*class=[\"'].*message_head.*[\"'].*>[\s]*From[\s]*<a.*>.*</a>[\s]*.*[(].+ago[)][\s]*</div"
|
||
|
expMsgHead2 = "(<div .*class=[\"'].*message_head.*[\"'].*>[\s]*From[\s]*<a.*>.*</a>[\s]*.*[(])|([)][\s]*</div)"
|
||
|
|
||
|
msg_heads = re.findall(
|
||
|
expMsgHead,
|
||
|
markup
|
||
|
)
|
||
|
if msg_heads:
|
||
|
result["last_use_raw"] = re.sub(
|
||
|
expMsgHead2,
|
||
|
"",
|
||
|
msg_heads[0]
|
||
|
)
|
||
|
|
||
|
nav_links = re.findall(
|
||
|
"<li class=[\"']page-item[a-z- ]*[\"']>[\s]*<a class=[\"'][a-zA-Z0-9 -]*[\"'] href=[\"'][a-zA-Z0-9/#-]*[\"']",
|
||
|
markup
|
||
|
)
|
||
|
pages_amount = len(nav_links)
|
||
|
msgs_at_last_page = 0
|
||
|
if nav_links:
|
||
|
if len(nav_links) > 1:
|
||
|
pages_amount -= 2
|
||
|
|
||
|
last_page_uri = nav_links[-1]
|
||
|
last_page_uri = re.sub(
|
||
|
"(^<li class=[\"']page-item[a-z- ]*[\"']>[\s]*<a class=[\"'][a-zA-Z0-9 -]*[\"'] href=[\"'])|([\"']$)",
|
||
|
"",
|
||
|
last_page_uri
|
||
|
)
|
||
|
msg_heads_last = self.ParseAllFromPage(
|
||
|
expMsgHead,
|
||
|
expMsgHead2,
|
||
|
last_page_uri
|
||
|
)
|
||
|
if msg_heads_last:
|
||
|
result["born_raw"] = msg_heads_last[-1]
|
||
|
msgs_at_last_page = len(msg_heads_last)
|
||
|
|
||
|
result["times_used"] = (pages_amount - 1) * len(msg_heads) + msgs_at_last_page
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
def ProcessRawNumbers(self, raws):
|
||
|
"""Process list of raw parsed numbers and make them look as designed.
|
||
|
"""
|
||
|
|
||
|
for i in range(0, len(raws)):
|
||
|
if not re.search("\+[0-9]+", raws[i]["number"]):
|
||
|
self.Log(f"error: incorrect number: \"{raws[i]['number']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
|
||
|
continue
|
||
|
|
||
|
if not re.search("[a-zA-Z0-9]+", raws[i]["last_use_raw"]):
|
||
|
self.Log(f"warning: malformed last_use_raw field: \"{raws[i]['last_use_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
|
||
|
raws[i]["last_use_raw"] = self.EMPTY
|
||
|
|
||
|
raws[i]["last_use"] = raws[i]["last_use_raw"]
|
||
|
|
||
|
if not re.search("[a-zA-Z0-9]+", raws[i]["born_raw"]):
|
||
|
self.Log(f"warning: malformed born_raw field: \"{raws[i]['born_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
|
||
|
raws[i]["born_raw"] = self.EMPTY
|
||
|
|
||
|
raws[i]["born"] = raws[i]["born_raw"]
|
||
|
|
||
|
raws[i]["url"] = self.WebSiteFullURL + raws[i]["uri_raw"]
|
||
|
|
||
|
if raws[i]["times_used"] == 0:
|
||
|
self.Output["not_used"].append(raws[i])
|
||
|
elif raws[i]["times_used"] < self.AlmostNotUsedThreshold:
|
||
|
self.Output["almost_not_used"].append(raws[i])
|
||
|
else:
|
||
|
self.Output["heavily_used"].append(raws[i])
|