vnp/sites/FreesmscenterCom.py

"""Module for parsing freesmscenter.com
NOTICE: this can be rewritten in more clean manner in future
"""

import re
import WebSiteParser


class FreesmscenterCom(WebSiteParser.WebSiteParser):
	def __init__(self):
		super().__init__("freesmscenter.com")


	def Parse(self):
		"""Perform parsing of entire web site.
		"""

		# Parsing main page with list of countries
		country_uris = self.ParseURIsFromPage(
			"<a href=\"/FreeSms/.+-Phone-Number\" class=\"btn btn-primary\">Show Phone Numbers</a>",
			"<a href=\"/",
			"\" class=\"btn btn-primary\">Show Phone Numbers</a>"
		)

		# Parsing all single number from page with country's number list
		numbers_raw = []
		for country_uri in country_uris:
			tmp = country_uri.replace("FreeSms", "SmsReceive")
			number_uris = self.ParseURIsFromPage(
				f"<a href=\"/SmsReceive/.+/[0-9]+\" class=\"btn btn-primary\">Receive SMS</a>",
				"<a href=\"/",
				"\" class=\"btn btn-primary\">Receive SMS</a>",
				country_uri
			)
			if not number_uris:
				number_uris  = self.ParseURIsFromPage(
					f"<a href=\"/SmsReceive/.+/[0-9]+\" class=\"btn btn-primary\">Receive SMS</a>",
					"<a href=\"/",
					"\" class=\"btn btn-primary\">Receive SMS</a>",
					country_uri
				)
			for number_uri in number_uris:
				numbers_raw.append({
					"uri_raw": number_uri,
					"country": country_uri.replace("FreeSms/", "").replace("-Phone-Number", "").replace("%20", " ")
				})

		# Parse number's page
		for i in range(len(numbers_raw)):
			numbers_raw[i].update(
				self.ParseNumberPage(numbers_raw[i]["uri_raw"])
			)

		self.ProcessRawNumbers(numbers_raw)
		self.ParseDone = True
		self.Log("parsing done")


	def ParseURIsFromPage(self, exp, r1, r2, uri=""):
		"""Parse all URIs from selected page.
		"""

		markup = self.RequestPage(uri).text
		peaces_of_markup = re.findall(exp, markup)
		result_uris = []
		for peace in peaces_of_markup:
			result_uris.append(
				peace.replace(r1, "").replace(r2, "")
			)
		return result_uris


	def ParseNumberPage(self, uri):
		"""Parse page with history of messages, related to single number.

		NOTICE: this can be better,
		but a lot of hardcoded sheat is required then,
		just because original site has a lot of bugs.
		However, i fixed most of major things.
		"""

		def die(text):
			self.Log(text, f"ParseNumberPage(self at {id(self)}, \"{uri}\")")

		def p(l):
			for i in range(len(l)):
				if not l[i]:
					l[i] = self.EMPTY
			return l

		result = {
			"number": self.EMPTY,
			"last_use_raw": self.EMPTY,
			"born_raw": self.EMPTY,
			"times_used": self.EMPTY
		}
		numExp = "<h1 id=\"numberget\" class=\"cw\">\+[0-9]+</h1>"
		tableCellExp = "<td.*\">.*</td>"
		cellFromExp = "<td.*datesize3.*>*</td>"
		cellContExp = "<td.*datesize5.*>.*</td>"
		cellTimeExp = "<td.*datesize4.*>.*</td>"

		markup = self.RequestPage(uri).text
		number = re.findall(numExp, markup)
		if not number:
			die("error: can't parse page, number is empty")
			self.ErrorsWhileParsing += 1
			return result
		result["number"] = re.findall("\+[0-9]+", number[0])[0]
		tableCellsRaw = re.findall(tableCellExp, markup)
		if not len(tableCellsRaw):
			die("error: can't parse page, tableCellsRaw is empty")
			self.ErrorsWhileParsing += 1
			return result
		elif len(tableCellsRaw) % 3:
			die("warning: length tableCellsRaw should be divisible by 3, incorrect results are expected")

		tableLines = []
		tmpLine = [None, None, None]

		for i in range(0, len(tableCellsRaw)):
			if re.search(cellFromExp, tableCellsRaw[i]) and (i < len(tableCellsRaw) - 3):
				if re.search(cellContExp, tableCellsRaw[i+1]):
					if re.search(cellTimeExp, tableCellsRaw[i+2]):
						tableLines.append([
							tableCellsRaw[i],
							tableCellsRaw[i+1],
							tableCellsRaw[i+2]
						])
						tmpLine = [None, None, None]
						i += 2
						continue

			if re.search(cellFromExp, tableCellsRaw[i]) and not tmpLine[0]:
				if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):
					tmpLine[0] = self.EMPTY
				else:
					tmpLine[0] = tableCellsRaw[i]
			elif tmpLine[0]:
				tmpLine = p(tmpLine)
			if re.search(cellContExp, tableCellsRaw[i]) and not tmpLine[1]:
				# if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):
				# 	tmpLine[1] = self.EMPTY
				# else:
				tmpLine[1] = tableCellsRaw[i]
			elif tmpLine[1]:
				tmpLine = p(tmpLine)
			if re.search(cellTimeExp, tableCellsRaw[i]) and not tmpLine[2]:
				if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):
					tmpLine[2] = self.EMPTY
				else:
					tmpLine[2] = tableCellsRaw[i]
			elif tmpLine[2]:
				tmpLine = p(tmpLine)

			if tmpLine[0] and tmpLine[1] and tmpLine[2]:
				tableLines.append(tmpLine)
				tmpLine = [None, None, None]

		if not tableLines:
			die("error: can't parse page, tableLines is empty")
			self.ErrorsWhileParsing += 1
			return result

		for i in range(len(tableLines)):
			if tableLines[i][2] != self.EMPTY:
				result["last_use_raw"] = re.sub(
					"(<td.*\">)|(</td>)",
					"",
					tableLines[i][2]
				)
				break

		for i in range(len(tableLines)-1, -1, -1):
			if tableLines[i][2] != self.EMPTY:
				result["born_raw"] = re.sub(
					"(<td.*\">)|(</td>)",
					"",
					tableLines[i][2]
				)
				break
		result["times_used"] = len(tableLines)

		return result


	def ProcessRawNumbers(self, raws):
		"""Process list of raw parsed numbers and make them look cool ayo.
		"""

		for i in range(0, len(raws)):
			if not re.search("\+[0-9]+", raws[i]["number"]):
				self.Log(f"error: incorrect number: \"{raws[i]['number']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
				continue

			if not re.search("[a-zA-Z0-9]+", raws[i]["last_use_raw"]):
				self.Log(f"warning: malformed last_use_raw field: \"{raws[i]['last_use_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
				raws[i]["last_use_raw"] = self.EMPTY

			raws[i]["last_use"] = raws[i]["last_use_raw"]

			if not re.search("[a-zA-Z0-9]+", raws[i]["born_raw"]):
				self.Log(f"warning: malformed born_raw field: \"{raws[i]['born_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
				raws[i]["born_raw"] = self.EMPTY

			raws[i]["born"] = raws[i]["born_raw"]

			raws[i]["url"] = self.WebSiteFullURL + raws[i]["uri_raw"]

			if raws[i]["times_used"] == 0:
				self.Output["not_used"].append(raws[i])
			elif raws[i]["times_used"] < self.AlmostNotUsedThreshold:
				self.Output["almost_not_used"].append(raws[i])
			else:
				self.Output["heavily_used"].append(raws[i])
Added some peaces 2023-06-12 05:13:51 +05:30			`"""Module for parsing freesmscenter.com`
			`NOTICE: this can be rewritten in more clean manner in future`
			`"""`

			`import re`
			`import WebSiteParser`



			`class FreesmscenterCom(WebSiteParser.WebSiteParser):`
			`def __init__(self):`
			`super().__init__("freesmscenter.com")`


			`def Parse(self):`
			`"""Perform parsing of entire web site.`
			`"""`

			`# Parsing main page with list of countries`
			`country_uris = self.ParseURIsFromPage(`
			`"<a href=\"/FreeSms/.+-Phone-Number\" class=\"btn btn-primary\">Show Phone Numbers</a>",`
			`"<a href=\"/",`
			`"\" class=\"btn btn-primary\">Show Phone Numbers</a>"`
			`)`

			`# Parsing all single number from page with country's number list`
			`numbers_raw = []`
			`for country_uri in country_uris:`
			`tmp = country_uri.replace("FreeSms", "SmsReceive")`
			`number_uris = self.ParseURIsFromPage(`
			`f"<a href=\"/SmsReceive/.+/[0-9]+\" class=\"btn btn-primary\">Receive SMS</a>",`
			`"<a href=\"/",`
			`"\" class=\"btn btn-primary\">Receive SMS</a>",`
			`country_uri`
			`)`
			`if not number_uris:`
			`number_uris = self.ParseURIsFromPage(`
			`f"<a href=\"/SmsReceive/.+/[0-9]+\" class=\"btn btn-primary\">Receive SMS</a>",`
			`"<a href=\"/",`
			`"\" class=\"btn btn-primary\">Receive SMS</a>",`
			`country_uri`
			`)`
			`for number_uri in number_uris:`
			`numbers_raw.append({`
			`"uri_raw": number_uri,`
			`"country": country_uri.replace("FreeSms/", "").replace("-Phone-Number", "").replace("%20", " ")`
			`})`

			`# Parse number's page`
			`for i in range(len(numbers_raw)):`
			`numbers_raw[i].update(`
			`self.ParseNumberPage(numbers_raw[i]["uri_raw"])`
			`)`

			`self.ProcessRawNumbers(numbers_raw)`
			`self.ParseDone = True`
			`self.Log("parsing done")`


			`def ParseURIsFromPage(self, exp, r1, r2, uri=""):`
			`"""Parse all URIs from selected page.`
			`"""`

			`markup = self.RequestPage(uri).text`
			`peaces_of_markup = re.findall(exp, markup)`
			`result_uris = []`
			`for peace in peaces_of_markup:`
			`result_uris.append(`
			`peace.replace(r1, "").replace(r2, "")`
			`)`
			`return result_uris`


			`def ParseNumberPage(self, uri):`
			`"""Parse page with history of messages, related to single number.`

			`NOTICE: this can be better,`
			`but a lot of hardcoded sheat is required then,`
			`just because original site has a lot of bugs.`
			`However, i fixed most of major things.`
			`"""`

			`def die(text):`
			`self.Log(text, f"ParseNumberPage(self at {id(self)}, \"{uri}\")")`

			`def p(l):`
			`for i in range(len(l)):`
			`if not l[i]:`
			`l[i] = self.EMPTY`
			`return l`

			`result = {`
			`"number": self.EMPTY,`
			`"last_use_raw": self.EMPTY,`
			`"born_raw": self.EMPTY,`
			`"times_used": self.EMPTY`
			`}`
			`numExp = "<h1 id=\"numberget\" class=\"cw\">\+[0-9]+</h1>"`
			`tableCellExp = "<td.\">.</td>"`
			`cellFromExp = "<td.datesize3.>*</td>"`
			`cellContExp = "<td.datesize5.>.*</td>"`
			`cellTimeExp = "<td.datesize4.>.*</td>"`

			`markup = self.RequestPage(uri).text`
			`number = re.findall(numExp, markup)`
			`if not number:`
			`die("error: can't parse page, number is empty")`
			`self.ErrorsWhileParsing += 1`
			`return result`
			`result["number"] = re.findall("\+[0-9]+", number[0])[0]`
			`tableCellsRaw = re.findall(tableCellExp, markup)`
			`if not len(tableCellsRaw):`
			`die("error: can't parse page, tableCellsRaw is empty")`
			`self.ErrorsWhileParsing += 1`
			`return result`
			`elif len(tableCellsRaw) % 3:`
			`die("warning: length tableCellsRaw should be divisible by 3, incorrect results are expected")`

			`tableLines = []`
			`tmpLine = [None, None, None]`

			`for i in range(0, len(tableCellsRaw)):`
			`if re.search(cellFromExp, tableCellsRaw[i]) and (i < len(tableCellsRaw) - 3):`
			`if re.search(cellContExp, tableCellsRaw[i+1]):`
			`if re.search(cellTimeExp, tableCellsRaw[i+2]):`
			`tableLines.append([`
			`tableCellsRaw[i],`
			`tableCellsRaw[i+1],`
			`tableCellsRaw[i+2]`
			`])`
			`tmpLine = [None, None, None]`
			`i += 2`
			`continue`

			`if re.search(cellFromExp, tableCellsRaw[i]) and not tmpLine[0]:`
			`if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):`
			`tmpLine[0] = self.EMPTY`
			`else:`
			`tmpLine[0] = tableCellsRaw[i]`
			`elif tmpLine[0]:`
			`tmpLine = p(tmpLine)`
			`if re.search(cellContExp, tableCellsRaw[i]) and not tmpLine[1]:`
			`# if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):`
			`# tmpLine[1] = self.EMPTY`
			`# else:`
			`tmpLine[1] = tableCellsRaw[i]`
			`elif tmpLine[1]:`
			`tmpLine = p(tmpLine)`
			`if re.search(cellTimeExp, tableCellsRaw[i]) and not tmpLine[2]:`
			`if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):`
			`tmpLine[2] = self.EMPTY`
			`else:`
			`tmpLine[2] = tableCellsRaw[i]`
			`elif tmpLine[2]:`
			`tmpLine = p(tmpLine)`

			`if tmpLine[0] and tmpLine[1] and tmpLine[2]:`
			`tableLines.append(tmpLine)`
			`tmpLine = [None, None, None]`

			`if not tableLines:`
			`die("error: can't parse page, tableLines is empty")`
			`self.ErrorsWhileParsing += 1`
			`return result`

			`for i in range(len(tableLines)):`
			`if tableLines[i][2] != self.EMPTY:`
			`result["last_use_raw"] = re.sub(`
			`"(<td.*\">)\|(</td>)",`
			`"",`
			`tableLines[i][2]`
			`)`
			`break`

			`for i in range(len(tableLines)-1, -1, -1):`
			`if tableLines[i][2] != self.EMPTY:`
			`result["born_raw"] = re.sub(`
			`"(<td.*\">)\|(</td>)",`
			`"",`
			`tableLines[i][2]`
			`)`
			`break`
			`result["times_used"] = len(tableLines)`

			`return result`


			`def ProcessRawNumbers(self, raws):`
			`"""Process list of raw parsed numbers and make them look cool ayo.`
			`"""`

			`for i in range(0, len(raws)):`
			`if not re.search("\+[0-9]+", raws[i]["number"]):`
			`self.Log(f"error: incorrect number: \"{raws[i]['number']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")`
			`continue`

			`if not re.search("[a-zA-Z0-9]+", raws[i]["last_use_raw"]):`
			`self.Log(f"warning: malformed last_use_raw field: \"{raws[i]['last_use_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")`
			`raws[i]["last_use_raw"] = self.EMPTY`

			`raws[i]["last_use"] = raws[i]["last_use_raw"]`

			`if not re.search("[a-zA-Z0-9]+", raws[i]["born_raw"]):`
			`self.Log(f"warning: malformed born_raw field: \"{raws[i]['born_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")`
			`raws[i]["born_raw"] = self.EMPTY`

			`raws[i]["born"] = raws[i]["born_raw"]`

			`raws[i]["url"] = self.WebSiteFullURL + raws[i]["uri_raw"]`

			`if raws[i]["times_used"] == 0:`
			`self.Output["not_used"].append(raws[i])`
			`elif raws[i]["times_used"] < self.AlmostNotUsedThreshold:`
			`self.Output["almost_not_used"].append(raws[i])`
			`else:`
			`self.Output["heavily_used"].append(raws[i])`