vnp/sites/FreesmscenterCom.py

"""Module for parsing freesmscenter.com
NOTICE: this can be rewritten in more clean manner in future
"""

import re
import WebSiteParser


class FreesmscenterCom(WebSiteParser.WebSiteParser):
	def __init__(self):
		super().__init__("freesmscenter.com")


	def Parse(self):
		"""Perform parsing of entire web site.
		"""

		# Parsing main page with list of countries
		country_uris = self.ParseURIsFromPage(
			"<a href=\"/FreeSms/.+-Phone-Number\" class=\"btn btn-primary\">Show Phone Numbers</a>",
			"<a href=\"/",
			"\" class=\"btn btn-primary\">Show Phone Numbers</a>"
		)

		# Parsing all single number from page with country's number list
		numbers_raw = []
		for country_uri in country_uris:
			tmp = country_uri.replace("FreeSms", "SmsReceive")
			number_uris = self.ParseURIsFromPage(
				f"<a href=\"/SmsReceive/.+/[0-9]+\" class=\"btn btn-primary\">Receive SMS</a>",
				"<a href=\"/",
				"\" class=\"btn btn-primary\">Receive SMS</a>",
				country_uri
			)
			if not number_uris:
				number_uris  = self.ParseURIsFromPage(
					f"<a href=\"/SmsReceive/.+/[0-9]+\" class=\"btn btn-primary\">Receive SMS</a>",
					"<a href=\"/",
					"\" class=\"btn btn-primary\">Receive SMS</a>",
					country_uri
				)
			for number_uri in number_uris:
				numbers_raw.append({
					"uri_raw": number_uri,
					"country": country_uri.replace("FreeSms/", "").replace("-Phone-Number", "").replace("%20", " ")
				})

		# Parse number's page
		for i in range(len(numbers_raw)):
			numbers_raw[i].update(
				self.ParseNumberPage(numbers_raw[i]["uri_raw"])
			)

		self.ProcessRawNumbers(numbers_raw)
		self.ParseDone = True
		self.Log("parsing done")


	def ParseURIsFromPage(self, exp, r1, r2, uri=""):
		"""Parse all URIs from selected page.
		"""

		markup = self.RequestPage(uri).text
		peaces_of_markup = re.findall(exp, markup)
		result_uris = []
		for peace in peaces_of_markup:
			result_uris.append(
				peace.replace(r1, "").replace(r2, "")
			)
		return result_uris


	def ParseNumberPage(self, uri):
		"""Parse page with history of messages, related to single number.

		NOTICE: this can be better,
		but a lot of hardcoded sheat is required then,
		just because original site has a lot of bugs.
		However, i fixed most of major things.
		"""

		def die(text):
			self.Log(text, f"ParseNumberPage(self at {id(self)}, \"{uri}\")")

		def p(l):
			for i in range(len(l)):
				if not l[i]:
					l[i] = self.EMPTY
			return l

		result = {
			"number": self.EMPTY,
			"last_use_raw": self.EMPTY,
			"born_raw": self.EMPTY,
			"times_used": self.EMPTY
		}
		numExp = "<h1 id=\"numberget\" class=\"cw\">\+[0-9]+</h1>"
		tableCellExp = "<td.*\">.*</td>"
		cellFromExp = "<td.*datesize3.*>*</td>"
		cellContExp = "<td.*datesize5.*>.*</td>"
		cellTimeExp = "<td.*datesize4.*>.*</td>"

		markup = self.RequestPage(uri).text
		number = re.findall(numExp, markup)
		if not number:
			die("error: can't parse page, number is empty")
			self.ErrorsWhileParsing += 1
			return result
		result["number"] = re.findall("\+[0-9]+", number[0])[0]
		tableCellsRaw = re.findall(tableCellExp, markup)
		if not len(tableCellsRaw):
			die("error: can't parse page, tableCellsRaw is empty")
			self.ErrorsWhileParsing += 1
			return result
		elif len(tableCellsRaw) % 3:
			die("warning: length tableCellsRaw should be divisible by 3, incorrect results are expected")

		tableLines = []
		tmpLine = [None, None, None]

		for i in range(0, len(tableCellsRaw)):
			if re.search(cellFromExp, tableCellsRaw[i]) and (i < len(tableCellsRaw) - 3):
				if re.search(cellContExp, tableCellsRaw[i+1]):
					if re.search(cellTimeExp, tableCellsRaw[i+2]):
						tableLines.append([
							tableCellsRaw[i],
							tableCellsRaw[i+1],
							tableCellsRaw[i+2]
						])
						tmpLine = [None, None, None]
						i += 2
						continue

			if re.search(cellFromExp, tableCellsRaw[i]) and not tmpLine[0]:
				if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):
					tmpLine[0] = self.EMPTY
				else:
					tmpLine[0] = tableCellsRaw[i]
			elif tmpLine[0]:
				tmpLine = p(tmpLine)
			if re.search(cellContExp, tableCellsRaw[i]) and not tmpLine[1]:
				# if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):
				# 	tmpLine[1] = self.EMPTY
				# else:
				tmpLine[1] = tableCellsRaw[i]
			elif tmpLine[1]:
				tmpLine = p(tmpLine)
			if re.search(cellTimeExp, tableCellsRaw[i]) and not tmpLine[2]:
				if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):
					tmpLine[2] = self.EMPTY
				else:
					tmpLine[2] = tableCellsRaw[i]
			elif tmpLine[2]:
				tmpLine = p(tmpLine)

			if tmpLine[0] and tmpLine[1] and tmpLine[2]:
				tableLines.append(tmpLine)
				tmpLine = [None, None, None]

		if not tableLines:
			die("error: can't parse page, tableLines is empty")
			self.ErrorsWhileParsing += 1
			return result

		for i in range(len(tableLines)):
			if tableLines[i][2] != self.EMPTY:
				result["last_use_raw"] = re.sub(
					"(<td.*\">)|(</td>)",
					"",
					tableLines[i][2]
				)
				break

		for i in range(len(tableLines)-1, -1, -1):
			if tableLines[i][2] != self.EMPTY:
				result["born_raw"] = re.sub(
					"(<td.*\">)|(</td>)",
					"",
					tableLines[i][2]
				)
				break
		result["times_used"] = len(tableLines)

		return result


	def ProcessRawNumbers(self, raws):
		"""Process list of raw parsed numbers and make them look cool ayo.
		"""

		for i in range(0, len(raws)):
			if not re.search("\+[0-9]+", raws[i]["number"]):
				self.Log(f"error: incorrect number: \"{raws[i]['number']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
				continue

			if not re.search("[a-zA-Z0-9]+", raws[i]["last_use_raw"]):
				self.Log(f"warning: malformed last_use_raw field: \"{raws[i]['last_use_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
				raws[i]["last_use_raw"] = self.EMPTY

			raws[i]["last_use"] = raws[i]["last_use_raw"]

			if not re.search("[a-zA-Z0-9]+", raws[i]["born_raw"]):
				self.Log(f"warning: malformed born_raw field: \"{raws[i]['born_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
				raws[i]["born_raw"] = self.EMPTY

			raws[i]["born"] = raws[i]["born_raw"]

			raws[i]["url"] = self.WebSiteFullURL + raws[i]["uri_raw"]

			if raws[i]["times_used"] == 0:
				self.Output["not_used"].append(raws[i])
			elif raws[i]["times_used"] < self.AlmostNotUsedThreshold:
				self.Output["almost_not_used"].append(raws[i])
			else:
				self.Output["heavily_used"].append(raws[i])