"""Module for parsing freesmscenter.com NOTICE: this can be rewritten in more clean manner in future """ import re import WebSiteParser class FreesmscenterCom(WebSiteParser.WebSiteParser): def __init__(self): super().__init__("freesmscenter.com") def Parse(self): """Perform parsing of entire web site. """ # Parsing main page with list of countries country_uris = self.ParseURIsFromPage( "Show Phone Numbers", "Show Phone Numbers" ) # Parsing all single number from page with country's number list numbers_raw = [] for country_uri in country_uris: tmp = country_uri.replace("FreeSms", "SmsReceive") number_uris = self.ParseURIsFromPage( f"Receive SMS", "Receive SMS", country_uri ) if not number_uris: number_uris = self.ParseURIsFromPage( f"Receive SMS", "Receive SMS", country_uri ) for number_uri in number_uris: numbers_raw.append({ "uri_raw": number_uri, "country": country_uri.replace("FreeSms/", "").replace("-Phone-Number", "").replace("%20", " ") }) # Parse number's page for i in range(len(numbers_raw)): numbers_raw[i].update( self.ParseNumberPage(numbers_raw[i]["uri_raw"]) ) self.ProcessRawNumbers(numbers_raw) self.ParseDone = True self.Log("parsing done") def ParseURIsFromPage(self, exp, r1, r2, uri=""): """Parse all URIs from selected page. """ markup = self.RequestPage(uri).text peaces_of_markup = re.findall(exp, markup) result_uris = [] for peace in peaces_of_markup: result_uris.append( peace.replace(r1, "").replace(r2, "") ) return result_uris def ParseNumberPage(self, uri): """Parse page with history of messages, related to single number. NOTICE: this can be better, but a lot of hardcoded sheat is required then, just because original site has a lot of bugs. However, i fixed most of major things. """ def die(text): self.Log(text, f"ParseNumberPage(self at {id(self)}, \"{uri}\")") def p(l): for i in range(len(l)): if not l[i]: l[i] = self.EMPTY return l result = { "number": self.EMPTY, "last_use_raw": self.EMPTY, "born_raw": self.EMPTY, "times_used": self.EMPTY } numExp = "

\+[0-9]+

" tableCellExp = ".*" cellFromExp = "*" cellContExp = ".*" cellTimeExp = ".*" markup = self.RequestPage(uri).text number = re.findall(numExp, markup) if not number: die("error: can't parse page, number is empty") self.ErrorsWhileParsing += 1 return result result["number"] = re.findall("\+[0-9]+", number[0])[0] tableCellsRaw = re.findall(tableCellExp, markup) if not len(tableCellsRaw): die("error: can't parse page, tableCellsRaw is empty") self.ErrorsWhileParsing += 1 return result elif len(tableCellsRaw) % 3: die("warning: length tableCellsRaw should be divisible by 3, incorrect results are expected") tableLines = [] tmpLine = [None, None, None] for i in range(0, len(tableCellsRaw)): if re.search(cellFromExp, tableCellsRaw[i]) and (i < len(tableCellsRaw) - 3): if re.search(cellContExp, tableCellsRaw[i+1]): if re.search(cellTimeExp, tableCellsRaw[i+2]): tableLines.append([ tableCellsRaw[i], tableCellsRaw[i+1], tableCellsRaw[i+2] ]) tmpLine = [None, None, None] i += 2 continue if re.search(cellFromExp, tableCellsRaw[i]) and not tmpLine[0]: if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]): tmpLine[0] = self.EMPTY else: tmpLine[0] = tableCellsRaw[i] elif tmpLine[0]: tmpLine = p(tmpLine) if re.search(cellContExp, tableCellsRaw[i]) and not tmpLine[1]: # if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]): # tmpLine[1] = self.EMPTY # else: tmpLine[1] = tableCellsRaw[i] elif tmpLine[1]: tmpLine = p(tmpLine) if re.search(cellTimeExp, tableCellsRaw[i]) and not tmpLine[2]: if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]): tmpLine[2] = self.EMPTY else: tmpLine[2] = tableCellsRaw[i] elif tmpLine[2]: tmpLine = p(tmpLine) if tmpLine[0] and tmpLine[1] and tmpLine[2]: tableLines.append(tmpLine) tmpLine = [None, None, None] if not tableLines: die("error: can't parse page, tableLines is empty") self.ErrorsWhileParsing += 1 return result for i in range(len(tableLines)): if tableLines[i][2] != self.EMPTY: result["last_use_raw"] = re.sub( "()|()", "", tableLines[i][2] ) break for i in range(len(tableLines)-1, -1, -1): if tableLines[i][2] != self.EMPTY: result["born_raw"] = re.sub( "()|()", "", tableLines[i][2] ) break result["times_used"] = len(tableLines) return result def ProcessRawNumbers(self, raws): """Process list of raw parsed numbers and make them look cool ayo. """ for i in range(0, len(raws)): if not re.search("\+[0-9]+", raws[i]["number"]): self.Log(f"error: incorrect number: \"{raws[i]['number']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})") continue if not re.search("[a-zA-Z0-9]+", raws[i]["last_use_raw"]): self.Log(f"warning: malformed last_use_raw field: \"{raws[i]['last_use_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})") raws[i]["last_use_raw"] = self.EMPTY raws[i]["last_use"] = raws[i]["last_use_raw"] if not re.search("[a-zA-Z0-9]+", raws[i]["born_raw"]): self.Log(f"warning: malformed born_raw field: \"{raws[i]['born_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})") raws[i]["born_raw"] = self.EMPTY raws[i]["born"] = raws[i]["born_raw"] raws[i]["url"] = self.WebSiteFullURL + raws[i]["uri_raw"] if raws[i]["times_used"] == 0: self.Output["not_used"].append(raws[i]) elif raws[i]["times_used"] < self.AlmostNotUsedThreshold: self.Output["almost_not_used"].append(raws[i]) else: self.Output["heavily_used"].append(raws[i])