From 075f2ccb7be86c299e4baa4026ae5413bcd7d137 Mon Sep 17 00:00:00 2001 From: kulath Date: Tue, 24 Mar 2015 15:24:28 +0000 Subject: [PATCH] 3082, 4439, 7134, 8279 Various fixes for GEDCOM import. 0003082: 1/4 and 1/2 ANSEL characters not supported on importing ANSEL GEDCOM 0004439: [Info]: characters ignored on a Gedcom encoded ANSI (cp1252 West Europe, USA) 0007134: Failure importing ANSEL encoded gedcom file. 0008279: GEDCOM import fails for ANSI file incorrectly opened with the utf8 locale Fixed GEDCOM import not working properly for Python3; other problems also corected, including fixing the inability to import Windows CP1252 coded files. Also more consistent fix for 8014. Ensured any error messages are not lost. Fixed a few ANSEL characters that were not translated. --- gramps/plugins/importer/importgedcom.py | 10 +- gramps/plugins/lib/libgedcom.py | 159 ++++++++++++++++-------- 2 files changed, 118 insertions(+), 51 deletions(-) diff --git a/gramps/plugins/importer/importgedcom.py b/gramps/plugins/importer/importgedcom.py index edaff50e1..da257a764 100644 --- a/gramps/plugins/importer/importgedcom.py +++ b/gramps/plugins/importer/importgedcom.py @@ -66,14 +66,20 @@ def importData(database, filename, user): database.__class__.__bases__ try: - ifile = open(filename, "r") + ifile = open(filename, "rb") except IOError: return ansel = False gramps = False for index in range(50): - line = ifile.readline().split() + # Treat the file as though it is UTF-8 since this is the more modern + # option; and anyway it doesn't really matter as we are only trying to + # detect a CHAR or SOUR line which is only 7-bit ASCII anyway, and we + # ignore anything that can't be translated. + line = ifile.readline() + line = line.decode(encoding='utf-8', errors='replace') + line = line.split() if len(line) == 0: break if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL": diff --git a/gramps/plugins/lib/libgedcom.py b/gramps/plugins/lib/libgedcom.py index e83926164..19a148f1e 100644 --- a/gramps/plugins/lib/libgedcom.py +++ b/gramps/plugins/lib/libgedcom.py @@ -564,6 +564,10 @@ LDS_STATUS = { # table for skipping illegal control chars in GEDCOM import # Only 09, 0A, 0D are allowed. STRIP_DICT = dict.fromkeys(list(range(9))+list(range(11, 13))+list(range(14, 32))) +# The C1 Control characters are not treated in Latin-1 (ISO-8859-1) as +# undefined, but if they have been used, the file is probably supposed to be +# cp1252 +DEL_AND_C1 = dict.fromkeys(list(range(0x7F, 0x9F))) #------------------------------------------------------------------------- # @@ -675,7 +679,7 @@ class GedcomDateParser(DateParser): #------------------------------------------------------------------------- class Lexer(object): - def __init__(self, ifile): + def __init__(self, ifile, __add_msg): self.ifile = ifile self.current_list = [] self.eof = False @@ -686,6 +690,7 @@ class Lexer(object): TOKEN_CONT : self.__fix_token_cont, TOKEN_CONC : self.__fix_token_conc, } + self.__add_msg = __add_msg def readline(self): if len(self.current_list) <= 1 and not self.eof: @@ -724,6 +729,7 @@ class Lexer(object): self.eof = True return + original_line = line try: # According to the GEDCOM 5.5 standard, # Chapter 1 subsection Grammar @@ -757,6 +763,13 @@ class Lexer(object): tag = line[0] line_value = line[2] except: + problem = _("Line ignored ") + text = original_line.rstrip('\n\r') + prob_width = 66 + problem = problem.ljust(prob_width)[0:(prob_width-1)] + text = text.replace("\n", "\n".ljust(prob_width + 22)) + message = "%s %s" % (problem, text) + self.__add_msg(message) continue token = TOKENS.get(tag, TOKEN_UNKNOWN) @@ -1213,22 +1226,29 @@ class GedInfoParser(object): # #------------------------------------------------------------------------- class BaseReader(object): - def __init__(self, ifile, encoding): + def __init__(self, ifile, encoding, __add_msg): self.ifile = ifile self.enc = encoding + self.__add_msg = __add_msg def reset(self): self.ifile.seek(0) def readline(self): - line = self.ifile.readline() - line = line.decode(self.enc, errors='replace') - return line.translate(STRIP_DICT) + raise NotImplemented + + def report_error(self, problem, line): + line = line.rstrip('\n\r') + prob_width = 66 + problem = problem.ljust(prob_width)[0:(prob_width-1)] + text = line.replace("\n", "\n".ljust(prob_width + 22)) + message = "%s %s" % (problem, text) + self.__add_msg(message) class UTF8Reader(BaseReader): - def __init__(self, ifile): - BaseReader.__init__(self, ifile, 'utf8') + def __init__(self, ifile, __add_msg): + BaseReader.__init__(self, ifile, 'utf8', __add_msg) self.reset() def reset(self): @@ -1244,23 +1264,38 @@ class UTF8Reader(BaseReader): class UTF16Reader(BaseReader): - def __init__(self, ifile): + def __init__(self, ifile, __add_msg): new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16') - BaseReader.__init__(self, new_file, 'utf16') + BaseReader.__init__(self, new_file, '', __add_msg) self.reset() def readline(self): - l = self.ifile.readline() - if l.strip(): - return l - else: - return self.ifile.readline() + line = self.ifile.readline() + line = line.decode('utf8', errors='replace') + return line.translate(STRIP_DICT) class AnsiReader(BaseReader): - def __init__(self, ifile): - BaseReader.__init__(self, ifile, 'latin1') - + def __init__(self, ifile, __add_msg): + BaseReader.__init__(self, ifile, 'latin1', __add_msg) + + def readline(self): + line = self.ifile.readline() + line = line.decode(self.enc, errors='replace') + if line.translate(DEL_AND_C1) != line: + self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line) + return line.translate(STRIP_DICT) + +class CP1252Reader(BaseReader): + + def __init__(self, ifile, __add_msg): + BaseReader.__init__(self, ifile, 'cp1252', __add_msg) + + def readline(self): + line = self.ifile.readline() + line = line.decode(self.enc, errors='replace') + return line.translate(STRIP_DICT) + class AnselReader(BaseReader): """ ANSEL to Unicode Conversion @@ -1280,7 +1315,8 @@ class AnselReader(BaseReader): TODO: should we allow TAB, as a Gramps extension? """ __printable_ascii = list(map(chr, list(range(32, 127)))) # note: up thru 126 - __use_ASCII = list(map(chr, [10, 27, 29 , 30, 31])) + __printable_ascii + # LF CR Esc GS RS US + __use_ASCII = list(map(chr, [10, 13, 27, 29 , 30, 31])) + __printable_ascii # mappings of single byte ANSEL codes to unicode __onebyte = { @@ -1293,9 +1329,11 @@ class AnselReader(BaseReader): b'\xB4' : '\u00fe', b'\xB5' : '\u00e6', b'\xB6' : '\u0153', b'\xB7' : '\u02ba', b'\xB8' : '\u0131', b'\xB9' : '\u00a3', b'\xBA' : '\u00f0', b'\xBC' : '\u01a1', b'\xBD' : '\u01b0', + b'\xBE' : '\u25a1', b'\xBF' : '\u25a0', b'\xC0' : '\u00b0', b'\xC1' : '\u2113', b'\xC2' : '\u2117', b'\xC3' : '\u00a9', b'\xC4' : '\u266f', b'\xC5' : '\u00bf', b'\xC6' : '\u00a1', b'\xC7' : '\u00df', b'\xC8' : '\u20ac', + b'\xCD' : '\u0065', b'\xCE' : '\u006f', b'\xCF' : '\u00df', } # combining forms (in ANSEL, they precede the modified ASCII character @@ -1316,6 +1354,7 @@ class AnselReader(BaseReader): b'\xF3' : '\u0324', b'\xF4' : '\u0325', b'\xF5' : '\u0333', b'\xF6' : '\u0332', b'\xF7' : '\u0326', b'\xF8' : '\u031c', b'\xF9' : '\u032e', b'\xFA' : '\ufe22', b'\xFB' : '\ufe23', + b'\xFC' : '\u0338', b'\xFE' : '\u0313', } @@ -1473,49 +1512,56 @@ class AnselReader(BaseReader): b'\xF9\x48' : '\u1e2a', b'\xF9\x68' : '\u1e2b', } - @staticmethod - def __ansel_to_unicode(s): + def __ansel_to_unicode(self, s): """ Convert an ANSEL encoded string to unicode """ buff = StringIO() + error = "" while s: - if ord(s[0]) < 128: - if s[0] in AnselReader.__use_ASCII: - head = s[0] + if s[0] < 128: + if chr(s[0]) in AnselReader.__use_ASCII: + head = chr(s[0]) else: # substitute space for disallowed (control) chars + error += " (%#X)" % s[0] head = ' ' s = s[1:] else: if s[0:2] in AnselReader.__twobyte: head = AnselReader.__twobyte[s[0:2]] s = s[2:] - elif s[0] in AnselReader.__onebyte: - head = AnselReader.__onebyte[s[0]] + elif bytes([s[0]]) in AnselReader.__onebyte: + head = AnselReader.__onebyte[bytes([s[0]])] s = s[1:] - elif s[0] in AnselReader.__acombiners: - c = AnselReader.__acombiners[s[0]] + elif bytes([s[0]]) in AnselReader.__acombiners: + c = AnselReader.__acombiners[bytes([s[0]])] # always consume the combiner s = s[1:] - next = s[0] - if next in AnselReader.__printable_ascii: + next_byte = s[0] + if next_byte < 128 and chr(next_byte) in AnselReader.__printable_ascii: # consume next as well s = s[1:] # unicode: combiner follows base-char - head = next + c + head = chr(next_byte) + c else: # just drop the unexpected combiner + error += " (%#X)" % s[0] continue else: + error += " (%#X)" % s[0] head = '\ufffd' # "Replacement Char" s = s[1:] - buff.write(head.encode("utf-8")) - ans = buff.getvalue().decode("utf-8") + buff.write(head) + ans = buff.getvalue() + + if error: + # e.g. Illegal character (oxAB) (0xCB)... 1 NOTE xyz?pqr?lmn + self.report_error(_("Illegal character%s") % error, ans) buff.close() return ans - def __init__(self, ifile): - BaseReader.__init__(self, ifile, "") + def __init__(self, ifile, __add_msg): + BaseReader.__init__(self, ifile, "", __add_msg) def readline(self): return self.__ansel_to_unicode(self.ifile.readline()) @@ -2617,15 +2663,17 @@ class GedcomParser(UpdateCallback): enc = stage_one.get_encoding() if enc == "ANSEL": - rdr = AnselReader(ifile) + rdr = AnselReader(ifile, self.__add_msg) elif enc in ("UTF-8", "UTF8"): - rdr = UTF8Reader(ifile) - elif enc in ("UTF-16", "UTF16", "UNICODE"): - rdr = UTF16Reader(ifile) + rdr = UTF8Reader(ifile, self.__add_msg) + elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"): + rdr = UTF16Reader(ifile, self.__add_msg) + elif enc in ("CP1252", "WINDOWS-1252"): + rdr = CP1252Reader(ifile, self.__add_msg) else: - rdr = AnsiReader(ifile) + rdr = AnsiReader(ifile, self.__add_msg) - self.lexer = Lexer(rdr) + self.lexer = Lexer(rdr, self.__add_msg) self.filename = filename self.backoff = False @@ -7129,8 +7177,13 @@ class GedcomParser(UpdateCallback): sattr.set_value(line.data) self.def_src.add_attribute(sattr) elif line.token == TOKEN_FORM: - if line.data != "LINEAGE-LINKED": - self.__add_msg(_("GEDCOM form not supported"), line, state) + if line.data == "LINEAGE-LINKED": + pass + elif line.data.upper() == "LINEAGE-LINKED": + # Allow Lineage-Linked etc. though it should be in uppercase + self.__add_msg(_("GEDCOM FORM should be in uppercase"), line, state) + else: + self.__add_msg(_("GEDCOM FORM not supported"), line, state) if self.use_def_src: sattr = SrcAttribute() sattr.set_type(_('GEDCOM form')) @@ -7675,7 +7728,7 @@ class GedcomStageOne(object): input_file.read(1) self.enc = "UTF8" return input_file - elif line == b"\xff\xfe": + elif line == b"\xff\xfe" or line == b"\xfe\xff": self.enc = "UTF16" input_file.seek(0) return codecs.EncodedFile(input_file, 'utf8', 'utf16') @@ -7696,25 +7749,30 @@ class GedcomStageOne(object): reader = self.__detect_file_decoder(self.ifile) for line in reader: + # Treat the file as though it is UTF-8 since this will be right if a + # BOM was detected; it is the more modern option; and anyway it + # doesn't really matter as we are only trying to detect a CHAR line + # which is only 7-bit ASCII anyway, and we ignore anything that + # can't be translated. + line = line.decode(encoding='utf-8', errors='replace') line = line.strip() if not line: continue self.lcnt += 1 - data = line.split(None, 2) + [''] try: + data = line.split(None, 2) + [''] (level, key, value) = data[:3] level = int(level) - key = conv_to_unicode(key.strip()) - value = conv_to_unicode(value.strip()) + key = key.strip() + value = value.strip() except: - LOG.warn(_("Invalid line %d in GEDCOM file.") % self.lcnt) continue if level == 0 and key[0] == '@': - if value == ("FAM", "FAMILY") : + if value in ("FAM", "FAMILY") : current_family_id = key.strip()[1:-1] - elif value == ("INDI", "INDIVIDUAL"): + elif value in ("INDI", "INDIVIDUAL"): self.pcnt += 1 elif key in ("HUSB", "HUSBAND", "WIFE") and \ self.__is_xref_value(value): @@ -7724,6 +7782,9 @@ class GedcomStageOne(object): elif key == 'CHAR' and not self.enc: assert(isinstance(value, str)) self.enc = value + LOG.debug("parse pcnt %d" % self.pcnt) + LOG.debug("parse famc %s" % dict(self.famc)) + LOG.debug("parse fams %s" % dict(self.fams)) def get_famc_map(self): """