From be6715cd99bc4483ff7c63b1f78a456af5bdd52c Mon Sep 17 00:00:00 2001 From: prculley Date: Fri, 3 Jun 2016 10:57:29 -0500 Subject: [PATCH] 9124: GEDCOM doesn't accept CR as a line terminator Original code used readline() when file was in binary mode; this works only if file contains '\n', true only for CRLF and LF line endings. Switched to file text mode with correct encoding and universal newline support. --- gramps/plugins/importer/importgedcom.py | 54 +++++++++-------- gramps/plugins/lib/libgedcom.py | 77 ++++++++++++++----------- 2 files changed, 73 insertions(+), 58 deletions(-) diff --git a/gramps/plugins/importer/importgedcom.py b/gramps/plugins/importer/importgedcom.py index 659a58ea5..e0020186a 100644 --- a/gramps/plugins/importer/importgedcom.py +++ b/gramps/plugins/importer/importgedcom.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(".GedcomImport") #------------------------------------------------------------------------ # -# GRAMPS modules +# Gramps modules # #------------------------------------------------------------------------ from gramps.gen.const import GRAMPS_LOCALE as glocale @@ -51,7 +51,7 @@ import imp imp.reload(module) from gramps.gen.config import config - + #------------------------------------------------------------------------- # # importData @@ -65,30 +65,34 @@ def importData(database, filename, user): if DbMixin not in database.__class__.__bases__: database.__class__.__bases__ = (DbMixin,) + \ database.__class__.__bases__ - try: - ifile = open(filename, "rb") + # Opening in utf-8 with universal newline to allow cr, lf, and crlf + # If the file is really UTF16 or a varient, the next block code will + # not find anything even if it is there, but this is ok since it + # won't be ANSEL, or is inconsistent... + with open(filename, "r", encoding='utf-8', errors='replace', + newline=None) as ifile: + ansel = False + gramps = False + for index in range(50): + # Treat the file as though it is UTF-8 since this is the more + # modern option; and anyway it doesn't really matter as we are + # only trying to detect a CHAR or SOUR line which is only + # 7-bit ASCII anyway, and we ignore anything that can't be + # translated. + line = ifile.readline() + line = line.split() + if len(line) == 0: + break + if len(line) > 2 and line[1][0:4] == 'CHAR' \ + and line[2] == "ANSEL": + ansel = True + if len(line) > 2 and line[1][0:4] == 'SOUR' \ + and line[2] == "GRAMPS": + gramps = True except IOError: return - ansel = False - gramps = False - for index in range(50): - # Treat the file as though it is UTF-8 since this is the more modern - # option; and anyway it doesn't really matter as we are only trying to - # detect a CHAR or SOUR line which is only 7-bit ASCII anyway, and we - # ignore anything that can't be translated. - line = ifile.readline() - line = line.decode(encoding='utf-8', errors='replace') - line = line.split() - if len(line) == 0: - break - if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL": - ansel = True - if len(line) > 2 and line[1][0:4] == 'SOUR' and line[2] == "GRAMPS": - gramps = True - ifile.close() - if not gramps and ansel and user.uistate: top = Glade() code = top.get_object('codeset') @@ -116,15 +120,15 @@ def importData(database, filename, user): database, ifile, filename, user, stage_one, None, None) else: gedparse = libgedcom.GedcomParser( - database, ifile, filename, user, stage_one, + database, ifile, filename, user, stage_one, config.get('preferences.default-source'), - (config.get('preferences.tag-on-import-format') if + (config.get('preferences.tag-on-import-format') if config.get('preferences.tag-on-import') else None)) except IOError as msg: user.notify_error(_("%s could not be opened\n") % filename, str(msg)) return except GedcomError as msg: - user.notify_error(_("Invalid GEDCOM file"), + user.notify_error(_("Invalid GEDCOM file"), _("%s could not be imported") % filename + "\n" + str(msg)) return diff --git a/gramps/plugins/lib/libgedcom.py b/gramps/plugins/lib/libgedcom.py index 314b39e06..6a9679686 100755 --- a/gramps/plugins/lib/libgedcom.py +++ b/gramps/plugins/lib/libgedcom.py @@ -94,7 +94,7 @@ import codecs from xml.parsers.expat import ParserCreate from collections import defaultdict import string -from io import StringIO +from io import StringIO, TextIOWrapper from urllib.parse import urlparse #------------------------------------------------------------------------ @@ -1251,41 +1251,41 @@ class BaseReader(object): class UTF8Reader(BaseReader): - def __init__(self, ifile, __add_msg): - BaseReader.__init__(self, ifile, 'utf8', __add_msg) + def __init__(self, ifile, __add_msg, enc): + BaseReader.__init__(self, ifile, enc, __add_msg) self.reset() - - def reset(self): - self.ifile.seek(0) - data = self.ifile.read(3) - if data != b"\xef\xbb\xbf": - self.ifile.seek(0) + if enc == 'UTF_8_SIG': + self.ifile = TextIOWrapper(ifile, encoding='utf_8_sig', + errors='replace', newline=None) + else: + self.ifile = TextIOWrapper(ifile, encoding='utf_8', + errors='replace', newline=None) def readline(self): line = self.ifile.readline() - line = line.decode(self.enc, errors='replace') return line.translate(STRIP_DICT) class UTF16Reader(BaseReader): def __init__(self, ifile, __add_msg): - new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16') - BaseReader.__init__(self, new_file, '', __add_msg) + BaseReader.__init__(self, ifile, 'UTF16', __add_msg) + self.ifile = TextIOWrapper(ifile, encoding='utf_16', + errors='replace', newline=None) self.reset() def readline(self): line = self.ifile.readline() - line = line.decode('utf8', errors='replace') return line.translate(STRIP_DICT) class AnsiReader(BaseReader): def __init__(self, ifile, __add_msg): BaseReader.__init__(self, ifile, 'latin1', __add_msg) + self.ifile = TextIOWrapper(ifile, encoding='latin1', + errors='replace', newline=None) def readline(self): line = self.ifile.readline() - line = line.decode(self.enc, errors='replace') if line.translate(DEL_AND_C1) != line: self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line) return line.translate(STRIP_DICT) @@ -1294,10 +1294,11 @@ class CP1252Reader(BaseReader): def __init__(self, ifile, __add_msg): BaseReader.__init__(self, ifile, 'cp1252', __add_msg) + self.ifile = TextIOWrapper(ifile, encoding='cp1252', + errors='replace', newline=None) def readline(self): line = self.ifile.readline() - line = line.decode(self.enc, errors='replace') return line.translate(STRIP_DICT) class AnselReader(BaseReader): @@ -1565,10 +1566,16 @@ class AnselReader(BaseReader): return ans def __init__(self, ifile, __add_msg): - BaseReader.__init__(self, ifile, "", __add_msg) + BaseReader.__init__(self, ifile, "ANSEL", __add_msg) + # In theory, we should have been able to skip the encode/decode from + # ascii. But this way allows us to use pythons universal newline + self.ifile = TextIOWrapper(ifile, encoding='ascii', + errors='surrogateescape', newline=None) def readline(self): - return self.__ansel_to_unicode(self.ifile.readline()) + line = self.ifile.readline() + linebytes = line.encode(encoding='ascii', errors='surrogateescape') + return self.__ansel_to_unicode(linebytes) #------------------------------------------------------------------------- # @@ -2676,8 +2683,8 @@ class GedcomParser(UpdateCallback): if enc == "ANSEL": rdr = AnselReader(ifile, self.__add_msg) - elif enc in ("UTF-8", "UTF8"): - rdr = UTF8Reader(ifile, self.__add_msg) + elif enc in ("UTF-8", "UTF8", "UTF_8_SIG"): + rdr = UTF8Reader(ifile, self.__add_msg, enc) elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"): rdr = UTF16Reader(ifile, self.__add_msg) elif enc in ("CP1252", "WINDOWS-1252"): @@ -7774,26 +7781,33 @@ class GedcomStageOne(object): def __detect_file_decoder(self, input_file): """ Detects the file encoding of the file by looking for a BOM - (byte order marker) in the GEDCOM file. If we detect a UTF-16 - encoded file, we must connect to a wrapper using the codecs - package. + (byte order marker) in the GEDCOM file. If we detect a UTF-16 or + UTF-8-BOM encoded file, we choose appropriate decoders. If no BOM + is detected, we return in UTF-8 mode it is the more modern option; + and anyway it doesn't really matter as we are only looking for GEDCOM + keywords which are only 7-bit ASCII anyway. + In any case, we Always return the file in text mode with transparent + newline (CR, LF, or CRLF). """ line = input_file.read(2) if line == b"\xef\xbb": input_file.read(1) - self.enc = "UTF8" - return input_file + self.enc = "utf_8_sig" + return TextIOWrapper(input_file, encoding='utf_8_sig', + errors='replace', newline=None) elif line == b"\xff\xfe" or line == b"\xfe\xff": self.enc = "UTF16" input_file.seek(0) - return codecs.EncodedFile(input_file, 'utf8', 'utf16') + return TextIOWrapper(input_file, encoding='utf_16', + errors='replace', newline=None) elif not line : raise GedcomError(self.__EMPTY_GED) - elif line[0] == b"\x00" or line[1] == b"\x00": + elif line == b"\x30\x00" or line == b"\x00\x30": raise GedcomError(self.__BAD_UTF16) else: input_file.seek(0) - return input_file + return TextIOWrapper(input_file, encoding='utf-8', + errors='replace', newline=None) def parse(self): """ @@ -7804,12 +7818,8 @@ class GedcomStageOne(object): reader = self.__detect_file_decoder(self.ifile) for line in reader: - # Treat the file as though it is UTF-8 since this will be right if a - # BOM was detected; it is the more modern option; and anyway it - # doesn't really matter as we are only trying to detect a CHAR line - # which is only 7-bit ASCII anyway, and we ignore anything that - # can't be translated. - line = line.decode(encoding='utf-8', errors='replace') + # Scan for a few items, keep counts. Also look for actual CHAR + # Keyword to figure out actual encodeing for non-unicode file types line = line.strip() if not line: continue @@ -7840,6 +7850,7 @@ class GedcomStageOne(object): LOG.debug("parse pcnt %d" % self.pcnt) LOG.debug("parse famc %s" % dict(self.famc)) LOG.debug("parse fams %s" % dict(self.fams)) + self.ifile = reader # need this to keep python from autoclosing file def get_famc_map(self): """