9124: GEDCOM doesn't accept CR as a line terminator

Original code used readline() when file was in binary mode; this works only if file contains '\n', true only for CRLF and LF line endings. Switched to file text mode with correct encoding and universal newline support.
2016-06-03 10:57:29 -05:00 · 2016-06-03 10:57:29 -05:00 · be6715cd99
commit be6715cd99
parent 16e2ed4f54
2 changed files with 73 additions and 58 deletions
--- a/gramps/plugins/importer/importgedcom.py
+++ b/gramps/plugins/importer/importgedcom.py
@ -31,7 +31,7 @@ LOG = logging.getLogger(".GedcomImport")
 #------------------------------------------------------------------------
 #
-# GRAMPS modules
+# Gramps modules
 #
 #------------------------------------------------------------------------
 from gramps.gen.const import GRAMPS_LOCALE as glocale
@ -51,7 +51,7 @@ import imp
 imp.reload(module)
 from gramps.gen.config import config
-    
+
 #-------------------------------------------------------------------------
 #
 # importData
@ -65,30 +65,34 @@ def importData(database, filename, user):
    if DbMixin not in database.__class__.__bases__:
        database.__class__.__bases__ = (DbMixin,) +  \
                                        database.__class__.__bases__
    try:
-        ifile = open(filename, "rb")
+        # Opening in utf-8 with universal newline to allow cr, lf, and crlf
        # If the file is really UTF16 or a varient, the next block code will
        # not find anything even if it is there, but this is ok since it
        # won't be ANSEL, or is inconsistent...
        with open(filename, "r", encoding='utf-8', errors='replace',
                  newline=None) as ifile:
            ansel = False
            gramps = False
            for index in range(50):
                # Treat the file as though it is UTF-8 since this is the more
                # modern option; and anyway it doesn't really matter as we are
                # only trying to detect a CHAR or SOUR line which is only
                # 7-bit ASCII anyway,  and we ignore anything that can't be
                # translated.
                line = ifile.readline()
                line = line.split()
                if len(line) == 0:
                    break
                if len(line) > 2 and line[1][0:4] == 'CHAR' \
                                 and line[2] == "ANSEL":
                    ansel = True
                if len(line) > 2 and line[1][0:4] == 'SOUR' \
                                 and line[2] == "GRAMPS":
                    gramps = True
    except IOError:
        return
    ansel = False
    gramps = False
    for index in range(50):
        # Treat the file as though it is UTF-8 since this is the more modern
        # option; and anyway it doesn't really matter as we are only trying to
        # detect a CHAR or SOUR line which is only 7-bit ASCII anyway,  and we
        # ignore anything that can't be translated.
        line = ifile.readline()
        line = line.decode(encoding='utf-8', errors='replace')
        line = line.split()
        if len(line) == 0:
            break
        if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":
            ansel = True
        if len(line) > 2 and line[1][0:4] == 'SOUR' and line[2] == "GRAMPS":
            gramps = True
    ifile.close()
    if not gramps and ansel and user.uistate:
        top = Glade()
        code = top.get_object('codeset')
@ -116,15 +120,15 @@ def importData(database, filename, user):
                database, ifile, filename, user, stage_one, None, None)
        else:
            gedparse = libgedcom.GedcomParser(
-                database, ifile, filename, user, stage_one, 
+                database, ifile, filename, user, stage_one,
                config.get('preferences.default-source'),
-                (config.get('preferences.tag-on-import-format') if 
+                (config.get('preferences.tag-on-import-format') if
                 config.get('preferences.tag-on-import') else None))
    except IOError as msg:
        user.notify_error(_("%s could not be opened\n") % filename, str(msg))
        return
    except GedcomError as msg:
-        user.notify_error(_("Invalid GEDCOM file"), 
+        user.notify_error(_("Invalid GEDCOM file"),
                          _("%s could not be imported") % filename + "\n" + str(msg))
        return
--- a/gramps/plugins/lib/libgedcom.py
+++ b/gramps/plugins/lib/libgedcom.py
@ -94,7 +94,7 @@ import codecs
 from xml.parsers.expat import ParserCreate
 from collections import defaultdict
 import string
-from io import StringIO
+from io import StringIO, TextIOWrapper
 from urllib.parse import urlparse
 #------------------------------------------------------------------------
@ -1251,41 +1251,41 @@ class BaseReader(object):
 class UTF8Reader(BaseReader):
-    def __init__(self, ifile, __add_msg):
+    def __init__(self, ifile, __add_msg, enc):
-        BaseReader.__init__(self, ifile, 'utf8', __add_msg)
+        BaseReader.__init__(self, ifile, enc, __add_msg)
        self.reset()
-
+        if enc == 'UTF_8_SIG':
-    def reset(self):
+            self.ifile = TextIOWrapper(ifile, encoding='utf_8_sig',
-        self.ifile.seek(0)
+                                       errors='replace', newline=None)
-        data = self.ifile.read(3)
+        else:
-        if data != b"\xef\xbb\xbf":
+            self.ifile = TextIOWrapper(ifile, encoding='utf_8',
-            self.ifile.seek(0)
+                                       errors='replace', newline=None)
    def readline(self):
        line = self.ifile.readline()
        line = line.decode(self.enc, errors='replace')
        return line.translate(STRIP_DICT)
 class UTF16Reader(BaseReader):
    def __init__(self, ifile, __add_msg):
-        new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16')
+        BaseReader.__init__(self, ifile, 'UTF16', __add_msg)
-        BaseReader.__init__(self, new_file, '', __add_msg)
+        self.ifile = TextIOWrapper(ifile, encoding='utf_16',
                                   errors='replace', newline=None)
        self.reset()
    def readline(self):
        line = self.ifile.readline()
        line = line.decode('utf8', errors='replace')
        return line.translate(STRIP_DICT)
 class AnsiReader(BaseReader):
    def __init__(self, ifile, __add_msg):
        BaseReader.__init__(self, ifile, 'latin1', __add_msg)
        self.ifile = TextIOWrapper(ifile, encoding='latin1',
                                   errors='replace', newline=None)
    def readline(self):
        line = self.ifile.readline()
        line = line.decode(self.enc, errors='replace')
        if line.translate(DEL_AND_C1) != line:
            self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
        return line.translate(STRIP_DICT)
@ -1294,10 +1294,11 @@ class CP1252Reader(BaseReader):
    def __init__(self, ifile, __add_msg):
        BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
        self.ifile = TextIOWrapper(ifile, encoding='cp1252',
                                   errors='replace', newline=None)
    def readline(self):
        line = self.ifile.readline()
        line = line.decode(self.enc, errors='replace')
        return line.translate(STRIP_DICT)
 class AnselReader(BaseReader):
@ -1565,10 +1566,16 @@ class AnselReader(BaseReader):
        return ans
    def __init__(self, ifile, __add_msg):
-        BaseReader.__init__(self, ifile, "", __add_msg)
+        BaseReader.__init__(self, ifile, "ANSEL", __add_msg)
        # In theory, we should have been able to skip the encode/decode from
        # ascii.  But this way allows us to use pythons universal newline
        self.ifile = TextIOWrapper(ifile, encoding='ascii',
                                   errors='surrogateescape', newline=None)
    def readline(self):
-        return self.__ansel_to_unicode(self.ifile.readline())
+        line = self.ifile.readline()
        linebytes = line.encode(encoding='ascii', errors='surrogateescape')
        return self.__ansel_to_unicode(linebytes)
 #-------------------------------------------------------------------------
 #
@ -2676,8 +2683,8 @@ class GedcomParser(UpdateCallback):
        if enc == "ANSEL":
            rdr = AnselReader(ifile, self.__add_msg)
-        elif enc in ("UTF-8", "UTF8"):
+        elif enc in ("UTF-8", "UTF8", "UTF_8_SIG"):
-            rdr = UTF8Reader(ifile, self.__add_msg)
+            rdr = UTF8Reader(ifile, self.__add_msg, enc)
        elif enc in ("UTF-16LE", "UTF-16BE",  "UTF16", "UNICODE"):
            rdr = UTF16Reader(ifile, self.__add_msg)
        elif enc in ("CP1252", "WINDOWS-1252"):
@ -7774,26 +7781,33 @@ class GedcomStageOne(object):
    def __detect_file_decoder(self, input_file):
        """
        Detects the file encoding of the file by looking for a BOM 
-        (byte order marker) in the GEDCOM file. If we detect a UTF-16
+        (byte order marker) in the GEDCOM file. If we detect a UTF-16 or
-        encoded file, we must connect to a wrapper using the codecs
+        UTF-8-BOM encoded file, we choose appropriate decoders.  If no BOM
-        package.
+        is detected, we return in UTF-8 mode it is the more modern option;
        and anyway it doesn't really matter as we are only looking for GEDCOM
        keywords which are only 7-bit ASCII anyway.
        In any case, we Always return the file in text mode with transparent
        newline (CR, LF, or CRLF).
        """
        line = input_file.read(2)
        if line == b"\xef\xbb":
            input_file.read(1)
-            self.enc = "UTF8"
+            self.enc = "utf_8_sig"
-            return input_file
+            return TextIOWrapper(input_file, encoding='utf_8_sig',
                                 errors='replace', newline=None)
        elif line == b"\xff\xfe" or line == b"\xfe\xff":
            self.enc = "UTF16"
            input_file.seek(0)
-            return codecs.EncodedFile(input_file, 'utf8', 'utf16')
+            return TextIOWrapper(input_file, encoding='utf_16',
                                 errors='replace', newline=None)
        elif not line :
            raise GedcomError(self.__EMPTY_GED)
-        elif line[0] == b"\x00" or line[1] == b"\x00":
+        elif line == b"\x30\x00" or line == b"\x00\x30":
            raise GedcomError(self.__BAD_UTF16)
        else:
            input_file.seek(0)
-            return input_file
+            return TextIOWrapper(input_file, encoding='utf-8',
                                 errors='replace', newline=None)
    def parse(self):
        """
@ -7804,12 +7818,8 @@ class GedcomStageOne(object):
        reader = self.__detect_file_decoder(self.ifile)
        for line in reader:
-            # Treat the file as though it is UTF-8 since this will be right if a
+            # Scan for a few items, keep counts.  Also look for actual CHAR
-            # BOM was detected; it is the more modern option; and anyway it
+            # Keyword to figure out actual encodeing for non-unicode file types
            # doesn't really matter as we are only trying to detect a CHAR line
            # which is only 7-bit ASCII anyway,  and we ignore anything that
            # can't be translated.
            line = line.decode(encoding='utf-8', errors='replace')
            line = line.strip()
            if not line:
                continue
@ -7840,6 +7850,7 @@ class GedcomStageOne(object):
        LOG.debug("parse pcnt %d" % self.pcnt)
        LOG.debug("parse famc %s" % dict(self.famc))
        LOG.debug("parse fams %s" % dict(self.fams))
        self.ifile = reader # need this to keep python from autoclosing file
    def get_famc_map(self):
        """