From be6715cd99bc4483ff7c63b1f78a456af5bdd52c Mon Sep 17 00:00:00 2001
From: prculley <paulr2787@gmail.com>
Date: Fri, 3 Jun 2016 10:57:29 -0500
Subject: [PATCH] 9124: GEDCOM doesn't accept CR as a line terminator

Original code used readline() when file was in binary mode; this works
only if file contains '\n', true only for CRLF and LF line endings.
Switched to file text mode with correct encoding and universal newline
support.
---
 gramps/plugins/importer/importgedcom.py | 54 +++++++++--------
 gramps/plugins/lib/libgedcom.py         | 77 ++++++++++++++-----------
 2 files changed, 73 insertions(+), 58 deletions(-)

diff --git a/gramps/plugins/importer/importgedcom.py b/gramps/plugins/importer/importgedcom.py
index 659a58ea5..e0020186a 100644
--- a/gramps/plugins/importer/importgedcom.py
+++ b/gramps/plugins/importer/importgedcom.py
@@ -31,7 +31,7 @@ LOG = logging.getLogger(".GedcomImport")
 
 #------------------------------------------------------------------------
 #
-# GRAMPS modules
+# Gramps modules
 #
 #------------------------------------------------------------------------
 from gramps.gen.const import GRAMPS_LOCALE as glocale
@@ -51,7 +51,7 @@ import imp
 imp.reload(module)
 
 from gramps.gen.config import config
-    
+
 #-------------------------------------------------------------------------
 #
 # importData
@@ -65,30 +65,34 @@ def importData(database, filename, user):
     if DbMixin not in database.__class__.__bases__:
         database.__class__.__bases__ = (DbMixin,) +  \
                                         database.__class__.__bases__
-
     try:
-        ifile = open(filename, "rb")
+        # Opening in utf-8 with universal newline to allow cr, lf, and crlf
+        # If the file is really UTF16 or a varient, the next block code will
+        # not find anything even if it is there, but this is ok since it
+        # won't be ANSEL, or is inconsistent...
+        with open(filename, "r", encoding='utf-8', errors='replace',
+                  newline=None) as ifile:
+            ansel = False
+            gramps = False
+            for index in range(50):
+                # Treat the file as though it is UTF-8 since this is the more
+                # modern option; and anyway it doesn't really matter as we are
+                # only trying to detect a CHAR or SOUR line which is only
+                # 7-bit ASCII anyway,  and we ignore anything that can't be
+                # translated.
+                line = ifile.readline()
+                line = line.split()
+                if len(line) == 0:
+                    break
+                if len(line) > 2 and line[1][0:4] == 'CHAR' \
+                                 and line[2] == "ANSEL":
+                    ansel = True
+                if len(line) > 2 and line[1][0:4] == 'SOUR' \
+                                 and line[2] == "GRAMPS":
+                    gramps = True
     except IOError:
         return
 
-    ansel = False
-    gramps = False
-    for index in range(50):
-        # Treat the file as though it is UTF-8 since this is the more modern
-        # option; and anyway it doesn't really matter as we are only trying to
-        # detect a CHAR or SOUR line which is only 7-bit ASCII anyway,  and we
-        # ignore anything that can't be translated.
-        line = ifile.readline()
-        line = line.decode(encoding='utf-8', errors='replace')
-        line = line.split()
-        if len(line) == 0:
-            break
-        if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":
-            ansel = True
-        if len(line) > 2 and line[1][0:4] == 'SOUR' and line[2] == "GRAMPS":
-            gramps = True
-    ifile.close()
-
     if not gramps and ansel and user.uistate:
         top = Glade()
         code = top.get_object('codeset')
@@ -116,15 +120,15 @@ def importData(database, filename, user):
                 database, ifile, filename, user, stage_one, None, None)
         else:
             gedparse = libgedcom.GedcomParser(
-                database, ifile, filename, user, stage_one, 
+                database, ifile, filename, user, stage_one,
                 config.get('preferences.default-source'),
-                (config.get('preferences.tag-on-import-format') if 
+                (config.get('preferences.tag-on-import-format') if
                  config.get('preferences.tag-on-import') else None))
     except IOError as msg:
         user.notify_error(_("%s could not be opened\n") % filename, str(msg))
         return
     except GedcomError as msg:
-        user.notify_error(_("Invalid GEDCOM file"), 
+        user.notify_error(_("Invalid GEDCOM file"),
                           _("%s could not be imported") % filename + "\n" + str(msg))
         return
 
diff --git a/gramps/plugins/lib/libgedcom.py b/gramps/plugins/lib/libgedcom.py
index 314b39e06..6a9679686 100755
--- a/gramps/plugins/lib/libgedcom.py
+++ b/gramps/plugins/lib/libgedcom.py
@@ -94,7 +94,7 @@ import codecs
 from xml.parsers.expat import ParserCreate
 from collections import defaultdict
 import string
-from io import StringIO
+from io import StringIO, TextIOWrapper
 from urllib.parse import urlparse
 
 #------------------------------------------------------------------------
@@ -1251,41 +1251,41 @@ class BaseReader(object):
 
 class UTF8Reader(BaseReader):
 
-    def __init__(self, ifile, __add_msg):
-        BaseReader.__init__(self, ifile, 'utf8', __add_msg)
+    def __init__(self, ifile, __add_msg, enc):
+        BaseReader.__init__(self, ifile, enc, __add_msg)
         self.reset()
-
-    def reset(self):
-        self.ifile.seek(0)
-        data = self.ifile.read(3)
-        if data != b"\xef\xbb\xbf":
-            self.ifile.seek(0)
+        if enc == 'UTF_8_SIG':
+            self.ifile = TextIOWrapper(ifile, encoding='utf_8_sig',
+                                       errors='replace', newline=None)
+        else:
+            self.ifile = TextIOWrapper(ifile, encoding='utf_8',
+                                       errors='replace', newline=None)
 
     def readline(self):
         line = self.ifile.readline()
-        line = line.decode(self.enc, errors='replace')
         return line.translate(STRIP_DICT)
 
 class UTF16Reader(BaseReader):
 
     def __init__(self, ifile, __add_msg):
-        new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16')
-        BaseReader.__init__(self, new_file, '', __add_msg)
+        BaseReader.__init__(self, ifile, 'UTF16', __add_msg)
+        self.ifile = TextIOWrapper(ifile, encoding='utf_16',
+                                   errors='replace', newline=None)
         self.reset()
 
     def readline(self):
         line = self.ifile.readline()
-        line = line.decode('utf8', errors='replace')
         return line.translate(STRIP_DICT)
 
 class AnsiReader(BaseReader):
 
     def __init__(self, ifile, __add_msg):
         BaseReader.__init__(self, ifile, 'latin1', __add_msg)
+        self.ifile = TextIOWrapper(ifile, encoding='latin1',
+                                   errors='replace', newline=None)
    
     def readline(self):
         line = self.ifile.readline()
-        line = line.decode(self.enc, errors='replace')
         if line.translate(DEL_AND_C1) != line:
             self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
         return line.translate(STRIP_DICT)
@@ -1294,10 +1294,11 @@ class CP1252Reader(BaseReader):
 
     def __init__(self, ifile, __add_msg):
         BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
+        self.ifile = TextIOWrapper(ifile, encoding='cp1252',
+                                   errors='replace', newline=None)
    
     def readline(self):
         line = self.ifile.readline()
-        line = line.decode(self.enc, errors='replace')
         return line.translate(STRIP_DICT)
 
 class AnselReader(BaseReader):
@@ -1565,10 +1566,16 @@ class AnselReader(BaseReader):
         return ans
 
     def __init__(self, ifile, __add_msg):
-        BaseReader.__init__(self, ifile, "", __add_msg)
+        BaseReader.__init__(self, ifile, "ANSEL", __add_msg)
+        # In theory, we should have been able to skip the encode/decode from
+        # ascii.  But this way allows us to use pythons universal newline
+        self.ifile = TextIOWrapper(ifile, encoding='ascii',
+                                   errors='surrogateescape', newline=None)
 
     def readline(self):
-        return self.__ansel_to_unicode(self.ifile.readline())
+        line = self.ifile.readline()
+        linebytes = line.encode(encoding='ascii', errors='surrogateescape')
+        return self.__ansel_to_unicode(linebytes)
     
 #-------------------------------------------------------------------------
 #
@@ -2676,8 +2683,8 @@ class GedcomParser(UpdateCallback):
 
         if enc == "ANSEL":
             rdr = AnselReader(ifile, self.__add_msg)
-        elif enc in ("UTF-8", "UTF8"):
-            rdr = UTF8Reader(ifile, self.__add_msg)
+        elif enc in ("UTF-8", "UTF8", "UTF_8_SIG"):
+            rdr = UTF8Reader(ifile, self.__add_msg, enc)
         elif enc in ("UTF-16LE", "UTF-16BE",  "UTF16", "UNICODE"):
             rdr = UTF16Reader(ifile, self.__add_msg)
         elif enc in ("CP1252", "WINDOWS-1252"):
@@ -7774,26 +7781,33 @@ class GedcomStageOne(object):
     def __detect_file_decoder(self, input_file):
         """
         Detects the file encoding of the file by looking for a BOM 
-        (byte order marker) in the GEDCOM file. If we detect a UTF-16
-        encoded file, we must connect to a wrapper using the codecs
-        package.
+        (byte order marker) in the GEDCOM file. If we detect a UTF-16 or
+        UTF-8-BOM encoded file, we choose appropriate decoders.  If no BOM
+        is detected, we return in UTF-8 mode it is the more modern option;
+        and anyway it doesn't really matter as we are only looking for GEDCOM
+        keywords which are only 7-bit ASCII anyway.
+        In any case, we Always return the file in text mode with transparent
+        newline (CR, LF, or CRLF).
         """
         line = input_file.read(2)
         if line == b"\xef\xbb":
             input_file.read(1)
-            self.enc = "UTF8"
-            return input_file
+            self.enc = "utf_8_sig"
+            return TextIOWrapper(input_file, encoding='utf_8_sig',
+                                 errors='replace', newline=None)
         elif line == b"\xff\xfe" or line == b"\xfe\xff":
             self.enc = "UTF16"
             input_file.seek(0)
-            return codecs.EncodedFile(input_file, 'utf8', 'utf16')
+            return TextIOWrapper(input_file, encoding='utf_16',
+                                 errors='replace', newline=None)
         elif not line :
             raise GedcomError(self.__EMPTY_GED)
-        elif line[0] == b"\x00" or line[1] == b"\x00":
+        elif line == b"\x30\x00" or line == b"\x00\x30":
             raise GedcomError(self.__BAD_UTF16)
         else:
             input_file.seek(0)
-            return input_file
+            return TextIOWrapper(input_file, encoding='utf-8',
+                                 errors='replace', newline=None)
 
     def parse(self):
         """
@@ -7804,12 +7818,8 @@ class GedcomStageOne(object):
         reader = self.__detect_file_decoder(self.ifile)
 
         for line in reader:
-            # Treat the file as though it is UTF-8 since this will be right if a
-            # BOM was detected; it is the more modern option; and anyway it
-            # doesn't really matter as we are only trying to detect a CHAR line
-            # which is only 7-bit ASCII anyway,  and we ignore anything that
-            # can't be translated.
-            line = line.decode(encoding='utf-8', errors='replace')
+            # Scan for a few items, keep counts.  Also look for actual CHAR
+            # Keyword to figure out actual encodeing for non-unicode file types
             line = line.strip()
             if not line:
                 continue
@@ -7840,6 +7850,7 @@ class GedcomStageOne(object):
         LOG.debug("parse pcnt %d" % self.pcnt)
         LOG.debug("parse famc %s" % dict(self.famc))
         LOG.debug("parse fams %s" % dict(self.fams))
+        self.ifile = reader # need this to keep python from autoclosing file
 
     def get_famc_map(self):
         """