9124: GEDCOM doesn't accept CR as a line terminator

Original code used readline() when file was in binary mode; this works
only if file contains '\n', true only for CRLF and LF line endings.
Switched to file text mode with correct encoding and universal newline
support.
This commit is contained in:
prculley 2016-06-03 10:57:29 -05:00 committed by Nick Hall
parent 16e2ed4f54
commit be6715cd99
2 changed files with 73 additions and 58 deletions

View File

@ -31,7 +31,7 @@ LOG = logging.getLogger(".GedcomImport")
#------------------------------------------------------------------------
#
# GRAMPS modules
# Gramps modules
#
#------------------------------------------------------------------------
from gramps.gen.const import GRAMPS_LOCALE as glocale
@ -51,7 +51,7 @@ import imp
imp.reload(module)
from gramps.gen.config import config
#-------------------------------------------------------------------------
#
# importData
@ -65,30 +65,34 @@ def importData(database, filename, user):
if DbMixin not in database.__class__.__bases__:
database.__class__.__bases__ = (DbMixin,) + \
database.__class__.__bases__
try:
ifile = open(filename, "rb")
# Opening in utf-8 with universal newline to allow cr, lf, and crlf
# If the file is really UTF16 or a varient, the next block code will
# not find anything even if it is there, but this is ok since it
# won't be ANSEL, or is inconsistent...
with open(filename, "r", encoding='utf-8', errors='replace',
newline=None) as ifile:
ansel = False
gramps = False
for index in range(50):
# Treat the file as though it is UTF-8 since this is the more
# modern option; and anyway it doesn't really matter as we are
# only trying to detect a CHAR or SOUR line which is only
# 7-bit ASCII anyway, and we ignore anything that can't be
# translated.
line = ifile.readline()
line = line.split()
if len(line) == 0:
break
if len(line) > 2 and line[1][0:4] == 'CHAR' \
and line[2] == "ANSEL":
ansel = True
if len(line) > 2 and line[1][0:4] == 'SOUR' \
and line[2] == "GRAMPS":
gramps = True
except IOError:
return
ansel = False
gramps = False
for index in range(50):
# Treat the file as though it is UTF-8 since this is the more modern
# option; and anyway it doesn't really matter as we are only trying to
# detect a CHAR or SOUR line which is only 7-bit ASCII anyway, and we
# ignore anything that can't be translated.
line = ifile.readline()
line = line.decode(encoding='utf-8', errors='replace')
line = line.split()
if len(line) == 0:
break
if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":
ansel = True
if len(line) > 2 and line[1][0:4] == 'SOUR' and line[2] == "GRAMPS":
gramps = True
ifile.close()
if not gramps and ansel and user.uistate:
top = Glade()
code = top.get_object('codeset')
@ -116,15 +120,15 @@ def importData(database, filename, user):
database, ifile, filename, user, stage_one, None, None)
else:
gedparse = libgedcom.GedcomParser(
database, ifile, filename, user, stage_one,
database, ifile, filename, user, stage_one,
config.get('preferences.default-source'),
(config.get('preferences.tag-on-import-format') if
(config.get('preferences.tag-on-import-format') if
config.get('preferences.tag-on-import') else None))
except IOError as msg:
user.notify_error(_("%s could not be opened\n") % filename, str(msg))
return
except GedcomError as msg:
user.notify_error(_("Invalid GEDCOM file"),
user.notify_error(_("Invalid GEDCOM file"),
_("%s could not be imported") % filename + "\n" + str(msg))
return

View File

@ -94,7 +94,7 @@ import codecs
from xml.parsers.expat import ParserCreate
from collections import defaultdict
import string
from io import StringIO
from io import StringIO, TextIOWrapper
from urllib.parse import urlparse
#------------------------------------------------------------------------
@ -1251,41 +1251,41 @@ class BaseReader(object):
class UTF8Reader(BaseReader):
def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, 'utf8', __add_msg)
def __init__(self, ifile, __add_msg, enc):
BaseReader.__init__(self, ifile, enc, __add_msg)
self.reset()
def reset(self):
self.ifile.seek(0)
data = self.ifile.read(3)
if data != b"\xef\xbb\xbf":
self.ifile.seek(0)
if enc == 'UTF_8_SIG':
self.ifile = TextIOWrapper(ifile, encoding='utf_8_sig',
errors='replace', newline=None)
else:
self.ifile = TextIOWrapper(ifile, encoding='utf_8',
errors='replace', newline=None)
def readline(self):
line = self.ifile.readline()
line = line.decode(self.enc, errors='replace')
return line.translate(STRIP_DICT)
class UTF16Reader(BaseReader):
def __init__(self, ifile, __add_msg):
new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16')
BaseReader.__init__(self, new_file, '', __add_msg)
BaseReader.__init__(self, ifile, 'UTF16', __add_msg)
self.ifile = TextIOWrapper(ifile, encoding='utf_16',
errors='replace', newline=None)
self.reset()
def readline(self):
line = self.ifile.readline()
line = line.decode('utf8', errors='replace')
return line.translate(STRIP_DICT)
class AnsiReader(BaseReader):
def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, 'latin1', __add_msg)
self.ifile = TextIOWrapper(ifile, encoding='latin1',
errors='replace', newline=None)
def readline(self):
line = self.ifile.readline()
line = line.decode(self.enc, errors='replace')
if line.translate(DEL_AND_C1) != line:
self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
return line.translate(STRIP_DICT)
@ -1294,10 +1294,11 @@ class CP1252Reader(BaseReader):
def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
self.ifile = TextIOWrapper(ifile, encoding='cp1252',
errors='replace', newline=None)
def readline(self):
line = self.ifile.readline()
line = line.decode(self.enc, errors='replace')
return line.translate(STRIP_DICT)
class AnselReader(BaseReader):
@ -1565,10 +1566,16 @@ class AnselReader(BaseReader):
return ans
def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, "", __add_msg)
BaseReader.__init__(self, ifile, "ANSEL", __add_msg)
# In theory, we should have been able to skip the encode/decode from
# ascii. But this way allows us to use pythons universal newline
self.ifile = TextIOWrapper(ifile, encoding='ascii',
errors='surrogateescape', newline=None)
def readline(self):
return self.__ansel_to_unicode(self.ifile.readline())
line = self.ifile.readline()
linebytes = line.encode(encoding='ascii', errors='surrogateescape')
return self.__ansel_to_unicode(linebytes)
#-------------------------------------------------------------------------
#
@ -2676,8 +2683,8 @@ class GedcomParser(UpdateCallback):
if enc == "ANSEL":
rdr = AnselReader(ifile, self.__add_msg)
elif enc in ("UTF-8", "UTF8"):
rdr = UTF8Reader(ifile, self.__add_msg)
elif enc in ("UTF-8", "UTF8", "UTF_8_SIG"):
rdr = UTF8Reader(ifile, self.__add_msg, enc)
elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"):
rdr = UTF16Reader(ifile, self.__add_msg)
elif enc in ("CP1252", "WINDOWS-1252"):
@ -7774,26 +7781,33 @@ class GedcomStageOne(object):
def __detect_file_decoder(self, input_file):
"""
Detects the file encoding of the file by looking for a BOM
(byte order marker) in the GEDCOM file. If we detect a UTF-16
encoded file, we must connect to a wrapper using the codecs
package.
(byte order marker) in the GEDCOM file. If we detect a UTF-16 or
UTF-8-BOM encoded file, we choose appropriate decoders. If no BOM
is detected, we return in UTF-8 mode it is the more modern option;
and anyway it doesn't really matter as we are only looking for GEDCOM
keywords which are only 7-bit ASCII anyway.
In any case, we Always return the file in text mode with transparent
newline (CR, LF, or CRLF).
"""
line = input_file.read(2)
if line == b"\xef\xbb":
input_file.read(1)
self.enc = "UTF8"
return input_file
self.enc = "utf_8_sig"
return TextIOWrapper(input_file, encoding='utf_8_sig',
errors='replace', newline=None)
elif line == b"\xff\xfe" or line == b"\xfe\xff":
self.enc = "UTF16"
input_file.seek(0)
return codecs.EncodedFile(input_file, 'utf8', 'utf16')
return TextIOWrapper(input_file, encoding='utf_16',
errors='replace', newline=None)
elif not line :
raise GedcomError(self.__EMPTY_GED)
elif line[0] == b"\x00" or line[1] == b"\x00":
elif line == b"\x30\x00" or line == b"\x00\x30":
raise GedcomError(self.__BAD_UTF16)
else:
input_file.seek(0)
return input_file
return TextIOWrapper(input_file, encoding='utf-8',
errors='replace', newline=None)
def parse(self):
"""
@ -7804,12 +7818,8 @@ class GedcomStageOne(object):
reader = self.__detect_file_decoder(self.ifile)
for line in reader:
# Treat the file as though it is UTF-8 since this will be right if a
# BOM was detected; it is the more modern option; and anyway it
# doesn't really matter as we are only trying to detect a CHAR line
# which is only 7-bit ASCII anyway, and we ignore anything that
# can't be translated.
line = line.decode(encoding='utf-8', errors='replace')
# Scan for a few items, keep counts. Also look for actual CHAR
# Keyword to figure out actual encodeing for non-unicode file types
line = line.strip()
if not line:
continue
@ -7840,6 +7850,7 @@ class GedcomStageOne(object):
LOG.debug("parse pcnt %d" % self.pcnt)
LOG.debug("parse famc %s" % dict(self.famc))
LOG.debug("parse fams %s" % dict(self.fams))
self.ifile = reader # need this to keep python from autoclosing file
def get_famc_map(self):
"""