9124: GEDCOM doesn't accept CR as a line terminator

Original code used readline() when file was in binary mode; this works
only if file contains '\n', true only for CRLF and LF line endings.
Switched to file text mode with correct encoding and universal newline
support.
This commit is contained in:
prculley 2016-06-03 10:57:29 -05:00 committed by Nick Hall
parent 16e2ed4f54
commit be6715cd99
2 changed files with 73 additions and 58 deletions

View File

@ -31,7 +31,7 @@ LOG = logging.getLogger(".GedcomImport")
#------------------------------------------------------------------------ #------------------------------------------------------------------------
# #
# GRAMPS modules # Gramps modules
# #
#------------------------------------------------------------------------ #------------------------------------------------------------------------
from gramps.gen.const import GRAMPS_LOCALE as glocale from gramps.gen.const import GRAMPS_LOCALE as glocale
@ -51,7 +51,7 @@ import imp
imp.reload(module) imp.reload(module)
from gramps.gen.config import config from gramps.gen.config import config
#------------------------------------------------------------------------- #-------------------------------------------------------------------------
# #
# importData # importData
@ -65,30 +65,34 @@ def importData(database, filename, user):
if DbMixin not in database.__class__.__bases__: if DbMixin not in database.__class__.__bases__:
database.__class__.__bases__ = (DbMixin,) + \ database.__class__.__bases__ = (DbMixin,) + \
database.__class__.__bases__ database.__class__.__bases__
try: try:
ifile = open(filename, "rb") # Opening in utf-8 with universal newline to allow cr, lf, and crlf
# If the file is really UTF16 or a varient, the next block code will
# not find anything even if it is there, but this is ok since it
# won't be ANSEL, or is inconsistent...
with open(filename, "r", encoding='utf-8', errors='replace',
newline=None) as ifile:
ansel = False
gramps = False
for index in range(50):
# Treat the file as though it is UTF-8 since this is the more
# modern option; and anyway it doesn't really matter as we are
# only trying to detect a CHAR or SOUR line which is only
# 7-bit ASCII anyway, and we ignore anything that can't be
# translated.
line = ifile.readline()
line = line.split()
if len(line) == 0:
break
if len(line) > 2 and line[1][0:4] == 'CHAR' \
and line[2] == "ANSEL":
ansel = True
if len(line) > 2 and line[1][0:4] == 'SOUR' \
and line[2] == "GRAMPS":
gramps = True
except IOError: except IOError:
return return
ansel = False
gramps = False
for index in range(50):
# Treat the file as though it is UTF-8 since this is the more modern
# option; and anyway it doesn't really matter as we are only trying to
# detect a CHAR or SOUR line which is only 7-bit ASCII anyway, and we
# ignore anything that can't be translated.
line = ifile.readline()
line = line.decode(encoding='utf-8', errors='replace')
line = line.split()
if len(line) == 0:
break
if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":
ansel = True
if len(line) > 2 and line[1][0:4] == 'SOUR' and line[2] == "GRAMPS":
gramps = True
ifile.close()
if not gramps and ansel and user.uistate: if not gramps and ansel and user.uistate:
top = Glade() top = Glade()
code = top.get_object('codeset') code = top.get_object('codeset')
@ -116,15 +120,15 @@ def importData(database, filename, user):
database, ifile, filename, user, stage_one, None, None) database, ifile, filename, user, stage_one, None, None)
else: else:
gedparse = libgedcom.GedcomParser( gedparse = libgedcom.GedcomParser(
database, ifile, filename, user, stage_one, database, ifile, filename, user, stage_one,
config.get('preferences.default-source'), config.get('preferences.default-source'),
(config.get('preferences.tag-on-import-format') if (config.get('preferences.tag-on-import-format') if
config.get('preferences.tag-on-import') else None)) config.get('preferences.tag-on-import') else None))
except IOError as msg: except IOError as msg:
user.notify_error(_("%s could not be opened\n") % filename, str(msg)) user.notify_error(_("%s could not be opened\n") % filename, str(msg))
return return
except GedcomError as msg: except GedcomError as msg:
user.notify_error(_("Invalid GEDCOM file"), user.notify_error(_("Invalid GEDCOM file"),
_("%s could not be imported") % filename + "\n" + str(msg)) _("%s could not be imported") % filename + "\n" + str(msg))
return return

View File

@ -94,7 +94,7 @@ import codecs
from xml.parsers.expat import ParserCreate from xml.parsers.expat import ParserCreate
from collections import defaultdict from collections import defaultdict
import string import string
from io import StringIO from io import StringIO, TextIOWrapper
from urllib.parse import urlparse from urllib.parse import urlparse
#------------------------------------------------------------------------ #------------------------------------------------------------------------
@ -1251,41 +1251,41 @@ class BaseReader(object):
class UTF8Reader(BaseReader): class UTF8Reader(BaseReader):
def __init__(self, ifile, __add_msg): def __init__(self, ifile, __add_msg, enc):
BaseReader.__init__(self, ifile, 'utf8', __add_msg) BaseReader.__init__(self, ifile, enc, __add_msg)
self.reset() self.reset()
if enc == 'UTF_8_SIG':
def reset(self): self.ifile = TextIOWrapper(ifile, encoding='utf_8_sig',
self.ifile.seek(0) errors='replace', newline=None)
data = self.ifile.read(3) else:
if data != b"\xef\xbb\xbf": self.ifile = TextIOWrapper(ifile, encoding='utf_8',
self.ifile.seek(0) errors='replace', newline=None)
def readline(self): def readline(self):
line = self.ifile.readline() line = self.ifile.readline()
line = line.decode(self.enc, errors='replace')
return line.translate(STRIP_DICT) return line.translate(STRIP_DICT)
class UTF16Reader(BaseReader): class UTF16Reader(BaseReader):
def __init__(self, ifile, __add_msg): def __init__(self, ifile, __add_msg):
new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16') BaseReader.__init__(self, ifile, 'UTF16', __add_msg)
BaseReader.__init__(self, new_file, '', __add_msg) self.ifile = TextIOWrapper(ifile, encoding='utf_16',
errors='replace', newline=None)
self.reset() self.reset()
def readline(self): def readline(self):
line = self.ifile.readline() line = self.ifile.readline()
line = line.decode('utf8', errors='replace')
return line.translate(STRIP_DICT) return line.translate(STRIP_DICT)
class AnsiReader(BaseReader): class AnsiReader(BaseReader):
def __init__(self, ifile, __add_msg): def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, 'latin1', __add_msg) BaseReader.__init__(self, ifile, 'latin1', __add_msg)
self.ifile = TextIOWrapper(ifile, encoding='latin1',
errors='replace', newline=None)
def readline(self): def readline(self):
line = self.ifile.readline() line = self.ifile.readline()
line = line.decode(self.enc, errors='replace')
if line.translate(DEL_AND_C1) != line: if line.translate(DEL_AND_C1) != line:
self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line) self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
return line.translate(STRIP_DICT) return line.translate(STRIP_DICT)
@ -1294,10 +1294,11 @@ class CP1252Reader(BaseReader):
def __init__(self, ifile, __add_msg): def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, 'cp1252', __add_msg) BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
self.ifile = TextIOWrapper(ifile, encoding='cp1252',
errors='replace', newline=None)
def readline(self): def readline(self):
line = self.ifile.readline() line = self.ifile.readline()
line = line.decode(self.enc, errors='replace')
return line.translate(STRIP_DICT) return line.translate(STRIP_DICT)
class AnselReader(BaseReader): class AnselReader(BaseReader):
@ -1565,10 +1566,16 @@ class AnselReader(BaseReader):
return ans return ans
def __init__(self, ifile, __add_msg): def __init__(self, ifile, __add_msg):
BaseReader.__init__(self, ifile, "", __add_msg) BaseReader.__init__(self, ifile, "ANSEL", __add_msg)
# In theory, we should have been able to skip the encode/decode from
# ascii. But this way allows us to use pythons universal newline
self.ifile = TextIOWrapper(ifile, encoding='ascii',
errors='surrogateescape', newline=None)
def readline(self): def readline(self):
return self.__ansel_to_unicode(self.ifile.readline()) line = self.ifile.readline()
linebytes = line.encode(encoding='ascii', errors='surrogateescape')
return self.__ansel_to_unicode(linebytes)
#------------------------------------------------------------------------- #-------------------------------------------------------------------------
# #
@ -2676,8 +2683,8 @@ class GedcomParser(UpdateCallback):
if enc == "ANSEL": if enc == "ANSEL":
rdr = AnselReader(ifile, self.__add_msg) rdr = AnselReader(ifile, self.__add_msg)
elif enc in ("UTF-8", "UTF8"): elif enc in ("UTF-8", "UTF8", "UTF_8_SIG"):
rdr = UTF8Reader(ifile, self.__add_msg) rdr = UTF8Reader(ifile, self.__add_msg, enc)
elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"): elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"):
rdr = UTF16Reader(ifile, self.__add_msg) rdr = UTF16Reader(ifile, self.__add_msg)
elif enc in ("CP1252", "WINDOWS-1252"): elif enc in ("CP1252", "WINDOWS-1252"):
@ -7774,26 +7781,33 @@ class GedcomStageOne(object):
def __detect_file_decoder(self, input_file): def __detect_file_decoder(self, input_file):
""" """
Detects the file encoding of the file by looking for a BOM Detects the file encoding of the file by looking for a BOM
(byte order marker) in the GEDCOM file. If we detect a UTF-16 (byte order marker) in the GEDCOM file. If we detect a UTF-16 or
encoded file, we must connect to a wrapper using the codecs UTF-8-BOM encoded file, we choose appropriate decoders. If no BOM
package. is detected, we return in UTF-8 mode it is the more modern option;
and anyway it doesn't really matter as we are only looking for GEDCOM
keywords which are only 7-bit ASCII anyway.
In any case, we Always return the file in text mode with transparent
newline (CR, LF, or CRLF).
""" """
line = input_file.read(2) line = input_file.read(2)
if line == b"\xef\xbb": if line == b"\xef\xbb":
input_file.read(1) input_file.read(1)
self.enc = "UTF8" self.enc = "utf_8_sig"
return input_file return TextIOWrapper(input_file, encoding='utf_8_sig',
errors='replace', newline=None)
elif line == b"\xff\xfe" or line == b"\xfe\xff": elif line == b"\xff\xfe" or line == b"\xfe\xff":
self.enc = "UTF16" self.enc = "UTF16"
input_file.seek(0) input_file.seek(0)
return codecs.EncodedFile(input_file, 'utf8', 'utf16') return TextIOWrapper(input_file, encoding='utf_16',
errors='replace', newline=None)
elif not line : elif not line :
raise GedcomError(self.__EMPTY_GED) raise GedcomError(self.__EMPTY_GED)
elif line[0] == b"\x00" or line[1] == b"\x00": elif line == b"\x30\x00" or line == b"\x00\x30":
raise GedcomError(self.__BAD_UTF16) raise GedcomError(self.__BAD_UTF16)
else: else:
input_file.seek(0) input_file.seek(0)
return input_file return TextIOWrapper(input_file, encoding='utf-8',
errors='replace', newline=None)
def parse(self): def parse(self):
""" """
@ -7804,12 +7818,8 @@ class GedcomStageOne(object):
reader = self.__detect_file_decoder(self.ifile) reader = self.__detect_file_decoder(self.ifile)
for line in reader: for line in reader:
# Treat the file as though it is UTF-8 since this will be right if a # Scan for a few items, keep counts. Also look for actual CHAR
# BOM was detected; it is the more modern option; and anyway it # Keyword to figure out actual encodeing for non-unicode file types
# doesn't really matter as we are only trying to detect a CHAR line
# which is only 7-bit ASCII anyway, and we ignore anything that
# can't be translated.
line = line.decode(encoding='utf-8', errors='replace')
line = line.strip() line = line.strip()
if not line: if not line:
continue continue
@ -7840,6 +7850,7 @@ class GedcomStageOne(object):
LOG.debug("parse pcnt %d" % self.pcnt) LOG.debug("parse pcnt %d" % self.pcnt)
LOG.debug("parse famc %s" % dict(self.famc)) LOG.debug("parse famc %s" % dict(self.famc))
LOG.debug("parse fams %s" % dict(self.fams)) LOG.debug("parse fams %s" % dict(self.fams))
self.ifile = reader # need this to keep python from autoclosing file
def get_famc_map(self): def get_famc_map(self):
""" """