9124: GEDCOM doesn't accept CR as a line terminator
Original code used readline() when file was in binary mode; this works only if file contains '\n', true only for CRLF and LF line endings. Switched to file text mode with correct encoding and universal newline support.
This commit is contained in:
parent
16e2ed4f54
commit
be6715cd99
@ -31,7 +31,7 @@ LOG = logging.getLogger(".GedcomImport")
|
|||||||
|
|
||||||
#------------------------------------------------------------------------
|
#------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
# GRAMPS modules
|
# Gramps modules
|
||||||
#
|
#
|
||||||
#------------------------------------------------------------------------
|
#------------------------------------------------------------------------
|
||||||
from gramps.gen.const import GRAMPS_LOCALE as glocale
|
from gramps.gen.const import GRAMPS_LOCALE as glocale
|
||||||
@ -51,7 +51,7 @@ import imp
|
|||||||
imp.reload(module)
|
imp.reload(module)
|
||||||
|
|
||||||
from gramps.gen.config import config
|
from gramps.gen.config import config
|
||||||
|
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
# importData
|
# importData
|
||||||
@ -65,30 +65,34 @@ def importData(database, filename, user):
|
|||||||
if DbMixin not in database.__class__.__bases__:
|
if DbMixin not in database.__class__.__bases__:
|
||||||
database.__class__.__bases__ = (DbMixin,) + \
|
database.__class__.__bases__ = (DbMixin,) + \
|
||||||
database.__class__.__bases__
|
database.__class__.__bases__
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ifile = open(filename, "rb")
|
# Opening in utf-8 with universal newline to allow cr, lf, and crlf
|
||||||
|
# If the file is really UTF16 or a varient, the next block code will
|
||||||
|
# not find anything even if it is there, but this is ok since it
|
||||||
|
# won't be ANSEL, or is inconsistent...
|
||||||
|
with open(filename, "r", encoding='utf-8', errors='replace',
|
||||||
|
newline=None) as ifile:
|
||||||
|
ansel = False
|
||||||
|
gramps = False
|
||||||
|
for index in range(50):
|
||||||
|
# Treat the file as though it is UTF-8 since this is the more
|
||||||
|
# modern option; and anyway it doesn't really matter as we are
|
||||||
|
# only trying to detect a CHAR or SOUR line which is only
|
||||||
|
# 7-bit ASCII anyway, and we ignore anything that can't be
|
||||||
|
# translated.
|
||||||
|
line = ifile.readline()
|
||||||
|
line = line.split()
|
||||||
|
if len(line) == 0:
|
||||||
|
break
|
||||||
|
if len(line) > 2 and line[1][0:4] == 'CHAR' \
|
||||||
|
and line[2] == "ANSEL":
|
||||||
|
ansel = True
|
||||||
|
if len(line) > 2 and line[1][0:4] == 'SOUR' \
|
||||||
|
and line[2] == "GRAMPS":
|
||||||
|
gramps = True
|
||||||
except IOError:
|
except IOError:
|
||||||
return
|
return
|
||||||
|
|
||||||
ansel = False
|
|
||||||
gramps = False
|
|
||||||
for index in range(50):
|
|
||||||
# Treat the file as though it is UTF-8 since this is the more modern
|
|
||||||
# option; and anyway it doesn't really matter as we are only trying to
|
|
||||||
# detect a CHAR or SOUR line which is only 7-bit ASCII anyway, and we
|
|
||||||
# ignore anything that can't be translated.
|
|
||||||
line = ifile.readline()
|
|
||||||
line = line.decode(encoding='utf-8', errors='replace')
|
|
||||||
line = line.split()
|
|
||||||
if len(line) == 0:
|
|
||||||
break
|
|
||||||
if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":
|
|
||||||
ansel = True
|
|
||||||
if len(line) > 2 and line[1][0:4] == 'SOUR' and line[2] == "GRAMPS":
|
|
||||||
gramps = True
|
|
||||||
ifile.close()
|
|
||||||
|
|
||||||
if not gramps and ansel and user.uistate:
|
if not gramps and ansel and user.uistate:
|
||||||
top = Glade()
|
top = Glade()
|
||||||
code = top.get_object('codeset')
|
code = top.get_object('codeset')
|
||||||
@ -116,15 +120,15 @@ def importData(database, filename, user):
|
|||||||
database, ifile, filename, user, stage_one, None, None)
|
database, ifile, filename, user, stage_one, None, None)
|
||||||
else:
|
else:
|
||||||
gedparse = libgedcom.GedcomParser(
|
gedparse = libgedcom.GedcomParser(
|
||||||
database, ifile, filename, user, stage_one,
|
database, ifile, filename, user, stage_one,
|
||||||
config.get('preferences.default-source'),
|
config.get('preferences.default-source'),
|
||||||
(config.get('preferences.tag-on-import-format') if
|
(config.get('preferences.tag-on-import-format') if
|
||||||
config.get('preferences.tag-on-import') else None))
|
config.get('preferences.tag-on-import') else None))
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
user.notify_error(_("%s could not be opened\n") % filename, str(msg))
|
user.notify_error(_("%s could not be opened\n") % filename, str(msg))
|
||||||
return
|
return
|
||||||
except GedcomError as msg:
|
except GedcomError as msg:
|
||||||
user.notify_error(_("Invalid GEDCOM file"),
|
user.notify_error(_("Invalid GEDCOM file"),
|
||||||
_("%s could not be imported") % filename + "\n" + str(msg))
|
_("%s could not be imported") % filename + "\n" + str(msg))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ import codecs
|
|||||||
from xml.parsers.expat import ParserCreate
|
from xml.parsers.expat import ParserCreate
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import string
|
import string
|
||||||
from io import StringIO
|
from io import StringIO, TextIOWrapper
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
#------------------------------------------------------------------------
|
#------------------------------------------------------------------------
|
||||||
@ -1251,41 +1251,41 @@ class BaseReader(object):
|
|||||||
|
|
||||||
class UTF8Reader(BaseReader):
|
class UTF8Reader(BaseReader):
|
||||||
|
|
||||||
def __init__(self, ifile, __add_msg):
|
def __init__(self, ifile, __add_msg, enc):
|
||||||
BaseReader.__init__(self, ifile, 'utf8', __add_msg)
|
BaseReader.__init__(self, ifile, enc, __add_msg)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
if enc == 'UTF_8_SIG':
|
||||||
def reset(self):
|
self.ifile = TextIOWrapper(ifile, encoding='utf_8_sig',
|
||||||
self.ifile.seek(0)
|
errors='replace', newline=None)
|
||||||
data = self.ifile.read(3)
|
else:
|
||||||
if data != b"\xef\xbb\xbf":
|
self.ifile = TextIOWrapper(ifile, encoding='utf_8',
|
||||||
self.ifile.seek(0)
|
errors='replace', newline=None)
|
||||||
|
|
||||||
def readline(self):
|
def readline(self):
|
||||||
line = self.ifile.readline()
|
line = self.ifile.readline()
|
||||||
line = line.decode(self.enc, errors='replace')
|
|
||||||
return line.translate(STRIP_DICT)
|
return line.translate(STRIP_DICT)
|
||||||
|
|
||||||
class UTF16Reader(BaseReader):
|
class UTF16Reader(BaseReader):
|
||||||
|
|
||||||
def __init__(self, ifile, __add_msg):
|
def __init__(self, ifile, __add_msg):
|
||||||
new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16')
|
BaseReader.__init__(self, ifile, 'UTF16', __add_msg)
|
||||||
BaseReader.__init__(self, new_file, '', __add_msg)
|
self.ifile = TextIOWrapper(ifile, encoding='utf_16',
|
||||||
|
errors='replace', newline=None)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def readline(self):
|
def readline(self):
|
||||||
line = self.ifile.readline()
|
line = self.ifile.readline()
|
||||||
line = line.decode('utf8', errors='replace')
|
|
||||||
return line.translate(STRIP_DICT)
|
return line.translate(STRIP_DICT)
|
||||||
|
|
||||||
class AnsiReader(BaseReader):
|
class AnsiReader(BaseReader):
|
||||||
|
|
||||||
def __init__(self, ifile, __add_msg):
|
def __init__(self, ifile, __add_msg):
|
||||||
BaseReader.__init__(self, ifile, 'latin1', __add_msg)
|
BaseReader.__init__(self, ifile, 'latin1', __add_msg)
|
||||||
|
self.ifile = TextIOWrapper(ifile, encoding='latin1',
|
||||||
|
errors='replace', newline=None)
|
||||||
|
|
||||||
def readline(self):
|
def readline(self):
|
||||||
line = self.ifile.readline()
|
line = self.ifile.readline()
|
||||||
line = line.decode(self.enc, errors='replace')
|
|
||||||
if line.translate(DEL_AND_C1) != line:
|
if line.translate(DEL_AND_C1) != line:
|
||||||
self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
|
self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
|
||||||
return line.translate(STRIP_DICT)
|
return line.translate(STRIP_DICT)
|
||||||
@ -1294,10 +1294,11 @@ class CP1252Reader(BaseReader):
|
|||||||
|
|
||||||
def __init__(self, ifile, __add_msg):
|
def __init__(self, ifile, __add_msg):
|
||||||
BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
|
BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
|
||||||
|
self.ifile = TextIOWrapper(ifile, encoding='cp1252',
|
||||||
|
errors='replace', newline=None)
|
||||||
|
|
||||||
def readline(self):
|
def readline(self):
|
||||||
line = self.ifile.readline()
|
line = self.ifile.readline()
|
||||||
line = line.decode(self.enc, errors='replace')
|
|
||||||
return line.translate(STRIP_DICT)
|
return line.translate(STRIP_DICT)
|
||||||
|
|
||||||
class AnselReader(BaseReader):
|
class AnselReader(BaseReader):
|
||||||
@ -1565,10 +1566,16 @@ class AnselReader(BaseReader):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def __init__(self, ifile, __add_msg):
|
def __init__(self, ifile, __add_msg):
|
||||||
BaseReader.__init__(self, ifile, "", __add_msg)
|
BaseReader.__init__(self, ifile, "ANSEL", __add_msg)
|
||||||
|
# In theory, we should have been able to skip the encode/decode from
|
||||||
|
# ascii. But this way allows us to use pythons universal newline
|
||||||
|
self.ifile = TextIOWrapper(ifile, encoding='ascii',
|
||||||
|
errors='surrogateescape', newline=None)
|
||||||
|
|
||||||
def readline(self):
|
def readline(self):
|
||||||
return self.__ansel_to_unicode(self.ifile.readline())
|
line = self.ifile.readline()
|
||||||
|
linebytes = line.encode(encoding='ascii', errors='surrogateescape')
|
||||||
|
return self.__ansel_to_unicode(linebytes)
|
||||||
|
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
@ -2676,8 +2683,8 @@ class GedcomParser(UpdateCallback):
|
|||||||
|
|
||||||
if enc == "ANSEL":
|
if enc == "ANSEL":
|
||||||
rdr = AnselReader(ifile, self.__add_msg)
|
rdr = AnselReader(ifile, self.__add_msg)
|
||||||
elif enc in ("UTF-8", "UTF8"):
|
elif enc in ("UTF-8", "UTF8", "UTF_8_SIG"):
|
||||||
rdr = UTF8Reader(ifile, self.__add_msg)
|
rdr = UTF8Reader(ifile, self.__add_msg, enc)
|
||||||
elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"):
|
elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"):
|
||||||
rdr = UTF16Reader(ifile, self.__add_msg)
|
rdr = UTF16Reader(ifile, self.__add_msg)
|
||||||
elif enc in ("CP1252", "WINDOWS-1252"):
|
elif enc in ("CP1252", "WINDOWS-1252"):
|
||||||
@ -7774,26 +7781,33 @@ class GedcomStageOne(object):
|
|||||||
def __detect_file_decoder(self, input_file):
|
def __detect_file_decoder(self, input_file):
|
||||||
"""
|
"""
|
||||||
Detects the file encoding of the file by looking for a BOM
|
Detects the file encoding of the file by looking for a BOM
|
||||||
(byte order marker) in the GEDCOM file. If we detect a UTF-16
|
(byte order marker) in the GEDCOM file. If we detect a UTF-16 or
|
||||||
encoded file, we must connect to a wrapper using the codecs
|
UTF-8-BOM encoded file, we choose appropriate decoders. If no BOM
|
||||||
package.
|
is detected, we return in UTF-8 mode it is the more modern option;
|
||||||
|
and anyway it doesn't really matter as we are only looking for GEDCOM
|
||||||
|
keywords which are only 7-bit ASCII anyway.
|
||||||
|
In any case, we Always return the file in text mode with transparent
|
||||||
|
newline (CR, LF, or CRLF).
|
||||||
"""
|
"""
|
||||||
line = input_file.read(2)
|
line = input_file.read(2)
|
||||||
if line == b"\xef\xbb":
|
if line == b"\xef\xbb":
|
||||||
input_file.read(1)
|
input_file.read(1)
|
||||||
self.enc = "UTF8"
|
self.enc = "utf_8_sig"
|
||||||
return input_file
|
return TextIOWrapper(input_file, encoding='utf_8_sig',
|
||||||
|
errors='replace', newline=None)
|
||||||
elif line == b"\xff\xfe" or line == b"\xfe\xff":
|
elif line == b"\xff\xfe" or line == b"\xfe\xff":
|
||||||
self.enc = "UTF16"
|
self.enc = "UTF16"
|
||||||
input_file.seek(0)
|
input_file.seek(0)
|
||||||
return codecs.EncodedFile(input_file, 'utf8', 'utf16')
|
return TextIOWrapper(input_file, encoding='utf_16',
|
||||||
|
errors='replace', newline=None)
|
||||||
elif not line :
|
elif not line :
|
||||||
raise GedcomError(self.__EMPTY_GED)
|
raise GedcomError(self.__EMPTY_GED)
|
||||||
elif line[0] == b"\x00" or line[1] == b"\x00":
|
elif line == b"\x30\x00" or line == b"\x00\x30":
|
||||||
raise GedcomError(self.__BAD_UTF16)
|
raise GedcomError(self.__BAD_UTF16)
|
||||||
else:
|
else:
|
||||||
input_file.seek(0)
|
input_file.seek(0)
|
||||||
return input_file
|
return TextIOWrapper(input_file, encoding='utf-8',
|
||||||
|
errors='replace', newline=None)
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
"""
|
"""
|
||||||
@ -7804,12 +7818,8 @@ class GedcomStageOne(object):
|
|||||||
reader = self.__detect_file_decoder(self.ifile)
|
reader = self.__detect_file_decoder(self.ifile)
|
||||||
|
|
||||||
for line in reader:
|
for line in reader:
|
||||||
# Treat the file as though it is UTF-8 since this will be right if a
|
# Scan for a few items, keep counts. Also look for actual CHAR
|
||||||
# BOM was detected; it is the more modern option; and anyway it
|
# Keyword to figure out actual encodeing for non-unicode file types
|
||||||
# doesn't really matter as we are only trying to detect a CHAR line
|
|
||||||
# which is only 7-bit ASCII anyway, and we ignore anything that
|
|
||||||
# can't be translated.
|
|
||||||
line = line.decode(encoding='utf-8', errors='replace')
|
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
@ -7840,6 +7850,7 @@ class GedcomStageOne(object):
|
|||||||
LOG.debug("parse pcnt %d" % self.pcnt)
|
LOG.debug("parse pcnt %d" % self.pcnt)
|
||||||
LOG.debug("parse famc %s" % dict(self.famc))
|
LOG.debug("parse famc %s" % dict(self.famc))
|
||||||
LOG.debug("parse fams %s" % dict(self.fams))
|
LOG.debug("parse fams %s" % dict(self.fams))
|
||||||
|
self.ifile = reader # need this to keep python from autoclosing file
|
||||||
|
|
||||||
def get_famc_map(self):
|
def get_famc_map(self):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user