3082, 4439, 7134, 8279 Various fixes for GEDCOM import.
0003082: 1/4 and 1/2 ANSEL characters not supported on importing ANSEL GEDCOM 0004439: [Info]: characters ignored on a Gedcom encoded ANSI (cp1252 West Europe, USA) 0007134: Failure importing ANSEL encoded gedcom file. 0008279: GEDCOM import fails for ANSI file incorrectly opened with the utf8 locale Fixed GEDCOM import not working properly for Python3; other problems also corected, including fixing the inability to import Windows CP1252 coded files. Also more consistent fix for 8014. Ensured any error messages are not lost. Fixed a few ANSEL characters that were not translated.
This commit is contained in:
parent
3f23c2c5cd
commit
075f2ccb7b
@ -66,14 +66,20 @@ def importData(database, filename, user):
|
|||||||
database.__class__.__bases__
|
database.__class__.__bases__
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ifile = open(filename, "r")
|
ifile = open(filename, "rb")
|
||||||
except IOError:
|
except IOError:
|
||||||
return
|
return
|
||||||
|
|
||||||
ansel = False
|
ansel = False
|
||||||
gramps = False
|
gramps = False
|
||||||
for index in range(50):
|
for index in range(50):
|
||||||
line = ifile.readline().split()
|
# Treat the file as though it is UTF-8 since this is the more modern
|
||||||
|
# option; and anyway it doesn't really matter as we are only trying to
|
||||||
|
# detect a CHAR or SOUR line which is only 7-bit ASCII anyway, and we
|
||||||
|
# ignore anything that can't be translated.
|
||||||
|
line = ifile.readline()
|
||||||
|
line = line.decode(encoding='utf-8', errors='replace')
|
||||||
|
line = line.split()
|
||||||
if len(line) == 0:
|
if len(line) == 0:
|
||||||
break
|
break
|
||||||
if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":
|
if len(line) > 2 and line[1][0:4] == 'CHAR' and line[2] == "ANSEL":
|
||||||
|
@ -564,6 +564,10 @@ LDS_STATUS = {
|
|||||||
# table for skipping illegal control chars in GEDCOM import
|
# table for skipping illegal control chars in GEDCOM import
|
||||||
# Only 09, 0A, 0D are allowed.
|
# Only 09, 0A, 0D are allowed.
|
||||||
STRIP_DICT = dict.fromkeys(list(range(9))+list(range(11, 13))+list(range(14, 32)))
|
STRIP_DICT = dict.fromkeys(list(range(9))+list(range(11, 13))+list(range(14, 32)))
|
||||||
|
# The C1 Control characters are not treated in Latin-1 (ISO-8859-1) as
|
||||||
|
# undefined, but if they have been used, the file is probably supposed to be
|
||||||
|
# cp1252
|
||||||
|
DEL_AND_C1 = dict.fromkeys(list(range(0x7F, 0x9F)))
|
||||||
|
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
@ -675,7 +679,7 @@ class GedcomDateParser(DateParser):
|
|||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
class Lexer(object):
|
class Lexer(object):
|
||||||
|
|
||||||
def __init__(self, ifile):
|
def __init__(self, ifile, __add_msg):
|
||||||
self.ifile = ifile
|
self.ifile = ifile
|
||||||
self.current_list = []
|
self.current_list = []
|
||||||
self.eof = False
|
self.eof = False
|
||||||
@ -686,6 +690,7 @@ class Lexer(object):
|
|||||||
TOKEN_CONT : self.__fix_token_cont,
|
TOKEN_CONT : self.__fix_token_cont,
|
||||||
TOKEN_CONC : self.__fix_token_conc,
|
TOKEN_CONC : self.__fix_token_conc,
|
||||||
}
|
}
|
||||||
|
self.__add_msg = __add_msg
|
||||||
|
|
||||||
def readline(self):
|
def readline(self):
|
||||||
if len(self.current_list) <= 1 and not self.eof:
|
if len(self.current_list) <= 1 and not self.eof:
|
||||||
@ -724,6 +729,7 @@ class Lexer(object):
|
|||||||
self.eof = True
|
self.eof = True
|
||||||
return
|
return
|
||||||
|
|
||||||
|
original_line = line
|
||||||
try:
|
try:
|
||||||
# According to the GEDCOM 5.5 standard,
|
# According to the GEDCOM 5.5 standard,
|
||||||
# Chapter 1 subsection Grammar
|
# Chapter 1 subsection Grammar
|
||||||
@ -757,6 +763,13 @@ class Lexer(object):
|
|||||||
tag = line[0]
|
tag = line[0]
|
||||||
line_value = line[2]
|
line_value = line[2]
|
||||||
except:
|
except:
|
||||||
|
problem = _("Line ignored ")
|
||||||
|
text = original_line.rstrip('\n\r')
|
||||||
|
prob_width = 66
|
||||||
|
problem = problem.ljust(prob_width)[0:(prob_width-1)]
|
||||||
|
text = text.replace("\n", "\n".ljust(prob_width + 22))
|
||||||
|
message = "%s %s" % (problem, text)
|
||||||
|
self.__add_msg(message)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
token = TOKENS.get(tag, TOKEN_UNKNOWN)
|
token = TOKENS.get(tag, TOKEN_UNKNOWN)
|
||||||
@ -1213,22 +1226,29 @@ class GedInfoParser(object):
|
|||||||
#
|
#
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
class BaseReader(object):
|
class BaseReader(object):
|
||||||
def __init__(self, ifile, encoding):
|
def __init__(self, ifile, encoding, __add_msg):
|
||||||
self.ifile = ifile
|
self.ifile = ifile
|
||||||
self.enc = encoding
|
self.enc = encoding
|
||||||
|
self.__add_msg = __add_msg
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.ifile.seek(0)
|
self.ifile.seek(0)
|
||||||
|
|
||||||
def readline(self):
|
def readline(self):
|
||||||
line = self.ifile.readline()
|
raise NotImplemented
|
||||||
line = line.decode(self.enc, errors='replace')
|
|
||||||
return line.translate(STRIP_DICT)
|
def report_error(self, problem, line):
|
||||||
|
line = line.rstrip('\n\r')
|
||||||
|
prob_width = 66
|
||||||
|
problem = problem.ljust(prob_width)[0:(prob_width-1)]
|
||||||
|
text = line.replace("\n", "\n".ljust(prob_width + 22))
|
||||||
|
message = "%s %s" % (problem, text)
|
||||||
|
self.__add_msg(message)
|
||||||
|
|
||||||
class UTF8Reader(BaseReader):
|
class UTF8Reader(BaseReader):
|
||||||
|
|
||||||
def __init__(self, ifile):
|
def __init__(self, ifile, __add_msg):
|
||||||
BaseReader.__init__(self, ifile, 'utf8')
|
BaseReader.__init__(self, ifile, 'utf8', __add_msg)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
@ -1244,22 +1264,37 @@ class UTF8Reader(BaseReader):
|
|||||||
|
|
||||||
class UTF16Reader(BaseReader):
|
class UTF16Reader(BaseReader):
|
||||||
|
|
||||||
def __init__(self, ifile):
|
def __init__(self, ifile, __add_msg):
|
||||||
new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16')
|
new_file = codecs.EncodedFile(ifile, 'utf8', 'utf16')
|
||||||
BaseReader.__init__(self, new_file, 'utf16')
|
BaseReader.__init__(self, new_file, '', __add_msg)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def readline(self):
|
def readline(self):
|
||||||
l = self.ifile.readline()
|
line = self.ifile.readline()
|
||||||
if l.strip():
|
line = line.decode('utf8', errors='replace')
|
||||||
return l
|
return line.translate(STRIP_DICT)
|
||||||
else:
|
|
||||||
return self.ifile.readline()
|
|
||||||
|
|
||||||
class AnsiReader(BaseReader):
|
class AnsiReader(BaseReader):
|
||||||
|
|
||||||
def __init__(self, ifile):
|
def __init__(self, ifile, __add_msg):
|
||||||
BaseReader.__init__(self, ifile, 'latin1')
|
BaseReader.__init__(self, ifile, 'latin1', __add_msg)
|
||||||
|
|
||||||
|
def readline(self):
|
||||||
|
line = self.ifile.readline()
|
||||||
|
line = line.decode(self.enc, errors='replace')
|
||||||
|
if line.translate(DEL_AND_C1) != line:
|
||||||
|
self.report_error("DEL or C1 control chars in line did you mean CHAR cp1252??", line)
|
||||||
|
return line.translate(STRIP_DICT)
|
||||||
|
|
||||||
|
class CP1252Reader(BaseReader):
|
||||||
|
|
||||||
|
def __init__(self, ifile, __add_msg):
|
||||||
|
BaseReader.__init__(self, ifile, 'cp1252', __add_msg)
|
||||||
|
|
||||||
|
def readline(self):
|
||||||
|
line = self.ifile.readline()
|
||||||
|
line = line.decode(self.enc, errors='replace')
|
||||||
|
return line.translate(STRIP_DICT)
|
||||||
|
|
||||||
class AnselReader(BaseReader):
|
class AnselReader(BaseReader):
|
||||||
"""
|
"""
|
||||||
@ -1280,7 +1315,8 @@ class AnselReader(BaseReader):
|
|||||||
TODO: should we allow TAB, as a Gramps extension?
|
TODO: should we allow TAB, as a Gramps extension?
|
||||||
"""
|
"""
|
||||||
__printable_ascii = list(map(chr, list(range(32, 127)))) # note: up thru 126
|
__printable_ascii = list(map(chr, list(range(32, 127)))) # note: up thru 126
|
||||||
__use_ASCII = list(map(chr, [10, 27, 29 , 30, 31])) + __printable_ascii
|
# LF CR Esc GS RS US
|
||||||
|
__use_ASCII = list(map(chr, [10, 13, 27, 29 , 30, 31])) + __printable_ascii
|
||||||
|
|
||||||
# mappings of single byte ANSEL codes to unicode
|
# mappings of single byte ANSEL codes to unicode
|
||||||
__onebyte = {
|
__onebyte = {
|
||||||
@ -1293,9 +1329,11 @@ class AnselReader(BaseReader):
|
|||||||
b'\xB4' : '\u00fe', b'\xB5' : '\u00e6', b'\xB6' : '\u0153',
|
b'\xB4' : '\u00fe', b'\xB5' : '\u00e6', b'\xB6' : '\u0153',
|
||||||
b'\xB7' : '\u02ba', b'\xB8' : '\u0131', b'\xB9' : '\u00a3',
|
b'\xB7' : '\u02ba', b'\xB8' : '\u0131', b'\xB9' : '\u00a3',
|
||||||
b'\xBA' : '\u00f0', b'\xBC' : '\u01a1', b'\xBD' : '\u01b0',
|
b'\xBA' : '\u00f0', b'\xBC' : '\u01a1', b'\xBD' : '\u01b0',
|
||||||
|
b'\xBE' : '\u25a1', b'\xBF' : '\u25a0',
|
||||||
b'\xC0' : '\u00b0', b'\xC1' : '\u2113', b'\xC2' : '\u2117',
|
b'\xC0' : '\u00b0', b'\xC1' : '\u2113', b'\xC2' : '\u2117',
|
||||||
b'\xC3' : '\u00a9', b'\xC4' : '\u266f', b'\xC5' : '\u00bf',
|
b'\xC3' : '\u00a9', b'\xC4' : '\u266f', b'\xC5' : '\u00bf',
|
||||||
b'\xC6' : '\u00a1', b'\xC7' : '\u00df', b'\xC8' : '\u20ac',
|
b'\xC6' : '\u00a1', b'\xC7' : '\u00df', b'\xC8' : '\u20ac',
|
||||||
|
b'\xCD' : '\u0065', b'\xCE' : '\u006f', b'\xCF' : '\u00df',
|
||||||
}
|
}
|
||||||
|
|
||||||
# combining forms (in ANSEL, they precede the modified ASCII character
|
# combining forms (in ANSEL, they precede the modified ASCII character
|
||||||
@ -1316,6 +1354,7 @@ class AnselReader(BaseReader):
|
|||||||
b'\xF3' : '\u0324', b'\xF4' : '\u0325', b'\xF5' : '\u0333',
|
b'\xF3' : '\u0324', b'\xF4' : '\u0325', b'\xF5' : '\u0333',
|
||||||
b'\xF6' : '\u0332', b'\xF7' : '\u0326', b'\xF8' : '\u031c',
|
b'\xF6' : '\u0332', b'\xF7' : '\u0326', b'\xF8' : '\u031c',
|
||||||
b'\xF9' : '\u032e', b'\xFA' : '\ufe22', b'\xFB' : '\ufe23',
|
b'\xF9' : '\u032e', b'\xFA' : '\ufe22', b'\xFB' : '\ufe23',
|
||||||
|
b'\xFC' : '\u0338',
|
||||||
b'\xFE' : '\u0313',
|
b'\xFE' : '\u0313',
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1473,49 +1512,56 @@ class AnselReader(BaseReader):
|
|||||||
b'\xF9\x48' : '\u1e2a', b'\xF9\x68' : '\u1e2b',
|
b'\xF9\x48' : '\u1e2a', b'\xF9\x68' : '\u1e2b',
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
def __ansel_to_unicode(self, s):
|
||||||
def __ansel_to_unicode(s):
|
|
||||||
""" Convert an ANSEL encoded string to unicode """
|
""" Convert an ANSEL encoded string to unicode """
|
||||||
|
|
||||||
buff = StringIO()
|
buff = StringIO()
|
||||||
|
error = ""
|
||||||
while s:
|
while s:
|
||||||
if ord(s[0]) < 128:
|
if s[0] < 128:
|
||||||
if s[0] in AnselReader.__use_ASCII:
|
if chr(s[0]) in AnselReader.__use_ASCII:
|
||||||
head = s[0]
|
head = chr(s[0])
|
||||||
else:
|
else:
|
||||||
# substitute space for disallowed (control) chars
|
# substitute space for disallowed (control) chars
|
||||||
|
error += " (%#X)" % s[0]
|
||||||
head = ' '
|
head = ' '
|
||||||
s = s[1:]
|
s = s[1:]
|
||||||
else:
|
else:
|
||||||
if s[0:2] in AnselReader.__twobyte:
|
if s[0:2] in AnselReader.__twobyte:
|
||||||
head = AnselReader.__twobyte[s[0:2]]
|
head = AnselReader.__twobyte[s[0:2]]
|
||||||
s = s[2:]
|
s = s[2:]
|
||||||
elif s[0] in AnselReader.__onebyte:
|
elif bytes([s[0]]) in AnselReader.__onebyte:
|
||||||
head = AnselReader.__onebyte[s[0]]
|
head = AnselReader.__onebyte[bytes([s[0]])]
|
||||||
s = s[1:]
|
s = s[1:]
|
||||||
elif s[0] in AnselReader.__acombiners:
|
elif bytes([s[0]]) in AnselReader.__acombiners:
|
||||||
c = AnselReader.__acombiners[s[0]]
|
c = AnselReader.__acombiners[bytes([s[0]])]
|
||||||
# always consume the combiner
|
# always consume the combiner
|
||||||
s = s[1:]
|
s = s[1:]
|
||||||
next = s[0]
|
next_byte = s[0]
|
||||||
if next in AnselReader.__printable_ascii:
|
if next_byte < 128 and chr(next_byte) in AnselReader.__printable_ascii:
|
||||||
# consume next as well
|
# consume next as well
|
||||||
s = s[1:]
|
s = s[1:]
|
||||||
# unicode: combiner follows base-char
|
# unicode: combiner follows base-char
|
||||||
head = next + c
|
head = chr(next_byte) + c
|
||||||
else:
|
else:
|
||||||
# just drop the unexpected combiner
|
# just drop the unexpected combiner
|
||||||
|
error += " (%#X)" % s[0]
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
|
error += " (%#X)" % s[0]
|
||||||
head = '\ufffd' # "Replacement Char"
|
head = '\ufffd' # "Replacement Char"
|
||||||
s = s[1:]
|
s = s[1:]
|
||||||
buff.write(head.encode("utf-8"))
|
buff.write(head)
|
||||||
ans = buff.getvalue().decode("utf-8")
|
ans = buff.getvalue()
|
||||||
|
|
||||||
|
if error:
|
||||||
|
# e.g. Illegal character (oxAB) (0xCB)... 1 NOTE xyz?pqr?lmn
|
||||||
|
self.report_error(_("Illegal character%s") % error, ans)
|
||||||
buff.close()
|
buff.close()
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def __init__(self, ifile):
|
def __init__(self, ifile, __add_msg):
|
||||||
BaseReader.__init__(self, ifile, "")
|
BaseReader.__init__(self, ifile, "", __add_msg)
|
||||||
|
|
||||||
def readline(self):
|
def readline(self):
|
||||||
return self.__ansel_to_unicode(self.ifile.readline())
|
return self.__ansel_to_unicode(self.ifile.readline())
|
||||||
@ -2617,15 +2663,17 @@ class GedcomParser(UpdateCallback):
|
|||||||
enc = stage_one.get_encoding()
|
enc = stage_one.get_encoding()
|
||||||
|
|
||||||
if enc == "ANSEL":
|
if enc == "ANSEL":
|
||||||
rdr = AnselReader(ifile)
|
rdr = AnselReader(ifile, self.__add_msg)
|
||||||
elif enc in ("UTF-8", "UTF8"):
|
elif enc in ("UTF-8", "UTF8"):
|
||||||
rdr = UTF8Reader(ifile)
|
rdr = UTF8Reader(ifile, self.__add_msg)
|
||||||
elif enc in ("UTF-16", "UTF16", "UNICODE"):
|
elif enc in ("UTF-16LE", "UTF-16BE", "UTF16", "UNICODE"):
|
||||||
rdr = UTF16Reader(ifile)
|
rdr = UTF16Reader(ifile, self.__add_msg)
|
||||||
|
elif enc in ("CP1252", "WINDOWS-1252"):
|
||||||
|
rdr = CP1252Reader(ifile, self.__add_msg)
|
||||||
else:
|
else:
|
||||||
rdr = AnsiReader(ifile)
|
rdr = AnsiReader(ifile, self.__add_msg)
|
||||||
|
|
||||||
self.lexer = Lexer(rdr)
|
self.lexer = Lexer(rdr, self.__add_msg)
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.backoff = False
|
self.backoff = False
|
||||||
|
|
||||||
@ -7129,8 +7177,13 @@ class GedcomParser(UpdateCallback):
|
|||||||
sattr.set_value(line.data)
|
sattr.set_value(line.data)
|
||||||
self.def_src.add_attribute(sattr)
|
self.def_src.add_attribute(sattr)
|
||||||
elif line.token == TOKEN_FORM:
|
elif line.token == TOKEN_FORM:
|
||||||
if line.data != "LINEAGE-LINKED":
|
if line.data == "LINEAGE-LINKED":
|
||||||
self.__add_msg(_("GEDCOM form not supported"), line, state)
|
pass
|
||||||
|
elif line.data.upper() == "LINEAGE-LINKED":
|
||||||
|
# Allow Lineage-Linked etc. though it should be in uppercase
|
||||||
|
self.__add_msg(_("GEDCOM FORM should be in uppercase"), line, state)
|
||||||
|
else:
|
||||||
|
self.__add_msg(_("GEDCOM FORM not supported"), line, state)
|
||||||
if self.use_def_src:
|
if self.use_def_src:
|
||||||
sattr = SrcAttribute()
|
sattr = SrcAttribute()
|
||||||
sattr.set_type(_('GEDCOM form'))
|
sattr.set_type(_('GEDCOM form'))
|
||||||
@ -7675,7 +7728,7 @@ class GedcomStageOne(object):
|
|||||||
input_file.read(1)
|
input_file.read(1)
|
||||||
self.enc = "UTF8"
|
self.enc = "UTF8"
|
||||||
return input_file
|
return input_file
|
||||||
elif line == b"\xff\xfe":
|
elif line == b"\xff\xfe" or line == b"\xfe\xff":
|
||||||
self.enc = "UTF16"
|
self.enc = "UTF16"
|
||||||
input_file.seek(0)
|
input_file.seek(0)
|
||||||
return codecs.EncodedFile(input_file, 'utf8', 'utf16')
|
return codecs.EncodedFile(input_file, 'utf8', 'utf16')
|
||||||
@ -7696,25 +7749,30 @@ class GedcomStageOne(object):
|
|||||||
reader = self.__detect_file_decoder(self.ifile)
|
reader = self.__detect_file_decoder(self.ifile)
|
||||||
|
|
||||||
for line in reader:
|
for line in reader:
|
||||||
|
# Treat the file as though it is UTF-8 since this will be right if a
|
||||||
|
# BOM was detected; it is the more modern option; and anyway it
|
||||||
|
# doesn't really matter as we are only trying to detect a CHAR line
|
||||||
|
# which is only 7-bit ASCII anyway, and we ignore anything that
|
||||||
|
# can't be translated.
|
||||||
|
line = line.decode(encoding='utf-8', errors='replace')
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
self.lcnt += 1
|
self.lcnt += 1
|
||||||
|
|
||||||
data = line.split(None, 2) + ['']
|
|
||||||
try:
|
try:
|
||||||
|
data = line.split(None, 2) + ['']
|
||||||
(level, key, value) = data[:3]
|
(level, key, value) = data[:3]
|
||||||
level = int(level)
|
level = int(level)
|
||||||
key = conv_to_unicode(key.strip())
|
key = key.strip()
|
||||||
value = conv_to_unicode(value.strip())
|
value = value.strip()
|
||||||
except:
|
except:
|
||||||
LOG.warn(_("Invalid line %d in GEDCOM file.") % self.lcnt)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if level == 0 and key[0] == '@':
|
if level == 0 and key[0] == '@':
|
||||||
if value == ("FAM", "FAMILY") :
|
if value in ("FAM", "FAMILY") :
|
||||||
current_family_id = key.strip()[1:-1]
|
current_family_id = key.strip()[1:-1]
|
||||||
elif value == ("INDI", "INDIVIDUAL"):
|
elif value in ("INDI", "INDIVIDUAL"):
|
||||||
self.pcnt += 1
|
self.pcnt += 1
|
||||||
elif key in ("HUSB", "HUSBAND", "WIFE") and \
|
elif key in ("HUSB", "HUSBAND", "WIFE") and \
|
||||||
self.__is_xref_value(value):
|
self.__is_xref_value(value):
|
||||||
@ -7724,6 +7782,9 @@ class GedcomStageOne(object):
|
|||||||
elif key == 'CHAR' and not self.enc:
|
elif key == 'CHAR' and not self.enc:
|
||||||
assert(isinstance(value, str))
|
assert(isinstance(value, str))
|
||||||
self.enc = value
|
self.enc = value
|
||||||
|
LOG.debug("parse pcnt %d" % self.pcnt)
|
||||||
|
LOG.debug("parse famc %s" % dict(self.famc))
|
||||||
|
LOG.debug("parse fams %s" % dict(self.fams))
|
||||||
|
|
||||||
def get_famc_map(self):
|
def get_famc_map(self):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user