Move GedcomStageOne.py into GedcomParse.py.

svn: r13936
This commit is contained in:
Brian Matherly 2009-12-29 05:37:20 +00:00
parent 4f3d8f3675
commit c7b595210f
5 changed files with 148 additions and 213 deletions

View File

@ -342,7 +342,6 @@ src/docgen/SpreadSheetDoc.py
src/docgen/TextBufDoc.py
# GrampsDbUtils package
src/GrampsDbUtils/_GedcomStageOne.py
src/GrampsDbUtils/_GedcomParse.py
src/GrampsDbUtils/_GedcomTokens.py
src/GrampsDbUtils/__init__.py

View File

@ -7,7 +7,6 @@ pkgdatadir = $(datadir)/@PACKAGE@/GrampsDbUtils
pkgdata_PYTHON = \
_GedcomParse.py\
_GedcomStageOne.py\
_GedcomTokens.py\
_GedcomUtils.py\
__init__.py

View File

@ -100,7 +100,7 @@ from xml.parsers.expat import ParserCreate
#
#------------------------------------------------------------------------
import logging
LOG = logging.getLogger(".GedcomImport")
LOG = logging.getLogger(".libgedcom")
#-------------------------------------------------------------------------
#
@ -5051,4 +5051,148 @@ class GedcomParser(UpdateCallback):
"""
state.res.set_phone(line.data)
#===eof===
#-------------------------------------------------------------------------
#
# GedcomStageOne
#
#-------------------------------------------------------------------------
class GedcomStageOne(object):
"""
The GedcomStageOne parser scans the file quickly, looking for a few things.
This includes:
1. Character set encoding
2. Number of people and families in the list
3. Child to family references, since Ancestry.com creates GEDCOM files
without the FAMC references.
"""
__BAD_UTF16 = _("Your GEDCOM file is corrupted. "
"The file appears to be encoded using the UTF16 "
"character set, but is missing the BOM marker.")
__EMPTY_GED = _("Your GEDCOM file is empty.")
@staticmethod
def __is_xref_value(value):
"""
Return True if value is in the form of a XREF value. We assume that
if we have a leading '@' character, then we are okay.
"""
return value and value[0] == '@'
@staticmethod
def __add_to_list(table, key, value):
"""
Add the value to the table entry associated with key. If the entry
does not exist, it is added.
"""
if key in table:
table[key].append(value)
else:
table[key] = [value]
def __init__(self, ifile):
self.ifile = ifile
self.famc = {}
self.fams = {}
self.enc = ""
self.pcnt = 0
self.lcnt = 0
def __detect_file_decoder(self, input_file):
"""
Detects the file encoding of the file by looking for a BOM
(byte order marker) in the GEDCOM file. If we detect a UTF-16
encoded file, we must connect to a wrapper using the codecs
package.
"""
line = input_file.read(2)
if line == "\xef\xbb":
input_file.read(1)
self.enc = "UTF8"
return input_file
elif line == "\xff\xfe":
self.enc = "UTF16"
input_file.seek(0)
return codecs.EncodedFile(input_file, 'utf8', 'utf16')
elif not line :
raise Errors.GedcomError(self.__EMPTY_GED)
elif line[0] == "\x00" or line[1] == "\x00":
raise Errors.GedcomError(self.__BAD_UTF16)
else:
input_file.seek(0)
return input_file
def parse(self):
"""
Parse the input file.
"""
current_family_id = ""
reader = self.__detect_file_decoder(self.ifile)
for line in reader:
line = line.strip()
if not line:
continue
self.lcnt += 1
data = line.split(None, 2) + ['']
try:
(level, key, value) = data[:3]
value = value.strip()
level = int(level)
key = key.strip()
except:
LOG.warn(_("Invalid line %d in GEDCOM file.") % self.lcnt)
continue
if level == 0 and key[0] == '@':
if value == ("FAM", "FAMILY") :
current_family_id = key.strip()[1:-1]
elif value == ("INDI", "INDIVIDUAL"):
self.pcnt += 1
elif key in ("HUSB", "HUSBAND", "WIFE") and \
self.__is_xref_value(value):
self.__add_to_list(self.fams, value[1:-1], current_family_id)
elif key in ("CHIL", "CHILD") and self.__is_xref_value(value):
self.__add_to_list(self.famc, value[1:-1], current_family_id)
elif key == 'CHAR' and not self.enc:
assert(isinstance(value, basestring))
self.enc = value
def get_famc_map(self):
"""
Return the Person to Child Family map
"""
return self.famc
def get_fams_map(self):
"""
Return the Person to Family map (where the person is a spouse)
"""
return self.fams
def get_encoding(self):
"""
Return the detected encoding
"""
return self.enc.upper()
def set_encoding(self, enc):
"""
Forces the encoding
"""
assert(isinstance(enc, basestring))
self.enc = enc
def get_person_count(self):
"""
Return the number of INDI records found
"""
return self.pcnt
def get_line_count(self):
"""
Return the number of lines in the file
"""
return self.lcnt

View File

@ -1,206 +0,0 @@
#
# Gramps - a GTK+/GNOME based genealogy program
#
# Copyright (C) 2000-2007 Donald N. Allingham
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# $Id$
"""
Import from GEDCOM
"""
#-------------------------------------------------------------------------
#
# standard python modules
#
#-------------------------------------------------------------------------
import codecs
from gettext import gettext as _
#-------------------------------------------------------------------------
#
# GRAMPS modules
#
#-------------------------------------------------------------------------
import Errors
#------------------------------------------------------------------------
#
# Set up logging
#
#------------------------------------------------------------------------
import logging
LOG = logging.getLogger(".GedcomImport")
#-------------------------------------------------------------------------
#
# Constants
#
#-------------------------------------------------------------------------
BAD_UTF16 = _("Your GEDCOM file is corrupted. "
"The file appears to be encoded using the UTF16 "
"character set, but is missing the BOM marker.")
EMPTY_GED = _("Your GEDCOM file is empty.")
#-------------------------------------------------------------------------
#
# is_xref_value
#
#-------------------------------------------------------------------------
def is_xref_value(value):
"""
Return True if value is in the form of a XREF value. We assume that
if we have a leading '@' character, then we are okay.
"""
return value and value[0] == '@'
#-------------------------------------------------------------------------
#
# add_to_list
#
#-------------------------------------------------------------------------
def add_to_list(table, key, value):
"""
Add the value to the table entry associated with key. If the entry
does not exist, it is added.
"""
if key in table:
table[key].append(value)
else:
table[key] = [value]
#-------------------------------------------------------------------------
#
# StageOne
#
#-------------------------------------------------------------------------
class StageOne(object):
"""
The StageOne parser scans the file quickly, looking for a few things. This
includes:
1. Character set encoding
2. Number of people and families in the list
3. Child to family references, since Ancestry.com creates GEDCOM files
without the FAMC references.
"""
def __init__(self, ifile):
self.ifile = ifile
self.famc = {}
self.fams = {}
self.enc = ""
self.pcnt = 0
self.lcnt = 0
def __detect_file_decoder(self, input_file):
"""
Detects the file encoding of the file by looking for a BOM
(byte order marker) in the GEDCOM file. If we detect a UTF-16
encoded file, we must connect to a wrapper using the codecs
package.
"""
line = input_file.read(2)
if line == "\xef\xbb":
input_file.read(1)
self.enc = "UTF8"
return input_file
elif line == "\xff\xfe":
self.enc = "UTF16"
input_file.seek(0)
return codecs.EncodedFile(input_file, 'utf8', 'utf16')
elif not line :
raise Errors.GedcomError(EMPTY_GED)
elif line[0] == "\x00" or line[1] == "\x00":
raise Errors.GedcomError(BAD_UTF16)
else:
input_file.seek(0)
return input_file
def parse(self):
"""
Parse the input file.
"""
current_family_id = ""
reader = self.__detect_file_decoder(self.ifile)
for line in reader:
line = line.strip()
if not line:
continue
self.lcnt += 1
data = line.split(None, 2) + ['']
try:
(level, key, value) = data[:3]
value = value.strip()
level = int(level)
key = key.strip()
except:
LOG.warn(_("Invalid line %d in GEDCOM file.") % self.lcnt)
continue
if level == 0 and key[0] == '@':
if value == ("FAM", "FAMILY") :
current_family_id = key.strip()[1:-1]
elif value == ("INDI", "INDIVIDUAL"):
self.pcnt += 1
elif key in ("HUSB", "HUSBAND", "WIFE") and is_xref_value(value):
add_to_list(self.fams, value[1:-1], current_family_id)
elif key in ("CHIL", "CHILD") and is_xref_value(value):
add_to_list(self.famc, value[1:-1], current_family_id)
elif key == 'CHAR' and not self.enc:
assert(isinstance(value, basestring))
self.enc = value
def get_famc_map(self):
"""
Return the Person to Child Family map
"""
return self.famc
def get_fams_map(self):
"""
Return the Person to Family map (where the person is a spouse)
"""
return self.fams
def get_encoding(self):
"""
Return the detected encoding
"""
return self.enc.upper()
def set_encoding(self, enc):
"""
Forces the encoding
"""
assert(isinstance(enc, basestring))
self.enc = enc
def get_person_count(self):
"""
Return the number of INDI records found
"""
return self.pcnt
def get_line_count(self):
"""
Return the number of lines in the file
"""
return self.lcnt

View File

@ -43,8 +43,7 @@ LOG = logging.getLogger(".GedcomImport")
#
#------------------------------------------------------------------------
import Errors
from GrampsDbUtils._GedcomParse import GedcomParser
from GrampsDbUtils._GedcomStageOne import StageOne
from GrampsDbUtils._GedcomParse import GedcomParser, GedcomStageOne
from QuestionDialog import ErrorDialog, DBErrorDialog
from glade import Glade
from libmixin import DbMixin
@ -103,7 +102,7 @@ def importData(database, filename, callback=None):
try:
ifile = open(filename, "rU")
stage_one = StageOne(ifile)
stage_one = GedcomStageOne(ifile)
stage_one.parse()
if code_set: