From 706916af15b3fdc7d83f595848389663e8e2cb3c Mon Sep 17 00:00:00 2001 From: Don Allingham Date: Sun, 25 Feb 2007 05:26:32 +0000 Subject: [PATCH] 2007-02-24 Don Allingham * src/DisplayTabs/_NoteModel.py: added * src/DisplayTabs/_NoteTab.py: support new list * src/GrampsDbUtils/_GedcomParse.py: enhancements to parsing * src/GrampsDbUtils/_ReadGedcom.py: handle encoding properly * src/GrampsDbUtils/_GedcomChar.py: new encoding interface * src/GrampsDbUtils/_GedcomLex.py: cleanup svn: r8231 --- ChangeLog | 8 ++ src/DisplayTabs/_NoteModel.py | 46 +++++++ src/DisplayTabs/_NoteTab.py | 173 +++++-------------------- src/GrampsDbUtils/_GedcomChar.py | 76 +++++++++++ src/GrampsDbUtils/_GedcomLex.py | 201 ++++++++++-------------------- src/GrampsDbUtils/_GedcomParse.py | 116 ++++++----------- src/GrampsDbUtils/_ReadGedcom.py | 7 +- 7 files changed, 268 insertions(+), 359 deletions(-) create mode 100644 src/DisplayTabs/_NoteModel.py create mode 100644 src/GrampsDbUtils/_GedcomChar.py diff --git a/ChangeLog b/ChangeLog index 145bc9a6c..6d7561a1b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2007-02-24 Don Allingham + * src/DisplayTabs/_NoteModel.py: added + * src/DisplayTabs/_NoteTab.py: support new list + * src/GrampsDbUtils/_GedcomParse.py: enhancements to parsing + * src/GrampsDbUtils/_ReadGedcom.py: handle encoding properly + * src/GrampsDbUtils/_GedcomChar.py: new encoding interface + * src/GrampsDbUtils/_GedcomLex.py: cleanup + 2007-02-24 Brian Matherly * src/docgen/SvgDrawDoc.py.py: Fix XML error in draw_text. diff --git a/src/DisplayTabs/_NoteModel.py b/src/DisplayTabs/_NoteModel.py new file mode 100644 index 000000000..f4a128c5e --- /dev/null +++ b/src/DisplayTabs/_NoteModel.py @@ -0,0 +1,46 @@ +# +# Gramps - a GTK+/GNOME based genealogy program +# +# Copyright (C) 2000-2006 Donald N. Allingham +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +# $Id: _NoteModel.py 7068 2006-07-24 23:06:49Z rshura $ + +#------------------------------------------------------------------------- +# +# GTK libraries +# +#------------------------------------------------------------------------- +import gtk + +#------------------------------------------------------------------------- +# +# NoteModel +# +#------------------------------------------------------------------------- +class NoteModel(gtk.ListStore): + + def __init__(self, note_list, db): + gtk.ListStore.__init__(self, str, str, object) + self.db = db + for handle in note_list: + note = self.db.get_note_from_handle(handle) + self.append(row=[ + str(note.get_type()), + note.get().replace('\n', ' ')[:80], + handle, + ]) diff --git a/src/DisplayTabs/_NoteTab.py b/src/DisplayTabs/_NoteTab.py index e5824a4bc..37422e87c 100644 --- a/src/DisplayTabs/_NoteTab.py +++ b/src/DisplayTabs/_NoteTab.py @@ -27,169 +27,60 @@ #------------------------------------------------------------------------- from gettext import gettext as _ -#------------------------------------------------------------------------- -# -# GTK libraries -# -#------------------------------------------------------------------------- -import gtk -import pango - #------------------------------------------------------------------------- # # GRAMPS classes # #------------------------------------------------------------------------- import Spell -from _GrampsTab import GrampsTab from DisplayTabs import log -from MarkupText import EditorBuffer +from _NoteModel import NoteModel +from _EmbeddedList import EmbeddedList #------------------------------------------------------------------------- # # NoteTab # #------------------------------------------------------------------------- -class NoteTab(GrampsTab): +class NoteTab(EmbeddedList): - def __init__(self, dbstate, uistate, track, note_list, title=_('Note')): - self.note_list = note_list - self.original = note_list[:] + _HANDLE_COL = 2 - GrampsTab.__init__(self, dbstate, uistate, track, title) - self.show_all() + _column_names = [ + (_('Type'), 0, 100), + (_('Preview'), 1, 200), + ] - def get_icon_name(self): - return 'gramps-notes' + def __init__(self, dbstate, uistate, track, data): + self.data = data + EmbeddedList.__init__(self, dbstate, uistate, track, + _("Notes"), NoteModel) - def _update_label(self, *obj): - cc = self.buf.get_char_count() - if cc == 0 and not self.empty: - self.empty = True - self._set_label() - elif cc != 0 and self.empty: - self.empty = False - self._set_label() + def get_editor(self): + pass - def is_empty(self): - """ - Indicates if the tab contains any data. This is used to determine - how the label should be displayed. - """ - return self.buf.get_char_count() == 0 + def get_user_values(self): + return [] - def build_interface(self): - BUTTON = [(_('Italic'),gtk.STOCK_ITALIC,'i','I'), - (_('Bold'),gtk.STOCK_BOLD,'b','B'), - (_('Underline'),gtk.STOCK_UNDERLINE,'u','U'), - #('Separator', None, None, None), - ] + def get_data(self): + return self.data - vbox = gtk.VBox() + def column_order(self): + return ((1, 0), (1, 1)) - self.text = gtk.TextView() - self.text.set_accepts_tab(True) - # Accelerator dictionary used for formatting shortcuts - # key: tuple(key, modifier) - # value: widget, to emit 'activate' signal on - self.accelerator = {} - self.text.connect('key-press-event', self._on_key_press_event) + def add_button_clicked(self, obj): + pass - self.flowed = gtk.RadioButton(None, _('Flowed')) - self.format = gtk.RadioButton(self.flowed, _('Formatted')) - -# if self.note_obj and self.note_obj.get_format(): -# self.format.set_active(True) -# self.text.set_wrap_mode(gtk.WRAP_NONE) -# else: -# self.flowed.set_active(True) -# self.text.set_wrap_mode(gtk.WRAP_WORD) - self.spellcheck = Spell.Spell(self.text) - - self.flowed.connect('toggled', self.flow_changed) - - scroll = gtk.ScrolledWindow() - scroll.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) - scroll.add(self.text) - # FIXME: is this signal called at all - scroll.connect('focus-out-event', self.update) - - vbox.pack_start(scroll, True) - vbox.set_spacing(6) - vbox.set_border_width(6) - - hbox = gtk.HBox() - hbox.set_spacing(12) - hbox.set_border_width(6) - hbox.pack_start(self.flowed, False) - hbox.pack_start(self.format, False) - vbox.pack_start(hbox, False) - self.pack_start(vbox, True) - - self.buf = EditorBuffer() - self.text.set_buffer(self.buf) - tooltips = gtk.Tooltips() - for tip, stock, markup, accel in BUTTON: - if markup: - button = gtk.ToggleButton() - image = gtk.Image() - image.set_from_stock(stock, gtk.ICON_SIZE_MENU) - button.set_image(image) - button.set_relief(gtk.RELIEF_NONE) - tooltips.set_tip(button, tip) - self.buf.setup_widget_from_xml(button, markup) - key, mod = gtk.accelerator_parse(accel) - self.accelerator[(key, mod)] = button - hbox.pack_start(button, False) - else: - hbox.pack_start(gtk.VSeparator(), False) - hbox.pack_start(gtk.Label(_('Additional Notes:')),False) - self.menu = gtk.ComboBox() - hbox.pack_start(self.menu, True) - -# if self.note_obj: -# self.empty = False -# self.buf.set_text(self.note_obj.get(markup=True)) -# log.debug("Text: %s" % self.buf.get_text()) -# else: -# self.empty = True - - self.buf.connect('changed', self.update) - self.buf.connect_after('apply-tag', self.update) - self.buf.connect_after('remove-tag', self.update) + def add_callback(self, name): + self.get_data().append(name) + self.changed = True self.rebuild() - def _on_key_press_event(self, widget, event): - log.debug("Key %s (%d) was pressed on %s" % - (gtk.gdk.keyval_name(event.keyval), event.keyval, widget)) - key = event.keyval - mod = event.state - if self.accelerator.has_key((key, mod)): - self.accelerator[(key, mod)].emit('activate') - return True + def edit_button_clicked(self, obj): + note = self.get_selected() + if note: + print note - def update(self, obj, *args): -# if self.note_obj: -# start = self.buf.get_start_iter() -# stop = self.buf.get_end_iter() -# text = self.buf.get_text(start, stop) -# self.note_obj.set(text) -# else: -# print "NOTE OBJ DOES NOT EXIST" - self._update_label(obj) - return False - - def flow_changed(self, obj): - if obj.get_active(): - self.text.set_wrap_mode(gtk.WRAP_WORD) -# self.note_obj.set_format(0) - else: - self.text.set_wrap_mode(gtk.WRAP_NONE) -# self.note_obj.set_format(1) - - def rebuild(self): - self._set_label() - - def cancel(self): - pass -# self.note_obj.unserialize(self.original) + def edit_callback(self, name): + self.changed = True + self.rebuild() diff --git a/src/GrampsDbUtils/_GedcomChar.py b/src/GrampsDbUtils/_GedcomChar.py new file mode 100644 index 000000000..1e68410da --- /dev/null +++ b/src/GrampsDbUtils/_GedcomChar.py @@ -0,0 +1,76 @@ +# +# Gramps - a GTK+/GNOME based genealogy program +# +# Copyright (C) 2000-2005 Donald N. Allingham +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +from ansel_utf8 import ansel_to_utf8 + +class BaseReader: + def __init__(self, ifile, encoding): + self.ifile = ifile + self.enc = encoding + + def reset(self): + self.ifile.seek(0) + + def readline(self): + return unicode(self.ifile.readline(), + encoding=self.enc, + errors='replace').strip('\n\r') + +class UTF8Reader(BaseReader): + + def __init__(self, ifile): + BaseReader.__init__(self, ifile, 'utf8') + + def reset(self): + self.ifile.seek(0) + data = self.ifile.read(3) + if data != "\xef\xbb\xbf": + self.ifile.seek(0) + + def readline(self): + return unicode(self.ifile.readline(), + encoding=self.enc, + errors='replace').strip('\n\r') + +class UTF16Reader(BaseReader): + + def __init__(self, ifile): + BaseReader.__init__(self, ifile, 'utf16') + + def reset(self): + self.ifile.seek(0) + data = self.ifile.read(2) + if data != "\xff\xfe": + self.ifile.seek(0) + +class AnsiReader(BaseReader): + + def __init__(self, ifile): + BaseReader.__init__(self, ifile, 'latin1') + +class AnselReader(BaseReader): + + def __init__(self, ifile): + BaseReader.__init__(self, ifile, "") + + def readline(self): + return ansel_to_utf8(self.ifile.readline().strip('\n\r')) + + diff --git a/src/GrampsDbUtils/_GedcomLex.py b/src/GrampsDbUtils/_GedcomLex.py index 6c8ff988a..dc7a75d51 100644 --- a/src/GrampsDbUtils/_GedcomLex.py +++ b/src/GrampsDbUtils/_GedcomLex.py @@ -22,21 +22,22 @@ "Import from GEDCOM" +__revision__ = "$Revision: $" +__author__ = "Don Allingham" + #------------------------------------------------------------------------- # # standard python modules # #------------------------------------------------------------------------- + import re -import string -from gettext import gettext as _ #------------------------------------------------------------------------- # # GRAMPS modules # #------------------------------------------------------------------------- -from ansel_utf8 import ansel_to_utf8 from _GedcomInfo import * from _GedcomTokens import * @@ -45,60 +46,25 @@ from DateHandler._DateParser import DateParser #------------------------------------------------------------------------- # -# latin/utf8 conversions -# +# constants # #------------------------------------------------------------------------- -def utf8_to_latin(msg): - """ - Converts a string from unicode to iso-8859-1. If any illegal characters - are found, they are converted to ? - - @param msg: unicode string to convert - @type level: unicode - @return: Returns the string, converted to a ISO-8859-1 object - @rtype: str - """ - return msg.encode('iso-8859-1', 'replace') - -def latin_to_utf8(s): - if type(s) == unicode: - return s - else: - return unicode(s,'iso-8859-1') - -def nocnv(s): - return unicode(s,errors='replace') - -#------------------------------------------------------------------------- -# -# constants -# -#------------------------------------------------------------------------- -ANSEL = 1 -UNICODE = 2 -UPDATE = 25 - -_transtable = string.maketrans('','') -_delc = _transtable[0:8] + _transtable[10:31] -_transtable2 = _transtable[0:128] + ('?' * 128) - -ged2gramps = {} +GED2GRAMPS = {} for _val in personalConstantEvents.keys(): _key = personalConstantEvents[_val] if _key != "": - ged2gramps[_key] = _val + GED2GRAMPS[_key] = _val for _val in familyConstantEvents.keys(): _key = familyConstantEvents[_val] if _key != "": - ged2gramps[_key] = _val + GED2GRAMPS[_key] = _val -ged2attr = {} +GED2ATTR = {} for _val in personalConstantAttributes.keys(): _key = personalConstantAttributes[_val] if _key != "": - ged2attr[_key] = _val + GED2ATTR[_key] = _val #------------------------------------------------------------------------- # @@ -106,26 +72,24 @@ for _val in personalConstantAttributes.keys(): # #------------------------------------------------------------------------- -intRE = re.compile(r"\s*(\d+)\s*$") -modRegexp = re.compile(r"\s*(INT|EST|CAL)\s+(.*)$") -calRegexp = re.compile(r"\s*(ABT|BEF|AFT)?\s*@#D([^@]+)@\s*(.*)$") -rangeRegexp = re.compile(r"\s*BET\s+@#D([^@]+)@\s*(.*)\s+AND\s+@#D([^@]+)@\s*(.*)$") -spanRegexp = re.compile(r"\s*FROM\s+@#D([^@]+)@\s*(.*)\s+TO\s+@#D([^@]+)@\s*(.*)$") -intRegexp = re.compile(r"\s*INT\s+([^(]+)\((.*)\)$") +MOD = re.compile(r"\s*(INT|EST|CAL)\s+(.*)$") +CAL = re.compile(r"\s*(ABT|BEF|AFT)?\s*@#D([^@]+)@\s*(.*)$") +RANGE = re.compile(r"\s*BET\s+@#D([^@]+)@\s*(.*)\s+AND\s+@#D([^@]+)@\s*(.*)$") +SPAN = re.compile(r"\s*FROM\s+@#D([^@]+)@\s*(.*)\s+TO\s+@#D([^@]+)@\s*(.*)$") -_calendar_map = { +CALENDAR_MAP = { "FRENCH R" : RelLib.Date.CAL_FRENCH, "JULIAN" : RelLib.Date.CAL_JULIAN, "HEBREW" : RelLib.Date.CAL_HEBREW, } -_quality_map = { +QUALITY_MAP = { 'CAL' : RelLib.Date.QUAL_CALCULATED, 'INT' : RelLib.Date.QUAL_CALCULATED, 'EST' : RelLib.Date.QUAL_ESTIMATED, } -_sex_map = { +SEX_MAP = { 'F' : RelLib.Person.FEMALE, 'M' : RelLib.Person.MALE, } @@ -185,20 +149,21 @@ class GedLine: self.data = data[2] if self.level == 0: - if self.token_text and self.token_text[0] == '@' and self.token_text[-1] == '@': + if self.token_text and self.token_text[0] == '@' \ + and self.token_text[-1] == '@': self.token = TOKEN_ID self.token_text = self.token_text[1:-1] self.data = self.data.strip() else: - f = MAP_DATA.get(self.token) - if f: - f(self) + func = MAP_DATA.get(self.token) + if func: + func(self) def calc_sex(self): """ Converts the data field to a RelLib token indicating the gender """ - self.data = _sex_map.get(self.data.strip(),RelLib.Person.UNKNOWN) + self.data = SEX_MAP.get(self.data.strip(), RelLib.Person.UNKNOWN) def calc_date(self): """ @@ -212,12 +177,12 @@ class GedLine: change the type from UNKNOWN to TOKEN_GEVENT (gedcom event), and the data is assigned to the associated GRAMPS EventType """ - token = ged2gramps.get(self.token_text) + token = GED2GRAMPS.get(self.token_text) if token: self.token = TOKEN_GEVENT self.data = token else: - token = ged2attr.get(self.token_text) + token = GED2ATTR.get(self.token_text) if token: attr = RelLib.Attribute() attr.set_value(self.data) @@ -226,10 +191,10 @@ class GedLine: self.data = attr def calc_note(self): - d = self.data.strip() - if len(d) > 2 and d[0] == '@' and d[-1] == '@': + gid = self.data.strip() + if len(gid) > 2 and gid[0] == '@' and gid[-1] == '@': self.token = TOKEN_RNOTE - self.data = d[1:-1] + self.data = gid[1:-1] def calc_nchi(self): attr = RelLib.Attribute() @@ -245,10 +210,6 @@ class GedLine: self.data = attr self.token = TOKEN_ATTR - def calc_lds(self): - self.data = _ - self.token = TOKEN_ATTR - def __repr__(self): return "%d: %d (%d:%s) %s" % (self.line, self.level, self.token, self.token_text, self.data) @@ -276,7 +237,7 @@ MAP_DATA = { # #------------------------------------------------------------------------- -_dp = GedcomDateParser() +DATE_CNV = GedcomDateParser() def extract_date(text): """ @@ -285,54 +246,55 @@ def extract_date(text): dateobj = RelLib.Date() try: # extract out the MOD line - match = modRegexp.match(text) + match = MOD.match(text) if match: (mod, text) = match.groups() - qual = _quality_map.get(mod, RelLib.Date.QUAL_NONE) + qual = QUALITY_MAP.get(mod, RelLib.Date.QUAL_NONE) else: qual = RelLib.Date.QUAL_NONE # parse the range if we match, if so, return - match = rangeRegexp.match(text) + match = RANGE.match(text) if match: - (cal1,data1,cal2,data2) = match.groups() + (cal1, data1, cal2, data2) = match.groups() - cal = _calendar_map.get(cal1, RelLib.Date.CAL_GREGORIAN) + cal = CALENDAR_MAP.get(cal1, RelLib.Date.CAL_GREGORIAN) - start = _dp.parse(data1) - stop = _dp.parse(data2) + start = DATE_CNV.parse(data1) + stop = DATE_CNV.parse(data2) dateobj.set(RelLib.Date.QUAL_NONE, RelLib.Date.MOD_RANGE, cal, start.get_start_date() + stop.get_start_date()) dateobj.set_quality(qual) return dateobj # parse a span if we match - match = spanRegexp.match(text) + match = SPAN.match(text) if match: - (cal1,data1,cal2,data2) = match.groups() + (cal1, data1, cal2, data2) = match.groups() - cal = _calendar_map.get(cal1, RelLib.Date.CAL_GREGORIAN) + cal = CALENDAR_MAP.get(cal1, RelLib.Date.CAL_GREGORIAN) - start = _dp.parse(data1) - stop = _dp.parse(data2) + start = DATE_CNV.parse(data1) + stop = DATE_CNV.parse(data2) dateobj.set(RelLib.Date.QUAL_NONE, RelLib.Date.MOD_SPAN, cal, start.get_start_date() + stop.get_start_date()) dateobj.set_quality(qual) return dateobj - match = calRegexp.match(text) + match = CAL.match(text) if match: - (abt,cal,data) = match.groups() - dateobj = _dp.parse("%s %s" % (abt, data)) - dateobj.set_calendar(_calendar_map.get(cal, RelLib.Date.CAL_GREGORIAN)) + (abt, cal, data) = match.groups() + dateobj = DATE_CNV.parse("%s %s" % (abt, data)) + dateobj.set_calendar(CALENDAR_MAP.get(cal, + RelLib.Date.CAL_GREGORIAN)) dateobj.set_quality(qual) return dateobj - dateobj = _dp.parse(text) + dateobj = DATE_CNV.parse(text) dateobj.set_quality(qual) return dateobj except IOError: - return self.dp.set_text(text) + return DATE_CNV.set_text(text) #------------------------------------------------------------------------- # @@ -341,8 +303,8 @@ def extract_date(text): #------------------------------------------------------------------------- class Reader: - def __init__(self, f): - self.f = f + def __init__(self, ifile): + self.ifile = ifile self.current_list = [] self.eof = False self.cnv = None @@ -353,11 +315,7 @@ class Reader: TOKEN_CONC : self._fix_token_conc, } - def set_charset_fn(self,cnv): - print "Character set changed", cnv - self.cnv = cnv - - def set_broken_conc(self,broken): + def set_broken_conc(self, broken): self.func_map = { TOKEN_CONT : self._fix_token_cont, TOKEN_CONC : self._fix_token_broken_conc, @@ -372,46 +330,39 @@ class Reader: return None def _fix_token_cont(self, data): - l = self.current_list[0] - new_value = l[2]+'\n'+data[2] - self.current_list[0] = (l[0], l[1], new_value, l[3], l[4]) + line = self.current_list[0] + new_value = line[2]+'\n'+data[2] + self.current_list[0] = (line[0], line[1], new_value, line[3], line[4]) def _fix_token_conc(self, data): - l = self.current_list[0] - new_value = l[2] + data[2] - self.current_list[0] = (l[0], l[1], new_value, l[3], l[4]) + line = self.current_list[0] + new_value = line[2] + data[2] + self.current_list[0] = (line[0], line[1], new_value, line[3], line[4]) def _fix_token_broken_conc(self, data): - l = self.current_list[0] - new_value = u"%s %s" % (l[2], data[2]) - self.current_list[0] = (l[0], l[1], new_value, l[3], l[4]) + line = self.current_list[0] + new_value = u"%s %s" % (line[2], data[2]) + self.current_list[0] = (line[0], line[1], new_value, line[3], line[4]) def readahead(self): while len(self.current_list) < 5: - line = self.f.readline() + line = self.ifile.readline() self.index += 1 if not line: self.eof = True return - if self.cnv: - try: - line = self.cnv(line) - except: - line = self.cnv(line.translate(_transtable2)) - else: - line = unicode(line,errors='replace') + line = line.split(None, 2) + [''] - line = line.split(None,2) + [''] - - val = line[2].rstrip('\r\n') + val = line[2] try: level = int(line[0]) except: level = 0 - data = (level, tokens.get(line[1], TOKEN_UNKNOWN), val, line[1], self.index) + data = (level, tokens.get(line[1], TOKEN_UNKNOWN), val, line[1], + self.index) func = self.func_map.get(data[1]) if func: @@ -419,25 +370,3 @@ class Reader: else: self.current_list.insert(0, data) -if __name__ == "__main__": - import sys - - def run(): - print "Reading", sys.argv[1] - a = Reader(sys.argv[1]) - while True: - line = a.readline() - print line - if not line: break - -# import Utils -# Utils.profile(run) - run() - - print extract_date("20 JAN 2000") - print extract_date("EST 20 JAN 2000") - print extract_date("CAL 20 JAN 2000") - print extract_date("ABT 20 JAN 2000") - print extract_date("INT 20 JAN 2000") - print extract_date("BET 20 JAN 2000 AND FEB 2000") - print extract_date("FROM 20 JAN 2000 TO FEB 2000") diff --git a/src/GrampsDbUtils/_GedcomParse.py b/src/GrampsDbUtils/_GedcomParse.py index 127e85674..d7a02be2b 100644 --- a/src/GrampsDbUtils/_GedcomParse.py +++ b/src/GrampsDbUtils/_GedcomParse.py @@ -64,13 +64,11 @@ all tokens at the lower level. For example: - 1 BIRT 2 DATE 1 JAN 2000 2 UKNOWN TAG 3 NOTE DATA - The function parsing the individual at level 1, would encounter the BIRT tag. It would look up the BIRT token in the table to see if a function as defined for this TOKEN, and pass control to this function. This function would then @@ -81,7 +79,6 @@ the level 2 parser, which would then encounter the "UKNOWN" tag. Since this is not a valid token, it would not be in the table, and a function that would skip all lines until the next level 2 token is found (in this case, skipping the "3 NOTE DATA" line. - """ __revision__ = "$Revision: $" @@ -94,10 +91,8 @@ __author__ = "Don Allingham" #------------------------------------------------------------------------- import os import re -import string import time from gettext import gettext as _ -import copy #------------------------------------------------------------------------ # @@ -114,20 +109,19 @@ LOG = logging.getLogger(".GedcomImport") #------------------------------------------------------------------------- import Errors import RelLib -from BasicUtils import NameDisplay +from BasicUtils import NameDisplay, UpdateCallback import Utils import Mime import LdsUtils -from ansel_utf8 import ansel_to_utf8 from _GedcomInfo import * from _GedcomTokens import * from _GedcomLex import Reader +from _GedcomChar import * import _GedcomUtils as GedcomUtils from GrampsDb._GrampsDbConst import EVENT_KEY -from BasicUtils import UpdateCallback try: import Config @@ -145,53 +139,14 @@ ADDR_RE = re.compile('(.+)([\n\r]+)(.+)\s*,(.+)\s+(\d+)\s*(.*)') ADDR2_RE = re.compile('(.+)([\n\r]+)(.+)\s*,(.+)\s+(\d+)') ADDR3_RE = re.compile('(.+)([\n\r]+)(.+)\s*,(.+)') - TRUNC_MSG = _("Your GEDCOM file is corrupted. " "It appears to have been truncated.") -#------------------------------------------------------------------------- -# -# latin/utf8 conversions -# -#------------------------------------------------------------------------- - - -def latin_to_utf8(msg): - """ - Converts a string from iso-8859-1 to unicode. If the string is already - unicode, we do nothing. - - @param msg: string to convert - @type level: str - @return: Returns the string, converted to a unicode object - @rtype: unicode - """ - if type(msg) == unicode: - return msg - else: - return unicode(msg, 'iso-8859-1') - -def nocnv(msg): - """ - Null operation that makes sure that a unicode string remains a unicode - string - - @param msg: unicode to convert - @type level: unicode - @return: Returns the string, converted to a unicode object - @rtype: unicode - """ - return unicode(msg) - #------------------------------------------------------------------------- # # constants # #------------------------------------------------------------------------- -ANSEL = 1 -UNICODE = 2 -UPDATE = 25 - TYPE_BIRTH = RelLib.ChildRefType() TYPE_ADOPT = RelLib.ChildRefType(RelLib.ChildRefType.ADOPTED) TYPE_FOSTER = RelLib.ChildRefType(RelLib.ChildRefType.FOSTER) @@ -224,10 +179,6 @@ MIME_MAP = { EVENT_FAMILY_STR = _("%(event_name)s of %(family)s") EVENT_PERSON_STR = _("%(event_name)s of %(person)s") -TRANS_TABLE = string.maketrans('', '') -DEL_CHARS = TRANS_TABLE[0:8] + TRANS_TABLE[10:31] -TRANS_TABLE2 = TRANS_TABLE[0:128] + ('?' * 128) - FTW_BAD_PLACE = [ RelLib.EventType.OCCUPATION, RelLib.EventType.RELIGION, @@ -265,6 +216,7 @@ CONC_RE = re.compile(r"\s*\d+\s+CONC\s?(.*)$") PERSON_RE = re.compile(r"\s*\d+\s+\@(\S+)\@\s+INDI(.*)$") class StageOne: + def __init__(self, ifile): self.ifile = ifile self.famc = {} @@ -275,44 +227,47 @@ class StageOne: def parse(self): current = "" + + line = self.ifile.read(3) + if line == "\xef\xbb": + self.ifile.read(1) + self.enc = "UTF8" + else: + self.ifile.seek(0) + for line in self.ifile: self.lcnt +=1 + data = line.split(None,2) + [''] try: (level, key, value) = data[:3] - value = value.strip() - # convert the first value to an integer. We have to be a bit - # careful here, since some GEDCOM files have garbage characters - # at the front of the first file if they are unicode encoded. - # So, if we have a failure to convert, check the last character - # of the string, which shoul de a '0' try: level = int(level) except: - level = int(level[-1]) + level = 0 key = key.strip() except: raise Errors.GedcomError("Corrupted file at line %d" % self.lcnt) if level == 0 and key[0] == '@': - if value == "FAM": + if value == ("FAM", "FAMILY") : current = key.strip() current = current[1:-1] - elif value == "INDI": + elif value == ("INDI", "INDIVIDUAL"): self.pcnt += 1 - elif key in ("HUSB", "WIFE") and value and value[0] == '@': + elif key in ("HUSB", "HUSBAND", "WIFE") and value and value[0] == '@': value = value[1:-1] if self.fams.has_key(value): self.fams[value].append(current) else: self.fams[value] = [current] - elif key == "CHIL" and value and value[0] == '@': + elif key in ("CHIL", "CHILD") and value and value[0] == '@': value = value[1:-1] if self.famc.has_key(value): self.famc[value].append(current) else: self.famc[value] = [current] - elif key == 'CHAR': + elif key == 'CHAR' and not self.enc: self.enc = value def get_famc_map(self): @@ -322,7 +277,10 @@ class StageOne: return self.fams def get_encoding(self): - return self.enc + return self.enc.upper() + + def set_encoding(self, enc): + self.enc = enc def get_person_count(self): return self.pcnt @@ -806,16 +764,20 @@ class GedcomParser(UpdateCallback): data = cursor.next() cursor.close() - self.lexer = Reader(ifile) + enc = stage_one.get_encoding() + + if enc == "ANSEL": + rdr = AnselReader(ifile) + elif enc in ("UTF-8", "UTF8"): + rdr = UTF8Reader(ifile) + elif enc in ("UTF-16", "UTF16", "UNICODE"): + rdr = UTF16Reader(ifile) + else: + rdr = AnsiReader(ifile) + + self.lexer = Reader(rdr) self.filename = filename self.backoff = False - self.override = False -# -# if self.override != 0: -# if self.override == 1: -# self.lexer.set_charset_fn(ansel_to_utf8) -# elif self.override == 2: -# self.lexer.set_charset_fn(latin_to_utf8) fullpath = os.path.normpath(os.path.abspath(filename)) self.geddir = os.path.dirname(fullpath) @@ -1064,9 +1026,6 @@ class GedcomParser(UpdateCallback): """ text = self.groups.line msg = _("Line %d was not understood, so it was ignored.") % text - import traceback - traceback.print_stack() - print self.groups self.warn(msg) self.error_count += 1 self.skip_subordinate_levels(level) @@ -4039,11 +3998,8 @@ class GedcomParser(UpdateCallback): if genby == "GRAMPS": self.gedsource = self.gedmap.get_from_source_tag(line.data) self.lexer.set_broken_conc(self.gedsource.get_conc()) - elif line.token == TOKEN_CHAR and not self.override: - if line.data == "ANSEL": - self.lexer.set_charset_fn(ansel_to_utf8) - elif line.data not in ("UNICODE","UTF-8","UTF8"): - self.lexer.set_charset_fn(latin_to_utf8) + elif line.token == TOKEN_CHAR: + pass self.skip_subordinate_levels(2) elif line.token == TOKEN_GEDC: self.skip_subordinate_levels(2) diff --git a/src/GrampsDbUtils/_ReadGedcom.py b/src/GrampsDbUtils/_ReadGedcom.py index d2b767ab7..51f092ab5 100644 --- a/src/GrampsDbUtils/_ReadGedcom.py +++ b/src/GrampsDbUtils/_ReadGedcom.py @@ -66,6 +66,7 @@ def importData(database, filename, callback=None, use_trans=False): dialog.destroy() else: code_set = None + import2(database, filename, callback, code_set, use_trans) def import2(database, filename, callback, code_set, use_trans): @@ -74,7 +75,10 @@ def import2(database, filename, callback, code_set, use_trans): ifile = open(filename,"rU") np = StageOne(ifile) np.parse() - print np.get_encoding() + + if code_set: + np.set_encoding(code_set) + ifile.seek(0) gedparse = GedcomParser(database, ifile, filename, callback, np) except IOError, msg: @@ -85,7 +89,6 @@ def import2(database, filename, callback, code_set, use_trans): _("%s could not be imported") % filename + "\n" + str(msg)) return - if database.get_number_of_people() == 0: use_trans = False