2006-06-10 Don Allingham <don@gramps-project.org>

* src/GrampsDb/_ReadGedcom.py: fixes based of the gedcom torture test
	* src/GrampsDb/_GedTokeys.py: fixes based of the gedcom torture test



svn: r6880
This commit is contained in:
Don Allingham 2006-06-11 04:19:47 +00:00
parent 8940cf321d
commit 5810926898
3 changed files with 201 additions and 37 deletions

View File

@ -1,3 +1,7 @@
2006-06-10 Don Allingham <don@gramps-project.org>
* src/GrampsDb/_ReadGedcom.py: fixes based of the gedcom torture test
* src/GrampsDb/_GedTokeys.py: fixes based of the gedcom torture test
2006-06-09 Don Allingham <don@gramps-project.org> 2006-06-09 Don Allingham <don@gramps-project.org>
* src/GrampsDb/_ReadGedcom.py: yet even more funky fixes, pass Jerome's tests * src/GrampsDb/_ReadGedcom.py: yet even more funky fixes, pass Jerome's tests
* src/GrampsDb/_GedTokens.py: yet even more funky fixes, handle more * src/GrampsDb/_GedTokens.py: yet even more funky fixes, handle more

View File

@ -120,11 +120,13 @@ TOKEN_VERS = 100
TOKEN_WIFE = 101 TOKEN_WIFE = 101
TOKEN__WITN = 102 TOKEN__WITN = 102
TOKEN__WTN = 103 TOKEN__WTN = 103
TOKEN_AGNC = 104
TOKEN_HEAD = 105 TOKEN_HEAD = 105
TOKEN_CALN = 106 TOKEN_CALN = 106
TOKEN_MEDI = 107 TOKEN_MEDI = 107
TOKEN_RELA = 108 TOKEN_RELA = 108
TOKEN__LKD = 109 TOKEN__LKD = 109
TOKEN_BLOB = 110
tokens = { tokens = {
"HEAD" : TOKEN_HEAD, "MEDI" : TOKEN_MEDI, "HEAD" : TOKEN_HEAD, "MEDI" : TOKEN_MEDI,
@ -215,4 +217,5 @@ tokens = {
"_LKD" : TOKEN__LKD, "_DATE" : TOKEN_IGNORE, "_LKD" : TOKEN__LKD, "_DATE" : TOKEN_IGNORE,
"_SCBK" : TOKEN_IGNORE,"_TYPE" : TOKEN_IGNORE, "_SCBK" : TOKEN_IGNORE,"_TYPE" : TOKEN_IGNORE,
"_PRIM" : TOKEN_IGNORE,"_SSHOW" : TOKEN_IGNORE, "_PRIM" : TOKEN_IGNORE,"_SSHOW" : TOKEN_IGNORE,
"_PAREN" : TOKEN_IGNORE,"BLOB" : TOKEN_BLOB,
} }

View File

@ -133,6 +133,17 @@ pedi_type = {
'foster' : _TYPE_FOSTER, 'foster' : _TYPE_FOSTER,
} }
mime_map = {
'jpeg' : 'image/jpeg', 'jpg' : 'image/jpeg',
'rtf' : 'text/rtf', 'pdf' : 'application/pdf',
'mpeg' : 'video/mpeg', 'mpg' : 'video/mpeg',
'gif' : 'image/gif', 'bmp' : 'image/x-ms-bmp',
'tiff' : 'image/tiff', 'aif' : 'audio/x-aiff',
'text' : 'text/plain', 'w8bn' : 'application/msword',
'wav' : 'audio/x-wav', 'mov' : 'video/quicktime',
}
_event_family_str = _("%(event_name)s of %(family)s") _event_family_str = _("%(event_name)s of %(family)s")
_event_person_str = _("%(event_name)s of %(person)s") _event_person_str = _("%(event_name)s of %(person)s")
@ -239,7 +250,6 @@ def import2(database, filename, callback, codeset, use_trans):
'problem.')) 'problem.'))
return return
#------------------------------------------------------------------------- #-------------------------------------------------------------------------
# #
# #
@ -508,6 +518,7 @@ class GedcomParser(UpdateCallback):
self.is_ftw = 0 self.is_ftw = 0
self.idswap = {} self.idswap = {}
self.gid2id = {} self.gid2id = {}
self.oid2id = {}
self.sid2id = {} self.sid2id = {}
self.lid2id = {} self.lid2id = {}
self.fid2id = {} self.fid2id = {}
@ -516,6 +527,7 @@ class GedcomParser(UpdateCallback):
self.repo_func = { self.repo_func = {
TOKEN_NAME : self.func_repo_name, TOKEN_NAME : self.func_repo_name,
TOKEN_ADDR : self.func_repo_addr, TOKEN_ADDR : self.func_repo_addr,
TOKEN_RIN : self.func_repo_ignore,
} }
self.name_func = { self.name_func = {
@ -657,12 +669,26 @@ class GedcomParser(UpdateCallback):
TOKEN_NOTE : self.func_source_note, TOKEN_NOTE : self.func_source_note,
TOKEN_TEXT : self.func_source_text, TOKEN_TEXT : self.func_source_text,
TOKEN_ABBR : self.func_source_abbr, TOKEN_ABBR : self.func_source_abbr,
TOKEN_REFN : self.func_source_ignore,
TOKEN_RIN : self.func_source_ignore,
TOKEN_REPO : self.func_source_repo, TOKEN_REPO : self.func_source_repo,
TOKEN_OBJE : self.func_source_ignore, TOKEN_OBJE : self.func_source_object,
TOKEN_CHAN : self.func_source_ignore, TOKEN_CHAN : self.func_source_chan,
TOKEN_DATA : self.func_source_ignore,
TOKEN_IGNORE: self.func_source_ignore, TOKEN_IGNORE: self.func_source_ignore,
} }
self.obje_func = {
TOKEN_FORM : self.func_obje_form,
TOKEN_TITL : self.func_obje_title,
TOKEN_NOTE : self.func_obje_note,
TOKEN_BLOB : self.func_obje_blob,
TOKEN_REFN : self.func_obje_refn,
TOKEN_TYPE : self.func_obje_type,
TOKEN_RIN : self.func_obje_rin,
TOKEN_CHAN : self.func_obje_chan,
}
self.place_names = set() self.place_names = set()
cursor = dbase.get_place_cursor() cursor = dbase.get_place_cursor()
data = cursor.next() data = cursor.next()
@ -832,9 +858,12 @@ class GedcomParser(UpdateCallback):
self.db.request_rebuild() self.db.request_rebuild()
def parse_trailer(self): def parse_trailer(self):
try:
matches = self.get_next() matches = self.get_next()
if matches[0] >= 0 and matches[1] != TOKEN_TRLR: if matches[0] >= 0 and matches[1] != TOKEN_TRLR:
self.not_recognized(0) self.not_recognized(0)
except TypeError:
pass
def parse_header(self): def parse_header(self):
self.parse_header_head() self.parse_header_head()
@ -918,6 +947,7 @@ class GedcomParser(UpdateCallback):
def func_source_note(self, matches, source, level): def func_source_note(self, matches, source, level):
note = self.parse_note(matches, source, level+1, '') note = self.parse_note(matches, source, level+1, '')
source.set_note(note) source.set_note(note)
def func_source_auth(self, matches, source, level): def func_source_auth(self, matches, source, level):
source.set_author(matches[2]) source.set_author(matches[2])
@ -933,15 +963,52 @@ class GedcomParser(UpdateCallback):
if source.get_title() == "": if source.get_title() == "":
source.set_title(matches[2].replace('\n',' ')) source.set_title(matches[2].replace('\n',' '))
def func_obje_form(self, matches, media, level):
self.ignore_sub_junk(level+1)
def func_obje_file(self, matches, media, level):
(ok,path) = self.find_file(matches[2], self.dir_path)
if not ok:
self.warn(_("Could not import %s") % filename)
path = filename.replace('\\','/')
def func_obje_ignore(self, matches, media, level):
self.ignore_sub_junk(level+1)
def func_obje_title(self, matches, media, level):
media.set_description(matches[2])
def func_obje_note(self, matches, media, level):
note = self.parse_note(matches, media, level+1, '')
media.set_note(note)
def func_obje_blob(self, matches, media, level):
self.ignore_sub_junk(level+1)
def func_obje_refn(self, matches, media, level):
self.ignore_sub_junk(level+1)
def func_obje_type(self, matches, media, level):
self.ignore_sub_junk(level+1)
def func_obje_rin(self, matches, media, level):
self.ignore_sub_junk(level+1)
def func_obje_chan(self, matches, media, level):
self.ignore_sub_junk(level+1)
def parse_record(self): def parse_record(self):
while True: while True:
matches = self.get_next() matches = self.get_next()
key = matches[2].strip() key = matches[2].strip()
if matches[0] < 0 or matches[1] == TOKEN_TRLR:
break
if key == "FAM": if key == "FAM":
self.parse_FAM(matches) self.parse_FAM(matches)
elif key == "INDI": elif key == "INDI":
self.parse_INDI(matches) self.parse_INDI(matches)
elif key == "OBJE":
self.parse_OBJE(matches)
elif key == "REPO": elif key == "REPO":
self.repo_count += 1 self.repo_count += 1
self.repo = self.find_or_create_repository(matches[3][1:-1]) self.repo = self.find_or_create_repository(matches[3][1:-1])
@ -949,9 +1016,9 @@ class GedcomParser(UpdateCallback):
self.parse_repository(self.repo) self.parse_repository(self.repo)
self.db.commit_repository(self.repo, self.trans) self.db.commit_repository(self.repo, self.trans)
del self.repo del self.repo
elif key in ("SUBM","SUBN"): elif key in ("SUBM", "SUBN"):
self.ignore_sub_junk(1) self.ignore_sub_junk(1)
elif matches[1] in (TOKEN_SUBM,TOKEN_SUBN,TOKEN_OBJE,TOKEN_IGNORE): elif matches[1] in (TOKEN_SUBM, TOKEN_SUBN, TOKEN_IGNORE):
self.ignore_sub_junk(1) self.ignore_sub_junk(1)
elif key == "SOUR": elif key == "SOUR":
self.parse_source(matches[3],1) self.parse_source(matches[3],1)
@ -963,13 +1030,10 @@ class GedcomParser(UpdateCallback):
self.db.commit_source(source, self.trans) self.db.commit_source(source, self.trans)
elif key[0:4] == "NOTE": elif key[0:4] == "NOTE":
self.ignore_sub_junk(1) self.ignore_sub_junk(1)
elif key == "_LOC": elif key in ("_LOC") :
# TODO: Add support for extended Locations. self.ignore_sub_junk(1)
# See: http://en.wiki.genealogy.net/index.php/Gedcom_5.5EL elif matches[3] in ("_EVENT_DEFN") :
self.ignore_sub_junk(1) self.ignore_sub_junk(1)
elif matches[0] < 0 or matches[1] == TOKEN_TRLR:
self.backup()
return
else: else:
self.not_recognized(1) self.not_recognized(1)
@ -1004,6 +1068,13 @@ class GedcomParser(UpdateCallback):
self.gid2id[gramps_id] = intid self.gid2id[gramps_id] = intid
return intid return intid
def find_object_handle(self,gramps_id):
intid = self.oid2id.get(gramps_id)
if not intid:
intid = create_id()
self.oid2id[gramps_id] = intid
return intid
def find_or_create_family(self,gramps_id): def find_or_create_family(self,gramps_id):
family = RelLib.Family() family = RelLib.Family()
intid = self.fid2id.get(gramps_id) intid = self.fid2id.get(gramps_id)
@ -1026,6 +1097,17 @@ class GedcomParser(UpdateCallback):
repository.set_gramps_id(gramps_id) repository.set_gramps_id(gramps_id)
return repository return repository
def find_or_create_object(self, gramps_id):
obj = RelLib.MediaObject()
intid = self.oid2id.get(gramps_id)
if self.db.has_object_handle(intid):
obj.unserialize(self.db.get_raw_object_data(intid))
else:
intid = self.find_object_handle(gramps_id)
obj.set_handle(intid)
obj.set_gramps_id(gramps_id)
return obj
def find_repository_handle(self,gramps_id): def find_repository_handle(self,gramps_id):
intid = self.rid2id.get(gramps_id) intid = self.rid2id.get(gramps_id)
if not intid: if not intid:
@ -1248,7 +1330,7 @@ class GedcomParser(UpdateCallback):
def func_family_addr(self, family, matches, level): def func_family_addr(self, family, matches, level):
self.addr = RelLib.Address() self.addr = RelLib.Address()
self.addr.set_street(matches[2]) self.addr.set_street(matches[2])
self.parse_address(self.addr,2) self.parse_address(self.addr, level)
def func_family_chil(self, family, matches, level): def func_family_chil(self, family, matches, level):
mrel,frel = self.parse_ftw_relations(2) mrel,frel = self.parse_ftw_relations(2)
@ -1297,6 +1379,13 @@ class GedcomParser(UpdateCallback):
(form, filename, title, note) = self.parse_obje(level) (form, filename, title, note) = self.parse_obje(level)
self.build_media_object(event, form, filename, title, note) self.build_media_object(event, form, filename, title, note)
def func_source_object(self, matches, source, level):
if matches[2] and matches[2][0] == '@':
self.not_recognized(level)
else:
(form, filename, title, note) = self.parse_obje(level+1)
self.build_media_object(source, form, filename, title, note)
def parse_obje(self, level): def parse_obje(self, level):
""" """
n OBJE {1:1} n OBJE {1:1}
@ -1415,6 +1504,47 @@ class GedcomParser(UpdateCallback):
""" """
return value[1:-1] return value[1:-1]
def parse_OBJE(self, matches):
"""
n @XREF:OBJE@ OBJE {1:1}
+1 FORM <MULTIMEDIA_FORMAT> {1:1} p.*
+1 TITL <DESCRIPTIVE_TITLE> {0:1} p.*
+1 <<NOTE_STRUCTURE>> {0:M} p.*
+1 BLOB {1:1}
+2 CONT <ENCODED_MULTIMEDIA_LINE> {1:M} p.*
+1 OBJE @<XREF:OBJE>@ /* chain to continued object */ {0:1} p.*
+1 REFN <USER_REFERENCE_NUMBER> {0:M} p.*
+2 TYPE <USER_REFERENCE_TYPE> {0:1} p.*
+1 RIN <AUTOMATED_RECORD_ID> {0:1} p.*
+1 <<CHANGE_DATE>> {0:1} p.*
"""
gid = self.extract_gramps_id(matches[3])
self.media = self.find_or_create_object(self.map_gid(gid))
while True:
matches = self.get_next()
if self.level_is_finished(matches, 1):
break
else:
func = self.obje_func.get(matches[1], self.func_obje_ignore)
func(matches, self.media, 1)
# Add the default reference if no source has found
if len(self.media.get_source_references()) == 0:
sref = RelLib.SourceRef()
sref.set_reference_handle(self.def_src.handle)
self.media.add_source_reference(sref)
# commit the person to the database
if self.media.change:
self.db.commit_media_object(self.media, self.trans,
change_time=self.media.change)
else:
self.db.commit_media_object(self.media, self.trans)
del self.media
#---------------------------------------------------------------------- #----------------------------------------------------------------------
# #
# INDI parsing # INDI parsing
@ -1534,13 +1664,12 @@ class GedcomParser(UpdateCallback):
note = "" note = ""
while True: while True:
matches = self.get_next() matches = self.get_next()
if int(matches[0]) < level: if int(matches[0]) < level:
if matches[1] == TOKEN_PHON: if matches[1] == TOKEN_PHON:
address.set_phone(matches[2]) address.set_phone(matches[2])
else: else:
self.backup() self.backup()
return break
elif matches[1] in (TOKEN_ADDR, TOKEN_ADR1, TOKEN_ADR2): elif matches[1] in (TOKEN_ADDR, TOKEN_ADR1, TOKEN_ADR2):
val = address.get_street() val = address.get_street()
if first == 0: if first == 0:
@ -1561,10 +1690,14 @@ class GedcomParser(UpdateCallback):
address.set_country(matches[2]) address.set_country(matches[2])
elif matches[1] == TOKEN_PHON: elif matches[1] == TOKEN_PHON:
address.set_phone(matches[2]) address.set_phone(matches[2])
elif matches[1] == TOKEN_SOUR:
address.add_source_reference(self.handle_source(matches,level+1))
elif matches[1] == TOKEN_NOTE: elif matches[1] == TOKEN_NOTE:
note = self.parse_note(matches,address,level+1,'') note = self.parse_note(matches,address,level+1,'')
elif matches[1] in (TOKEN__LOC, TOKEN__NAME): elif matches[1] in (TOKEN__LOC, TOKEN__NAME):
pass # ignore unsupported extended location syntax pass # ignore unsupported extended location syntax
elif matches[1] in (TOKEN_IGNORE, TOKEN_TYPE, TOKEN_CAUS):
self.ignore_sub_junk(level+1)
else: else:
self.not_recognized(level+1) self.not_recognized(level+1)
@ -1853,12 +1986,12 @@ class GedcomParser(UpdateCallback):
info = self.parse_note(matches, attr, level+1, '') info = self.parse_note(matches, attr, level+1, '')
attr.set_note(info) attr.set_note(info)
def parse_source_reference(self,source,level): def parse_source_reference(self, source, level, handle):
"""Reads the data associated with a SOUR reference""" """Reads the data associated with a SOUR reference"""
while True: while True:
matches = self.get_next() matches = self.get_next()
if self.level_is_finished(matches, level): if self.level_is_finished(matches, level):
return break
elif matches[1] == TOKEN_PAGE: elif matches[1] == TOKEN_PAGE:
source.set_page(matches[2]) source.set_page(matches[2])
elif matches[1] == TOKEN_DATE: elif matches[1] == TOKEN_DATE:
@ -1869,7 +2002,16 @@ class GedcomParser(UpdateCallback):
d = self.dp.parse(date) d = self.dp.parse(date)
source.set_date_object(d) source.set_date_object(d)
source.set_text(text) source.set_text(text)
elif matches[1] in (TOKEN_OBJE, TOKEN_REFN, TOKEN_EVEN, elif matches[1] == TOKEN_OBJE:
if matches[2] and matches[2][0] == '@':
self.not_recognized(level)
else:
src = self.db.get_source_from_handle(handle)
(form, filename, title, note) = self.parse_obje(level)
self.build_media_object(src, form, filename,
title, note)
self.db.commit_source(src, self.trans)
elif matches[1] in (TOKEN_REFN, TOKEN_EVEN,
TOKEN_IGNORE, TOKEN__LKD): TOKEN_IGNORE, TOKEN__LKD):
self.ignore_sub_junk(level+1) self.ignore_sub_junk(level+1)
elif matches[1] == TOKEN_QUAY: elif matches[1] == TOKEN_QUAY:
@ -2043,6 +2185,8 @@ class GedcomParser(UpdateCallback):
matches = self.get_next() matches = self.get_next()
if self.level_is_finished(matches,level): if self.level_is_finished(matches,level):
break break
# else:
# print matches
def ignore_change_data(self,level): def ignore_change_data(self,level):
matches = self.get_next() matches = self.get_next()
@ -2147,10 +2291,10 @@ class GedcomParser(UpdateCallback):
note = '' note = ''
handle = self.inline_srcs.get((title,note),Utils.create_id()) handle = self.inline_srcs.get((title,note),Utils.create_id())
self.inline_srcs[(title,note)] = handle self.inline_srcs[(title,note)] = handle
self.parse_source_reference(source_ref,level) self.parse_source_reference(source_ref, level, handle)
else: else:
handle = self.find_or_create_source(matches[2][1:-1]).handle handle = self.find_or_create_source(matches[2][1:-1]).handle
self.parse_source_reference(source_ref,level) self.parse_source_reference(source_ref,level, handle)
source_ref.set_reference_handle(handle) source_ref.set_reference_handle(handle)
return source_ref return source_ref
@ -2258,6 +2402,9 @@ class GedcomParser(UpdateCallback):
def func_family_chan(self, family, matches, level): def func_family_chan(self, family, matches, level):
self.parse_change(matches, family, level+1) self.parse_change(matches, family, level+1)
def func_source_chan(self, matches, source, level):
self.parse_change(matches, source, level+1)
def parse_change(self, matches, obj, level): def parse_change(self, matches, obj, level):
""" """
CHANGE_DATE:= CHANGE_DATE:=
@ -2401,7 +2548,12 @@ class GedcomParser(UpdateCallback):
+1 FILE <MULTIMEDIA_FILE_REFERENCE> {1:1} +1 FILE <MULTIMEDIA_FILE_REFERENCE> {1:1}
+1 <<NOTE_STRUCTURE>> {0:M} +1 <<NOTE_STRUCTURE>> {0:M}
""" """
if matches[2] and matches[2][0] == '@':
ref = RelLib.MediaRef()
handle = self.find_object_handle(matches[2][1:-1])
ref.set_reference_handle(handle)
self.person.add_media_reference(ref)
else:
(form, filename, title, note) = self.parse_obje(state.level+1) (form, filename, title, note) = self.parse_obje(state.level+1)
self.build_media_object(state.person, form, filename, title, note) self.build_media_object(state.person, form, filename, title, note)
@ -2421,7 +2573,11 @@ class GedcomParser(UpdateCallback):
photo = RelLib.MediaObject() photo = RelLib.MediaObject()
photo.set_path(path) photo.set_path(path)
photo.set_description(title) photo.set_description(title)
photo.set_mime_type(Mime.get_type(os.path.abspath(path))) full_path = os.path.abspath(path)
if os.path.isfile(full_path):
photo.set_mime_type(Mime.get_type(full_path))
else:
photo.set_mime_type(mime_map.get(form.lower(),'unknown'))
self.db.add_object(photo, self.trans) self.db.add_object(photo, self.trans)
self.media_map[path] = photo.handle self.media_map[path] = photo.handle
else: else:
@ -2542,7 +2698,8 @@ class GedcomParser(UpdateCallback):
addr.set_street(matches[2]) addr.set_street(matches[2])
self.parse_address(addr, state.level+1) self.parse_address(addr, state.level+1)
elif matches[1] == TOKEN_SOUR: elif matches[1] == TOKEN_SOUR:
addr.add_source_reference(self.handle_source(matches, state.level+1)) addr.add_source_reference(self.handle_source(matches,
state.level+1))
elif matches[1] == TOKEN_PLAC: elif matches[1] == TOKEN_PLAC:
addr.set_street(matches[2]) addr.set_street(matches[2])
self.parse_address(addr, state.level+1) self.parse_address(addr, state.level+1)
@ -2802,16 +2959,16 @@ class GedcomParser(UpdateCallback):
if self.level_is_finished(matches, 1): if self.level_is_finished(matches, 1):
break break
else: else:
func = self.repo_func.get(matches[1],self.skip_record) func = self.repo_func.get(matches[1],self.func_repo_ignore)
func(matches,state) func(matches, repo, 1)
if state.get_text(): def func_repo_name(self, matches, repo, level):
state.repo.set_note(state.get_text()) repo.set_name(matches[2])
def func_repo_name(self, matches, state): def func_repo_ignore(self, matches, repo, level):
state.repo.set_name(matches[2]) self.ignore_sub_junk(level)
def func_repo_addr(self, matches, state): def func_repo_addr(self, matches, repo, level):
""" """
n ADDR <ADDRESS_LINE> {0:1} n ADDR <ADDRESS_LINE> {0:1}
+1 CONT <ADDRESS_LINE> {0:M} +1 CONT <ADDRESS_LINE> {0:M}
@ -2832,7 +2989,7 @@ class GedcomParser(UpdateCallback):
addr = RelLib.Address() addr = RelLib.Address()
matched = False matched = False
self.parse_address(addr, state.level+1) self.parse_address(addr, 2)
text = addr.get_street() text = addr.get_street()
if not addr.get_city() and not addr.get_state() and \ if not addr.get_city() and not addr.get_state() and \
@ -2865,7 +3022,7 @@ class GedcomParser(UpdateCallback):
addr.set_state(groups[3].strip()) addr.set_state(groups[3].strip())
matched = True matched = True
state.repo.add_address(addr) repo.add_address(addr)
def skip_record(self,matches,state): def skip_record(self,matches,state):
self.ignore_sub_junk(2) self.ignore_sub_junk(2)