From 70558272602c35969b8a4b624e88576a5f85ff16 Mon Sep 17 00:00:00 2001 From: Tim G L Lyons Date: Mon, 11 Mar 2013 18:45:09 +0000 Subject: [PATCH] Exportgedcom and importgedcom. 0006382: ADDR tag in GEDCOM export does not contain a full address. On export, all the elements of the structured address are output as ADDR/CONT. On import the structured address takes precedence (as at present), but a warning is only given if an element in the free-form address is missing from the structured address. svn: r21610 --- gramps/plugins/export/exportgedcom.py | 118 ++++++++------- gramps/plugins/importer/importgedcom.py | 7 +- gramps/plugins/lib/libgedcom.py | 183 +++++++++++++----------- 3 files changed, 168 insertions(+), 140 deletions(-) diff --git a/gramps/plugins/export/exportgedcom.py b/gramps/plugins/export/exportgedcom.py index 156f9f4f6..4ae2f1582 100644 --- a/gramps/plugins/export/exportgedcom.py +++ b/gramps/plugins/export/exportgedcom.py @@ -356,39 +356,19 @@ class GedcomWriter(UpdateCallback): """ owner = self.dbase.get_researcher() name = owner.get_name() - addr = owner.get_address() - adr2 = owner.get_locality() - city = owner.get_city() - state = owner.get_state() - ctry = owner.get_country() - post = owner.get_postal_code() phon = owner.get_phone() mail = owner.get_email() - if not name : - name = cuni('Not Provided') - if not addr : - addr = cuni('Not Provided') - self._writeln(0, "@SUBM@", "SUBM") self._writeln(1, "NAME", name) - self._writeln(1, "ADDR", addr) - if city and state and post: - self._writeln(2, "CONT", "%s, %s %s" % (city, state, post)) - else: - self._writeln(2, "CONT", cuni("Not Provided")) - if addr: - self._writeln(2, "ADR1", addr) - if adr2: - self._writeln(2, "ADR2", adr2) - if city: - self._writeln(2, "CITY", city) - if state: - self._writeln(2, "STAE", state) - if post: - self._writeln(2, "POST", post) - if ctry: - self._writeln(2, "CTRY", ctry) + + # Researcher is a sub-type of LocationBase, so get_city etc. which are + # used in __write_addr work fine. However, the database owner street is + # stored in address, so we need to temporarily copy it into street so + # __write_addr works properly + owner.set_street(owner.get_address()) + self.__write_addr(1, owner) + if phon: self._writeln(1, "PHON", phon) if mail: @@ -683,20 +663,8 @@ class GedcomWriter(UpdateCallback): """ for addr in person.get_address_list(): self._writeln(1, 'RESI') - self._date(2, addr.get_date_object()) - self._writeln(2, "ADDR", addr.get_street()) - if addr.get_street(): - self._writeln(3, 'ADR1', addr.get_street()) - if addr.get_locality(): - self._writeln(3, 'ADR2', addr.get_locality()) - if addr.get_city(): - self._writeln(3, 'CITY', addr.get_city()) - if addr.get_state(): - self._writeln(3, 'STAE', addr.get_state()) - if addr.get_postal_code(): - self._writeln(3, 'POST', addr.get_postal_code()) - if addr.get_country(): - self._writeln(3, 'CTRY', addr.get_country()) + self.__date(2, addr.get_date_object()) + self._write_addr(2, addr) if addr.get_phone(): self._writeln(2, 'PHON', addr.get_phone()) @@ -1019,19 +987,7 @@ class GedcomWriter(UpdateCallback): if repo.get_name(): self._writeln(1, 'NAME', repo.get_name()) for addr in repo.get_address_list(): - self._writeln(1, "ADDR", addr.get_street()) - if addr.get_street(): - self._writeln(2, 'ADR1', addr.get_street()) - if addr.get_locality(): - self._writeln(2, 'ADR2', addr.get_locality()) - if addr.get_city(): - self._writeln(2, 'CITY', addr.get_city()) - if addr.get_state(): - self._writeln(2, 'STAE', addr.get_state()) - if addr.get_postal_code(): - self._writeln(2, 'POST', addr.get_postal_code()) - if addr.get_country(): - self._writeln(2, 'CTRY', addr.get_country()) + self.__write_addr(1, addr) if addr.get_phone(): self._writeln(1, 'PHON', addr.get_phone()) for url in repo.get_url_list(): @@ -1432,6 +1388,58 @@ class GedcomWriter(UpdateCallback): self._note_references(place.get_note_list(), level+1) + def __write_addr(self, level, addr): + """ + n ADDR {0:1} + +1 CONT {0:M} + +1 ADR1 {0:1} (Street) + +1 ADR2 {0:1} (Locality) + +1 CITY {0:1} + +1 STAE {0:1} + +1 POST {0:1} + +1 CTRY {0:1} + + This is done along the lines suggested by Tamura Jones in + http://www.tamurajones.net/GEDCOMADDR.xhtml as a result of bug 6382. + "GEDCOM writers should always use the structured address format, + and it use it for all addresses, including the submitter address and + their own corporate address." "Vendors that want their product to pass + even the strictest GEDCOM validation, should include export to the old + free-form format..." [This goes on to say the free-form should be an + option, but we have not made it an option in Gramps]. + + @param level: The level number for the ADDR tag + @type level: Integer + @param addr: The location or address + @type addr: [a super-type of] LocationBase + """ + if addr.get_street() or addr.get_locality() or addr.get_city() or \ + addr.get_state() or addr.get_postal_code or addr.get_country(): + self._writeln(level, 'ADDR', addr.get_street()) + if addr.get_locality(): + self._writeln(level + 1, 'CONT', addr.get_locality()) + if addr.get_city(): + self._writeln(level + 1, 'CONT', addr.get_city()) + if addr.get_state(): + self._writeln(level + 1, 'CONT', addr.get_state()) + if addr.get_postal_code(): + self._writeln(level + 1, 'CONT', addr.get_postal_code()) + if addr.get_country(): + self._writeln(level + 1, 'CONT', addr.get_country()) + + if addr.get_street(): + self._writeln(level + 1, 'ADR1', addr.get_street()) + if addr.get_locality(): + self._writeln(level + 1, 'ADR2', addr.get_locality()) + if addr.get_city(): + self._writeln(level + 1, 'CITY', addr.get_city()) + if addr.get_state(): + self._writeln(level + 1, 'STAE', addr.get_state()) + if addr.get_postal_code(): + self._writeln(level + 1, 'POST', addr.get_postal_code()) + if addr.get_country(): + self._writeln(level + 1, 'CTRY', addr.get_country()) + #------------------------------------------------------------------------- # # diff --git a/gramps/plugins/importer/importgedcom.py b/gramps/plugins/importer/importgedcom.py index 70d53c838..f9dcc5e7c 100644 --- a/gramps/plugins/importer/importgedcom.py +++ b/gramps/plugins/importer/importgedcom.py @@ -49,7 +49,12 @@ from gramps.gen.errors import DbError, GedcomError from gramps.gui.glade import Glade from gramps.plugins.lib.libmixin import DbMixin from gramps.plugins.lib import libgedcom -module = __import__("gramps.plugins.lib.libgedcom") # why o why ?? +# The following code is necessary to ensure that when Help->Plugin +# Manager->Reload is executed, not only is the top-level exportgedcom file +# reloaded, but also the dependent libgedcom. This ensures that testing can have +# a quick turnround, without having to restart Gramps. +module = __import__("gramps.plugins.lib.libgedcom", + fromlist=["gramps.plugins.lib"]) # why o why ?? as above! if sys.version_info[0] < 3: reload (module) else: diff --git a/gramps/plugins/lib/libgedcom.py b/gramps/plugins/lib/libgedcom.py index 62ec8c136..89674886c 100644 --- a/gramps/plugins/lib/libgedcom.py +++ b/gramps/plugins/lib/libgedcom.py @@ -98,6 +98,7 @@ import time import codecs from xml.parsers.expat import ParserCreate from collections import defaultdict +import string if sys.version_info[0] < 3: from cStringIO import StringIO else: @@ -646,9 +647,6 @@ DATE_QUALITY = { # regular expressions # #------------------------------------------------------------------------- -ADDR_RE = re.compile('(.+)([\n\r]+)(.+)\s*, (.+)\s+(\d+)\s*(.*)') -ADDR2_RE = re.compile('(.+)([\n\r]+)(.+)\s*, (.+)\s+(\d+)') -ADDR3_RE = re.compile('(.+)([\n\r]+)(.+)\s*, (.+)') NOTE_RE = re.compile(r"\s*\d+\s+\@(\S+)\@\s+NOTE(.*)$") CONT_RE = re.compile(r"\s*\d+\s+CONT\s?(.*)$") CONC_RE = re.compile(r"\s*\d+\s+CONC\s?(.*)$") @@ -2134,6 +2132,7 @@ class GedcomParser(UpdateCallback): TOKEN_SOUR : self.__event_source, TOKEN_PLAC : self.__event_place, TOKEN_ADDR : self.__event_addr, + TOKEN_PHON : self.__event_phon, TOKEN_CAUS : self.__event_cause, TOKEN_AGNC : self.__event_agnc, TOKEN_AGE : self.__event_age, @@ -2245,14 +2244,15 @@ class GedcomParser(UpdateCallback): self.func_list.append(self.object_parse_tbl) self.parse_loc_tbl = { - TOKEN_ADDR : self.__location_addr, TOKEN_ADR1 : self.__location_adr1, TOKEN_ADR2 : self.__location_adr2, - TOKEN_DATE : self.__location_date, TOKEN_CITY : self.__location_city, TOKEN_STAE : self.__location_stae, TOKEN_POST : self.__location_post, TOKEN_CTRY : self.__location_ctry, + # Not legal GEDCOM - not clear why these are included at this level + TOKEN_ADDR : self.__ignore, + TOKEN_DATE : self.__location_date, TOKEN_NOTE : self.__location_note, TOKEN_RNOTE : self.__location_note, TOKEN__LOC : self.__ignore, @@ -2310,8 +2310,7 @@ class GedcomParser(UpdateCallback): # +1 <> {0:1} TOKEN_CHAN : self.__family_chan, TOKEN_ENDL : self.__ignore, - - TOKEN_ADDR : self.__family_addr, + TOKEN_ADDR : self.__ignore, TOKEN_RIN : self.__family_cust_attr, TOKEN_SUBM : self.__ignore, TOKEN_ATTR : self.__family_attr, @@ -3171,6 +3170,69 @@ class GedcomParser(UpdateCallback): self.__add_msg(txt) self.number_of_errors -= 1 + def __merge_address(self, free_form_address, addr, line, state): + """ + Merge freeform and structured addrssses. + n ADDR {0:1} + +1 CONT {0:M} + +1 ADR1 {0:1} (Street) + +1 ADR2 {0:1} (Locality) + +1 CITY {0:1} + +1 STAE {0:1} + +1 POST {0:1} + +1 CTRY {0:1} + + This is done along the lines suggested by Tamura Jones in + http://www.tamurajones.net/GEDCOMADDR.xhtml as a result of bug 6382. + "When a GEDCOM reader encounters a double address, it should read the + structured address. ... A GEDCOM reader that does verify that the + addresses are the same should issue an error if they are not". + + This is called for SUBMitter addresses (__subm_addr), INDIvidual + addresses (__person_addr), REPO addresses and HEADer corp address + (__repo_address) and EVENt addresses (__event_adr). + + The structured address (if any) will have been accumulated into an + object of type LocationBase, which will either be a Location, or an + Address object. + + If ADDR is provided, but none of ADR1, ADR2, CITY, STAE, or POST (not + CTRY), then Street is set to the freeform address. N.B. this is a change + for Repository addresses and HEADer Corp address where previously the + free-form address was deconstrucated into different structured + components. N.B. PAF provides a free-form address and a country, so this + allows for that case. + + If both forms of address are provided, then the structured address is + used, and if the ADDR/CONT contains anything not in the structured + address, a warning is issued. + + If just ADR1, ADR2, CITY, STAE, POST or CTRY are provided (this is not + actually legal GEDCOM symtax, but may be possible by GEDCOM extensions) + then just the structrued address is used. + """ + if not (addr.get_street() or addr.get_locality() or + addr.get_city() or addr.get_state() or + addr.get_postal_code()): + + addr.set_street(free_form_address) + else: + # structured address provided + addr_list = free_form_address.split("\n") + str_list = [] + for func in (addr.get_street(), addr.get_locality(), + addr.get_city(), addr.get_state(), + addr.get_postal_code(), addr.get_country()): + str_list += [i.strip(',' + string.whitespace) for i in func.split("\n")] + for elmn in addr_list: + if elmn.strip(',' + string.whitespace) not in str_list: + # message means that the element %s was ignored, but + # expressed the wrong way round because the message is + # truncated for output + self.__add_msg(_("ADDR element ignored '%s'" + % elmn), line, state) + # The free-form address ADDR is discarded + def __parse_trailer(self): """ Looks for the expected TRLR token @@ -3810,7 +3872,7 @@ class GedcomParser(UpdateCallback): def __person_addr(self, line, state): """ - Parses the Address structure + Parses the INDIvidual n ADDR {0:1} +1 CONT {0:M} @@ -3827,13 +3889,16 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ - sub_state = CurrentState() - sub_state.level = state.level+1 + free_form = line.data + + sub_state = CurrentState(level=state.level + 1) sub_state.addr = Address() - sub_state.addr.set_street(line.data) - state.person.add_address(sub_state.addr) + self.__parse_level(sub_state, self.parse_addr_tbl, self.__ignore) state.msg += sub_state.msg + + self.__merge_address(free_form, sub_state.addr, line, state) + state.person.add_address(sub_state.addr) def __person_phon(self, line, state): """ @@ -4928,17 +4993,6 @@ class GedcomParser(UpdateCallback): """ self.__parse_change(line, state.family, state.level+1, state) - def __family_addr(self, line, state): - """ - @param line: The current line in GedLine format - @type line: GedLine - @param state: The current state - @type state: CurrentState - """ - state.addr = Address() - state.addr.set_street(line.data) - self.__parse_level(state, self.parse_addr_tbl, self.__ignore) - def __family_attr(self, line, state): """ @param line: The current line in GedLine format @@ -5294,15 +5348,18 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ + free_form = line.data + sub_state = CurrentState(level=state.level+1) sub_state.location = Location() - sub_state.location.set_street(line.data) sub_state.note = [] sub_state.event = state.event self.__parse_level(sub_state, self.parse_loc_tbl, self.__undefined) state.msg += sub_state.msg + self.__merge_address(free_form, sub_state.location, line, state) + location = sub_state.location note_list = sub_state.note @@ -5703,6 +5760,7 @@ class GedcomParser(UpdateCallback): @type state: CurrentState """ # The ADDR may already have been parsed by the level above + assert state.addr.get_street() == "" if state.addr.get_street() != "": self.__add_msg(_("Warn: ADDR overwritten"), line, state) state.addr.set_street(line.data) @@ -5973,6 +6031,7 @@ class GedcomParser(UpdateCallback): state = CurrentState() state.source = self.__find_or_create_source(self.sid_map[name]) + # SOURce with the given gramps_id had no title state.source.set_title(_("No title - ID %s") % state.source.get_gramps_id()) state.level = level @@ -6436,6 +6495,8 @@ class GedcomParser(UpdateCallback): def __repo_addr(self, line, state): """ + Parses the REPOsitory and HEADer COPR + n ADDR {0:1} +1 CONT {0:M} +1 ADR1 {0:1} (Street) @@ -6450,46 +6511,16 @@ class GedcomParser(UpdateCallback): instead they put everything on a single line. Try to determine if this happened, and try to fix it. """ + free_form = line.data - addr = Address() - addr.set_street(line.data) - - sub_state = CurrentState() - sub_state.level = state.level+1 - sub_state.addr = addr + sub_state = CurrentState(level=state.level + 1) + sub_state.addr = Address() self.__parse_level(sub_state, self.parse_addr_tbl, self.__ignore) state.msg += sub_state.msg - - text = addr.get_street() - if not (addr.get_city() or addr.get_state() or - addr.get_postal_code() or addr.get_country()): - match = ADDR_RE.match(text) - if match: - groups = match.groups() - addr.set_street(groups[0].strip()) - addr.set_city(groups[2].strip()) - addr.set_state(groups[3].strip()) - addr.set_postal_code(groups[4].strip()) - addr.set_country(groups[5].strip()) - - match = ADDR2_RE.match(text) - if match: - groups = match.groups() - addr.set_street(groups[0].strip()) - addr.set_city(groups[2].strip()) - addr.set_state(groups[3].strip()) - addr.set_postal_code(groups[4].strip()) - - match = ADDR3_RE.match(text) - if match: - groups = match.groups() - addr.set_street(groups[0].strip()) - addr.set_city(groups[2].strip()) - addr.set_state(groups[3].strip()) - - state.repo.add_address(addr) + self.__merge_address(free_form, sub_state.addr, line, state) + state.repo.add_address(sub_state.addr) def __repo_phon(self, line, state): """ @@ -6526,22 +6557,6 @@ class GedcomParser(UpdateCallback): url.set_type(UrlType(UrlType.EMAIL)) state.repo.add_url(url) - def __location_addr(self, line, state): - """ - @param line: The current line in GedLine format - @type line: GedLine - @param state: The current state - @type state: CurrentState - """ - if not state.location: - state.location = Location() - val = state.location.get_street() - if val: - val = "%s, %s" % (val, line.data.strip()) - else: - val = line.data.strip() - state.location.set_street(val.replace('\n', ' ')) - def __location_date(self, line, state): """ @param line: The current line in GedLine format @@ -7394,20 +7409,20 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ + free_form = line.data + sub_state = CurrentState(level=state.level + 1) - sub_state.location = Location() - sub_state.location.set_street(line.data) + sub_state.location = state.res self.__parse_level(sub_state, self.parse_loc_tbl, self.__undefined) state.msg += sub_state.msg - location = sub_state.location - state.res.set_address(location.get_street()) - state.res.set_locality(location.get_locality()) - state.res.set_city(location.get_city()) - state.res.set_state(location.get_state()) - state.res.set_country(location.get_country()) - state.res.set_postal_code(location.get_postal_code()) + self.__merge_address(free_form, state.res, line, state) + # Researcher is a sub-type of LocationBase, so get_street and set_street + # which are used in routines called from self.parse_loc_tbl work fine. + # Unfortunately, Researcher also has get_address and set_address, so we + # need to copy the street into that. + state.res.set_address(state.res.get_street()) def __subm_phon(self, line, state): """