# # Gramps - a GTK+/GNOME based genealogy program # # Copyright (C) 2000-2007 Donald N. Allingham # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # $Id$ "Database Processing/Merge people" #------------------------------------------------------------------------- # # standard python models # #------------------------------------------------------------------------- import os from gettext import gettext as _ #------------------------------------------------------------------------- # # GNOME libraries # #------------------------------------------------------------------------- import gtk.glade #------------------------------------------------------------------------- # # GRAMPS modules # #------------------------------------------------------------------------- import gen.lib import Utils import soundex from BasicUtils import name_displayer import ListModel import Errors from Merge import PersonCompare import GrampsDisplay import ManagedWindow from PluginUtils import Tool, register_tool #------------------------------------------------------------------------- # # Constants # #------------------------------------------------------------------------- _val2label = { 0.25 : _("Low"), 1.0 : _("Medium"), 2.0 : _("High"), } #------------------------------------------------------------------------- # # # #------------------------------------------------------------------------- def is_initial(name): if len(name) > 2: return 0 elif len(name) == 2: if name[0] == name[0].upper() and name[1] == '.': return 1 else: return name[0] == name[0].upper() #------------------------------------------------------------------------- # # # #------------------------------------------------------------------------- class Merge(Tool.Tool,ManagedWindow.ManagedWindow): def __init__(self, dbstate, uistate, options_class, name, callback=None): Tool.Tool.__init__(self, dbstate, options_class, name) ManagedWindow.ManagedWindow.__init__(self, uistate, [], self.__class__) self.dbstate = dbstate self.uistate = uistate self.map = {} self.list = [] self.index = 0 self.merger = None self.mergee = None self.removed = {} self.update = callback self.use_soundex = 1 base = os.path.dirname(__file__) self.glade_file = "%s/%s" % (base,"merge.glade") top = gtk.glade.XML(self.glade_file,"dialog","gramps") # retrieve options threshold = self.options.handler.options_dict['threshold'] use_soundex = self.options.handler.options_dict['soundex'] my_menu = gtk.Menu() vals = _val2label.keys() vals.sort() for val in vals: item = gtk.MenuItem(_val2label[val]) item.set_data("v",val) item.show() my_menu.append(item) my_menu.set_active(vals.index(threshold)) self.soundex_obj = top.get_widget("soundex") self.soundex_obj.set_active(use_soundex) self.soundex_obj.show() self.menu = top.get_widget("menu") self.menu.set_menu(my_menu) window = top.get_widget('dialog') self.set_window(window, top.get_widget('title'), _('Find possible duplicate people')) top.signal_autoconnect({ "on_merge_ok_clicked" : self.on_merge_ok_clicked, "destroy_passed_object" : self.close, "on_help_clicked" : self.on_help_clicked, "on_delete_merge_event" : self.close, }) self.show() def build_menu_names(self,obj): return (_("Tool settings"),_("Find Duplicates tool")) def on_help_clicked(self,obj): """Display the relevant portion of GRAMPS manual""" GrampsDisplay.help('tools-db') def ancestors_of(self,p1_id,id_list): if (not p1_id) or (p1_id in id_list): return id_list.append(p1_id) p1 = self.db.get_person_from_handle(p1_id) f1_id = p1.get_main_parents_family_handle() if f1_id: f1 = self.db.get_family_from_handle(f1_id) self.ancestors_of(f1.get_father_handle(),id_list) self.ancestors_of(f1.get_mother_handle(),id_list) def on_merge_ok_clicked(self,obj): threshold = self.menu.get_menu().get_active().get_data("v") self.use_soundex = int(self.soundex_obj.get_active()) try: self.find_potentials(threshold) except AttributeError, msg: import QuestionDialog QuestionDialog.RunDatabaseRepair(str(msg)) return self.options.handler.options_dict['threshold'] = threshold self.options.handler.options_dict['soundex'] = self.use_soundex # Save options self.options.handler.save_options() if len(self.map) == 0: import QuestionDialog QuestionDialog.ErrorDialog( _("No matches found"), _("No potential duplicate people were found")) else: try: ShowMatches(self.dbstate,self.uistate,self.track, self.list,self.map,self.update) except Errors.WindowActiveError: pass def find_potentials(self,thresh): self.progress = Utils.ProgressMeter(_('Find duplicates'), _('Looking for duplicate people')) index = 0 males = {} females = {} self.person_list = self.db.get_person_handles(sort_handles=False) length = len(self.person_list) self.progress.set_pass(_('Pass 1: Building preliminary lists'), length) for p1_id in self.person_list: self.progress.step() p1 = self.db.get_person_from_handle(p1_id) key = self.gen_key(p1.get_primary_name().get_surname()) if p1.get_gender() == gen.lib.Person.MALE: if males.has_key(key): males[key].append(p1_id) else: males[key] = [p1_id] else: if females.has_key(key): females[key].append(p1_id) else: females[key] = [p1_id] self.progress.set_pass(_('Pass 2: Calculating potential matches'), length) for p1key in self.person_list: self.progress.step() p1 = self.db.get_person_from_handle(p1key) key = self.gen_key(p1.get_primary_name().get_surname()) if p1.get_gender() == gen.lib.Person.MALE: remaining = males[key] else: remaining = females[key] index = 0 for p2key in remaining: index = index + 1 if p1key == p2key: continue p2 = self.db.get_person_from_handle(p2key) if self.map.has_key(p2key): (v,c) = self.map[p2key] if v == p1key: continue chance = self.compare_people(p1,p2) if chance >= thresh: if self.map.has_key(p1key): val = self.map[p1key] if val[1] > chance: self.map[p1key] = (p2key,chance) else: self.map[p1key] = (p2key,chance) self.list = self.map.keys() self.list.sort() self.length = len(self.list) self.progress.close() def gen_key(self,val): if self.use_soundex: try: return soundex.soundex(val) except UnicodeEncodeError: return val else: return val def compare_people(self,p1,p2): name1 = p1.get_primary_name() name2 = p2.get_primary_name() chance = self.name_match(name1,name2) if chance == -1 : return -1 birth1_ref = p1.get_birth_ref() if birth1_ref: birth1 = self.db.get_event_from_handle(birth1_ref.ref) else: birth1 = gen.lib.Event() death1_ref = p1.get_death_ref() if death1_ref: death1 = self.db.get_event_from_handle(death1_ref.ref) else: death1 = gen.lib.Event() birth2_ref = p2.get_birth_ref() if birth2_ref: birth2 = self.db.get_event_from_handle(birth2_ref.ref) else: birth2 = gen.lib.Event() death2_ref = p2.get_death_ref() if death2_ref: death2 = self.db.get_event_from_handle(death2_ref.ref) else: death2 = gen.lib.Event() value = self.date_match(birth1.get_date_object(), birth2.get_date_object()) if value == -1 : return -1 chance = chance + value value = self.date_match(death1.get_date_object(), death2.get_date_object()) if value == -1 : return -1 chance = chance + value value = self.place_match(birth1.get_place_handle(), birth2.get_place_handle()) if value == -1 : return -1 chance = chance + value value = self.place_match(death1.get_place_handle(), death2.get_place_handle()) if value == -1 : return -1 chance = chance + value ancestors = [] self.ancestors_of(p1.get_handle(),ancestors) if p2.get_handle() in ancestors: return -1 ancestors = [] self.ancestors_of(p2.get_handle(),ancestors) if p1.get_handle() in ancestors: return -1 f1_id = p1.get_main_parents_family_handle() f2_id = p2.get_main_parents_family_handle() if f1_id and f2_id: f1 = self.db.get_family_from_handle(f1_id) f2 = self.db.get_family_from_handle(f2_id) dad1_id = f1.get_father_handle() if dad1_id: dad1 = get_name_obj(self.db.get_person_from_handle(dad1_id)) else: dad1 = None dad2_id = f2.get_father_handle() if dad2_id: dad2 = get_name_obj(self.db.get_person_from_handle(dad2_id)) else: dad2 = None value = self.name_match(dad1,dad2) if value == -1: return -1 chance = chance + value mom1_id = f1.get_mother_handle() if mom1_id: mom1 = get_name_obj(self.db.get_person_from_handle(mom1_id)) else: mom1 = None mom2_id = f2.get_mother_handle() if mom2_id: mom2 = get_name_obj(self.db.get_person_from_handle(mom2_id)) else: mom2 = None value = self.name_match(mom1,mom2) if value == -1: return -1 chance = chance + value for f1_id in p1.get_family_handle_list(): f1 = self.db.get_family_from_handle(f1_id) for f2_id in p2.get_family_handle_list(): f2 = self.db.get_family_from_handle(f2_id) if p1.get_gender() == gen.lib.Person.FEMALE: father1_id = f1.get_father_handle() father2_id = f2.get_father_handle() if father1_id and father2_id: if father1_id == father2_id: chance = chance + 1 else: father1 = self.db.get_person_from_handle(father1_id) father2 = self.db.get_person_from_handle(father2_id) fname1 = get_name_obj(father1) fname2 = get_name_obj(father2) value = self.name_match(fname1,fname2) if value != -1: chance = chance + value else: mother1_id = f1.get_mother_handle() mother2_id = f2.get_mother_handle() if mother1_id and mother2_id: if mother1_id == mother2_id: chance = chance + 1 else: mother1 = self.db.get_person_from_handle(mother1_id) mother2 = self.db.get_person_from_handle(mother2_id) mname1 = get_name_obj(mother1) mname2 = get_name_obj(mother2) value = self.name_match(mname1,mname2) if value != -1: chance = chance + value return chance def name_compare(self,s1,s2): if self.use_soundex: try: return soundex.compare(s1,s2) except UnicodeEncodeError: return s1 == s2 else: return s1 == s2 def date_match(self,date1,date2): if date1.is_empty() or date2.is_empty(): return 0 if date1.is_equal(date2): return 1 if date1.is_compound() or date2.is_compound(): return self.range_compare(date1,date2) if date1.get_year() == date2.get_year(): if date1.get_month() == date2.get_month(): return 0.75 if not date1.get_month_valid() or not date2.get_month_valid(): return 0.75 else: return -1 else: return -1 def range_compare(self,date1,date2): start_date_1 = date1.get_start_date()[0:3] start_date_2 = date2.get_start_date()[0:3] stop_date_1 = date1.get_stop_date()[0:3] stop_date_2 = date2.get_stop_date()[0:3] if date1.is_compound() and date2.is_compound(): if start_date_1 >= start_date_2 and start_date_1 <= stop_date_2 or \ start_date_2 >= start_date_1 and start_date_2 <= stop_date_1 or \ stop_date_1 >= start_date_2 and stop_date_1 <= stop_date_2 or \ stop_date_2 >= start_date_1 and stop_date_2 <= stop_date_1: return 0.5 else: return -1 elif date2.is_compound(): if start_date_1 >= start_date_2 and start_date_1 <= stop_date_2: return 0.5 else: return -1 else: if start_date_2 >= start_date_1 and start_date_2 <= stop_date_1: return 0.5 else: return -1 def name_match(self,name,name1): if not name1 or not name: return 0 srn1 = name.get_surname() sfx1 = name.get_suffix() srn2 = name1.get_surname() sfx2 = name1.get_suffix() if not self.name_compare(srn1,srn2): return -1 if sfx1 != sfx2: if sfx1 != "" and sfx2 != "": return -1 if name.get_first_name() == name1.get_first_name(): return 1 else: list1 = name.get_first_name().split() list2 = name1.get_first_name().split() if len(list1) < len(list2): return self.list_reduce(list1,list2) else: return self.list_reduce(list2,list1) def place_match(self,p1_id,p2_id): if p1_id == p2_id: return 1 if not p1_id: name1 = "" else: p1 = self.db.get_place_from_handle(p1_id) name1 = p1.get_title() if not p2_id: name2 = "" else: p2 = self.db.get_place_from_handle(p2_id) name2 = p2.get_title() if not (name1 and name2): return 0 if name1 == name2: return 1 list1 = name1.replace(","," ").split() list2 = name2.replace(","," ").split() value = 0 for name in list1: for name2 in list2: if name == name2: value = value + 0.5 break if name[0] == name2[0] and self.name_compare(name,name2): value = value + 0.25 break if value == 0: return -1 else: return min(value,1) def list_reduce(self,list1,list2): value = 0 for name in list1: for name2 in list2: if is_initial(name) and name[0] == name2[0]: value = value + 0.25 break if is_initial(name2) and name2[0] == name[0]: value = value + 0.25 break if name == name2: value = value + 0.5 break if name[0] == name2[0] and self.name_compare(name,name2): value = value + 0.25 break if value == 0: return -1 else: return min(value,1) class ShowMatches(ManagedWindow.ManagedWindow): def __init__(self,dbstate,uistate,track,the_list,the_map,callback): ManagedWindow.ManagedWindow.__init__(self,uistate,track,self.__class__) self.dellist = {} self.list = the_list self.map = the_map self.length = len(self.list) self.update = callback self.db = dbstate.db self.dbstate = dbstate self.uistate = uistate base = os.path.dirname(__file__) self.glade_file = "%s/%s" % (base,"merge.glade") top = gtk.glade.XML(self.glade_file,"mergelist","gramps") window = top.get_widget("mergelist") self.set_window(window, top.get_widget('title'), _('Potential Merges')) self.mlist = top.get_widget("mlist") top.signal_autoconnect({ "destroy_passed_object" : self.close, "on_do_merge_clicked" : self.on_do_merge_clicked, "on_help_show_clicked" : self.on_help_clicked, "on_delete_show_event" : self.close, }) mtitles = [(_('Rating'),3,75),(_('First Person'),1,200), (_('Second Person'),2,200),('',-1,0)] self.list = ListModel.ListModel(self.mlist,mtitles, event_func=self.on_do_merge_clicked) self.redraw() self.show() def build_menu_names(self,obj): return (_("Merge candidates"),None) def on_help_clicked(self,obj): """Display the relevant portion of GRAMPS manual""" GrampsDisplay.help('tools-db') def redraw(self): list = [] for p1key in self.map.keys(): if self.dellist.has_key(p1key): continue (p2key,c) = self.map[p1key] if p1key == p2key: continue list.append((c,p1key,p2key)) self.list.clear() for (c,p1key,p2key) in list: c1 = "%5.2f" % c c2 = "%5.2f" % (100-c) p1 = self.db.get_person_from_handle(p1key) p2 = self.db.get_person_from_handle(p2key) if not p1 or not p2: continue pn1 = name_displayer.display(p1) pn2 = name_displayer.display(p2) self.list.add([c1, pn1, pn2,c2],(p1key,p2key)) def on_do_merge_clicked(self,obj): store,iter = self.list.selection.get_selected() if not iter: return (self.p1,self.p2) = self.list.get_object(iter) pn1 = self.db.get_person_from_handle(self.p1) pn2 = self.db.get_person_from_handle(self.p2) PersonCompare(self.dbstate,self.uistate,pn1,pn2,self.on_update) def on_update(self): self.dellist[self.p2] = self.p1 for key in self.dellist.keys(): if self.dellist[key] == self.p2: self.dellist[key] = self.p1 self.update() self.redraw() def update_and_destroy(self,obj): self.update(1) self.close() #------------------------------------------------------------------------- # # # #------------------------------------------------------------------------- def name_of(p): if not p: return "" return "%s (%s)" % (name_displayer.display(p),p.get_handle()) def get_name_obj(person): if person: return person.get_primary_name() else: return None #------------------------------------------------------------------------- # # # #------------------------------------------------------------------------- def by_id(p1,p2): return cmp(p1.get_handle(),p2.get_handle()) #------------------------------------------------------------------------ # # # #------------------------------------------------------------------------ class MergeOptions(Tool.ToolOptions): """ Defines options and provides handling interface. """ def __init__(self,name,person_id=None): Tool.ToolOptions.__init__(self,name,person_id) # Options specific for this report self.options_dict = { 'soundex' : 1, 'threshold' : 0.25, } self.options_help = { 'soundex' : ("=0/1","Whether to use SoundEx codes", ["Do not use SoundEx","Use SoundEx"], True), 'threshold' : ("=num","Threshold for tolerance", "Floating point number") } #------------------------------------------------------------------------- # # # #------------------------------------------------------------------------- register_tool( name = 'dupfind', category = Tool.TOOL_DBPROC, tool_class = Merge, options_class = MergeOptions, modes = Tool.MODE_GUI, translated_name = _("Find possible duplicate people"), status = _("Stable"), author_name = "Donald N. Allingham", author_email = "don@gramps-project.org", description=_("Searches the entire database, looking for " "individual entries that may represent the same person.") )