# # Gramps - a GTK+/GNOME based genealogy program # # Copyright (C) 2000-2007 Donald N. Allingham # Copyright (C) 2008 Brian G. Matherly # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # $Id$ """Tools/Database Processing/Find Possible Duplicate People""" #------------------------------------------------------------------------- # # GNOME libraries # #------------------------------------------------------------------------- import gtk #------------------------------------------------------------------------- # # GRAMPS modules # #------------------------------------------------------------------------- import const import gen.lib import Utils import soundex from BasicUtils import name_displayer from QuestionDialog import OkDialog import ListModel import Errors from Merge import PersonCompare import GrampsDisplay import ManagedWindow from PluginUtils import Tool from gen.plug import PluginManager from QuestionDialog import ErrorDialog, RunDatabaseRepair from TransUtils import sgettext as _ from glade import Glade #------------------------------------------------------------------------- # # Constants # #------------------------------------------------------------------------- _val2label = { 0.25 : _("Low"), 1.0 : _("Medium"), 2.0 : _("High"), } WIKI_HELP_PAGE = '%s_-_Tools' % const.URL_MANUAL_PAGE WIKI_HELP_SEC = _('manual|Find_Possible_Duplicate_People...') #------------------------------------------------------------------------- # # # #------------------------------------------------------------------------- def is_initial(name): if len(name) > 2: return 0 elif len(name) == 2: if name[0] == name[0].upper() and name[1] == '.': return 1 else: return name[0] == name[0].upper() #------------------------------------------------------------------------- # # The Actual tool. # #------------------------------------------------------------------------- class Merge(Tool.Tool,ManagedWindow.ManagedWindow): def __init__(self, dbstate, uistate, options_class, name, callback=None): Tool.Tool.__init__(self, dbstate, options_class, name) ManagedWindow.ManagedWindow.__init__(self, uistate, [], self.__class__) self.dbstate = dbstate self.uistate = uistate self.map = {} self.list = [] self.index = 0 self.merger = None self.mergee = None self.removed = {} self.update = callback self.use_soundex = 1 top = Glade() # retrieve options threshold = self.options.handler.options_dict['threshold'] use_soundex = self.options.handler.options_dict['soundex'] my_menu = gtk.ListStore(str, object) vals = _val2label.keys() vals.sort() for val in vals: my_menu.append([_val2label[val], val]) self.soundex_obj = top.get_object("soundex") self.soundex_obj.set_active(use_soundex) self.soundex_obj.show() self.menu = top.get_object("menu") self.menu.set_model(my_menu) self.menu.set_active(0) window = top.toplevel window.show() self.set_window(window, top.get_object('title'), _('Find Possible Duplicate People')) top.connect_signals({ "on_merge_ok_clicked" : self.on_merge_ok_clicked, "destroy_passed_object" : self.close, "on_help_clicked" : self.on_help_clicked, "on_delete_merge_event" : self.close, }) self.show() def build_menu_names(self, obj): return (_("Tool settings"),_("Find Duplicates tool")) def on_help_clicked(self, obj): """Display the relevant portion of GRAMPS manual""" GrampsDisplay.help(WIKI_HELP_PAGE , WIKI_HELP_SEC) def ancestors_of(self,p1_id,id_list): if (not p1_id) or (p1_id in id_list): return id_list.append(p1_id) p1 = self.db.get_person_from_handle(p1_id) f1_id = p1.get_main_parents_family_handle() if f1_id: f1 = self.db.get_family_from_handle(f1_id) self.ancestors_of(f1.get_father_handle(),id_list) self.ancestors_of(f1.get_mother_handle(),id_list) def on_merge_ok_clicked(self, obj): threshold = self.menu.get_model()[self.menu.get_active()][1] self.use_soundex = int(self.soundex_obj.get_active()) try: self.find_potentials(threshold) except AttributeError, msg: RunDatabaseRepair(str(msg)) return self.options.handler.options_dict['threshold'] = threshold self.options.handler.options_dict['soundex'] = self.use_soundex # Save options self.options.handler.save_options() if len(self.map) == 0: OkDialog( _("No matches found"), _("No potential duplicate people were found")) else: try: ShowMatches(self.dbstate,self.uistate,self.track, self.list,self.map,self.update) except Errors.WindowActiveError: pass def find_potentials(self,thresh): self.progress = Utils.ProgressMeter(_('Find Duplicates'), _('Looking for duplicate people')) index = 0 males = {} females = {} self.person_list = self.db.get_person_handles(sort_handles=False) length = len(self.person_list) self.progress.set_pass(_('Pass 1: Building preliminary lists'), length) for p1_id in self.person_list: self.progress.step() p1 = self.db.get_person_from_handle(p1_id) key = self.gen_key(p1.get_primary_name().get_surname()) if p1.get_gender() == gen.lib.Person.MALE: if key in males: males[key].append(p1_id) else: males[key] = [p1_id] else: if key in females: females[key].append(p1_id) else: females[key] = [p1_id] self.progress.set_pass(_('Pass 2: Calculating potential matches'), length) for p1key in self.person_list: self.progress.step() p1 = self.db.get_person_from_handle(p1key) key = self.gen_key(p1.get_primary_name().get_surname()) if p1.get_gender() == gen.lib.Person.MALE: remaining = males[key] else: remaining = females[key] index = 0 for p2key in remaining: index += 1 if p1key == p2key: continue p2 = self.db.get_person_from_handle(p2key) if p2key in self.map: (v,c) = self.map[p2key] if v == p1key: continue chance = self.compare_people(p1,p2) if chance >= thresh: if p1key in self.map: val = self.map[p1key] if val[1] > chance: self.map[p1key] = (p2key,chance) else: self.map[p1key] = (p2key,chance) self.list = self.map.keys() self.list.sort() self.length = len(self.list) self.progress.close() def gen_key(self,val): if self.use_soundex: try: return soundex.soundex(val) except UnicodeEncodeError: return val else: return val def compare_people(self,p1,p2): name1 = p1.get_primary_name() name2 = p2.get_primary_name() chance = self.name_match(name1, name2) if chance == -1 : return -1 birth1_ref = p1.get_birth_ref() if birth1_ref: birth1 = self.db.get_event_from_handle(birth1_ref.ref) else: birth1 = gen.lib.Event() death1_ref = p1.get_death_ref() if death1_ref: death1 = self.db.get_event_from_handle(death1_ref.ref) else: death1 = gen.lib.Event() birth2_ref = p2.get_birth_ref() if birth2_ref: birth2 = self.db.get_event_from_handle(birth2_ref.ref) else: birth2 = gen.lib.Event() death2_ref = p2.get_death_ref() if death2_ref: death2 = self.db.get_event_from_handle(death2_ref.ref) else: death2 = gen.lib.Event() value = self.date_match(birth1.get_date_object(), birth2.get_date_object()) if value == -1 : return -1 chance =+ value value = self.date_match(death1.get_date_object(), death2.get_date_object()) if value == -1 : return -1 chance =+ value value = self.place_match(birth1.get_place_handle(), birth2.get_place_handle()) if value == -1 : return -1 chance =+ value value = self.place_match(death1.get_place_handle(), death2.get_place_handle()) if value == -1 : return -1 chance =+ value ancestors = [] self.ancestors_of(p1.get_handle(),ancestors) if p2.get_handle() in ancestors: return -1 ancestors = [] self.ancestors_of(p2.get_handle(),ancestors) if p1.get_handle() in ancestors: return -1 f1_id = p1.get_main_parents_family_handle() f2_id = p2.get_main_parents_family_handle() if f1_id and f2_id: f1 = self.db.get_family_from_handle(f1_id) f2 = self.db.get_family_from_handle(f2_id) dad1_id = f1.get_father_handle() if dad1_id: dad1 = get_name_obj(self.db.get_person_from_handle(dad1_id)) else: dad1 = None dad2_id = f2.get_father_handle() if dad2_id: dad2 = get_name_obj(self.db.get_person_from_handle(dad2_id)) else: dad2 = None value = self.name_match(dad1,dad2) if value == -1: return -1 chance += value mom1_id = f1.get_mother_handle() if mom1_id: mom1 = get_name_obj(self.db.get_person_from_handle(mom1_id)) else: mom1 = None mom2_id = f2.get_mother_handle() if mom2_id: mom2 = get_name_obj(self.db.get_person_from_handle(mom2_id)) else: mom2 = None value = self.name_match(mom1,mom2) if value == -1: return -1 chance += value for f1_id in p1.get_family_handle_list(): f1 = self.db.get_family_from_handle(f1_id) for f2_id in p2.get_family_handle_list(): f2 = self.db.get_family_from_handle(f2_id) if p1.get_gender() == gen.lib.Person.FEMALE: father1_id = f1.get_father_handle() father2_id = f2.get_father_handle() if father1_id and father2_id: if father1_id == father2_id: chance += 1 else: father1 = self.db.get_person_from_handle(father1_id) father2 = self.db.get_person_from_handle(father2_id) fname1 = get_name_obj(father1) fname2 = get_name_obj(father2) value = self.name_match(fname1,fname2) if value != -1: chance += value else: mother1_id = f1.get_mother_handle() mother2_id = f2.get_mother_handle() if mother1_id and mother2_id: if mother1_id == mother2_id: chance += 1 else: mother1 = self.db.get_person_from_handle(mother1_id) mother2 = self.db.get_person_from_handle(mother2_id) mname1 = get_name_obj(mother1) mname2 = get_name_obj(mother2) value = self.name_match(mname1,mname2) if value != -1: chance += value return chance def name_compare(self,s1,s2): if self.use_soundex: try: return soundex.compare(s1,s2) except UnicodeEncodeError: return s1 == s2 else: return s1 == s2 def date_match(self,date1,date2): if date1.is_empty() or date2.is_empty(): return 0 if date1.is_equal(date2): return 1 if date1.is_compound() or date2.is_compound(): return self.range_compare(date1,date2) if date1.get_year() == date2.get_year(): if date1.get_month() == date2.get_month(): return 0.75 if not date1.get_month_valid() or not date2.get_month_valid(): return 0.75 else: return -1 else: return -1 def range_compare(self,date1,date2): start_date_1 = date1.get_start_date()[0:3] start_date_2 = date2.get_start_date()[0:3] stop_date_1 = date1.get_stop_date()[0:3] stop_date_2 = date2.get_stop_date()[0:3] if date1.is_compound() and date2.is_compound(): if start_date_2 <= start_date_1 <= stop_date_2 or \ start_date_1 <= start_date_2 <= stop_date_1 or \ start_date_2 <= stop_date_1 <= stop_date_2 or \ start-date_1 <= stop_date_2 <= stop_date_1: return 0.5 else: return -1 elif date2.is_compound(): if start_date_2 <= start_date_1 <= stop_date_2: return 0.5 else: return -1 else: if start_date_1 <= start_date_2 <= stop_date_1: return 0.5 else: return -1 def name_match(self, name, name1): if not name1 or not name: return 0 srn1 = name.get_surname() sfx1 = name.get_suffix() srn2 = name1.get_surname() sfx2 = name1.get_suffix() if not self.name_compare(srn1,srn2): return -1 if sfx1 != sfx2: if sfx1 != "" and sfx2 != "": return -1 if name.get_first_name() == name1.get_first_name(): return 1 else: list1 = name.get_first_name().split() list2 = name1.get_first_name().split() if len(list1) < len(list2): return self.list_reduce(list1,list2) else: return self.list_reduce(list2,list1) def place_match(self,p1_id,p2_id): if p1_id == p2_id: return 1 if not p1_id: name1 = "" else: p1 = self.db.get_place_from_handle(p1_id) name1 = p1.get_title() if not p2_id: name2 = "" else: p2 = self.db.get_place_from_handle(p2_id) name2 = p2.get_title() if not (name1 and name2): return 0 if name1 == name2: return 1 list1 = name1.replace(","," ").split() list2 = name2.replace(","," ").split() value = 0 for name in list1: for name2 in list2: if name == name2: value += 0.5 elif name[0] == name2[0] and self.name_compare(name, name2): value += 0.25 return min(value,1) if value else -1 def list_reduce(self,list1,list2): value = 0 for name in list1: for name2 in list2: if is_initial(name) and name[0] == name2[0]: value += 0.25 elif is_initial(name2) and name2[0] == name[0]: value += 0.25 elif name == name2: value += 0.5 elif name[0] == name2[0] and self.name_compare(name, name2): value += 0.25 return min(value,1) if value else -1 class ShowMatches(ManagedWindow.ManagedWindow): def __init__(self,dbstate,uistate,track,the_list,the_map,callback): ManagedWindow.ManagedWindow.__init__(self,uistate,track,self.__class__) self.dellist = {} self.list = the_list self.map = the_map self.length = len(self.list) self.update = callback self.db = dbstate.db self.dbstate = dbstate self.uistate = uistate top = Glade(toplevel="mergelist") window = top.toplevel window.show() self.set_window(window, top.get_object('title'), _('Potential Merges')) self.mlist = top.get_object("mlist") top.connect_signals({ "destroy_passed_object" : self.close, "on_do_merge_clicked" : self.on_do_merge_clicked, "on_help_show_clicked" : self.on_help_clicked, "on_delete_show_event" : self.close, }) mtitles = [ (_('Rating'),3,75), (_('First Person'),1,200), (_('Second Person'),2,200), ('',-1,0) ] self.list = ListModel.ListModel(self.mlist,mtitles, event_func=self.on_do_merge_clicked) self.redraw() self.show() def build_menu_names(self, obj): return (_("Merge candidates"),None) def on_help_clicked(self, obj): """Display the relevant portion of GRAMPS manual""" GrampsDisplay.help(WIKI_HELP_PAGE , WIKI_HELP_SEC) def redraw(self): list = [] for p1key in self.map.keys(): if p1key in self.dellist: continue (p2key,c) = self.map[p1key] if p1key == p2key: continue list.append((c,p1key,p2key)) self.list.clear() for (c,p1key,p2key) in list: c1 = "%5.2f" % c c2 = "%5.2f" % (100-c) p1 = self.db.get_person_from_handle(p1key) p2 = self.db.get_person_from_handle(p2key) if not p1 or not p2: continue pn1 = name_displayer.display(p1) pn2 = name_displayer.display(p2) self.list.add([c1, pn1, pn2,c2],(p1key,p2key)) def on_do_merge_clicked(self, obj): store,iter = self.list.selection.get_selected() if not iter: return (self.p1,self.p2) = self.list.get_object(iter) pn1 = self.db.get_person_from_handle(self.p1) pn2 = self.db.get_person_from_handle(self.p2) PersonCompare(self.dbstate,self.uistate,pn1,pn2,self.on_update) def on_update(self): self.dellist[self.p2] = self.p1 for key in self.dellist.keys(): if self.dellist[key] == self.p2: self.dellist[key] = self.p1 self.update() self.redraw() def update_and_destroy(self, obj): self.update(1) self.close() #------------------------------------------------------------------------- # # # #------------------------------------------------------------------------- def name_of(p): if not p: return "" return "%s (%s)" % (name_displayer.display(p),p.get_handle()) def get_name_obj(person): if person: return person.get_primary_name() else: return None #------------------------------------------------------------------------- # # # #------------------------------------------------------------------------- def by_id(p1,p2): return cmp(p1.get_handle(),p2.get_handle()) #------------------------------------------------------------------------ # # # #------------------------------------------------------------------------ class MergeOptions(Tool.ToolOptions): """ Defines options and provides handling interface. """ def __init__(self, name,person_id=None): Tool.ToolOptions.__init__(self, name,person_id) # Options specific for this report self.options_dict = { 'soundex' : 1, 'threshold' : 0.25, } self.options_help = { 'soundex' : ("=0/1","Whether to use SoundEx codes", ["Do not use SoundEx","Use SoundEx"], True), 'threshold' : ("=num","Threshold for tolerance", "Floating point number") } #------------------------------------------------------------------------- # # # #------------------------------------------------------------------------- pmgr = PluginManager.get_instance() pmgr.register_tool( name = 'dupfind', category = Tool.TOOL_DBPROC, tool_class = Merge, options_class = MergeOptions, modes = PluginManager.TOOL_MODE_GUI, translated_name = _("Find Possible Duplicate People"), status = _("Stable"), author_name = "Donald N. Allingham", author_email = "don@gramps-project.org", description = _("Searches the entire database, looking for " "individual entries that may represent the same person.") )