gramps/src/plugins/tool/FindDupes.py
2009-10-24 13:53:20 +00:00

667 lines
21 KiB
Python

#
# Gramps - a GTK+/GNOME based genealogy program
#
# Copyright (C) 2000-2007 Donald N. Allingham
# Copyright (C) 2008 Brian G. Matherly
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# $Id$
"""Tools/Database Processing/Find Possible Duplicate People"""
#-------------------------------------------------------------------------
#
# GNOME libraries
#
#-------------------------------------------------------------------------
import gtk
#-------------------------------------------------------------------------
#
# GRAMPS modules
#
#-------------------------------------------------------------------------
import const
import gen.lib
from gui.utils import ProgressMeter
import soundex
from BasicUtils import name_displayer
from QuestionDialog import OkDialog
import ListModel
import Errors
from Merge import PersonCompare
import GrampsDisplay
import ManagedWindow
from PluginUtils import Tool
from QuestionDialog import ErrorDialog, RunDatabaseRepair
from TransUtils import sgettext as _
from glade import Glade
#-------------------------------------------------------------------------
#
# Constants
#
#-------------------------------------------------------------------------
_val2label = {
0.25 : _("Low"),
1.0 : _("Medium"),
2.0 : _("High"),
}
WIKI_HELP_PAGE = '%s_-_Tools' % const.URL_MANUAL_PAGE
WIKI_HELP_SEC = _('manual|Find_Possible_Duplicate_People...')
#-------------------------------------------------------------------------
#
#
#
#-------------------------------------------------------------------------
def is_initial(name):
if len(name) > 2:
return 0
elif len(name) == 2:
if name[0] == name[0].upper() and name[1] == '.':
return 1
else:
return name[0] == name[0].upper()
#-------------------------------------------------------------------------
#
# The Actual tool.
#
#-------------------------------------------------------------------------
class Merge(Tool.Tool,ManagedWindow.ManagedWindow):
def __init__(self, dbstate, uistate, options_class, name, callback=None):
Tool.Tool.__init__(self, dbstate, options_class, name)
ManagedWindow.ManagedWindow.__init__(self, uistate, [],
self.__class__)
self.dbstate = dbstate
self.uistate = uistate
self.map = {}
self.list = []
self.index = 0
self.merger = None
self.mergee = None
self.removed = {}
self.update = callback
self.use_soundex = 1
top = Glade()
# retrieve options
threshold = self.options.handler.options_dict['threshold']
use_soundex = self.options.handler.options_dict['soundex']
my_menu = gtk.ListStore(str, object)
for val in sorted(_val2label):
my_menu.append([_val2label[val], val])
self.soundex_obj = top.get_object("soundex")
self.soundex_obj.set_active(use_soundex)
self.soundex_obj.show()
self.menu = top.get_object("menu")
self.menu.set_model(my_menu)
self.menu.set_active(0)
window = top.toplevel
window.show()
self.set_window(window, top.get_object('title'),
_('Find Possible Duplicate People'))
top.connect_signals({
"on_merge_ok_clicked" : self.on_merge_ok_clicked,
"destroy_passed_object" : self.close,
"on_help_clicked" : self.on_help_clicked,
"on_delete_merge_event" : self.close,
})
self.show()
def build_menu_names(self, obj):
return (_("Tool settings"),_("Find Duplicates tool"))
def on_help_clicked(self, obj):
"""Display the relevant portion of GRAMPS manual"""
GrampsDisplay.help(WIKI_HELP_PAGE , WIKI_HELP_SEC)
def ancestors_of(self,p1_id,id_list):
if (not p1_id) or (p1_id in id_list):
return
id_list.append(p1_id)
p1 = self.db.get_person_from_handle(p1_id)
f1_id = p1.get_main_parents_family_handle()
if f1_id:
f1 = self.db.get_family_from_handle(f1_id)
self.ancestors_of(f1.get_father_handle(),id_list)
self.ancestors_of(f1.get_mother_handle(),id_list)
def on_merge_ok_clicked(self, obj):
threshold = self.menu.get_model()[self.menu.get_active()][1]
self.use_soundex = int(self.soundex_obj.get_active())
try:
self.find_potentials(threshold)
except AttributeError, msg:
RunDatabaseRepair(str(msg))
return
self.options.handler.options_dict['threshold'] = threshold
self.options.handler.options_dict['soundex'] = self.use_soundex
# Save options
self.options.handler.save_options()
if len(self.map) == 0:
OkDialog(
_("No matches found"),
_("No potential duplicate people were found"))
else:
try:
ShowMatches(self.dbstate,self.uistate,self.track,
self.list,self.map,self.update)
except Errors.WindowActiveError:
pass
def find_potentials(self,thresh):
self.progress = ProgressMeter(_('Find Duplicates'),
_('Looking for duplicate people'))
index = 0
males = {}
females = {}
length = self.db.get_number_of_people()
self.progress.set_pass(_('Pass 1: Building preliminary lists'),
length)
for p1_id in self.db.iter_person_handles():
self.progress.step()
p1 = self.db.get_person_from_handle(p1_id)
key = self.gen_key(p1.get_primary_name().get_surname())
if p1.get_gender() == gen.lib.Person.MALE:
if key in males:
males[key].append(p1_id)
else:
males[key] = [p1_id]
else:
if key in females:
females[key].append(p1_id)
else:
females[key] = [p1_id]
self.progress.set_pass(_('Pass 2: Calculating potential matches'),
length)
for p1key in self.db.iter_person_handles():
self.progress.step()
p1 = self.db.get_person_from_handle(p1key)
key = self.gen_key(p1.get_primary_name().get_surname())
if p1.get_gender() == gen.lib.Person.MALE:
remaining = males[key]
else:
remaining = females[key]
#index = 0
for p2key in remaining:
#index += 1
if p1key == p2key:
continue
p2 = self.db.get_person_from_handle(p2key)
if p2key in self.map:
(v,c) = self.map[p2key]
if v == p1key:
continue
chance = self.compare_people(p1,p2)
if chance >= thresh:
if p1key in self.map:
val = self.map[p1key]
if val[1] > chance:
self.map[p1key] = (p2key,chance)
else:
self.map[p1key] = (p2key,chance)
self.list = sorted(self.map)
self.length = len(self.list)
self.progress.close()
def gen_key(self,val):
if self.use_soundex:
try:
return soundex.soundex(val)
except UnicodeEncodeError:
return val
else:
return val
def compare_people(self,p1,p2):
name1 = p1.get_primary_name()
name2 = p2.get_primary_name()
chance = self.name_match(name1, name2)
if chance == -1 :
return -1
birth1_ref = p1.get_birth_ref()
if birth1_ref:
birth1 = self.db.get_event_from_handle(birth1_ref.ref)
else:
birth1 = gen.lib.Event()
death1_ref = p1.get_death_ref()
if death1_ref:
death1 = self.db.get_event_from_handle(death1_ref.ref)
else:
death1 = gen.lib.Event()
birth2_ref = p2.get_birth_ref()
if birth2_ref:
birth2 = self.db.get_event_from_handle(birth2_ref.ref)
else:
birth2 = gen.lib.Event()
death2_ref = p2.get_death_ref()
if death2_ref:
death2 = self.db.get_event_from_handle(death2_ref.ref)
else:
death2 = gen.lib.Event()
value = self.date_match(birth1.get_date_object(),
birth2.get_date_object())
if value == -1 :
return -1
chance =+ value
value = self.date_match(death1.get_date_object(),
death2.get_date_object())
if value == -1 :
return -1
chance =+ value
value = self.place_match(birth1.get_place_handle(),
birth2.get_place_handle())
if value == -1 :
return -1
chance =+ value
value = self.place_match(death1.get_place_handle(),
death2.get_place_handle())
if value == -1 :
return -1
chance =+ value
ancestors = []
self.ancestors_of(p1.get_handle(),ancestors)
if p2.get_handle() in ancestors:
return -1
ancestors = []
self.ancestors_of(p2.get_handle(),ancestors)
if p1.get_handle() in ancestors:
return -1
f1_id = p1.get_main_parents_family_handle()
f2_id = p2.get_main_parents_family_handle()
if f1_id and f2_id:
f1 = self.db.get_family_from_handle(f1_id)
f2 = self.db.get_family_from_handle(f2_id)
dad1_id = f1.get_father_handle()
if dad1_id:
dad1 = get_name_obj(self.db.get_person_from_handle(dad1_id))
else:
dad1 = None
dad2_id = f2.get_father_handle()
if dad2_id:
dad2 = get_name_obj(self.db.get_person_from_handle(dad2_id))
else:
dad2 = None
value = self.name_match(dad1,dad2)
if value == -1:
return -1
chance += value
mom1_id = f1.get_mother_handle()
if mom1_id:
mom1 = get_name_obj(self.db.get_person_from_handle(mom1_id))
else:
mom1 = None
mom2_id = f2.get_mother_handle()
if mom2_id:
mom2 = get_name_obj(self.db.get_person_from_handle(mom2_id))
else:
mom2 = None
value = self.name_match(mom1,mom2)
if value == -1:
return -1
chance += value
for f1_id in p1.get_family_handle_list():
f1 = self.db.get_family_from_handle(f1_id)
for f2_id in p2.get_family_handle_list():
f2 = self.db.get_family_from_handle(f2_id)
if p1.get_gender() == gen.lib.Person.FEMALE:
father1_id = f1.get_father_handle()
father2_id = f2.get_father_handle()
if father1_id and father2_id:
if father1_id == father2_id:
chance += 1
else:
father1 = self.db.get_person_from_handle(father1_id)
father2 = self.db.get_person_from_handle(father2_id)
fname1 = get_name_obj(father1)
fname2 = get_name_obj(father2)
value = self.name_match(fname1,fname2)
if value != -1:
chance += value
else:
mother1_id = f1.get_mother_handle()
mother2_id = f2.get_mother_handle()
if mother1_id and mother2_id:
if mother1_id == mother2_id:
chance += 1
else:
mother1 = self.db.get_person_from_handle(mother1_id)
mother2 = self.db.get_person_from_handle(mother2_id)
mname1 = get_name_obj(mother1)
mname2 = get_name_obj(mother2)
value = self.name_match(mname1,mname2)
if value != -1:
chance += value
return chance
def name_compare(self,s1,s2):
if self.use_soundex:
try:
return soundex.compare(s1,s2)
except UnicodeEncodeError:
return s1 == s2
else:
return s1 == s2
def date_match(self,date1,date2):
if date1.is_empty() or date2.is_empty():
return 0
if date1.is_equal(date2):
return 1
if date1.is_compound() or date2.is_compound():
return self.range_compare(date1,date2)
if date1.get_year() == date2.get_year():
if date1.get_month() == date2.get_month():
return 0.75
if not date1.get_month_valid() or not date2.get_month_valid():
return 0.75
else:
return -1
else:
return -1
def range_compare(self,date1,date2):
start_date_1 = date1.get_start_date()[0:3]
start_date_2 = date2.get_start_date()[0:3]
stop_date_1 = date1.get_stop_date()[0:3]
stop_date_2 = date2.get_stop_date()[0:3]
if date1.is_compound() and date2.is_compound():
if start_date_2 <= start_date_1 <= stop_date_2 or \
start_date_1 <= start_date_2 <= stop_date_1 or \
start_date_2 <= stop_date_1 <= stop_date_2 or \
start_date_1 <= stop_date_2 <= stop_date_1:
return 0.5
else:
return -1
elif date2.is_compound():
if start_date_2 <= start_date_1 <= stop_date_2:
return 0.5
else:
return -1
else:
if start_date_1 <= start_date_2 <= stop_date_1:
return 0.5
else:
return -1
def name_match(self, name, name1):
if not name1 or not name:
return 0
srn1 = name.get_surname()
sfx1 = name.get_suffix()
srn2 = name1.get_surname()
sfx2 = name1.get_suffix()
if not self.name_compare(srn1,srn2):
return -1
if sfx1 != sfx2:
if sfx1 != "" and sfx2 != "":
return -1
if name.get_first_name() == name1.get_first_name():
return 1
else:
list1 = name.get_first_name().split()
list2 = name1.get_first_name().split()
if len(list1) < len(list2):
return self.list_reduce(list1,list2)
else:
return self.list_reduce(list2,list1)
def place_match(self,p1_id,p2_id):
if p1_id == p2_id:
return 1
if not p1_id:
name1 = ""
else:
p1 = self.db.get_place_from_handle(p1_id)
name1 = p1.get_title()
if not p2_id:
name2 = ""
else:
p2 = self.db.get_place_from_handle(p2_id)
name2 = p2.get_title()
if not (name1 and name2):
return 0
if name1 == name2:
return 1
list1 = name1.replace(","," ").split()
list2 = name2.replace(","," ").split()
value = 0
for name in list1:
for name2 in list2:
if name == name2:
value += 0.5
elif name[0] == name2[0] and self.name_compare(name, name2):
value += 0.25
return min(value,1) if value else -1
def list_reduce(self,list1,list2):
value = 0
for name in list1:
for name2 in list2:
if is_initial(name) and name[0] == name2[0]:
value += 0.25
elif is_initial(name2) and name2[0] == name[0]:
value += 0.25
elif name == name2:
value += 0.5
elif name[0] == name2[0] and self.name_compare(name, name2):
value += 0.25
return min(value,1) if value else -1
class ShowMatches(ManagedWindow.ManagedWindow):
def __init__(self,dbstate,uistate,track,the_list,the_map,callback):
ManagedWindow.ManagedWindow.__init__(self,uistate,track,self.__class__)
self.dellist = {}
self.list = the_list
self.map = the_map
self.length = len(self.list)
self.update = callback
self.db = dbstate.db
self.dbstate = dbstate
self.uistate = uistate
top = Glade(toplevel="mergelist")
window = top.toplevel
window.show()
self.set_window(window, top.get_object('title'),
_('Potential Merges'))
self.mlist = top.get_object("mlist")
top.connect_signals({
"destroy_passed_object" : self.close,
"on_do_merge_clicked" : self.on_do_merge_clicked,
"on_help_show_clicked" : self.on_help_clicked,
"on_delete_show_event" : self.close,
})
mtitles = [
(_('Rating'),3,75),
(_('First Person'),1,200),
(_('Second Person'),2,200),
('',-1,0)
]
self.list = ListModel.ListModel(self.mlist,mtitles,
event_func=self.on_do_merge_clicked)
self.redraw()
self.show()
def build_menu_names(self, obj):
return (_("Merge candidates"),None)
def on_help_clicked(self, obj):
"""Display the relevant portion of GRAMPS manual"""
GrampsDisplay.help(WIKI_HELP_PAGE , WIKI_HELP_SEC)
def redraw(self):
list = []
for p1key, p1data in self.map.iteritems():
if p1key in self.dellist:
continue
(p2key,c) = p1data
if p1key == p2key:
continue
list.append((c,p1key,p2key))
self.list.clear()
for (c,p1key,p2key) in list:
c1 = "%5.2f" % c
c2 = "%5.2f" % (100-c)
p1 = self.db.get_person_from_handle(p1key)
p2 = self.db.get_person_from_handle(p2key)
if not p1 or not p2:
continue
pn1 = name_displayer.display(p1)
pn2 = name_displayer.display(p2)
self.list.add([c1, pn1, pn2,c2],(p1key,p2key))
def on_do_merge_clicked(self, obj):
store,iter = self.list.selection.get_selected()
if not iter:
return
(self.p1,self.p2) = self.list.get_object(iter)
pn1 = self.db.get_person_from_handle(self.p1)
pn2 = self.db.get_person_from_handle(self.p2)
PersonCompare(self.dbstate,self.uistate,pn1,pn2,self.on_update)
def on_update(self):
self.dellist[self.p2] = self.p1
for key, data in self.dellist.iteritems():
if data == self.p2:
self.dellist[key] = self.p1
self.update()
self.redraw()
def update_and_destroy(self, obj):
self.update(1)
self.close()
#-------------------------------------------------------------------------
#
#
#
#-------------------------------------------------------------------------
def name_of(p):
if not p:
return ""
return "%s (%s)" % (name_displayer.display(p),p.get_handle())
def get_name_obj(person):
if person:
return person.get_primary_name()
else:
return None
#-------------------------------------------------------------------------
#
#
#
#-------------------------------------------------------------------------
def by_id(p1,p2):
return cmp(p1.get_handle(),p2.get_handle())
#------------------------------------------------------------------------
#
#
#
#------------------------------------------------------------------------
class MergeOptions(Tool.ToolOptions):
"""
Defines options and provides handling interface.
"""
def __init__(self, name,person_id=None):
Tool.ToolOptions.__init__(self, name,person_id)
# Options specific for this report
self.options_dict = {
'soundex' : 1,
'threshold' : 0.25,
}
self.options_help = {
'soundex' : ("=0/1","Whether to use SoundEx codes",
["Do not use SoundEx","Use SoundEx"],
True),
'threshold' : ("=num","Threshold for tolerance",
"Floating point number")
}