gramps/src/plugins/ImportCsv.py

800 lines
33 KiB
Python

#
# Gramps - a GTK+/GNOME based genealogy program
#
# Copyright (C) 2007 Douglas S. Blank
# Copyright (C) 2000-2007 Donald N. Allingham
# Copyright (C) 2008 Raphael Ackerman
# Copyright (C) 2008 Brian G. Matherly
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# $Id$
"Import from CSV Spreadsheet"
#-------------------------------------------------------------------------
#
# Standard Python Modules
#
#-------------------------------------------------------------------------
import time
from gettext import gettext as _
import csv
import codecs
import cStringIO
#------------------------------------------------------------------------
#
# Set up logging
#
#------------------------------------------------------------------------
import logging
log = logging.getLogger(".ImportCSV")
#-------------------------------------------------------------------------
#
# GRAMPS modules
#
#-------------------------------------------------------------------------
import gen.lib
from QuestionDialog import ErrorDialog
from DateHandler import parser as _dp
from gen.plug import PluginManager, ImportPlugin
from Utils import gender as gender_map
from Utils import ProgressMeter
#-------------------------------------------------------------------------
#
# Encoding support for CSV, from http://docs.python.org/lib/csv-examples.html
#
#-------------------------------------------------------------------------
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, **kwds)
self.stream = f
self.encoder = codecs.getencoder(encoding)
def writerow(self, row):
self.writer.writerow([s.encode('utf-8') for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode('utf-8')
# ... and reencode it into the target encoding
data, length = self.encoder(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
def close(self):
self.stream.close()
#-------------------------------------------------------------------------
#
# Support and main functions
#
#-------------------------------------------------------------------------
def rd(line_number, row, col, key, default = None):
""" Return Row data by column name """
if key in col:
if col[key] >= len(row):
print "Warning: missing '%s, on line %d" % (key, line_number)
return default
retval = row[col[key]].strip()
if retval == "":
return default
else:
return retval
else:
return default
def cleanup_column_name(column):
""" Handle column aliases for CSV spreadsheet import and SQL """
retval = column
# Title case:
if retval in ["Lastname",
"Surname", _("Surname")]:
return "surname"
elif retval in ["Firstname",
"Given name", _("Given name"),
"Given", _("Given")]:
return "firstname"
elif retval in ["Callname",
"Call name", _("Call name"),
"Call", _("Call")]:
return "callname"
elif retval in ["Title", _("Title")]:
return "title"
elif retval in ["Prefix", _("Prefix")]:
return "prefix"
elif retval in ["Suffix", _("Suffix")]:
return "suffix"
elif retval in ["Gender", _("Gender")]:
return "gender"
elif retval in ["Source", _("Source")]:
return "source"
elif retval in ["Note", _("Note")]:
return "note"
elif retval in ["Birthplace",
"Birth place", _("Birth place")]:
return "birthplace"
elif retval in ["Birthdate",
"Birth date", _("Birth date")]:
return "birthdate"
elif retval in ["Birthsource",
"Birth source", _("Birth source")]:
return "birthsource"
elif retval in ["Deathplace",
"Death place", _("Death place")]:
return "deathplace"
elif retval in ["Deathdate",
"Death date", _("Death date")]:
return "deathdate"
elif retval in ["Deathsource",
"Death source", _("Death source")]:
return "deathsource"
elif retval in ["Deathcause",
"Death cause", _("Death cause")]:
return "deathcause"
elif retval in ["Grampsid", "ID",
"Gramps id", _("Gramps id")]:
return "grampsid"
elif retval in ["Person", _("Person")]:
return "person"
# ----------------------------------
elif retval in ["Child", _("Child")]:
return "child"
elif retval in ["Source", _("Source")]:
return "source"
elif retval in ["Family", _("Family")]:
return "family"
# ----------------------------------
elif retval in ["Mother", _("Mother"),
"Wife", _("Wife"),
"Parent2", _("Parent2")]:
return "wife"
elif retval in ["Father", _("Father"),
"Husband", _("Husband"),
"Parent1", _("Parent1")]:
return "husband"
elif retval in ["Marriage", _("Marriage")]:
return "marriage"
elif retval in ["Date", _("Date")]:
return "date"
elif retval in ["Place", _("Place")]:
return "place"
# lowercase
elif retval in ["lastname", "last_name",
"surname", _("surname")]:
return "surname"
elif retval in ["firstname", "first_name", "given_name",
"given name", _("given name"),
"given", _("given")]:
return "firstname"
elif retval in ["callname", "call_name",
"call name",
"call", _("call")]:
return "callname"
elif retval in ["title", _("title")]:
return "title"
elif retval in ["prefix", _("prefix")]:
return "prefix"
elif retval in ["suffix", _("suffix")]:
return "suffix"
elif retval in ["gender", _("gender")]:
return "gender"
elif retval in ["source", _("source")]:
return "source"
elif retval in ["note", _("note")]:
return "note"
elif retval in ["birthplace", "birth_place",
"birth place", _("birth place")]:
return "birthplace"
elif retval in ["birthdate", "birth_date",
"birth date", _("birth date")]:
return "birthdate"
elif retval in ["birthsource", "birth_source",
"birth source", _("birth source")]:
return "birthsource"
elif retval in ["deathplace", "death_place",
"death place", _("death place")]:
return "deathplace"
elif retval in ["deathdate", "death_date",
"death date", _("death date")]:
return "deathdate"
elif retval in ["deathsource", "death_source",
"death source", _("death source")]:
return "deathsource"
elif retval in ["deathcause", "death_cause",
"death cause", _("death cause")]:
return "deathcause"
elif retval in ["grampsid", "id", "gramps_id",
"gramps id", _("gramps id")]:
return "grampsid"
elif retval in ["person", _("person")]:
return "person"
# ----------------------------------
elif retval in ["child", _("child")]:
return "child"
elif retval in ["source", _("source")]:
return "source"
elif retval in ["family", _("family")]:
return "family"
# ----------------------------------
elif retval in ["mother", _("mother"),
"wife", _("wife"),
"parent2", _("parent2")]:
return "wife"
elif retval in ["father", _("father"),
"husband", _("husband"),
"parent1", _("parent1")]:
return "husband"
elif retval in ["marriage", _("marriage")]:
return "marriage"
elif retval in ["date", _("date")]:
return "date"
elif retval in ["place", _("place")]:
return "place"
#----------------------------------------------------
return retval
def importData(db, filename, callback=None):
g = CSVParser(db, filename, callback)
g.process()
#-------------------------------------------------------------------------
#
# CSV Parser
#
#-------------------------------------------------------------------------
class CSVParser:
def __init__(self, db, filename, callback):
self.db = db
self.filename = filename
self.callback = callback
self.debug = 0
def readCSV(self):
fp = None
reader = []
try:
fp = open(self.filename, "rb")
reader = UnicodeReader(fp)
except IOError, msg:
errmsg = _("%s could not be opened\n") % self.filename
ErrorDialog(errmsg,str(msg))
try:
fp.close()
except:
pass
return None
try:
data = [[r.strip() for r in row] for row in reader]
except csv.Error, e:
ErrorDialog(_('format error: file %s, line %d: %s') %
(self.filename, reader.line_num, e))
try:
fp.close()
except:
pass
return None
return data
def lookup(self, type, id):
if id is None: return None
if type == "family":
if id.startswith("[") and id.endswith("]"):
id = id[1:-1]
db_lookup = self.db.get_family_from_gramps_id(id)
if db_lookup is None:
return self.lookup(type, id)
else:
return db_lookup
elif id.lower() in self.fref.keys():
return self.fref[id.lower()]
else:
return None
elif type == "person":
if id.startswith("[") and id.endswith("]"):
id = id[1:-1]
db_lookup = self.db.get_person_from_gramps_id(id)
if db_lookup is None:
return self.lookup(type, id)
else:
return db_lookup
elif id.lower() in self.pref.keys():
return self.pref[id.lower()]
else:
return None
else:
print "error: invalid lookup type in CSV import: '%s'" % type
return None
def storeup(self, type, id, object):
if id.startswith("[") and id.endswith("]"):
id = id[1:-1]
#return # do not store gramps people; go look them up
if type == "person":
self.pref[id.lower()] = object
elif type == "family":
self.fref[id.lower()] = object
else:
print "error: invalid storeup type in CSV import: '%s'" % type
def process(self):
progress = ProgressMeter(_('CSV Import'))
progress.set_pass(_('Reading data...'), 1)
data = self.readCSV()
progress.set_pass(_('Importing data...'), len(data))
self.trans = self.db.transaction_begin("",batch=True)
self.db.disable_signals()
t = time.time()
self.lineno = 0
self.index = 0
self.fam_count = 0
self.indi_count = 0
self.pref = {} # person ref, internal to this sheet
self.fref = {} # family ref, internal to this sheet
header = None
line_number = 0
for row in data:
progress.step()
line_number += 1
if "".join(row) == "": # no blanks are allowed inside a table
header = None # clear headers, ready for next "table"
continue
######################################
if header is None:
header = [cleanup_column_name(r) for r in row]
col = {}
count = 0
for key in header:
col[key] = count
count += 1
continue
# three different kinds of data: person, family, and marriage
if (("marriage" in header) or
("husband" in header) or
("wife" in header)):
# marriage, husband, wife
marriage_ref = rd(line_number, row, col, "marriage")
husband = rd(line_number, row, col, "husband")
wife = rd(line_number, row, col, "wife")
marriagedate = rd(line_number, row, col, "date")
marriageplace = rd(line_number, row, col, "place")
marriagesource = rd(line_number, row, col, "source")
note = rd(line_number, row, col, "note")
wife = self.lookup("person", wife)
husband = self.lookup("person", husband)
if husband is None and wife is None:
# might have children, so go ahead and add
print "Warning: no parents on line %d; adding family anyway" % line_number
family = self.get_or_create_family(marriage_ref, husband, wife)
# adjust gender, if not already provided
if husband:
# this is just a guess, if unknown
if husband.get_gender() == gen.lib.Person.UNKNOWN:
husband.set_gender(gen.lib.Person.MALE)
self.db.commit_person(husband, self.trans)
if wife:
# this is just a guess, if unknown
if wife.get_gender() == gen.lib.Person.UNKNOWN:
wife.set_gender(gen.lib.Person.FEMALE)
self.db.commit_person(wife, self.trans)
if marriage_ref:
self.storeup("family", marriage_ref.lower(), family)
if marriagesource:
# add, if new
new, marriagesource = self.get_or_create_source(marriagesource)
if marriageplace:
# add, if new
new, marriageplace = self.get_or_create_place(marriageplace)
if marriagedate:
marriagedate = _dp.parse(marriagedate)
if marriagedate or marriageplace or marriagesource:
# add, if new; replace, if different
new, marriage = self.get_or_create_event(family, gen.lib.EventType.MARRIAGE, marriagedate, marriageplace, marriagesource)
if new:
mar_ref = gen.lib.EventRef()
mar_ref.set_reference_handle(marriage.get_handle())
family.add_event_ref(mar_ref)
self.db.commit_family(family, self.trans)
# only add note to event:
if note:
# append notes, if previous notes
previous_notes = marriage.get_note()
if previous_notes != "":
if note not in previous_notes:
note = previous_notes + "\n" + note
marriage.set_note(note)
self.db.commit_event(marriage, self.trans)
elif "family" in header:
# family, child
family_ref = rd(line_number, row, col, "family")
if family_ref is None:
print "Error: no family reference found for family on line %d" % line_number
continue # required
child = rd(line_number, row, col, "child")
source = rd(line_number, row, col, "source")
note = rd(line_number, row, col, "note")
gender = rd(line_number, row, col, "gender")
child = self.lookup("person", child)
family = self.lookup("family", family_ref)
if family is None:
print "Error: no matching family reference found for family on line %d" % line_number
continue
if child is None:
print "Error: no matching child reference found for family on line %d" % line_number
continue
# is this child already in this family? If so, don't add
if self.debug: print "children:", [ref.ref for ref in family.get_child_ref_list()]
if self.debug: print "looking for:", child.get_handle()
if child.get_handle() not in [ref.ref for ref in family.get_child_ref_list()]:
# add child to family
if self.debug: print " adding child to family", child.get_gramps_id(), family.get_gramps_id()
childref = gen.lib.ChildRef()
childref.set_reference_handle(child.get_handle())
family.add_child_ref( childref)
self.db.commit_family(family, self.trans)
child.add_parent_family_handle(family.get_handle())
if gender:
# replace
gender = gender.lower()
if gender == gender_map[gen.lib.Person.MALE]:
gender = gen.lib.Person.MALE
elif gender == gender_map[gen.lib.Person.FEMALE]:
gender = gen.lib.Person.FEMALE
else:
gender = gen.lib.Person.UNKNOWN
child.set_gender(gender)
if source:
# add, if new
new, source = self.get_or_create_source(source)
source_refs = child.get_source_references()
found = 0
for ref in source_refs:
if self.debug: print "child: looking for ref:", ref.ref, source.get_handle()
if ref.ref == source.get_handle():
found = 1
if not found:
sref = gen.lib.SourceRef()
sref.set_reference_handle(source.get_handle())
child.add_source_reference(sref)
# put note on child
if note:
# append notes, if previous notes
previous_notes = child.get_note()
if self.debug: print " previous note:", previous_notes
if previous_notes != "":
if note not in previous_notes:
note = previous_notes + "\n" + note
child.set_note(note)
self.db.commit_person(child, self.trans)
elif "surname" in header: # person data
# surname, and any of the following
surname = rd(line_number, row, col, "surname")
firstname = rd(line_number, row, col, "firstname", "")
callname = rd(line_number, row, col, "callname")
title = rd(line_number, row, col, "title")
prefix = rd(line_number, row, col, "prefix")
suffix = rd(line_number, row, col, "suffix")
gender = rd(line_number, row, col, "gender")
source = rd(line_number, row, col, "source")
note = rd(line_number, row, col, "note")
birthplace = rd(line_number, row, col, "birthplace")
birthdate = rd(line_number, row, col, "birthdate")
birthsource = rd(line_number, row, col, "birthsource")
deathplace = rd(line_number, row, col, "deathplace")
deathdate = rd(line_number, row, col, "deathdate")
deathsource = rd(line_number, row, col, "deathsource")
deathcause = rd(line_number, row, col, "deathcause")
grampsid = rd(line_number, row, col, "grampsid")
person_ref = rd(line_number, row, col, "person")
#########################################################
# if this person already exists, don't create them
person = self.lookup("person", person_ref)
if person is None:
if surname is None:
print "Warning: empty surname for new person on line %d" % line_number
surname = ""
# new person
person = self.create_person(firstname, surname)
name = gen.lib.Name()
name.set_type(gen.lib.NameType(gen.lib.NameType.BIRTH))
name.set_first_name(firstname)
name.set_surname(surname)
person.set_primary_name(name)
else:
name = person.get_primary_name()
#########################################################
if person_ref is not None:
self.storeup("person", person_ref, person)
# replace
if callname is not None:
name.set_call_name(callname)
if title is not None:
name.set_title(title)
if prefix is not None:
name.prefix = prefix
name.group_as = '' # HELP? what should I do here?
if suffix is not None:
name.set_suffix(suffix)
if note is not None:
# append notes, if previous notes
previous_notes = person.get_note()
if previous_notes != "":
if note not in previous_notes:
note = previous_notes + "\n" + note
person.set_note(note)
if grampsid is not None:
person.gramps_id = grampsid
elif person_ref is not None:
if person_ref.startswith("[") and person_ref.endswith("]"):
person.gramps_id = person_ref[1:-1]
if person.get_gender() == gen.lib.Person.UNKNOWN and gender is not None:
gender = gender.lower()
if gender == gender_map[gen.lib.Person.MALE]:
gender = gen.lib.Person.MALE
elif gender == gender_map[gen.lib.Person.FEMALE]:
gender = gen.lib.Person.FEMALE
else:
gender = gen.lib.Person.UNKNOWN
person.set_gender(gender)
#########################################################
# add if new, replace if different
if birthdate is not None:
birthdate = _dp.parse(birthdate)
if birthplace is not None:
new, birthplace = self.get_or_create_place(birthplace)
if birthsource is not None:
new, birthsource = self.get_or_create_source(birthsource)
if birthdate or birthplace or birthsource:
new, birth = self.get_or_create_event(person, gen.lib.EventType.BIRTH, birthdate, birthplace, birthsource)
birth_ref = person.get_birth_ref()
if birth_ref is None:
# new
birth_ref = gen.lib.EventRef()
birth_ref.set_reference_handle( birth.get_handle())
person.set_birth_ref( birth_ref)
if deathdate is not None:
deathdate = _dp.parse(deathdate)
if deathplace is not None:
new, deathplace = self.get_or_create_place(deathplace)
if deathsource is not None:
new, deathsource = self.get_or_create_source(deathsource)
if deathdate or deathplace or deathsource or deathcause:
new, death = self.get_or_create_event(person, gen.lib.EventType.DEATH, deathdate, deathplace, deathsource)
if deathcause:
death.set_description(deathcause)
self.db.commit_event(death, self.trans)
death_ref = person.get_death_ref()
if death_ref is None:
# new
death_ref = gen.lib.EventRef()
death_ref.set_reference_handle(death.get_handle())
person.set_death_ref(death_ref)
if source:
# add, if new
new, source = self.get_or_create_source(source)
source_refs = person.get_source_references()
found = 0
for ref in source_refs:
if self.debug: print "person: looking for ref:", ref.ref, source.get_handle()
if ref.ref == source.get_handle():
found = 1
if not found:
sref = gen.lib.SourceRef()
sref.set_reference_handle(source.get_handle())
person.add_source_reference(sref)
self.db.commit_person(person, self.trans)
else:
print "Warning: ignoring line %d" % line_number
t = time.time() - t
msg = _('Import Complete: %d seconds') % t
self.db.transaction_commit(self.trans,_("CSV import"))
self.db.enable_signals()
self.db.request_rebuild()
print msg
print "New Families: %d" % self.fam_count
print "New Individuals: %d" % self.indi_count
progress.close()
return None
def get_or_create_family(self, family_ref, husband, wife):
# if a gramps_id and exists:
if self.debug: print "get_or_create_family"
if family_ref.startswith("[") and family_ref.endswith("]"):
family = self.db.get_family_from_gramps_id(family_ref[1:-1])
if family:
# don't delete, only add
fam_husband_handle = family.get_father_handle()
fam_wife_handle = family.get_mother_handle()
if husband:
if husband.get_handle() != fam_husband_handle:
# this husband is not the same old one! Add him!
family.set_father_handle(husband.get_handle())
if wife:
if wife.get_handle() != fam_wife_handle:
# this wife is not the same old one! Add her!
family.set_wife_handle(wife.get_handle())
if self.debug: print " returning existing family"
return family
# if not, create one:
family = gen.lib.Family()
# was marked with a gramps_id, but didn't exist, so we'll use it:
if family_ref.startswith("[") and family_ref.endswith("]"):
family.set_gramps_id(family_ref[1:-1])
# add it:
self.db.add_family(family, self.trans)
if husband:
family.set_father_handle(husband.get_handle())
husband.add_family_handle(family.get_handle())
if wife:
family.set_mother_handle(wife.get_handle())
wife.add_family_handle(family.get_handle())
self.db.commit_family(family,self.trans)
if husband:
self.db.commit_person(husband, self.trans)
if wife:
self.db.commit_person(wife, self.trans)
self.fam_count += 1
return family
def get_or_create_event(self, object, type, date=None, place=None, source=None):
""" Add or find a type event on object """
# first, see if it exists
if self.debug: print "get_or_create_event"
ref_list = object.get_event_ref_list()
if self.debug: print "refs:", ref_list
# look for a match, and possible correction
for ref in ref_list:
event = self.db.get_event_from_handle(ref.ref)
if self.debug: print " compare event type", int(event.get_type()), type
if int(event.get_type()) == type:
# Match! Let's update
if date:
event.set_date_object(date)
if place:
event.set_place_handle(place.get_handle())
if source:
source_refs = event.get_source_references()
found = 0
for ref in source_refs:
if self.debug: print "get_or_create_event: looking for ref:", ref.ref, source.get_handle()
if ref.ref == source.get_handle():
found = 1
if not found:
sref = gen.lib.SourceRef()
sref.set_reference_handle(source.get_handle())
event.add_source_reference(sref)
self.db.commit_event(event,self.trans)
if self.debug: print " returning existing event"
return (0, event)
# else create it:
if self.debug: print " creating event"
event = gen.lib.Event()
if type:
event.set_type(gen.lib.EventType(type))
if date:
event.set_date_object(date)
if place:
event.set_place_handle(place.get_handle())
if source:
source_refs = event.get_source_references()
found = 0
for ref in source_refs:
if self.debug: print "looking for ref:", ref.ref, source.get_handle()
if ref.ref == source.get_handle():
found = 1
if not found:
sref = gen.lib.SourceRef()
sref.set_reference_handle(source.get_handle())
event.add_source_reference(sref)
self.db.add_event(event,self.trans)
self.db.commit_event(event,self.trans)
return (1, event)
def create_person(self,firstname,lastname):
""" Used to create a new person we know doesn't exist """
person = gen.lib.Person()
mykey = firstname+lastname
self.db.add_person(person,self.trans)
self.db.commit_person(person,self.trans)
self.indi_count += 1
return person
def get_or_create_place(self,place_name):
place_list = self.db.get_place_handles()
if self.debug: print "get_or_create_place: list:", place_list
if self.debug: print "get_or_create_place: looking for:", place_name
for place_handle in place_list:
place = self.db.get_place_from_handle(place_handle)
if place.get_title() == place_name:
return (0, place)
place = gen.lib.Place()
place.set_title(place_name)
self.db.add_place(place,self.trans)
self.db.commit_place(place,self.trans)
return (1, place)
def get_or_create_source(self, source_text):
source_list = self.db.get_source_handles()
if self.debug: print "get_or_create_source: list:", source_list
if self.debug: print "get_or_create_source: looking for:", source_text
for source_handle in source_list:
source = self.db.get_source_from_handle(source_handle)
if source.get_title() == source_text:
return (0, source)
source = gen.lib.Source()
source.set_title(source_text)
self.db.add_source(source, self.trans)
self.db.commit_source(source, self.trans)
return (1, source)
#-------------------------------------------------------------------------
#
# Register the plugin
#
#-------------------------------------------------------------------------
_mime_type = "text/x-comma-separated-values" # CSV Document
_mime_type_rfc_4180 = "text/csv" # CSV Document See rfc4180 for mime type
pmgr = PluginManager.get_instance()
plugin = ImportPlugin(name = _('CSV Spreadheet'),
description = _("Import data from CSV files"),
import_function = importData,
extension = "csv")
pmgr.register_plugin(plugin)