* src/DateParser.py: Switch from utf8 strings to unicode.
* src/DateDisplay.py: Switch from utf8 strings to unicode. * src/dates/Date_ru.py: Switch from utf8 strings to unicode. * src/dates/Date_fr.py: Switch from utf8 strings to unicode. svn: r3733
This commit is contained in:
parent
003e8e7190
commit
2eace67e3f
@ -4,6 +4,11 @@
|
|||||||
2004-11-16 Alex Roitman <shura@alex.neuro.umn.edu>
|
2004-11-16 Alex Roitman <shura@alex.neuro.umn.edu>
|
||||||
* src/DateParser.py: Typo.
|
* src/DateParser.py: Typo.
|
||||||
|
|
||||||
|
* src/DateParser.py: Switch from utf8 strings to unicode.
|
||||||
|
* src/DateDisplay.py: Switch from utf8 strings to unicode.
|
||||||
|
* src/dates/Date_ru.py: Switch from utf8 strings to unicode.
|
||||||
|
* src/dates/Date_fr.py: Switch from utf8 strings to unicode.
|
||||||
|
|
||||||
2004-11-15 Alex Roitman <shura@alex.neuro.umn.edu>
|
2004-11-15 Alex Roitman <shura@alex.neuro.umn.edu>
|
||||||
* src/DateDisplay.py: Remove localized displayers.
|
* src/DateDisplay.py: Remove localized displayers.
|
||||||
* src/DateParser.py: Remove localized parsers.
|
* src/DateParser.py: Remove localized parsers.
|
||||||
|
@ -98,9 +98,9 @@ class DateDisplay:
|
|||||||
)
|
)
|
||||||
|
|
||||||
_french = (
|
_french = (
|
||||||
'', 'Vend\xc3\xa9miaire', 'Brumaire',
|
'', u'Vend\xc3\xa9miaire', 'Brumaire',
|
||||||
'Frimaire', 'Niv\xc3\xb4se', 'Pluvi\xc3\xb4se',
|
'Frimaire', u'Niv\xc3\xb4se', u'Pluvi\xc3\xb4se',
|
||||||
'Vent\xc3\xb4se', 'Germinal', 'Flor\xc3\xa9al',
|
u'Vent\xc3\xb4se', 'Germinal', u'Flor\xc3\xa9al',
|
||||||
'Prairial', 'Messidor', 'Thermidor',
|
'Prairial', 'Messidor', 'Thermidor',
|
||||||
'Fructidor', 'Extra'
|
'Fructidor', 'Extra'
|
||||||
)
|
)
|
||||||
|
@ -137,10 +137,10 @@ class DateParser:
|
|||||||
}
|
}
|
||||||
|
|
||||||
french_to_int = {
|
french_to_int = {
|
||||||
'vend\xc3\xa9miaire' : 1, 'brumaire' : 2,
|
u'vend\xc3\xa9miaire' : 1, 'brumaire' : 2,
|
||||||
'frimaire' : 3, 'niv\xc3\xb4se ': 4,
|
'frimaire' : 3, u'niv\xc3\xb4se ': 4,
|
||||||
'pluvi\xc3\xb4se' : 5, 'vent\xc3\xb4se' : 6,
|
u'pluvi\xc3\xb4se' : 5, u'vent\xc3\xb4se' : 6,
|
||||||
'germinal' : 7, 'flor\xc3\xa9al' : 8,
|
'germinal' : 7, u'flor\xc3\xa9al' : 8,
|
||||||
'prairial' : 9, 'messidor' : 10,
|
'prairial' : 9, 'messidor' : 10,
|
||||||
'thermidor' : 11, 'fructidor' : 12,
|
'thermidor' : 11, 'fructidor' : 12,
|
||||||
'extra' : 13
|
'extra' : 13
|
||||||
@ -239,7 +239,12 @@ class DateParser:
|
|||||||
self._mod_str = '(' + '|'.join(
|
self._mod_str = '(' + '|'.join(
|
||||||
[ key.replace('.','\.') for key in self.modifier_to_int.keys() ]
|
[ key.replace('.','\.') for key in self.modifier_to_int.keys() ]
|
||||||
) + ')'
|
) + ')'
|
||||||
self._mon_str = '(' + '|'.join(self.month_to_int.keys()) + ')'
|
# Need to reverse-sort the keys, so that April matches before Apr does.
|
||||||
|
# Otherwise, 'april 2000' would be matched as 'apr' + garbage ('il 2000')
|
||||||
|
_month_keys = self.month_to_int.keys()
|
||||||
|
_month_keys.sort()
|
||||||
|
_month_keys.reverse()
|
||||||
|
self._mon_str = '(' + '|'.join(_month_keys) + ')'
|
||||||
self._jmon_str = '(' + '|'.join(self.hebrew_to_int.keys()) + ')'
|
self._jmon_str = '(' + '|'.join(self.hebrew_to_int.keys()) + ')'
|
||||||
self._fmon_str = '(' + '|'.join(self.french_to_int.keys()) + ')'
|
self._fmon_str = '(' + '|'.join(self.french_to_int.keys()) + ')'
|
||||||
self._pmon_str = '(' + '|'.join(self.persian_to_int.keys()) + ')'
|
self._pmon_str = '(' + '|'.join(self.persian_to_int.keys()) + ')'
|
||||||
@ -316,7 +321,7 @@ class DateParser:
|
|||||||
self.month_to_int,gregorian_valid)
|
self.month_to_int,gregorian_valid)
|
||||||
|
|
||||||
def _parse_calendar(self,text,regex1,regex2,mmap,check=None):
|
def _parse_calendar(self,text,regex1,regex2,mmap,check=None):
|
||||||
match = regex1.match(text)
|
match = regex1.match(text.lower())
|
||||||
if match:
|
if match:
|
||||||
groups = match.groups()
|
groups = match.groups()
|
||||||
if groups[0] == None:
|
if groups[0] == None:
|
||||||
@ -337,9 +342,10 @@ class DateParser:
|
|||||||
value = Date.EMPTY
|
value = Date.EMPTY
|
||||||
return value
|
return value
|
||||||
|
|
||||||
match = regex2.match(text)
|
match = regex2.match(text.lower())
|
||||||
if match:
|
if match:
|
||||||
groups = match.groups()
|
groups = match.groups()
|
||||||
|
print groups #[ g.encode('utf8') for g in groups ]
|
||||||
if groups[1] == None:
|
if groups[1] == None:
|
||||||
m = 0
|
m = 0
|
||||||
else:
|
else:
|
||||||
@ -421,8 +427,6 @@ class DateParser:
|
|||||||
qual = Date.QUAL_NONE
|
qual = Date.QUAL_NONE
|
||||||
cal = Date.CAL_GREGORIAN
|
cal = Date.CAL_GREGORIAN
|
||||||
|
|
||||||
text = text.encode('utf8')
|
|
||||||
|
|
||||||
match = self._cal.match(text)
|
match = self._cal.match(text)
|
||||||
if match:
|
if match:
|
||||||
grps = match.groups()
|
grps = match.groups()
|
||||||
|
@ -50,48 +50,54 @@ from DateDisplay import DateDisplay
|
|||||||
class DateParserFR(DateParser):
|
class DateParserFR(DateParser):
|
||||||
|
|
||||||
modifier_to_int = {
|
modifier_to_int = {
|
||||||
'avant' : Date.MOD_BEFORE,
|
u'avant' : Date.MOD_BEFORE,
|
||||||
'av.' : Date.MOD_BEFORE,
|
u'av.' : Date.MOD_BEFORE,
|
||||||
'av' : Date.MOD_BEFORE,
|
u'av' : Date.MOD_BEFORE,
|
||||||
'après' : Date.MOD_AFTER,
|
u'après' : Date.MOD_AFTER,
|
||||||
'ap.' : Date.MOD_AFTER,
|
u'ap.' : Date.MOD_AFTER,
|
||||||
'ap' : Date.MOD_AFTER,
|
u'ap' : Date.MOD_AFTER,
|
||||||
'env.' : Date.MOD_ABOUT,
|
u'env.' : Date.MOD_ABOUT,
|
||||||
'env' : Date.MOD_ABOUT,
|
u'env' : Date.MOD_ABOUT,
|
||||||
'circa' : Date.MOD_ABOUT,
|
u'circa' : Date.MOD_ABOUT,
|
||||||
'c.' : Date.MOD_ABOUT,
|
u'c.' : Date.MOD_ABOUT,
|
||||||
'vers' : Date.MOD_ABOUT,
|
u'vers' : Date.MOD_ABOUT,
|
||||||
}
|
}
|
||||||
|
|
||||||
calendar_to_int = {
|
calendar_to_int = {
|
||||||
'grégorien' : Date.CAL_GREGORIAN,
|
u'grégorien' : Date.CAL_GREGORIAN,
|
||||||
'g' : Date.CAL_GREGORIAN,
|
u'g' : Date.CAL_GREGORIAN,
|
||||||
'julien' : Date.CAL_JULIAN,
|
u'julien' : Date.CAL_JULIAN,
|
||||||
'j' : Date.CAL_JULIAN,
|
u'j' : Date.CAL_JULIAN,
|
||||||
'hébreu' : Date.CAL_HEBREW,
|
u'hébreu' : Date.CAL_HEBREW,
|
||||||
'h' : Date.CAL_HEBREW,
|
u'h' : Date.CAL_HEBREW,
|
||||||
'islamique' : Date.CAL_ISLAMIC,
|
u'islamique' : Date.CAL_ISLAMIC,
|
||||||
'i' : Date.CAL_ISLAMIC,
|
u'i' : Date.CAL_ISLAMIC,
|
||||||
'révolutionnaire': Date.CAL_FRENCH,
|
u'révolutionnaire': Date.CAL_FRENCH,
|
||||||
'r' : Date.CAL_FRENCH,
|
u'r' : Date.CAL_FRENCH,
|
||||||
'perse' : Date.CAL_PERSIAN,
|
u'perse' : Date.CAL_PERSIAN,
|
||||||
'p' : Date.CAL_PERSIAN,
|
u'p' : Date.CAL_PERSIAN,
|
||||||
}
|
}
|
||||||
|
|
||||||
quality_to_int = {
|
quality_to_int = {
|
||||||
'estimated' : Date.QUAL_ESTIMATED,
|
u'estimated' : Date.QUAL_ESTIMATED,
|
||||||
'est.' : Date.QUAL_ESTIMATED,
|
u'est.' : Date.QUAL_ESTIMATED,
|
||||||
'est' : Date.QUAL_ESTIMATED,
|
u'est' : Date.QUAL_ESTIMATED,
|
||||||
'calc.' : Date.QUAL_CALCULATED,
|
u'calc.' : Date.QUAL_CALCULATED,
|
||||||
'calc' : Date.QUAL_CALCULATED,
|
u'calc' : Date.QUAL_CALCULATED,
|
||||||
'calculated' : Date.QUAL_CALCULATED,
|
u'calculated' : Date.QUAL_CALCULATED,
|
||||||
}
|
}
|
||||||
|
|
||||||
def init_strings(self):
|
def init_strings(self):
|
||||||
DateParser.init_strings(self)
|
DateParser.init_strings(self)
|
||||||
self._span = re.compile("(de)\s+(.+)\s+(à)\s+(.+)",
|
_span_1 = [u'de']
|
||||||
|
_span_2 = [u'à']
|
||||||
|
_range_1 = [u'ent.',u'ent',u'entre']
|
||||||
|
_range_2 = [u'et']
|
||||||
|
self._span = re.compile("(%s)\s+(.+)\s+(%s)\s+(.+)" %
|
||||||
|
('|'.join(_span_1),'|'.join(_span_2)),
|
||||||
re.IGNORECASE)
|
re.IGNORECASE)
|
||||||
self._range = re.compile("(ent.|ent|entre)\s+(.+)\s+(et)\s+(.+)",
|
self._range = re.compile("(%s)\s+(.+)\s+(%s)\s+(.+)" %
|
||||||
|
('|'.join(_range_1),'|'.join(_range_2)),
|
||||||
re.IGNORECASE)
|
re.IGNORECASE)
|
||||||
|
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
@ -102,11 +108,11 @@ class DateParserFR(DateParser):
|
|||||||
class DateDisplayFR(DateDisplay):
|
class DateDisplayFR(DateDisplay):
|
||||||
|
|
||||||
calendar = (
|
calendar = (
|
||||||
"", " (Julien)", " (Hébreu)",
|
"", u" (Julien)", u" (Hébreu)",
|
||||||
" (Révolutionnaire)", " (Perse)", " (Islamique)"
|
u" (Révolutionnaire)", u" (Perse)", u" (Islamique)"
|
||||||
)
|
)
|
||||||
|
|
||||||
_mod_str = ("","avant ","après ","vers ","","","")
|
_mod_str = ("",u"avant ",u"après ",u"vers ","","","")
|
||||||
|
|
||||||
def display(self,date):
|
def display(self,date):
|
||||||
"""
|
"""
|
||||||
@ -126,11 +132,11 @@ class DateDisplayFR(DateDisplay):
|
|||||||
elif mod == Date.MOD_SPAN:
|
elif mod == Date.MOD_SPAN:
|
||||||
d1 = self.display_cal[cal](start)
|
d1 = self.display_cal[cal](start)
|
||||||
d2 = self.display_cal[cal](date.get_stop_date())
|
d2 = self.display_cal[cal](date.get_stop_date())
|
||||||
return "%sde %s à %s%s" % (qual_str,d1,d2,self.calendar[cal])
|
return "%s%s %s %s %s%s" % (qual_str,u'de',d1,u'à',d2,self.calendar[cal])
|
||||||
elif mod == Date.MOD_RANGE:
|
elif mod == Date.MOD_RANGE:
|
||||||
d1 = self.display_cal[cal](start)
|
d1 = self.display_cal[cal](start)
|
||||||
d2 = self.display_cal[cal](date.get_stop_date())
|
d2 = self.display_cal[cal](date.get_stop_date())
|
||||||
return "%sentre %s et %s%s" % (qual_str,d1,d2,self.calendar[cal])
|
return "%s%s %s %s %s%s" % (qual_str,u'entre',d1,u'et',d2,self.calendar[cal])
|
||||||
else:
|
else:
|
||||||
text = self.display_cal[date.get_calendar()](start)
|
text = self.display_cal[date.get_calendar()](start)
|
||||||
return "%s%s%s%s" % (qual_str,self._mod_str[mod],text,self.calendar[cal])
|
return "%s%s%s%s" % (qual_str,self._mod_str[mod],text,self.calendar[cal])
|
||||||
|
@ -50,58 +50,64 @@ from DateDisplay import DateDisplay
|
|||||||
class DateParserRU(DateParser):
|
class DateParserRU(DateParser):
|
||||||
|
|
||||||
modifier_to_int = {
|
modifier_to_int = {
|
||||||
'до' : Date.MOD_BEFORE,
|
u'до' : Date.MOD_BEFORE,
|
||||||
'по' : Date.MOD_BEFORE,
|
u'по' : Date.MOD_BEFORE,
|
||||||
'после' : Date.MOD_AFTER,
|
u'после' : Date.MOD_AFTER,
|
||||||
'п.' : Date.MOD_AFTER,
|
u'п.' : Date.MOD_AFTER,
|
||||||
'п' : Date.MOD_AFTER,
|
u'п' : Date.MOD_AFTER,
|
||||||
'с' : Date.MOD_AFTER,
|
u'с' : Date.MOD_AFTER,
|
||||||
'ок' : Date.MOD_ABOUT,
|
u'ок' : Date.MOD_ABOUT,
|
||||||
'ок.' : Date.MOD_ABOUT,
|
u'ок.' : Date.MOD_ABOUT,
|
||||||
'около' : Date.MOD_ABOUT,
|
u'около' : Date.MOD_ABOUT,
|
||||||
'примерно' : Date.MOD_ABOUT,
|
u'примерно' : Date.MOD_ABOUT,
|
||||||
'прим' : Date.MOD_ABOUT,
|
u'прим' : Date.MOD_ABOUT,
|
||||||
'прим.' : Date.MOD_ABOUT,
|
u'прим.' : Date.MOD_ABOUT,
|
||||||
'приблизительно' : Date.MOD_ABOUT,
|
u'приблизительно' : Date.MOD_ABOUT,
|
||||||
'приб.' : Date.MOD_ABOUT,
|
u'приб.' : Date.MOD_ABOUT,
|
||||||
'прибл.' : Date.MOD_ABOUT,
|
u'прибл.' : Date.MOD_ABOUT,
|
||||||
'приб' : Date.MOD_ABOUT,
|
u'приб' : Date.MOD_ABOUT,
|
||||||
'прибл' : Date.MOD_ABOUT,
|
u'прибл' : Date.MOD_ABOUT,
|
||||||
}
|
}
|
||||||
|
|
||||||
calendar_to_int = {
|
calendar_to_int = {
|
||||||
'григорианский' : Date.CAL_GREGORIAN,
|
u'григорианский' : Date.CAL_GREGORIAN,
|
||||||
'г' : Date.CAL_GREGORIAN,
|
u'г' : Date.CAL_GREGORIAN,
|
||||||
'юлианский' : Date.CAL_JULIAN,
|
u'юлианский' : Date.CAL_JULIAN,
|
||||||
'ю' : Date.CAL_JULIAN,
|
u'ю' : Date.CAL_JULIAN,
|
||||||
'еврейский' : Date.CAL_HEBREW,
|
u'еврейский' : Date.CAL_HEBREW,
|
||||||
'е' : Date.CAL_HEBREW,
|
u'е' : Date.CAL_HEBREW,
|
||||||
'исламский' : Date.CAL_ISLAMIC,
|
u'исламский' : Date.CAL_ISLAMIC,
|
||||||
'и' : Date.CAL_ISLAMIC,
|
u'и' : Date.CAL_ISLAMIC,
|
||||||
'республиканский': Date.CAL_FRENCH,
|
u'республиканский': Date.CAL_FRENCH,
|
||||||
'р' : Date.CAL_FRENCH,
|
u'р' : Date.CAL_FRENCH,
|
||||||
'персидский' : Date.CAL_PERSIAN,
|
u'персидский' : Date.CAL_PERSIAN,
|
||||||
'п' : Date.CAL_PERSIAN,
|
u'п' : Date.CAL_PERSIAN,
|
||||||
}
|
}
|
||||||
|
|
||||||
quality_to_int = {
|
quality_to_int = {
|
||||||
'оценено' : Date.QUAL_ESTIMATED,
|
u'оценено' : Date.QUAL_ESTIMATED,
|
||||||
'оцен.' : Date.QUAL_ESTIMATED,
|
u'оцен.' : Date.QUAL_ESTIMATED,
|
||||||
'оц.' : Date.QUAL_ESTIMATED,
|
u'оц.' : Date.QUAL_ESTIMATED,
|
||||||
'оцен' : Date.QUAL_ESTIMATED,
|
u'оцен' : Date.QUAL_ESTIMATED,
|
||||||
'оц' : Date.QUAL_ESTIMATED,
|
u'оц' : Date.QUAL_ESTIMATED,
|
||||||
'вычислено' : Date.QUAL_CALCULATED,
|
u'вычислено' : Date.QUAL_CALCULATED,
|
||||||
'вычисл.' : Date.QUAL_CALCULATED,
|
u'вычисл.' : Date.QUAL_CALCULATED,
|
||||||
'выч.' : Date.QUAL_CALCULATED,
|
u'выч.' : Date.QUAL_CALCULATED,
|
||||||
'вычисл' : Date.QUAL_CALCULATED,
|
u'вычисл' : Date.QUAL_CALCULATED,
|
||||||
'выч' : Date.QUAL_CALCULATED,
|
u'выч' : Date.QUAL_CALCULATED,
|
||||||
}
|
}
|
||||||
|
|
||||||
def init_strings(self):
|
def init_strings(self):
|
||||||
DateParser.init_strings(self)
|
DateParser.init_strings(self)
|
||||||
self._span = re.compile("(с|от)\s+(.+)\s+(по|до)\s+(.+)",
|
_span_1 = [u'с',u'от']
|
||||||
|
_span_2 = [u'по',u'до']
|
||||||
|
_range_1 = [u'между',u'меж',u'меж.']
|
||||||
|
_range_2 = [u'и']
|
||||||
|
self._span = re.compile("(%s)\s+(.+)\s+(%s)\s+(.+)" %
|
||||||
|
('|'.join(_span_1),'|'.join(_span_2)),
|
||||||
re.IGNORECASE)
|
re.IGNORECASE)
|
||||||
self._range = re.compile("(между|меж|меж.)\s+(.+)\s+(и)\s+(.+)",
|
self._range = re.compile("(%s)\s+(.+)\s+(%s)\s+(.+)" %
|
||||||
|
('|'.join(_range_1),'|'.join(_range_2)),
|
||||||
re.IGNORECASE)
|
re.IGNORECASE)
|
||||||
|
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
@ -112,16 +118,16 @@ class DateParserRU(DateParser):
|
|||||||
class DateDisplayRU(DateDisplay):
|
class DateDisplayRU(DateDisplay):
|
||||||
|
|
||||||
calendar = (
|
calendar = (
|
||||||
"", " (юлианский)",
|
"", u" (юлианский)",
|
||||||
" (еврейский)",
|
u" (еврейский)",
|
||||||
" (республиканский)",
|
u" (республиканский)",
|
||||||
" (персидский)",
|
u" (персидский)",
|
||||||
" (исламский)"
|
u" (исламский)"
|
||||||
)
|
)
|
||||||
|
|
||||||
_mod_str = ("","до ",
|
_mod_str = ("",u"до ",
|
||||||
"после ",
|
u"после ",
|
||||||
"около ","","","")
|
u"около ","","","")
|
||||||
|
|
||||||
def display(self,date):
|
def display(self,date):
|
||||||
"""
|
"""
|
||||||
@ -141,11 +147,11 @@ class DateDisplayRU(DateDisplay):
|
|||||||
elif mod == Date.MOD_SPAN:
|
elif mod == Date.MOD_SPAN:
|
||||||
d1 = self.display_cal[cal](start)
|
d1 = self.display_cal[cal](start)
|
||||||
d2 = self.display_cal[cal](date.get_stop_date())
|
d2 = self.display_cal[cal](date.get_stop_date())
|
||||||
return "%sс %s по %s%s" % (qual_str,d1,d2,self.calendar[cal])
|
return "%sс %s %s %s%s" % (qual_str,d1,u'по',d2,self.calendar[cal])
|
||||||
elif mod == Date.MOD_RANGE:
|
elif mod == Date.MOD_RANGE:
|
||||||
d1 = self.display_cal[cal](start)
|
d1 = self.display_cal[cal](start)
|
||||||
d2 = self.display_cal[cal](date.get_stop_date())
|
d2 = self.display_cal[cal](date.get_stop_date())
|
||||||
return "%sмежду %s и %s%s" % (qual_str,d1,d2,self.calendar[cal])
|
return "%s%s %s %s %s%s" % (qual_str,u'между',d1,u'и',d2,self.calendar[cal])
|
||||||
else:
|
else:
|
||||||
text = self.display_cal[date.get_calendar()](start)
|
text = self.display_cal[date.get_calendar()](start)
|
||||||
return "%s%s%s%s" % (qual_str,self._mod_str[mod],text,self.calendar[cal])
|
return "%s%s%s%s" % (qual_str,self._mod_str[mod],text,self.calendar[cal])
|
||||||
|
Loading…
Reference in New Issue
Block a user