#! /usr/bin/env python # # check_po - a gramps tool to check validity of po files # # Copyright (C) 2006-2006 Kees Bakker # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # TODO # # * Check for HTML text in msgstr when there is none in msgid # * Check for matching HTML tag/endtag in msgstr # import sys import re from optparse import OptionParser all_total = {} all_fuzzy = {} all_untranslated = {} all_percent_s = {} all_named_s = {} all_bnamed_s = {} all_context = {} all_coverage = {} all_template_coverage = {} def strip_quotes(st): if len(st) >= 2 and st[0] == '"' and st[len(st)-1] == '"': st = st.strip()[1:-1] return st class CheckException( Exception ): pass # This is a base class for all checks class Check: def __init__( self ): self.msgs = [] def diag( self ): if len( self.msgs ): print print self.diag_header for m in self.msgs: m.diag() def summary( self ): print "%-20s%d" % ( self.summary_text, len(self.msgs) ) class Check_fmt( Check ): def __init__( self, fmt ): Check.__init__( self ) self.diag_header = "-------- %s mismatches --------------" % fmt self.summary_text = "%s mismatches:" % fmt self.fmt = fmt def __process( self, msg, msgid, msgstr ): cnt1 = msgid.count( self.fmt ) cnt2 = msgstr.count( self.fmt ) if cnt1 != cnt2: self.msgs.append( msg ) def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Check_named_fmt( Check ): # A pattern to find all %() find_named_fmt_pat = re.compile('% \( \w+ \) \d* \D', re.VERBOSE) def __init__( self ): Check.__init__( self ) self.diag_header = "-------- %() name mismatches --------------" self.summary_text = "%() name mismatches:" def __process( self, msg, msgid, msgstr ): # Same number of named formats? fmts1 = self.find_named_fmt_pat.findall( msgid ) fmts2 = self.find_named_fmt_pat.findall( msgstr ) if len( fmts1 ) != len( fmts2 ): self.msgs.append( msg ) else: # Do we have the same named formats? fmts1.sort() fmts2.sort() if fmts1 != fmts2: self.msgs.append( msg ) def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Check_missing_sd( Check ): # A pattern to find %() without s or d # Here is a command to use for testing # print re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE).findall( '%(event_name)s: %(place)s%(endnotes)s. ' ) find_named_fmt_pat2 = re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE) def __init__( self ): Check.__init__( self ) self.diag_header = "-------- %() without 's' or 'd' mismatches --------------" self.summary_text = "%() missing s/d:" def process( self, msg ): for msgstr in msg.msgstr: fmts = self.find_named_fmt_pat2.findall( msgstr ) for f in fmts: if not f in ('s', 'd'): self.msgs.append( msg ) break class Check_runaway( Check ): def __init__( self ): Check.__init__( self ) self.diag_header = "-------- Runaway context in translation ---------" self.summary_text = "Runaway context:" def __process( self, msg, msgid, msgstr ): # Runaway context. In the translated part we only to see # the translation of the word after the | if msgid.count('|') > 0 and msgstr.count('|') > 0 and msgid != msgstr: self.msgs.append( msg ) def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Check_xml_chars( Check ): # Special XML characters # It is not allowed to have a quote, an ampersand or an angle bracket xml_chars_pat = re.compile( r'(?<=\W) > | " | & (?!(quot|nbsp|gt|amp);)', re.VERBOSE ) def __init__( self ): Check.__init__( self ) self.diag_header = "-------- unescaped XML special characters ---------" self.summary_text = "XML special chars:" def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] # XML errors # Only look at messages in the tips.xml if msg.is_tips_xml: if self.xml_chars_pat.search( msgstr ): self.msgs.append( msg ) class Check_last_char( Check ): def __init__( self ): Check.__init__( self ) self.diag_header = "-------- last character not identical ---------" self.summary_text = "Last character:" def __process( self, msg, msgid, msgstr ): msgid_last = msgid[-1:] msgstr_last = msgstr[-1:] if msgid_last.isspace() != msgstr_last.isspace(): self.msgs.append( msg ) elif (msgid_last == '.') != (msgstr_last == '.'): self.msgs.append( msg ) def process( self, msg ): # Last character of msgid? White space? Period? if msg.is_fuzzy: return msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Check_shortcut_trans( Check ): def __init__( self ): Check.__init__( self ) self.diag_header = "-------- shortcut key in translation ---------" self.summary_text = "Shortcut in msgstr:" def __process( self, msg, msgid, msgstr ): if msgid.count('_') == 0 and msgstr.count('_') > 0: self.msgs.append( msg ) def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Msgid: fuzzy_pat = re.compile( 'fuzzy' ) tips_xml_pat = re.compile( r'tips\.xml' ) def __init__( self, msgnr, lineno ): self._msgid = [] # For debugging purpose the original text self._msgidp = [] # For debugging purpose the original text self._msgstr = [] # For debugging purpose the original text self.msgid = '' self.msgidp = '' self.msgstr = [] # This is a list to support plural self._cmnt = [] self.nr = msgnr self.lineno = lineno self.is_fuzzy = 0 self.is_tips_xml = 0 def diag( self ): print print "msg nr: %d, lineno: %d%s" % ( self.nr, self.lineno, self.is_fuzzy and " (fuzzy)" or "" ) sys.stdout.write( ''.join( self._msgid ) ) sys.stdout.write( ''.join( self._msgidp ) ) sys.stdout.write( ''.join( self._msgstr ) ) def add_msgid( self, line, lineno ): self._msgid.append( line ) line = re.sub( r'msgid\s+', '', line ) line = line.strip() if line[0] != '"' or line[-1:] != '"': print "ERROR at line %d: Missing quote." % lineno line = strip_quotes( line ) self.msgid += line def add_msgidp( self, line, lineno ): self._msgidp.append( line ) line = re.sub( r'msgid_plural\s+', '', line ) line = line.strip() if line[0] != '"' or line[-1:] != '"': print "ERROR at line %d: Missing quote." % lineno line = strip_quotes( line ) self.msgidp += line def add_new_msgstr( self, line, lineno ): self.msgstr.append( '' ) # Start a new msgstr self.add_msgstr( line, lineno ) def add_msgstr( self, line, lineno ): self._msgstr.append( line ) line = re.sub( r'msgstr(\[\d\])?\s+', '', line ) line = line.strip() if line[0] != '"' or line[-1:] != '"': print "ERROR at line %d: Missing quote." % lineno line = strip_quotes( line ) self.msgstr[-1] += line def add_cmnt( self, line ): self._cmnt.append( line ) if not self.is_fuzzy and self.fuzzy_pat.search( line ): self.is_fuzzy = 1 if not self.is_tips_xml and self.tips_xml_pat.search( line ): self.is_tips_xml = 1 msgs = [] msgnr = 0 # This is the message number of the next message to read. The first real message is 1. def create_new_Msgid( lineno ): global msgnr msg = Msgid( msgnr, lineno ) msgnr += 1 msgs.append( msg ) return msg def read_msgs( fname ): empty_pat = re.compile( r'^ \s* $', re.VERBOSE ) comment_pat = re.compile( r'\#', re.VERBOSE ) msgid_pat = re.compile( r'msgid \s+ "', re.VERBOSE ) msgid_plural_pat = re.compile( r'msgid_plural \s+ "', re.VERBOSE ) msgstr_pat = re.compile( r'msgstr (\[\d\])? \s+ "', re.VERBOSE ) str_pat = re.compile( r'"', re.VERBOSE ) old_pat = re.compile( r'\#~ \s+ ', re.VERBOSE ) f = open( fname ) lines = f.readlines() # parse it like a statemachine NONE = 'NONE' # Nothing detected, yet CMNT = 'CMNT' # Inside comment part MSGID = 'msgid' # Inside msgid part MSGIDP = 'msgid_plural' # Inside msgid_plural part MSGSTR = 'msgstr' # Inside msgstr part STR = 'STR' # A continuation string OLD = 'OLD' # An old pattern with #~ global msgs state = NONE msg = None for ix, line in enumerate( lines ): # Use line numbers for messages lineno = ix + 1 m = empty_pat.match( line ) if m: continue # Empty lines are not interesting # What's the next state? if old_pat.match( line ): next_state = OLD elif comment_pat.match( line ): next_state = CMNT elif msgid_pat.match( line ): next_state = MSGID elif msgid_plural_pat.match( line ): next_state = MSGIDP elif msgstr_pat.match( line ): next_state = MSGSTR elif str_pat.match( line ): next_state = STR else: print 'WARNING: Unexpected input at %(fname)s:%(lineno)d' % vars() next_state = NONE #print "%(state)d->%(next_state)d\t%(line)s" % vars() if state == NONE: # expect msgid or comment or old stuff if next_state == CMNT: state = CMNT msg = create_new_Msgid( lineno ) # Start with an empty new item msg.add_cmnt( line ) elif next_state == MSGID: state = MSGID msg = create_new_Msgid( lineno ) # Start with an empty new item msg.add_msgid( line, lineno ) elif next_state == MSGIDP: raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars() state = MSGSTR msg = create_new_Msgid( lineno ) # Start with an empty new item msg.add_new_msgstr( line, lineno ) elif next_state == STR: print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars() elif next_state == OLD: pass # Just skip else: raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() ) elif state == CMNT: # Expect more comment, or msgid. If msgstr or string it is flagged as error. if next_state == CMNT: if msg: msg.add_cmnt( line ) else: # Note. We may need to do something about these comments # Skip for now pass elif next_state == MSGID: state = MSGID if not msg: msg = create_new_Msgid( lineno ) # Start with an empty new item msg.add_msgid( line, lineno ) elif next_state == MSGIDP: raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars() state = MSGSTR msg = create_new_Msgid( lineno ) # Start with an empty new item msg.add_new_msgstr( line, lineno ) elif next_state == STR: print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars() elif next_state == OLD: msg = None pass # Just skip else: raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() ) elif state == MSGID: # Expect msgstr or msgid_plural or string if next_state == CMNT: # Hmmm. A comment here? print 'WARNING: Unexpted comment at %(fname)s:%(lineno)d' % vars() elif next_state == MSGID: raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGIDP: state = MSGIDP msg.add_msgidp( line, lineno ) elif next_state == MSGSTR: state = MSGSTR msg.add_new_msgstr( line, lineno ) elif next_state == STR: # Continuation of msgid, stay in state MSGID msg.add_msgid( line, lineno ) elif next_state == OLD: msg = None pass # Just skip else: raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() ) elif state == MSGIDP: # Expect msgstr or string or comment if next_state == CMNT: # Hmmm. A comment here? print 'WARNING: Unexpted comment at %(fname)s:%(lineno)d' % vars() elif next_state == MSGID: raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGIDP: raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: state = MSGSTR msg.add_new_msgstr( line, lineno ) elif next_state == STR: # Continuation of msgid_plural, stay in state MSGIDP msg.add_msgidp( line, lineno ) elif next_state == OLD: msg = None pass # Just skip else: raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() ) elif state == MSGSTR: # Expect comment, or msgid, or string. if next_state == CMNT: # A comment probably starts a new item state = CMNT msg = create_new_Msgid( lineno ) msg.add_cmnt( line ) elif next_state == MSGID: state = MSGID msg = create_new_Msgid( lineno ) msg.add_msgid( line, lineno ) elif next_state == MSGIDP: raise CheckException( 'Unexpected %(next_state)s at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: # New msgstr, probably for plural form # Stay in MSGSTR state msg.add_new_msgstr( line, lineno ) elif next_state == STR: msg.add_msgstr( line, lineno ) elif next_state == OLD: msg = None pass # Just skip else: raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() ) else: raise CheckException( 'Unexpected state in po parsing (state = %(state)s)' % vars() ) # Strip items with just comments. (Can this happen?) msgs1 = [] for m in msgs: if not m.msgid and not m.msgstr: #print "INFO: No msgid or msgstr at %s:%s" % ( fname, m.lineno ) pass else: msgs1.append( m ) msgs = msgs1 return msgs def analyze_msgs( options, fname, msgs, nr_templates = None, nth = 0 ): nr_fuzzy = 0 nr_untranslated = 0 checks = [] checks.append( Check_fmt( '%s' ) ) checks.append( Check_fmt( '%d' ) ) checks.append( Check_named_fmt() ) checks.append( Check_missing_sd() ) checks.append( Check_runaway() ) checks.append( Check_xml_chars() ) checks.append( Check_last_char() ) checks.append( Check_shortcut_trans() ) for msg in msgs: msgid = msg.msgid msgstr = msg.msgstr #print #print "msgid: %(msgid)s" % vars() #print "msgstr: %(msgstr)s" % vars() if ''.join(msgstr) == '': nr_untranslated += 1 continue if msg.is_fuzzy: nr_fuzzy += 1 if options.skip_fuzzy: continue for c in checks: c.process( msg ) nr_msgs = len(msgs) if nth > 0: print print "=====================================" print "%-20s%s" % ( "File:", fname ) print "%-20s%d" % ( "Template total:", nr_templates ) print "%-20s%d" % ( "PO total:", nr_msgs ) print "%-20s%d" % ( "Fuzzy:", nr_fuzzy ) print "%-20s%d" % ( "Untranslated:", nr_untranslated ) for c in checks: c.summary() po_coverage = (1.0 - (float(nr_untranslated) / float(nr_msgs))) * 100 print "%-20s%5.2f%%" % ( "PO Coverage:", po_coverage ) template_coverage = po_coverage * float(nr_msgs) / float(nr_templates) print "%-20s%5.2f%%" % ( "Template Coverage:", template_coverage ) if not options.only_summary: for c in checks: c.diag() def main(): parser = OptionParser( description='This program validates a PO file for GRAMPS.', usage='%prog [options] po-file...' ) parser.add_option("", "--skip-fuzzy", action="store_true", dest="skip_fuzzy", default=False, help="skip fuzzies") parser.add_option("-s", "--only-summary", action="store_true", dest="only_summary", default=False, help="only give the summary") (options, args) = parser.parse_args() try: pot_msgs = read_msgs( 'gramps.pot' ) nr_templates = len( pot_msgs ) nth = 0 for fname in args: msgs = read_msgs( fname ) analyze_msgs( options, fname, msgs, nr_templates, nth ) nth += 1 except CheckException, e: print 'Oops.', e print 'Bailing out' if __name__ == "__main__": main()