From 0d321299c940f9201ae9dae7f765bd2b11b9ed46 Mon Sep 17 00:00:00 2001 From: Kees Bakker Date: Wed, 8 Nov 2006 19:22:08 +0000 Subject: [PATCH] This is a rewrite of the check_po script. A few checks have been improved and a new check was added. The new check is for XML special characters. svn: r7583 --- po/check_po | 570 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 393 insertions(+), 177 deletions(-) diff --git a/po/check_po b/po/check_po index 53e2db593..6b3bc2e13 100755 --- a/po/check_po +++ b/po/check_po @@ -1,8 +1,8 @@ #! /usr/bin/env python # -# Gramps - a GTK+/GNOME based genealogy program +# check_po - a gramps tool to check validity of po files # -# Copyright (C) 2000-2006 Donald N. Allingham +# Copyright (C) 2006-2006 Kees Bakker # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -18,25 +18,11 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# $Id: check_po,v 1.1.2.6 2006/04/22 18:30:33 rshura Exp $ +# $Id:$ import sys import re -f = open('gramps.pot') -template_total = 0 -for line in f.xreadlines(): - try: - if (line.split()[0] == 'msgid'): - template_total += 1 - except: - pass -f.close() - -NONE = 0 -MSGID = 1 -MSGSTR = 2 - all_total = {} all_fuzzy = {} all_untranslated = {} @@ -47,170 +33,400 @@ all_context = {} all_coverage = {} all_template_coverage = {} - def strip_quotes(st): - if len(st.strip()) > 2: - return st.strip()[1:-1] - else: - return "" + st = st.strip() + if len(st) >= 2 and st[0] == '"' and st[len(st)-1] == '"': + st = st.strip()[1:-1] + return st -args = sys.argv -while len(args) > 1: - args = args[1:] +class Msgid: + fuzzy_pat = re.compile( 'fuzzy' ) + tips_xml_pat = re.compile( r'tips\.xml' ) + def __init__( self, lineno ): + self._msgid = [] + self._msgstr = [] + self._cmnt = [] + self.lineno = lineno + self.is_fuzzy = 0 + self.has_sfmt_mismatch = 0 + self.has_named_sfmt_mismatch = 0 + self.has_fmt_missing_sd = 0 + self.has_context_error = 0 + self.has_named_fmt_mismatch = 0 + self.has_xml_error = 0 - f = open(args[0],"r") - - mode = NONE - fuzzy = False - fuzzy_count = 0 - string_map = {} - current_msgid = "" - current_msgstr = "" - - for line in f.xreadlines(): - data = line.split(None,1) - if mode == NONE: - if len(data) > 0 and data[0] == "msgid": - mode = MSGID - if len(data) > 1: - current_msgid = strip_quotes(data[1]) - elif (len(data) > 0) and (data[0] == "#,") \ - and (data[1] == 'fuzzy\n'): - fuzzy = True - elif mode == MSGID: - if data[0][0] == '"': - current_msgid += strip_quotes(line) - elif data[0] == "msgstr": - mode = MSGSTR - if len(data) > 1: - current_msgstr = strip_quotes(data[1]) - elif mode == MSGSTR: - if line == "" or line[0] == "#": - mode = NONE - if fuzzy: - fuzzy = False - fuzzy_count += 1 - else: - string_map[current_msgid] = current_msgstr - elif len(data) > 0 and data[0][0] == '"': - current_msgstr += strip_quotes(line) - - f.close() - - named = re.compile('%\((\w+)\)\d*s') - bnamed = re.compile('%\((\w+)\)\d*[^sd]') - - total = len(string_map) + fuzzy_count - untranslated = 0 - percent_s = 0 - percent_s_list = [] - named_s = 0 - named_s_list = [] - bnamed_s = 0 - bnamed_s_list = [] - context = 0 - context_list = [] - - for (msgid,msgstr) in string_map.items(): - if msgstr == "": - untranslated += 1 - continue - - cnt1 = msgid.count('%s') - cnt2 = msgstr.count('%s') - if cnt1 != cnt2: - percent_s += 1 - percent_s_list.append(msgid) - - list1 = named.findall(msgid) - list2 = named.findall(msgstr) - if len(list1) != len(list2): - percent_s += 1 - percent_s_list.append(msgid) - - list1.sort() - list2.sort() - if list1 != list2: - named_s += 1 - named_s_list.append(msgid) - - match = bnamed.match(msgstr) - if match: - bnamed_s +=1 - bnamed_s_list.append(msgstr) + def diag( self ): + if 0: + print "lineno: %d" % self.lineno + sys.stdout.write( ''.join( self._msgid ) ) + sys.stdout.write( ''.join( self._msgstr ) ) + else: + # Compatible with the old check_po + print "%d '%s' : '%s'" % ( self.lineno, self.msgid(), self.msgstr() ) - has_context1 = (msgid.count('|') > 0) - has_context2 = (msgstr.count('|') > 0) - if has_context1 and has_context2 and (msgid != msgstr): - context += 1 - context_list.append(msgid) - - - coverage = (1.0 - (float(untranslated)/float(total))) * 100 - template_coverage = coverage * float(total) / float(template_total) - - print "File: %s" % args[0] - print "Template total: %d" % template_total - print "PO total: %d" % total - all_total[args[0]] = total - print "Fuzzy: %d" % fuzzy_count - all_fuzzy[args[0]] = fuzzy_count - print "Untranslated: %d" % untranslated - all_untranslated[args[0]] = untranslated - print "%%s mismatches: %d" % percent_s - all_percent_s[args[0]] = percent_s - print "%%()s mismatches: %d" % named_s - all_named_s[args[0]] = named_s - print "%%() missing s/d: %d" % bnamed_s - all_bnamed_s[args[0]] = bnamed_s - print "Runaway context: %d" % context - all_context[args[0]] = context - print "PO Coverage: %5.2f%%" % coverage - all_coverage[args[0]] = coverage - print "Template Coverage: %5.2f%%" % template_coverage - all_template_coverage[args[0]] = coverage - - if percent_s: - print "\n-------- %s mismatches --------------" - for i in percent_s_list: - print "'%s' : '%s'" % (i, string_map[i]) - - if named_s: - print "\n-------- %()s mismatches ------------" - for i in named_s_list: - print "'%s' : '%s'" % (i, string_map[i]) - - if bnamed_s: - print "\n-------- %() missing s or d ---------" - for i in bnamed_s_list: - print "'%s' : '%s'" % (i, string_map[i]) + def msgid( self ): + if not self._msgid: + return None + txt = '' + for l in self._msgid: + l = re.sub( r'msgid\s+', '', l ) + l = strip_quotes( l ) + txt += l + return txt - if context: - print "\n-------- Runaway context in translation ---------" - for i in context_list: - print "'%s' : '%s'" % (i, string_map[i]) - print "" - + def add_msgid( self, line ): + self._msgid.append( line ) -if len(sys.argv) > 2: - print "\n\nFile \tTotal \tFuzzy \tUntranslated \t%s mismatch \t%()s mismatch \tmissing s/d \tcontext \tCoverage" - for pofile in sys.argv[1:]: - print "%s \t%5d \t%7d \t%7d \t%7d \t%7d \t%7d \t%7d \t%3.2f%% \t%3.2f%%" %\ - (pofile, - all_total[pofile], - all_fuzzy[pofile], - all_untranslated[pofile], - all_percent_s[pofile], - all_named_s[pofile], - all_bnamed_s[pofile], - all_context[pofile], - all_coverage[pofile], - all_template_coverage[pofile] - ) + def msgstr( self ): + if not self._msgstr: + return None + txt = '' + for l in self._msgstr: + l = re.sub( r'msgstr\s+', '', l ) + l = strip_quotes( l ) + txt += l + return txt -f = open("used_strings.txt","w") -keys = string_map.keys() -keys.sort() -for i in keys: - f.write(i + "\n") -f.close() + def add_msgstr( self, line ): + self._msgstr.append( line ) + + def add_cmnt( self, line ): + self._cmnt.append( line ) + if not self.is_fuzzy and self.fuzzy_pat.search( line ): + self.is_fuzzy = 1 + + def is_tips_xml( self ): + for c in self._cmnt: + if self.tips_xml_pat.search( c ): + return 1 + return 0 + + def set_sfmt_mismatch( self ): + self.has_sfmt_mismatch = 1 + + def set_named_fmt_mismatch( self ): + self.has_named_fmt_mismatch = 1 + + def set_fmt_missing_sd( self ): + self.has_fmt_missing_sd = 1 + + def set_context_error( self ): + self.has_context_error = 1 + + def set_named_fmt_mismatch( self ): + self.has_named_fmt_mismatch = 1 + + def set_xml_error( self ): + self.has_xml_error = 1 + +def read_msgs( fname ): + empty_pat = re.compile( r'^ \s* $', re.VERBOSE ) + comment_pat = re.compile( r'\#', re.VERBOSE ) + msgid_pat = re.compile( r'msgid \s+ "', re.VERBOSE ) + msgstr_pat = re.compile( r'msgstr \s+ "', re.VERBOSE ) + str_pat = re.compile( r'"', re.VERBOSE ) + old_pat = re.compile( r'\#~ \s+ ', re.VERBOSE ) + + f = open( fname ) + lines = f.readlines() + + # parse it like a statemachine + NONE = 0 # Nothing detected, yet + CMNT = 1 # Inside comment part + MSGID = 2 # Inside msgid part + MSGSTR = 3 # Inside msgstr part + STR = 4 # A continuation string + OLD = 5 # An old pattern with #~ + + state = NONE + msg = None + msgs = [] + + for ix in range( len(lines) ): # Use line numbers for messages + line = lines[ix] + lineno = ix + 1 + + m = empty_pat.match( line ) + if m: + continue # Empty lines are not interesting + + # What's the next state? + if old_pat.match( line ): + next_state = OLD + elif comment_pat.match( line ): + next_state = CMNT + elif msgid_pat.match( line ): + next_state = MSGID + elif msgstr_pat.match( line ): + next_state = MSGSTR + elif str_pat.match( line ): + next_state = STR + else: + next_state = NONE + + #print "%(state)d->%(next_state)d\t%(line)s" % vars() + if state == NONE: + # expect msgid or comment or old stuff + if next_state == CMNT: + state = CMNT + msg = Msgid( lineno ) # Start with an empty new item + msgs.append( msg ) + msg.add_cmnt( line ) + + elif next_state == MSGID: + state = MSGID + msg = Msgid( lineno ) # Start with an empty new item + msgs.append( msg ) + msg.add_msgid( line ) + + elif next_state == MSGSTR: + print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars() + state = MSGSTR + msg = Msgid( lineno ) # Start with an empty new item + msgs.append( msg ) + msg.add_msgstr( line ) + + elif next_state == STR: + print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars() + + elif next_state == OLD: + pass # Just skip + + elif state == CMNT: + if next_state == CMNT: + if msg: + msg.add_cmnt( line ) + else: + # Note. We may need to do something about these comments + # Skip for now + pass + + elif next_state == MSGID: + state = MSGID + if not msg: + msg = Msgid( lineno ) # Start with an empty new item + msgs.append( msg ) + msg.add_msgid( line ) + + elif next_state == MSGSTR: + print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars() + state = MSGSTR + msg = Msgid( lineno ) # Start with an empty new item + msgs.append( msg ) + msg.add_msgstr( line ) + + elif next_state == STR: + print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars() + + elif next_state == OLD: + msg = None + pass # Just skip + + elif state == MSGID: + if next_state == CMNT: + # Hmmm. A comment here? + print 'WARNING: Unexpted comment at %(fname)s:%(lineno)d' % vars() + + elif next_state == MSGID: + raise Exception( 'Unexpected msgid at %(fname)s:%(lineno)d' % vars() ) + + elif next_state == MSGSTR: + state = MSGSTR + msg.add_msgstr( line ) + + elif next_state == STR: + msg.add_msgid( line ) + + elif next_state == OLD: + msg = None + pass # Just skip + + elif state == MSGSTR: + if next_state == CMNT: + # A comment probably starts a new item + state = CMNT + msg = Msgid( lineno ) + msgs.append( msg ) + msg.add_cmnt( line ) + + elif next_state == MSGID: + state = MSGID + msg = Msgid( lineno ) + msgs.append( msg ) + msg.add_msgid( line ) + + elif next_state == MSGSTR: + raise Exception( 'Unexpected msgstr at %(fname)s:%(lineno)d' % vars() ) + + elif next_state == STR: + msg.add_msgstr( line ) + + elif next_state == OLD: + msg = None + pass # Just skip + + else: + raise Exception( 'Unexpected state in po parsing (state = %d)' % state ) + + # Strip items with just comments. (Can this happen?) + msgs1 = [] + for m in msgs: + if not m.msgid() and not m.msgstr(): + #print "INFO: No msgid or msgstr at %s:%s" % ( fname, m.lineno ) + pass + else: + msgs1.append( m ) + msgs = msgs1 + return msgs + +def analyze_msgs( fname, msgs, nr_templates = None, nth = 0 ): + nr_fuzzy = 0 + nr_untranslated = 0 + nr_sfmt_mismatches = 0 + nr_named_fmt_mismatches = 0 + nr_fmt_missing_sd = 0 + nr_context_errors = 0 + nr_xml_errors = 0 + + # A pattern to find %() without s or d + # Here is a command to use for testing + # print re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE).findall( '%(event_name)s: %(place)s%(endnotes)s. ' ) + find_named_fmt_pat2 = re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE) + + # A pattern to find all %() + find_named_fmt_pat = re.compile('% \( \w+ \) \d* \D', re.VERBOSE) + + # Special XML characters + # It is not allowed to have a quote, an ampersand or an angle bracket + xml_chars_pat = re.compile( r'(?<=\W) > | " | & (?!(quot|nbsp|gt|amp);)', re.VERBOSE ) + + for msg in msgs: + msgid = msg.msgid() + msgstr = msg.msgstr() + #print + #print "msgid: %(msgid)s" % vars() + #print "msgstr: %(msgstr)s" % vars() + + if not msgstr: + nr_untranslated += 1 + continue + + if msg.is_fuzzy: + nr_fuzzy += 1 + # Skip fuzzies or not? + # continue + + cnt1 = msgid.count('%s') + cnt2 = msgstr.count('%s') + if cnt1 != cnt2: + nr_sfmt_mismatches += 1 + msg.set_sfmt_mismatch() + + # Same number of named formats? + fmts1 = find_named_fmt_pat.findall( msgid ) + fmts2 = find_named_fmt_pat.findall( msgstr ) + if len( fmts1 ) != len( fmts2 ): + if not msg.has_sfmt_mismatch: + nr_sfmt_mismatches += 1 + msg.set_sfmt_mismatch() + + # Do we have the same named formats? + fmts1.sort() + fmts2.sort() + if fmts1 != fmts2: + nr_named_fmt_mismatches += 1 + msg.set_named_fmt_mismatch() + + # Any formats missing format letter? + fmts = find_named_fmt_pat2.findall( msgstr ) + for f in fmts: + if not f in ('s', 'd'): + nr_fmt_missing_sd += 1 + msg.set_fmt_missing_sd() + break + + # Runaway context. In the translated part we only to see + # the translation of the word after the | + if msgid.count('|') > 0 and msgstr.count('|') > 0 and msgid != msgstr: + nr_context_errors += 1 + msg.set_context_error() + + # XML errors + # Only look at messages in the tips.xml + if msg.is_tips_xml(): + if xml_chars_pat.search( msgstr ): + nr_xml_errors += 1 + msg.set_xml_error() + + nr_msgs = len(msgs) + if nth > 0: + print + print "=====================================" + print "%-20s%s" % ( "File:", fname ) + print "%-20s%d" % ( "Template total:", nr_templates ) + print "%-20s%d" % ( "PO total:", nr_msgs ) + print "%-20s%d" % ( "Fuzzy:", nr_fuzzy ) + print "%-20s%d" % ( "Untranslated:", nr_untranslated ) + print "%-20s%d" % ( "%s mismatches:", nr_sfmt_mismatches ) + print "%-20s%d" % ( "%() name mismatches:", nr_named_fmt_mismatches ) + print "%-20s%d" % ( "%() missing s/d:", nr_fmt_missing_sd ) + print "%-20s%d" % ( "Runaway context:", nr_context_errors ) + print "%-20s%d" % ( "XML special chars:", nr_xml_errors ) + + po_coverage = (1.0 - (float(nr_untranslated) / float(nr_msgs))) * 100 + print "%-20s%5.2f%%" % ( "PO Coverage:", po_coverage ) + + template_coverage = po_coverage * float(nr_msgs) / float(nr_templates) + print "%-20s%5.2f%%" % ( "Template Coverage:", template_coverage ) + + if nr_sfmt_mismatches: + print + print "-------- %s mismatches --------------" + for m in msgs: + if m.has_sfmt_mismatch: + m.diag() + + if nr_named_fmt_mismatches: + print + print "-------- %() name mismatches --------------" + for m in msgs: + if m.has_named_fmt_mismatch: + m.diag() + + if nr_fmt_missing_sd: + print + print "-------- %() without 's' or 'd' mismatches --------------" + for m in msgs: + if m.has_fmt_missing_sd: + m.diag() + + if nr_context_errors: + print + print "-------- Runaway context in translation ---------" + for m in msgs: + if m.has_context_error: + m.diag() + + if nr_xml_errors: + print + print "-------- unescaped XML special characters ---------" + for m in msgs: + if m.has_xml_error: + m.diag() + +def main(): + try: + pot_msgs = read_msgs( 'gramps.pot' ) + nr_templates = len( pot_msgs ) + #analyze_msgs( 'gramps.pot', pot_msgs ) + nth = 0 + for fname in sys.argv[1:]: + msgs = read_msgs( fname ) + analyze_msgs( fname, msgs, nr_templates, nth ) + nth += 1 + + except Exception, e: + print e + +if __name__ == "__main__": + main()