Previous 199869 Revisions Next

r45103 Sunday 21st February, 2016 at 13:45:38 UTC by Miodrag Milanović
Placed official version, and removed other files since work only with parsing python (nw)
[scripts/build]check_po.py msgfmt.py msgmerge.py pygettext.py

trunk/scripts/build/check_po.py
r253614r253615
1#! /usr/bin/env python
2#
3# check_po - a gramps tool to check validity of po files
4#
5# Copyright (C) 2006-2006  Kees Bakker
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20
21#
22# TODO
23#
24# * Check for HTML text in msgstr when there is none in msgid
25# * Check for matching HTML tag/endtag in msgstr
26#
27
28# Adapted for Umit by Guilherme Polo, original file:
29# https://gramps.svn.sourceforge.net/svnroot/gramps/branches/gramps22/po/check_po
30
31import re
32import sys
33from optparse import OptionParser
34
35APP = "Umit"
36
37all_total = {}
38all_fuzzy = {}
39all_untranslated = {}
40all_percent_s = {}
41all_named_s = {}
42all_bnamed_s = {}
43all_context = {}
44all_coverage = {}
45all_template_coverage = {}
46
47def strip_quotes(st):
48    if len(st) >= 2 and st[0] == '"' and st[len(st)-1] == '"':
49        st = st.strip()[1:-1]
50    return st
51
52# This is a base class for all checks
53class Check:
54    def __init__( self ):
55        self.msgs = []
56    def diag( self ):
57        if len( self.msgs ):
58            print
59            print self.diag_header
60            for m in self.msgs:
61                m.diag()
62    def summary( self ):
63        print "%-20s%d" % ( self.summary_text, len(self.msgs) )
64
65class Check_fmt( Check ):
66    def __init__( self, fmt ):
67        Check.__init__( self )
68        self.diag_header = "-------- %s mismatches --------------" % fmt
69        self.summary_text = "%s mismatches:" % fmt
70        self.fmt = fmt
71    def process( self, msg ):
72        msgid = msg.msgid
73        msgstr = msg.msgstr
74        cnt1 = msgid.count( self.fmt )
75        cnt2 = msgstr.count( self.fmt )
76        if cnt1 != cnt2:
77            self.msgs.append( msg )
78
79class Check_named_fmt( Check ):
80    # A pattern to find all %()
81    find_named_fmt_pat = re.compile('% \( \w+ \) \d* \D', re.VERBOSE)
82
83    def __init__( self ):
84        Check.__init__( self )
85        self.diag_header = "-------- %() name mismatches --------------"
86        self.summary_text = "%() name mismatches:"
87    def process( self, msg ):
88        msgid = msg.msgid
89        msgstr = msg.msgstr
90        # Same number of named formats?
91        fmts1 = self.find_named_fmt_pat.findall( msgid )
92        fmts2 = self.find_named_fmt_pat.findall( msgstr )
93        if len( fmts1 ) != len( fmts2 ):
94            self.msgs.append( msg )
95        else:
96            # Do we have the same named formats?
97            fmts1.sort()
98            fmts2.sort()
99            if fmts1 != fmts2:
100                self.msgs.append( msg )
101
102class Check_missing_sd( Check ):
103    # A pattern to find %() without s or d
104    # Here is a command to use for testing
105    # print re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE).findall( '%(event_name)s: %(place)s%(endnotes)s. ' )
106    find_named_fmt_pat2 = re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE)
107
108    def __init__( self ):
109        Check.__init__( self )
110        self.diag_header = "-------- %() without 's' or 'd' mismatches --------------"
111        self.summary_text = "%() missing s/d:"
112    def process( self, msg ):
113        msgstr = msg.msgstr
114        fmts = self.find_named_fmt_pat2.findall( msgstr )
115        for f in fmts:
116            if not f in ('s', 'd'):
117                self.msgs.append( msg )
118                break
119
120class Check_runaway( Check ):
121    def __init__( self ):
122        Check.__init__( self )
123        self.diag_header = "-------- Runaway context in translation ---------"
124        self.summary_text = "Runaway context:"
125    def process( self, msg ):
126        msgid = msg.msgid
127        msgstr = msg.msgstr
128
129        # Runaway context. In the translated part we only to see
130        # the translation of the word after the |
131        if msgid.count('|') > 0 and msgstr.count('|') > 0 and msgid != msgstr:
132            self.msgs.append( msg )
133
134class Check_xml_chars( Check ):
135    # Special XML characters
136    # It is not allowed to have a quote, an ampersand or an angle bracket
137    xml_chars_pat = re.compile( r'(?<=\W) > | " | & (?!(quot|nbsp|gt|amp);)', re.VERBOSE )
138
139    def __init__( self ):
140        Check.__init__( self )
141        self.diag_header = "-------- unescaped XML special characters ---------"
142        self.summary_text = "XML special chars:"
143    def process( self, msg ):
144        msgid = msg.msgid
145        msgstr = msg.msgstr
146
147        # XML errors
148        # Only look at messages in the tips.xml
149        if msg.is_tips_xml:
150            if self.xml_chars_pat.search( msgstr ):
151                self.msgs.append( msg )
152
153class Check_last_char( Check ):
154    def __init__( self ):
155        Check.__init__( self )
156        self.diag_header = "-------- last character not identical ---------"
157        self.summary_text = "Last character:"
158    def process( self, msg ):
159        msgid = msg.msgid
160        msgstr = msg.msgstr
161
162        # Last character of msgid? White space? Period?
163        if msg.is_fuzzy:
164            return
165
166        msgid_last = msgid[-1:]
167        msgstr_last = msgstr[-1:]
168        if msgid_last.isspace() != msgstr_last.isspace():
169            self.msgs.append( msg )
170        elif (msgid_last == '.') != (msgstr_last == '.'):
171            self.msgs.append( msg )
172
173class Check_shortcut_trans( Check ):
174    def __init__( self ):
175        Check.__init__( self )
176        self.diag_header = "-------- shortcut key in translation ---------"
177        self.summary_text = "Shortcut in msgstr:"
178    def process( self, msg ):
179        msgid = msg.msgid
180        msgstr = msg.msgstr
181
182        if msgid.count('_') == 0 and msgstr.count('_') > 0:
183            self.msgs.append( msg )
184
185class Msgid:
186    fuzzy_pat = re.compile( 'fuzzy' )
187    tips_xml_pat = re.compile( r'tips\.xml' )
188    def __init__( self, msgnr, lineno ):
189        self._msgid = []
190        self._msgstr = []
191        self.msgid = ''
192        self.msgstr = ''
193        self._cmnt = []
194        self.nr = msgnr
195        self.lineno = lineno
196        self.is_fuzzy = 0
197        self.is_tips_xml = 0
198
199    def diag( self ):
200        if 1:
201            print
202            print "msg nr: %d, lineno: %d%s" % ( self.nr, self.lineno, self.is_fuzzy and " (fuzzy)" or "" )
203            sys.stdout.write( ''.join( self._msgid ) )
204            sys.stdout.write( ''.join( self._msgstr ) )
205        else:
206            # Compatible with the old check_po
207            print "%d '%s' : '%s'" % ( self.lineno, self.msgid, self.msgstr )
208
209    def add_msgid( self, line, lineno ):
210        self._msgid.append( line )
211        line = re.sub( r'msgid\s+', '', line )
212        line = line.strip()
213        if line[0] != '"' or line[-1:] != '"':
214            print "ERROR at line %d: Missing quote." % lineno
215        line = strip_quotes( line )
216        self.msgid += line
217
218    def add_msgstr( self, line, lineno ):
219        self._msgstr.append( line )
220        line = re.sub( r'msgstr\s+', '', line )
221        line = line.strip()
222        if line[0] != '"' or line[-1:] != '"':
223            print "ERROR at line %d: Missing quote." % lineno
224        line = strip_quotes( line )
225        self.msgstr += line
226
227    def add_cmnt( self, line ):
228        self._cmnt.append( line )
229        if not self.is_fuzzy and self.fuzzy_pat.search( line ):
230            self.is_fuzzy = 1
231        if not self.is_tips_xml and self.tips_xml_pat.search( line ):
232            self.is_tips_xml = 1
233
234def read_msgs( fname ):
235    empty_pat   = re.compile( r'^ \s* $',      re.VERBOSE )
236    comment_pat = re.compile( r'\#',           re.VERBOSE )
237    msgid_pat   = re.compile( r'msgid \s+ "',  re.VERBOSE )
238    msgstr_pat  = re.compile( r'msgstr \s+ "', re.VERBOSE )
239    str_pat     = re.compile( r'"',            re.VERBOSE )
240    old_pat     = re.compile( r'\#~ \s+ ',     re.VERBOSE )
241
242    msgnr = 0         # This is the message number of the next message to read. The first real message is 1.
243    f = open( fname )
244    lines = f.readlines()
245
246    # parse it like a statemachine
247    NONE   = 0         # Nothing detected, yet
248    CMNT   = 1         # Inside comment part
249    MSGID  = 2         # Inside msgid part
250    MSGSTR = 3         # Inside msgstr part
251    STR    = 4         # A continuation string
252    OLD    = 5         # An old pattern with #~
253
254    state = NONE
255    msg = None
256    msgs = []
257
258    for ix in range( len(lines) ):   # Use line numbers for messages
259        line = lines[ix]
260        lineno = ix + 1
261
262        m = empty_pat.match( line )
263        if m:
264            continue   # Empty lines are not interesting
265
266        # What's the next state?
267        if  old_pat.match( line ):
268            next_state = OLD
269        elif comment_pat.match( line ):
270            next_state = CMNT
271        elif msgid_pat.match( line ):
272            next_state = MSGID
273        elif msgstr_pat.match( line ):
274            next_state = MSGSTR
275        elif str_pat.match( line ):
276            next_state = STR
277        else:
278            print 'WARNING: Unexpected input at %(fname)s:%(lineno)d' % vars()
279            next_state = NONE
280
281        #print "%(state)d->%(next_state)d\t%(line)s" % vars()
282        if state == NONE:
283            # expect msgid or comment or old stuff
284            if next_state == CMNT:
285                state = CMNT
286                msg = Msgid( msgnr, lineno ) # Start with an empty new item
287                msgnr += 1
288                msgs.append( msg )
289                msg.add_cmnt( line )
290
291            elif next_state == MSGID:
292                state = MSGID
293                msg = Msgid( msgnr, lineno ) # Start with an empty new item
294                msgnr += 1
295                msgs.append( msg )
296                msg.add_msgid( line, lineno )
297
298            elif next_state == MSGSTR:
299                print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars()
300                state = MSGSTR
301                msg = Msgid( msgnr, lineno ) # Start with an empty new item
302                msgnr += 1
303                msgs.append( msg )
304                msg.add_msgstr( line, lineno )
305
306            elif next_state == STR:
307                print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars()
308
309            elif next_state == OLD:
310                pass   # Just skip
311
312        elif state == CMNT:
313            if next_state == CMNT:
314                if msg:
315                    msg.add_cmnt( line )
316                else:
317                    # Note. We may need to do something about these comments
318                    # Skip for now
319                    pass
320
321            elif next_state == MSGID:
322                state = MSGID
323                if not msg:
324                    msg = Msgid( msgnr, lineno ) # Start with an empty new item
325                    msgnr += 1
326                    msgs.append( msg )
327                msg.add_msgid( line, lineno )
328
329            elif next_state == MSGSTR:
330                print 'WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars()
331                state = MSGSTR
332                msg = Msgid( msgnr, lineno ) # Start with an empty new item
333                msgnr += 1
334                msgs.append( msg )
335                msg.add_msgstr( line, lineno )
336
337            elif next_state == STR:
338                print 'WARNING: Wild string at %(fname)s:%(lineno)d' % vars()
339
340            elif next_state == OLD:
341                msg = None
342                pass   # Just skip
343
344        elif state == MSGID:
345            if next_state == CMNT:
346                # Hmmm. A comment here?
347                print 'WARNING: Unexpted comment at %(fname)s:%(lineno)d' % vars()
348
349            elif next_state == MSGID:
350                raise Exception( 'Unexpected msgid at %(fname)s:%(lineno)d' % vars() )
351
352            elif next_state == MSGSTR:
353                state = MSGSTR
354                msg.add_msgstr( line, lineno )
355
356            elif next_state == STR:
357                msg.add_msgid( line, lineno )
358
359            elif next_state == OLD:
360                msg = None
361                pass   # Just skip
362
363        elif state == MSGSTR:
364            if next_state == CMNT:
365                # A comment probably starts a new item
366                state = CMNT
367                msg = Msgid( msgnr, lineno )
368                msgnr += 1
369                msgs.append( msg )
370                msg.add_cmnt( line )
371
372            elif next_state == MSGID:
373                state = MSGID
374                msg = Msgid( msgnr, lineno )
375                msgnr += 1
376                msgs.append( msg )
377                msg.add_msgid( line, lineno )
378
379            elif next_state == MSGSTR:
380                raise Exception( 'Unexpected msgstr at %(fname)s:%(lineno)d' % vars() )
381
382            elif next_state == STR:
383                msg.add_msgstr( line, lineno )
384
385            elif next_state == OLD:
386                msg = None
387                pass   # Just skip
388
389        else:
390            raise Exception( 'Unexpected state in po parsing (state = %d)' % state )
391
392    # Strip items with just comments. (Can this happen?)
393    msgs1 = []
394    for m in msgs:
395        if not m.msgid and not m.msgstr:
396            #print "INFO: No msgid or msgstr at %s:%s" % ( fname, m.lineno )
397            pass
398        else:
399            msgs1.append( m )
400    msgs = msgs1
401    return msgs
402
403def analyze_msgs( options, fname, msgs, nr_templates = None, nth = 0 ):
404    nr_fuzzy = 0
405    nr_untranslated = 0
406
407    checks = []
408    checks.append( Check_fmt( '%s' ) )
409    checks.append( Check_fmt( '%d' ) )
410    checks.append( Check_named_fmt() )
411    checks.append( Check_missing_sd() )
412    checks.append( Check_runaway() )
413    checks.append( Check_xml_chars() )
414    checks.append( Check_last_char() )
415    checks.append( Check_shortcut_trans() )
416
417    for msg in msgs:
418        msgid = msg.msgid
419        msgstr = msg.msgstr
420        #print
421        #print "msgid: %(msgid)s" % vars()
422        #print "msgstr: %(msgstr)s" % vars()
423
424        if not msgstr:
425            nr_untranslated += 1
426            continue
427
428        if msg.is_fuzzy:
429            nr_fuzzy += 1
430            if options.skip_fuzzy:
431                continue
432
433        for c in checks:
434            c.process( msg )
435
436    nr_msgs = len(msgs)
437    if nth > 0:
438        print
439        print "====================================="
440    print "%-20s%s"     % ( "File:",              fname )
441    print "%-20s%d"     % ( "Template total:",    nr_templates )
442    print "%-20s%d"     % ( "PO total:",          nr_msgs )
443    print "%-20s%d"     % ( "Fuzzy:",             nr_fuzzy )
444    print "%-20s%d"     % ( "Untranslated:",      nr_untranslated )
445
446    for c in checks:
447        c.summary()
448
449    po_coverage = (1.0 - (float(nr_untranslated) / float(nr_msgs))) * 100
450    print "%-20s%5.2f%%" % ( "PO Coverage:",       po_coverage )
451
452    template_coverage = po_coverage * float(nr_msgs) / float(nr_templates)
453    print "%-20s%5.2f%%" % ( "Template Coverage:", template_coverage )
454
455    if not options.only_summary:
456        for c in checks:
457            c.diag()
458
459def main(args):
460    if len(sys.argv) < 2:
461        print "Error: Especify the umit.pot file path"
462        sys.exit(1)
463
464    parser = OptionParser(description="This program validates a PO file for "
465                          "%s." % APP, usage='%prog [options] po-file...' )
466
467    parser.add_option("", "--skip-fuzzy",
468                      action="store_true", dest="skip_fuzzy", default=False,
469                      help="skip fuzzies")
470
471    parser.add_option("-s", "--only-summary",
472                      action="store_true", dest="only_summary", default=False,
473                      help="only give the summary")
474
475    options, args = parser.parse_args()
476
477    try:
478        pot_msgs = read_msgs(sys.argv[1])
479        nr_templates = len(pot_msgs)
480        nth = 0
481        for fname in args:
482            msgs = read_msgs(fname)
483            analyze_msgs(options, fname, msgs, nr_templates, nth)
484            nth += 1
485
486    except Exception, e:
487        print e
488
489if __name__ == "__main__":
490    main(sys.argv)
trunk/scripts/build/msgfmt.py
r253614r253615
1#! /usr/bin/env python
1#!/usr/bin/env python2
22# -*- coding: iso-8859-1 -*-
33# Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
4#
5# Changelog: (Guilherme Polo)
6#   2008-04-11
7#    - Support for files with BOM UTF8 mark.
8#
9#   2008-04-10
10#    - Support for fuzzy strings in output.
11#    - Bumped to version 1.1.1
124
135"""Generate binary message catalog from textual translation description.
146
r253614r253615
2416        Specify the output file to write to.  If omitted, output will go to a
2517        file named filename.mo (based off the input file name).
2618
27    -f
28    --use-fuzzy
29        Use fuzzy entries in output
30
3119    -h
3220    --help
3321        Print this message and exit.
r253614r253615
3523    -V
3624    --version
3725        Display version information and exit.
38
39Before using the -f (fuzzy) option, read this:
40    http://www.finesheer.com:8457/cgi-bin/info2html?(gettext)Fuzzy%20Entries&lang=en
4126"""
4227
28import os
4329import sys
44import os
30import ast
4531import getopt
4632import struct
4733import array
48import codecs
4934
50__version__ = "1.1.1"
35__version__ = "1.1"
5136
5237MESSAGES = {}
5338
5439
40 
5541def usage(code, msg=''):
5642    print >> sys.stderr, __doc__
5743    if msg:
r253614r253615
5945    sys.exit(code)
6046
6147
62def add(id, str, fuzzy, use_fuzzy):
63    "Add a translation to the dictionary."
48 
49def add(id, str, fuzzy):
50    "Add a non-fuzzy translation to the dictionary."
6451    global MESSAGES
65    if (not fuzzy or use_fuzzy) and str:
52    if not fuzzy and str:
6653        MESSAGES[id] = str
6754
6855
56 
6957def generate():
7058    "Return the generated output."
7159    global MESSAGES
r253614r253615
10896    return output
10997
11098
111def make(filename, outfile, use_fuzzy):
99 
100def make(filename, outfile):
112101    ID = 1
113102    STR = 2
114103
r253614r253615
122111
123112    try:
124113        lines = open(infile).readlines()
125        if lines[0].startswith(codecs.BOM_UTF8):
126            lines[0] = lines[0][len(codecs.BOM_UTF8):]
127114    except IOError, msg:
128115        print >> sys.stderr, msg
129116        sys.exit(1)
r253614r253615
137124        lno += 1
138125        # If we get a comment line after a msgstr, this is a new entry
139126        if l[0] == '#' and section == STR:
140            add(msgid, msgstr, fuzzy, use_fuzzy)
127            add(msgid, msgstr, fuzzy)
141128            section = None
142129            fuzzy = 0
143130        # Record a fuzzy mark
r253614r253615
147134        if l[0] == '#':
148135            continue
149136        # Now we are in a msgid section, output previous section
150        if l.startswith('msgid'):
137        if l.startswith('msgid') and not l.startswith('msgid_plural'):
151138            if section == STR:
152                add(msgid, msgstr, fuzzy, use_fuzzy)
139                add(msgid, msgstr, fuzzy)
153140            section = ID
154141            l = l[5:]
155142            msgid = msgstr = ''
143            is_plural = False
144        # This is a message with plural forms
145        elif l.startswith('msgid_plural'):
146            if section != ID:
147                print >> sys.stderr, 'msgid_plural not preceded by msgid on %s:%d' %\
148                    (infile, lno)
149                sys.exit(1)
150            l = l[12:]
151            msgid += '\0' # separator of singular and plural
152            is_plural = True
156153        # Now we are in a msgstr section
157154        elif l.startswith('msgstr'):
158155            section = STR
159            l = l[6:]
156            if l.startswith('msgstr['):
157                if not is_plural:
158                    print >> sys.stderr, 'plural without msgid_plural on %s:%d' %\
159                        (infile, lno)
160                    sys.exit(1)
161                l = l.split(']', 1)[1]
162                if msgstr:
163                    msgstr += '\0' # Separator of the various plural forms
164            else:
165                if is_plural:
166                    print >> sys.stderr, 'indexed msgstr required for plural on  %s:%d' %\
167                        (infile, lno)
168                    sys.exit(1)
169                l = l[6:]
160170        # Skip empty lines
161171        l = l.strip()
162172        if not l:
163173            continue
164        # XXX: Does this always follow Python escape semantics?
165        l = eval(l)
174        l = ast.literal_eval(l)
166175        if section == ID:
167176            msgid += l
168177        elif section == STR:
r253614r253615
174183            sys.exit(1)
175184    # Add last entry
176185    if section == STR:
177        add(msgid, msgstr, fuzzy, use_fuzzy)
186        add(msgid, msgstr, fuzzy)
178187
179188    # Compute output
180189    output = generate()
r253614r253615
185194        print >> sys.stderr, msg
186195
187196
197 
188198def main():
189199    try:
190        opts, args = getopt.getopt(sys.argv[1:], 'hVo:f',
191            ['help', 'version', 'output-file=', 'use-fuzzy'])
200        opts, args = getopt.getopt(sys.argv[1:], 'hVo:',
201                                   ['help', 'version', 'output-file='])
192202    except getopt.error, msg:
193203        usage(1, msg)
194204
195205    outfile = None
196    use_fuzzy = False
197206    # parse options
198207    for opt, arg in opts:
199208        if opt in ('-h', '--help'):
r253614r253615
201210        elif opt in ('-V', '--version'):
202211            print >> sys.stderr, "msgfmt.py", __version__
203212            sys.exit(0)
204        elif opt in ('-f', '--use-fuzzy'):
205            use_fuzzy = True
206213        elif opt in ('-o', '--output-file'):
207214            outfile = arg
208215    # do it
r253614r253615
212219        return
213220
214221    for filename in args:
215        make(filename, outfile, use_fuzzy)
222        make(filename, outfile)
216223
217224
218225if __name__ == '__main__':
trunk/scripts/build/msgmerge.py
r253614r253615
1#! /usr/bin/env python
2# -*- coding: iso-8859-1 -*-
3#
4# Copyright Terje Røsten <terjeros@phys.ntnu.no> Nov. 2003.
5#
6'''Merge two Uniforum style .po files together.
7
8This is a implementation (not complete) in Python of the GNU
9msgmerge(1) program. It can be used on the command line (or as a Python
10module).
11
12Usage: msgmerge.py [OPTIONS] def.po ref.pot
13
14The def.po file is an existing PO file with translations. The ref.pot
15file is the last created PO file with up-to-date source references but
16old translations, or a PO Template file.
17
18Options:
19  -U, --update           update def.po,
20                         do nothing if def.po is already up to date.
21  -o, --output-file=FILE write output to file FILE. Output is written to
22                         stdout if set to - or if the option is not present.
23  -D, --docstrings       don\'t remove docstring flag.
24  -h, --help             display help text and exit.
25  -V, --version          display version and exit.
26  -q, --quiet, --silent  suppress progress indicators.
27'''
28from __future__ import generators
29
30if not __name__ == '__main__':
31    __doc__ += '''\
32
33When used as module the interesting functions are merge() and
34merge_dir().
35
36The merge() function does the same as the command line version, and
37the arguments are as follows. The first argument is the def.po file,
38then the ref.pot file. The third argument controls whether do work in
39update mode or not, then the next argument sets the output file. Set
40the next argument to False to remove docstring flags. The last
41argument can be used to suppress progress indicators. The default is
42to work in update mode with progress indicators.
43
44Example:
45 merge("def.po", "ref.pot")
46  merge the files def.po and ref.pot and write output to def.po if
47  there are any changes.
48 merge("def.po", "red.pot", docstrings = False, verbose = False,
49       update = False, outfile = "-")
50  merge the files def.po and ref.pot and write output to stdout,
51  remove docstring flag and be quiet.
52
53The merge_dir() function is useful when merging a directory of po
54files. The only required argument is the name of the directory with po
55files and the pot file. It will use simple glob to find the files. The
56second argument can be used to specify the pot file (in the
57directory). Third argument is a list of po files (then globbing will
58not be used) and the next argument is list of filename to exclude. The
59last argument can be used to suppress progress indicators. Docstring
60flag will not be removed.
61
62Example:
63 merge_dir("po")
64  merge (and update) all po files in directory po with the single pot
65  file in the same directory.
66
67The module raises the MsgmergeError exception in case of error.
68'''
69__revision__ = '$Id: msgmerge.py,v 1.41 2003/11/18 19:10:42 terjeros Exp $'
70__version__ = '0.1'
71name = 'msgmerge.py'
72
73__all__ = [ 'merge', 'merge_dir', 'MsgmergeError' ]
74
75import sys
76import re
77import string
78import getopt
79import difflib
80import glob
81import os.path
82import codecs
83
84try:
85    True, False
86except NameError:
87    True, False = 1, 0
88
89class Msgs:
90    '''Class to hold information about messages.'''
91    width = 80
92    file = ''
93    def __init__(self, msgid, msgstr, flag, lno, entry, **kwds):
94        self.id = msgid
95        self.str = msgstr
96        self.cmt = kwds.get('cmt', '')
97        self.ref = kwds.get('ref', '')
98        self.autocmt = kwds.get('autocmt', '')
99        self.flag = flag
100        self.entry = entry
101        self.lno = lno
102        self.count = 0
103    def wash(self):
104        self.id = wash(self.id, width = self.width,
105                       filename = self.file, lno = self.lno)
106        self.str = wash(self.str, 'msgstr', width = self.width,
107                        filename = self.file, lno = self.lno)
108    def used(self):
109        self.count += 1
110    def get_clean_id(self):
111        return self.id.replace('msgid "','', 1)
112    def obsolete(self):
113        self.width -= len('#~ ')
114        self.wash()
115        t = [ '#~ %s\n' % s for s in self.id.splitlines() ]
116        self.id = ''.join(t)
117        t = [ '#~ %s\n' % s for s in self.str.splitlines() ]
118        self.str = ''.join(t)
119
120class Options:
121    '''Class to hold options'''
122    def __init__(self, cmdline = False, **kwds):
123        if not cmdline:
124            self.update = kwds.get('update', True)
125            self.outfile = kwds.get('outfile', '-')
126            self.docstrings = kwds.get('docstrings', True)
127            self.verbose = kwds.get('verbose', False)
128            self.suffix = kwds.get('suffix', '~')
129            self.backup = kwds.get('backup', True)
130        else:
131            self.update = False
132            self.outfile = False
133            self.docstrings = False
134            self.verbose = True
135            self.suffix = '~'
136            self.backup = True
137
138class MsgmergeError(Exception):
139    '''Exception class for msgmerge'''
140
141def gen(lines):
142    '''
143    Generator which returns a line (with the obsolete prefix removed)
144    from the list of lines in <lines>, the line number is also
145    returned.
146    '''
147    lno = 0
148    for l in lines:
149        lno += 1
150        yield l.replace('#~ ', '', 1), lno
151    yield l, lno
152
153def slurp(s, g, sign):
154    '''
155    The string returned from iterator <g>\'s next() method is added to
156    the string <s> if string returned is beginning with the string
157    <sign>. The return value is the first returned string which do not
158    start with <sign>, the line number, the iterator <g> and the
159    (possibly) updated string <s>.
160    '''
161    l, lno = g.next()
162    while l.startswith(sign) or (sign == '# ' and l.strip() == '#'):
163        s += l
164        l, lno = g.next()
165    return l, lno, g, s
166
167def splitted_fit(chunk, line, width, break_always, break_after_space):
168    '''
169    Check if string <chunk> can be splitted by newline to fit into
170    string <line> with width smaller than <width>. The return value is
171    a tuple where the first element is the part of chunk which fits
172    and the second element is the rest of chunk.
173    '''
174    ret = '', chunk
175    l = len(chunk)
176    for i in range(l - 1, -1, -1):
177        if chunk[i] in break_always and len(chunk[0:i] + line) <= width:
178            ret = chunk[0:i], chunk[i:]
179            break
180        elif chunk[i] in break_after_space and i and chunk[i-1].strip() == '':
181            ret = chunk[0:i], chunk[i:]
182            break
183        elif chunk[i] == '\\' and len(chunk[i:]) > 1 and chunk[i+1] == '"' \
184             and len(chunk[0:i] + line) <= width:
185            ret = chunk[0:i], chunk[i:]
186            break
187    return ret
188
189def wrap(msg, width):
190    '''
191    Accept a list <msg> of strings to wrap, each string is wrapped to
192    width <width> and surrounded with a pair of ". The return value is
193    a string with these wrapped strings joined together with newlines.
194    '''
195    if msg.isspace() or not msg:
196        return '"%s"' % msg
197
198    # \ and " is here, but " is special in po files.
199    break_always = '$%+({['
200    # XXX what about: « © » ¦ § etc?
201    break_after_space = '_-=^`~\'<|>&*#@'
202    enders = '.:,;!?/])}|%-'
203    extra = string.punctuation
204    for c in enders:
205        extra = extra.replace(c, '')
206    escaped = { 'enders' : re.escape(enders),
207                'extra'  : re.escape(extra) }
208    regex = r'([\w%(extra)s]*[\s%(enders)s)]+[\s%(enders)s]*)' % escaped
209    r = re.compile(regex, re.UNICODE)
210    msg = [ m for m in r.split(msg) if not m == '']
211
212    lines = []
213    line = msg.pop(0)
214   
215    # Handle \n on end of line
216    if len(msg) > 1 and msg[-1] == 'n' and len(msg[-2]) > 0 \
217           and msg[-2][-1] == '\\':
218        msg[-2] += msg[-1]
219        msg.pop()
220    # Do not allow a single \n on a line
221    if len(msg) > 2 and msg[-1] == '\\n':
222        msg[-2] += msg[-1]
223        msg.pop()
224
225    for m in msg:
226        if len(line) > width or len(m) > width or len(line + m) > width:
227            fit, rest = splitted_fit(m, line, width, break_always,
228                                     break_after_space)
229            line += fit
230            lines.append(line)
231            line = rest
232        else:
233            line += m
234    lines.append(line)
235    lines = [ '"%s"' % l for l in lines ]
236    return '\n'.join(lines)
237
238def normalize(lines):
239    '''
240    Normalize <lines>: e.g "\n\nText\n\n" becomes:
241    "\n"
242    "\n"
243    "Text\n"
244    "\n"
245    '''
246    if  0 < lines.find('\\n') < len(lines) - 3:
247        if lines[-3:] == '\\n"':   
248            lines = lines[:-3].replace('\\n','\\n"\n"').replace('""\n','') \
249                    + '\\n"'
250        else:
251            lines = lines.replace('\\n','\\n"\n"').replace('""\n','')
252    return lines
253
254def wash(msg, idx = 'msgid', width = 80, **kwds):
255    '''
256    Do washing on the msgstr or msgid fields. Wrap the text to fit in
257    width <width>. <msg> is a list of lines that makes up the field.
258    <idx> indicate msgid or msgstr, <width> holds the width. <filename>
259    and <lno> (line number) is picked up from <kwds>.
260    Returns the washed field as a string.
261    '''
262    msg = normalize(msg)
263    lines = msg.splitlines()
264    size = len(lines)
265    if size > 1 or len(msg) > width:
266        washed = []
267        # The first line is special
268        m = re.match('^%s "(.*)"$' % (idx, ), lines[0])
269        if not m:
270            print lines[0]
271            kwds['lno'] -= size + 1           
272            raise MsgmergeError('parse error: %(filename)s:%(lno)s.'
273                                % kwds)
274        washed.append(m.group(1))
275        if m.group(1).endswith(r'\n'):
276            washed.append('')
277        i = 0
278        for line in lines[1:]:
279            m = re.match('^"(\s*.*)"$', line)
280            i += 1
281            if not m:
282                print line
283                kwds['lno'] -= size - i + 1
284                raise MsgmergeError('parse error: %(filename)s:%(lno)s.'
285                                    % kwds)
286            washed[-1] += m.group(1)
287            if m.group(1).endswith(r'\n'):
288                washed.append('')
289        if washed[0] == '':
290            washed.pop(0)
291        if washed[-1] == '':
292            washed.pop()
293   
294        washed = [ wrap(w, width - 3) for w in washed ] # " and \n removed.
295
296        # One line or multiline
297        if len(washed) == 1 and len('%s %s\n' % (idx, washed[0])) < width:
298            washed = '%s %s\n' % (idx, washed[0])
299        else:
300            washed = '%s ""\n%s\n' % (idx, '\n'.join(washed))
301    else:
302        washed = msg
303
304    return washed
305
306def parse(filename, entry):
307    '''
308    Parse po or pot file with name <filename>. Set the variable
309    <entry> to msgid/msgstr to indicate pot/po file.  The return value
310    is a dict with msgid (washed) as key and Msgs instances as
311    values.
312    '''
313    lines = io(filename).readlines()
314    Msgs.file = filename
315    messages = {}
316    last = len(lines)
317    g = gen(lines)           
318    cmt = autocmt = ref = flag = ''
319    msgid = False
320    lno = 0
321    while not lno == last:
322        l, lno = g.next()
323        if l.startswith('# '):
324            l, lno, g, cmt  = slurp(l, g, '# ')
325        if l.startswith('#.'):
326            l, lno, g, autocmt = slurp(l, g, '#.')
327        if l.startswith('#:'):
328            l, lno, g, ref = slurp(l, g, '#:')
329        if l.startswith('#,'):
330            l, lno, g, flag = slurp(l, g, '#,')
331        if l.startswith('msgid'):
332            l, lno, g, msgid = slurp(l, g, '"')
333        if l.startswith('msgstr'):
334            l, lno, g, msgstr = slurp(l, g, '"')
335
336        if not lno == last and not l.strip() == '':
337            raise MsgmergeError('parse error: %s:%s.' % (filename, lno))
338
339        if msgid and entry == 'msgstr':
340            idx = wash(msgid, filename = filename, lno = lno)
341            messages[idx] = Msgs(msgid, msgstr, flag, lno, entry, cmt = cmt)
342            msgid = False; msgstr = cmt = autocmt = ref = flag = ''
343        elif msgid and entry == 'msgid':
344            idx = wash(msgid, filename = filename, lno = lno)
345            messages[idx] = Msgs(msgid, msgstr, flag, lno, entry,
346                                 autocmt = autocmt, ref = ref)
347            msgid = False; msgstr = cmt = autocmt = ref = flag = ''
348
349    for m in messages.values():
350        m.wash()
351    return messages
352
353def fuzzy_match(pot, defs):
354    '''
355    Try to find the best difflib match (with ratio > 0.6) between
356    id of Msgs object <pot> and Msgs in the dict <defs>.
357    Return value is the Msgs object in <defs> with highest ratio,
358    False is returned if no suitable Msgs is found.
359    '''
360    limit = 0.6
361    l, po = limit - 0.01, False
362    s = difflib.SequenceMatcher(lambda x: x == ' "', '', pot.get_clean_id())
363    len2 = len(pot.get_clean_id())   
364    for candidate in defs.values():
365        if candidate.str == 'msgstr ""\n':       # Empty translation
366            continue
367        if candidate.id == 'msgid ""\n':         # Empty msgid (header)
368            continue
369        len1 = len(candidate.get_clean_id())
370        if len2 > 2 * len1 or len1 > 1.5 * len2: # Simple and fast tests first
371            continue
372        s.set_seq1(candidate.get_clean_id())
373        if s.quick_ratio() < l:
374            continue
375        r = s.ratio()                            # This is expensive
376        if r > l:
377            l, po = r, candidate
378    return po
379
380def flags(po, pot, fuzzy = False, obs = False):
381    '''
382    Create flag field from flag field in Msgs objects <po> and
383    <pot>. When <fuzzy> is true <po>\'s flags are ignored and the
384    fuzzy flag is added. If <obs> is set then most flags but fuzzy are
385    removed. If the global variable option.docstrings is set then
386    docstring flags will not be removed. The return value is a string
387    which holds the combined flag.
388    '''
389    global option
390    flag = ''
391    if po.flag or pot.flag or fuzzy:
392        if not fuzzy:
393            flag = '%s, %s' % (po.flag.strip(), pot.flag.strip())
394        else:
395            flag = '%s, %s' % ('#, fuzzy', pot.flag.strip())
396        flag = flag.split(', ')
397        fl = {}
398        flag = [fl.setdefault(f, f) for f in flag if f not in fl and f]
399        if not option.docstrings:
400            try:
401                flag.remove('docstring')
402            except ValueError:
403                pass
404        if obs:
405            removes = ['c-format', 'python-format', 'docstring']
406            for remove in removes:
407                try:
408                    flag.remove(remove)
409                except ValueError:
410                    pass
411        # Put fuzzy first
412        if 'fuzzy' in flag and not flag.index('fuzzy') == 1:
413            i = flag.index('fuzzy')
414            flag[1], flag[i] = flag[i], flag[1]
415
416        if len(flag) == 1:
417            flag = ''
418        else:
419            flag = ', '.join(flag) + '\n'
420    return flag
421
422def add(pot, po, fuzzy = False):
423    '''
424    Build a new entry from the Msgs objects <pot> and <pot>. If
425    <fuzzy> is true, <po>\'s flag field is ignored (in
426    flags()). Returns a multiline string with a up to date entry.
427    '''
428    msg = []
429    msg.append(po.cmt)
430    msg.append(pot.autocmt)
431    msg.append(pot.ref)
432    msg.append(flags(po, pot, fuzzy = fuzzy))
433    msg.append(pot.id)
434    msg.append(po.str)
435    return ''.join(msg)
436
437def header(pot, defs):
438    '''
439    Update date in header entry. Returns the updated header entry.
440    '''
441    try:
442        [po] = [ d for d in defs.values() if d.id == 'msgid ""\n' ]
443    except ValueError:
444        raise MsgmergeError('Error: did not find header in po file.')
445
446    r = re.compile(r'(.*^"POT-Creation-Date:\s+)(.*?)(\\n"$.*)',
447                   re.MULTILINE | re.DOTALL)
448    m = r.match(pot.str)
449    if not m:
450        raise MsgmergeError(
451            'Error: did not find POT-Creation-Date field in pot file.')
452
453    subs = '\\1%s\\3' % m.group(2)
454    _, count = r.subn(subs, po.str)
455    if not count == 1:
456        raise MsgmergeError(
457            'Error: did not find POT-Creation-Date field in po file.')
458    return po
459
460def match(defs, refs):
461    '''
462    Try to match Msgs objects in <refs> with Msgs objects in
463    <defs>. The return value is a list with po entries.
464    '''
465    global option
466    matches = []
467    empty = Msgs('msgid ""\n', 'msgstr ""\n', '', -1, 'str')
468    deco = [(r.lno, r) for r in refs.values()]
469    deco.sort()
470    po = header(deco.pop(0)[1], defs)       # Header entry
471    matches.append(add(empty, po))
472    po.used()
473    sorted = [ a[1] for a in deco ]
474    for pot in sorted:
475        if option.verbose:
476            sys.stderr.write('.')
477        po = defs.get(pot.id, False)        # Perfect match
478        if po:
479            matches.append(add(pot, po))
480            po.used(); pot.used()
481            continue
482        po = fuzzy_match(pot, defs)         # Fuzzy match
483        if po:
484            matches.append(add(pot, po, fuzzy = True))
485            po.used(); pot.used()
486            continue
487        matches.append(add(pot, empty))     # No match
488
489    obsolete(defs, matches)
490    return matches
491
492def obsolete(defs, matches):
493    '''Handle obsolete translations.'''
494    deco = [ (d.lno, d) for d in defs.values() if
495             d.count == 0 and not d.str == 'msgstr ""\n' ]
496    deco.sort()
497    empty = Msgs('msgid ""\n', 'msgstr ""\n', '', -1, 'str')
498    obs = [ o[1] for o in deco ]
499    for o in obs:
500        o.flag = flags(o, empty, obs = True)
501        o.obsolete()
502        matches.append('%s%s%s' % (o.flag, o.id, o.str))
503
504def help():
505    '''Print help text and exit.'''
506    print __doc__
507    sys.exit(0)
508
509def cmdline():
510    '''Parse options and arguments from command line.'''
511    advice = 'Try `%(name)s --help\' for more information.'
512    try:
513        long_opt = ['help', 'version', 'update', 'output-file=',
514                    'quiet', 'silent', 'docstrings', 'suffix', 'backup']
515        opts, args = getopt.getopt(sys.argv[1:], 'hVUo:qD', long_opt)
516    except getopt.error, msg:
517        print '%s: %s\n%s' % ('%(name)s', msg, advice) % globals()
518        sys.exit(1)
519       
520    option = Options(cmdline = True)
521    for opt, arg in opts:
522        if opt in ['-h', '--help']:
523            help()
524        elif opt in ['-V', '--version']:
525            print '%(name)s %(__version__)s' % globals()
526            sys.exit(0)
527        elif opt in ['-o', '--output-file']:
528            option.outfile = arg
529        elif opt in ['-U', '--update']:
530            option.update = True
531        elif opt in ['-q', '--silent', '--quiet']:
532            option.verbose = False
533        elif opt in ['-D', '--docstrings']:
534            option.docstrings = True
535        elif opt in ['--suffix']:
536            option.suffix = arg
537        elif opt in ['--backup']:
538            option.backup = arg
539           
540    # Sanity checks
541    warn = False
542    if option.update and option.outfile:
543        warn = '--update and --output-file are mutually exclusive.'
544    if len(args) == 0:
545        warn = 'no input files given.'
546    elif len(args) == 1 or len(args) > 2:
547        warn = 'exactly 2 input files required.'
548    if warn:
549        print '%s: %s\n%s' % ('%(name)s', warn, advice) % globals()
550        sys.exit(1)
551
552    if option.update:
553        option.outfile = args[0]
554    elif not option.outfile:
555        option.outfile = '-'
556
557    defs, refs = args
558
559    try:
560        merge(defs, refs, option = option)
561    except MsgmergeError, err:
562        print '%(name)s: ' % globals() + '%s' % err
563        sys.exit(1)
564
565def io(iofile, mode = 'rU'):
566    '''Wrapper around open().'''
567    try:
568        fo = open(iofile, mode)       
569        if 'r' in mode and fo.read(3) != codecs.BOM_UTF8:
570            fo.seek(0)
571
572    except IOError, msg:
573        raise MsgmergeError('error while opening file: %s: %s.' %
574                            (msg[1], iofile))
575    return fo
576
577def backup(infile):
578    '''Handle backup of files in update mode'''
579    os.environ.get('VERSION_CONTROL', '')
580    suffix = os.environ.get('SIMPLE_BACKUP_SUFFIX', '~')
581   
582    backup_file = '%s%s' % (infile, suffix)
583   
584def changes(new, old):
585    return cmp(''.join(old), '\n'.join(new))
586
587def write(matches, outfile):
588    '''Write the list <matches> to file <outfile>'''
589    if not outfile == '-':
590        fd = io(outfile, 'w')
591    else:
592        fd = sys.stdout
593    fd.write('\n'.join(matches))
594   
595def merge(def_file, ref_file, update = True, outfile = '-',
596          docstrings = True, suffix = '~', backup = True,
597          verbose = True, **kwds):
598    '''
599    Merge po file <def_file> with pot file <ref_file> . If <update> is
600    set to True then only update if there are changes to the po
601    file. Set outfile to write updated po file to an another file. Set
602    to `-\' for writing to standard out. If docstrings is False
603    docstrings flag will removed. Set verbose to False to suppress
604    progress indicators. <kwds> is used to pass options from the
605    command line interface.
606    '''
607    global option
608    option = kwds.get('option', Options(update = update,
609                                        outfile = outfile,
610                                        docstrings = docstrings,
611                                        suffix = suffix,
612                                        backup = backup,
613                                        verbose = verbose))
614    def_msgs = parse(def_file, 'msgstr')
615    ref_msgs = parse(ref_file, 'msgid')
616    if verbose and not __name__ == '__main__':
617        print >> sys.stderr, 'Merging %s with %s' % (ref_file, def_file)
618    updated_lines = match(def_msgs, ref_msgs)
619    if option.verbose:
620        print >> sys.stderr, ' done.'
621    if not option.update:
622        write(updated_lines, option.outfile)
623    elif option.update and changes(updated_lines, io(def_file).readlines()):
624        write(updated_lines, def_file)
625       
626def merge_dir(directory, pot = False, include = [], exclude = [],
627              verbose = True):
628    '''
629    Tries to merge a directory of po files. Uses simple glob to find
630    po files and pot file. The parameter <pot> can be used to specify
631    the pot file in the directory. If the list <include> is given only
632    files in this list is merged. Use the list <exclude> to exclude
633    files to be merged. This function is only useful if po files and
634    pot file are in the same directory. Set <verbose> to get
635    information when running.
636    '''
637    if directory[-1] == '/':
638        directory = os.path.dirname(directory)
639    if pot:
640        pot = os.path.basename(pot)
641    else:
642        pot = glob.glob('%s/*.pot' % directory)
643        if not pot:
644            raise MsgmergeError('No pot file found.')
645        elif len(pot) > 1:
646            raise MsgmergeError('More than one pot file found: %s.' % pot)
647        pot = os.path.basename(pot[0])
648   
649    if not include:
650        pos = glob.glob('%s/*po' % directory)
651        if not len(pos) > 1:
652            raise MsgmergeError('No po file(s) found.')
653        pos = [ os.path.basename(po) for po in pos ]
654    else:
655        pos = [ os.path.basename(po) for po in include ]
656   
657    for po in exclude:
658        try:
659            pos.remove(po)
660        except ValueError:
661            pass
662    format = '%s/%s'
663    for po in pos:
664        try:
665            merge(format % (directory, po), format % (directory, pot),
666                  update = True, verbose = verbose,
667                  outfile = format % (directory, po))
668        except MsgmergeError, err:           
669            if verbose:
670                print >> sys.stderr, '%s Not updated.' % err
671            else:
672                print >> sys.stderr, '%s %s not updated.' % (err, po)
673
674if __name__ == '__main__':
675    cmdline()
trunk/scripts/build/pygettext.py
r253614r253615
1#! /usr/bin/env python
2# -*- coding: iso-8859-1 -*-
3# Originally written by Barry Warsaw <barry@zope.com>
4#
5# Minimally patched to make it even more xgettext compatible
6# by Peter Funk <pf@artcom-gmbh.de>
7#
8# 2002-11-22 Jürgen Hermann <jh@web.de>
9# Added checks that _() only contains string literals, and
10# command line args are resolved to module lists, i.e. you
11# can now pass a filename, a module or package name, or a
12# directory (including globbing chars, important for Win32).
13# Made docstring fit in 80 chars wide displays using pydoc.
14#
15
16# for selftesting
17try:
18    import fintl
19    _ = fintl.gettext
20except ImportError:
21    _ = lambda s: s
22
23__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24
25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26internationalization of C programs. Most of these tools are independent of
27the programming language and can be used from within Python programs.
28Martin von Loewis' work[1] helps considerably in this regard.
29
30There's one problem though; xgettext is the program that scans source code
31looking for message strings, but it groks only C (or C++). Python
32introduces a few wrinkles, such as dual quoting characters, triple quoted
33strings, and raw strings. xgettext understands none of this.
34
35Enter pygettext, which uses Python's standard tokenize module to scan
36Python source code, generating .pot files identical to what GNU xgettext[2]
37generates for C and C++ code. From there, the standard GNU tools can be
38used.
39
40A word about marking Python strings as candidates for translation. GNU
41xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42and gettext_noop. But those can be a lot of text to include all over your
43code. C and C++ have a trick: they use the C preprocessor. Most
44internationalized C source includes a #define for gettext() to _() so that
45what has to be written in the source is much less. Thus these are both
46translatable strings:
47
48    gettext("Translatable String")
49    _("Translatable String")
50
51Python of course has no preprocessor so this doesn't work so well.  Thus,
52pygettext searches only for _() by default, but see the -k/--keyword flag
53below for how to augment this.
54
55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] http://www.gnu.org/software/gettext/gettext.html
57
58NOTE: pygettext attempts to be option and feature compatible with GNU
59xgettext where ever possible. However some options are still missing or are
60not fully implemented. Also, xgettext's use of command line switches with
61option arguments is broken, and in these cases, pygettext just defines
62additional switches.
63
64Usage: pygettext [options] inputfile ...
65
66Options:
67
68    -a
69    --extract-all
70        Extract all strings.
71
72    -d name
73    --default-domain=name
74        Rename the default output file from messages.pot to name.pot.
75
76    -E
77    --escape
78        Replace non-ASCII characters with octal escape sequences.
79
80    -D
81    --docstrings
82        Extract module, class, method, and function docstrings.  These do
83        not need to be wrapped in _() markers, and in fact cannot be for
84        Python to consider them docstrings. (See also the -X option).
85
86    -h
87    --help
88        Print this help message and exit.
89
90    -k word
91    --keyword=word
92        Keywords to look for in addition to the default set, which are:
93        %(DEFAULTKEYWORDS)s
94
95        You can have multiple -k flags on the command line.
96
97    -K
98    --no-default-keywords
99        Disable the default set of keywords (see above).  Any keywords
100        explicitly added with the -k/--keyword option are still recognized.
101
102    --no-location
103        Do not write filename/lineno location comments.
104
105    -n
106    --add-location
107        Write filename/lineno location comments indicating where each
108        extracted string is found in the source.  These lines appear before
109        each msgid.  The style of comments is controlled by the -S/--style
110        option.  This is the default.
111
112    -o filename
113    --output=filename
114        Rename the default output file from messages.pot to filename.  If
115        filename is `-' then the output is sent to standard out.
116
117    -p dir
118    --output-dir=dir
119        Output files will be placed in directory dir.
120
121    -S stylename
122    --style stylename
123        Specify which style to use for location comments.  Two styles are
124        supported:
125
126        Solaris  # File: filename, line: line-number
127        GNU      #: filename:line
128
129        The style name is case insensitive.  GNU style is the default.
130
131    -v
132    --verbose
133        Print the names of the files being processed.
134
135    -V
136    --version
137        Print the version of pygettext and exit.
138
139    -w columns
140    --width=columns
141        Set width of output to columns.
142
143    -x filename
144    --exclude-file=filename
145        Specify a file that contains a list of strings that are not be
146        extracted from the input files.  Each string to be excluded must
147        appear on a line by itself in the file.
148
149    -X filename
150    --no-docstrings=filename
151        Specify a file that contains a list of files (one per line) that
152        should not have their docstrings extracted.  This is only useful in
153        conjunction with the -D option above.
154
155If `inputfile' is -, standard input is read.
156""")
157
158import os
159import imp
160import sys
161import glob
162import time
163import getopt
164import token
165import tokenize
166import operator
167
168from umit.pm.core.const import PM_VERSION
169
170__version__ = '1.5'
171
172default_keywords = ['_']
173DEFAULTKEYWORDS = ', '.join(default_keywords)
174
175EMPTYSTRING = ''
176
177
178 
179# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
180# there.
181pot_header = _('''\
182# PacketManipulator catalog.
183# Copyright (C) 2009 Adriano Montero Marques
184# Francesco Piccinno <stack.box@gmail.com>, 2009
185#
186msgid ""
187msgstr ""
188"Project-Id-Version: PacketManipulator %(pm_version)s\\n"
189"POT-Creation-Date: %(time)s\\n"
190"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
191"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
192"Language-Team: LANGUAGE <LL@li.org>\\n"
193"MIME-Version: 1.0\\n"
194"Content-Type: text/plain; charset=UTF-8\\n"
195"Content-Transfer-Encoding: 8bit\\n"
196"Generated-By: pygettext.py %(version)s\\n"
197
198''')
199
200 
201def usage(code, msg=''):
202    print >> sys.stderr, __doc__ % globals()
203    if msg:
204        print >> sys.stderr, msg
205    sys.exit(code)
206
207
208 
209escapes = []
210
211def make_escapes(pass_iso8859):
212    global escapes
213    if pass_iso8859:
214        # Allow iso-8859 characters to pass through so that e.g. 'msgid
215        # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
216        # escape any character outside the 32..126 range.
217        mod = 128
218    else:
219        mod = 256
220    for i in range(256):
221        if 32 <= (i % mod) <= 126:
222            escapes.append(chr(i))
223        else:
224            escapes.append("\\%03o" % i)
225    escapes[ord('\\')] = '\\\\'
226    escapes[ord('\t')] = '\\t'
227    escapes[ord('\r')] = '\\r'
228    escapes[ord('\n')] = '\\n'
229    escapes[ord('\"')] = '\\"'
230
231
232def escape(s):
233    global escapes
234    s = list(s)
235    for i in range(len(s)):
236        s[i] = escapes[ord(s[i])]
237    return EMPTYSTRING.join(s)
238
239
240def safe_eval(s):
241    # unwrap quotes, safely
242    return eval(s, {'__builtins__':{}}, {})
243
244
245def normalize(s):
246    # This converts the various Python string types into a format that is
247    # appropriate for .po files, namely much closer to C style.
248    lines = s.split('\n')
249    if len(lines) == 1:
250        s = '"' + escape(s) + '"'
251    else:
252        if not lines[-1]:
253            del lines[-1]
254            lines[-1] = lines[-1] + '\n'
255        for i in range(len(lines)):
256            lines[i] = escape(lines[i])
257        lineterm = '\\n"\n"'
258        s = '""\n"' + lineterm.join(lines) + '"'
259    return s
260
261 
262def containsAny(str, set):
263    """Check whether 'str' contains ANY of the chars in 'set'"""
264    return 1 in [c in str for c in set]
265
266
267def _visit_pyfiles(list, dirname, names):
268    """Helper for getFilesForName()."""
269    # get extension for python source files
270    if not globals().has_key('_py_ext'):
271        global _py_ext
272        _py_ext = [triple[0] for triple in imp.get_suffixes()
273                   if triple[2] == imp.PY_SOURCE][0]
274
275    # don't recurse into CVS directories
276    if 'CVS' in names:
277        names.remove('CVS')
278
279    # add all *.py files to list
280    list.extend(
281        [os.path.join(dirname, file) for file in names
282         if os.path.splitext(file)[1] == _py_ext]
283        )
284
285
286def _get_modpkg_path(dotted_name, pathlist=None):
287    """Get the filesystem path for a module or a package.
288
289    Return the file system path to a file for a module, and to a directory for
290    a package. Return None if the name is not found, or is a builtin or
291    extension module.
292    """
293    # split off top-most name
294    parts = dotted_name.split('.', 1)
295
296    if len(parts) > 1:
297        # we have a dotted path, import top-level package
298        try:
299            file, pathname, description = imp.find_module(parts[0], pathlist)
300            if file: file.close()
301        except ImportError:
302            return None
303
304        # check if it's indeed a package
305        if description[2] == imp.PKG_DIRECTORY:
306            # recursively handle the remaining name parts
307            pathname = _get_modpkg_path(parts[1], [pathname])
308        else:
309            pathname = None
310    else:
311        # plain name
312        try:
313            file, pathname, description = imp.find_module(
314                dotted_name, pathlist)
315            if file:
316                file.close()
317            if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
318                pathname = None
319        except ImportError:
320            pathname = None
321
322    return pathname
323
324
325def getFilesForName(name):
326    """Get a list of module files for a filename, a module or package name,
327    or a directory.
328    """
329    if not os.path.exists(name):
330        # check for glob chars
331        if containsAny(name, "*?[]"):
332            files = glob.glob(name)
333            list = []
334            for file in files:
335                list.extend(getFilesForName(file))
336            return list
337
338        # try to find module or package
339        name = _get_modpkg_path(name)
340        if not name:
341            return []
342
343    if os.path.isdir(name):
344        # find all python files in directory
345        list = []
346        os.path.walk(name, _visit_pyfiles, list)
347        return list
348    elif os.path.exists(name):
349        # a single file
350        return [name]
351
352    return []
353
354 
355class TokenEater:
356    def __init__(self, options):
357        self.__options = options
358        self.__messages = {}
359        self.__state = self.__waiting
360        self.__data = []
361        self.__lineno = -1
362        self.__freshmodule = 1
363        self.__curfile = None
364
365    def __call__(self, ttype, tstring, stup, etup, line):
366        # dispatch
367##        import token
368##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
369##              'tstring:', tstring
370        self.__state(ttype, tstring, stup[0])
371
372    def __waiting(self, ttype, tstring, lineno):
373        opts = self.__options
374        # Do docstring extractions, if enabled
375        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
376            # module docstring?
377            if self.__freshmodule:
378                if ttype == tokenize.STRING:
379                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
380                    self.__freshmodule = 0
381                elif ttype not in (tokenize.COMMENT, tokenize.NL):
382                    self.__freshmodule = 0
383                return
384            # class docstring?
385            if ttype == tokenize.NAME and tstring in ('class', 'def'):
386                self.__state = self.__suiteseen
387                return
388        if ttype == tokenize.NAME and tstring in opts.keywords:
389            self.__state = self.__keywordseen
390
391    def __suiteseen(self, ttype, tstring, lineno):
392        # ignore anything until we see the colon
393        if ttype == tokenize.OP and tstring == ':':
394            self.__state = self.__suitedocstring
395
396    def __suitedocstring(self, ttype, tstring, lineno):
397        # ignore any intervening noise
398        if ttype == tokenize.STRING:
399            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
400            self.__state = self.__waiting
401        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
402                           tokenize.COMMENT):
403            # there was no class docstring
404            self.__state = self.__waiting
405
406    def __keywordseen(self, ttype, tstring, lineno):
407        if ttype == tokenize.OP and tstring == '(':
408            self.__data = []
409            self.__lineno = lineno
410            self.__state = self.__openseen
411        else:
412            self.__state = self.__waiting
413
414    def __openseen(self, ttype, tstring, lineno):
415        if ttype == tokenize.OP and tstring == ')':
416            # We've seen the last of the translatable strings.  Record the
417            # line number of the first line of the strings and update the list
418            # of messages seen.  Reset state for the next batch.  If there
419            # were no strings inside _(), then just ignore this entry.
420            if self.__data:
421                self.__addentry(EMPTYSTRING.join(self.__data))
422            self.__state = self.__waiting
423        elif ttype == tokenize.STRING:
424            self.__data.append(safe_eval(tstring))
425        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
426                           token.NEWLINE, tokenize.NL]:
427            # warn if we see anything else than STRING or whitespace
428            print >> sys.stderr, _(
429                '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
430                ) % {
431                'token': tstring,
432                'file': self.__curfile,
433                'lineno': self.__lineno
434                }
435            self.__state = self.__waiting
436
437    def __addentry(self, msg, lineno=None, isdocstring=0):
438        if lineno is None:
439            lineno = self.__lineno
440        if not msg in self.__options.toexclude:
441            entry = (self.__curfile, lineno)
442            self.__messages.setdefault(msg, {})[entry] = isdocstring
443
444    def set_filename(self, filename):
445        self.__curfile = filename
446        self.__freshmodule = 1
447
448    def write(self, fp):
449        options = self.__options
450        timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
451        # The time stamp in the header doesn't have the same format as that
452        # generated by xgettext...
453        print >> fp, pot_header % {'time': timestamp, 'version': __version__,
454                                   'pm_version': PM_VERSION}
455        # Sort the entries.  First sort each particular entry's keys, then
456        # sort all the entries by their first item.
457        reverse = {}
458        for k, v in self.__messages.items():
459            keys = v.keys()
460            keys.sort()
461            reverse.setdefault(tuple(keys), []).append((k, v))
462        rkeys = reverse.keys()
463        rkeys.sort()
464        for rkey in rkeys:
465            rentries = reverse[rkey]
466            rentries.sort()
467            for k, v in rentries:
468                isdocstring = 0
469                # If the entry was gleaned out of a docstring, then add a
470                # comment stating so.  This is to aid translators who may wish
471                # to skip translating some unimportant docstrings.
472                if reduce(operator.__add__, v.values()):
473                    isdocstring = 1
474                # k is the message string, v is a dictionary-set of (filename,
475                # lineno) tuples.  We want to sort the entries in v first by
476                # file name and then by line number.
477                v = v.keys()
478                v.sort()
479                if not options.writelocations:
480                    pass
481                # location comments are different b/w Solaris and GNU:
482                elif options.locationstyle == options.SOLARIS:
483                    for filename, lineno in v:
484                        d = {'filename': filename, 'lineno': lineno}
485                        print >>fp, _(
486                            '# File: %(filename)s, line: %(lineno)d') % d
487                elif options.locationstyle == options.GNU:
488                    # fit as many locations on one line, as long as the
489                    # resulting line length doesn't exceeds 'options.width'
490                    locline = '#:'
491                    for filename, lineno in v:
492                        d = {'filename': filename, 'lineno': lineno}
493                        s = _(' %(filename)s:%(lineno)d') % d
494                        if len(locline) + len(s) <= options.width:
495                            locline = locline + s
496                        else:
497                            print >> fp, locline
498                            locline = "#:" + s
499                    if len(locline) > 2:
500                        print >> fp, locline
501                if isdocstring:
502                    print >> fp, '#, docstring'
503                print >> fp, 'msgid', normalize(k)
504                print >> fp, 'msgstr ""\n'
505
506
507 
508def main():
509    global default_keywords
510    try:
511        opts, args = getopt.getopt(
512            sys.argv[1:],
513            'ad:DEhk:Kno:p:S:Vvw:x:X:',
514            ['extract-all', 'default-domain=', 'escape', 'help',
515             'keyword=', 'no-default-keywords',
516             'add-location', 'no-location', 'output=', 'output-dir=',
517             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
518             'docstrings', 'no-docstrings',
519             ])
520    except getopt.error, msg:
521        usage(1, msg)
522
523    # for holding option values
524    class Options:
525        # constants
526        GNU = 1
527        SOLARIS = 2
528        # defaults
529        extractall = 0 # FIXME: currently this option has no effect at all.
530        escape = 0
531        keywords = []
532        outpath = ''
533        outfile = 'messages.pot'
534        writelocations = 1
535        locationstyle = GNU
536        verbose = 0
537        width = 78
538        excludefilename = ''
539        docstrings = 0
540        nodocstrings = {}
541
542    options = Options()
543    locations = {'gnu' : options.GNU,
544                 'solaris' : options.SOLARIS,
545                 }
546
547    # parse options
548    for opt, arg in opts:
549        if opt in ('-h', '--help'):
550            usage(0)
551        elif opt in ('-a', '--extract-all'):
552            options.extractall = 1
553        elif opt in ('-d', '--default-domain'):
554            options.outfile = arg + '.pot'
555        elif opt in ('-E', '--escape'):
556            options.escape = 1
557        elif opt in ('-D', '--docstrings'):
558            options.docstrings = 1
559        elif opt in ('-k', '--keyword'):
560            options.keywords.append(arg)
561        elif opt in ('-K', '--no-default-keywords'):
562            default_keywords = []
563        elif opt in ('-n', '--add-location'):
564            options.writelocations = 1
565        elif opt in ('--no-location',):
566            options.writelocations = 0
567        elif opt in ('-S', '--style'):
568            options.locationstyle = locations.get(arg.lower())
569            if options.locationstyle is None:
570                usage(1, _('Invalid value for --style: %s') % arg)
571        elif opt in ('-o', '--output'):
572            options.outfile = arg
573        elif opt in ('-p', '--output-dir'):
574            options.outpath = arg
575        elif opt in ('-v', '--verbose'):
576            options.verbose = 1
577        elif opt in ('-V', '--version'):
578            print _('pygettext.py (xgettext for Python) %s') % __version__
579            sys.exit(0)
580        elif opt in ('-w', '--width'):
581            try:
582                options.width = int(arg)
583            except ValueError:
584                usage(1, _('--width argument must be an integer: %s') % arg)
585        elif opt in ('-x', '--exclude-file'):
586            options.excludefilename = arg
587        elif opt in ('-X', '--no-docstrings'):
588            fp = open(arg)
589            try:
590                while 1:
591                    line = fp.readline()
592                    if not line:
593                        break
594                    options.nodocstrings[line[:-1]] = 1
595            finally:
596                fp.close()
597
598    # calculate escapes
599    make_escapes(options.escape)
600
601    # calculate all keywords
602    options.keywords.extend(default_keywords)
603
604    # initialize list of strings to exclude
605    if options.excludefilename:
606        try:
607            fp = open(options.excludefilename)
608            options.toexclude = fp.readlines()
609            fp.close()
610        except IOError:
611            print >> sys.stderr, _(
612                "Can't read --exclude-file: %s") % options.excludefilename
613            sys.exit(1)
614    else:
615        options.toexclude = []
616
617    # resolve args to module lists
618    expanded = []
619    for arg in args:
620        if arg == '-':
621            expanded.append(arg)
622        else:
623            expanded.extend(getFilesForName(arg))
624    args = expanded
625
626    # slurp through all the files
627    eater = TokenEater(options)
628    for filename in args:
629        if filename == '-':
630            if options.verbose:
631                print _('Reading standard input')
632            fp = sys.stdin
633            closep = 0
634        else:
635            if options.verbose:
636                print _('Working on %s') % filename
637            fp = open(filename)
638            closep = 1
639        try:
640            eater.set_filename(filename)
641            try:
642                tokenize.tokenize(fp.readline, eater)
643            except tokenize.TokenError, e:
644                print >> sys.stderr, '%s: %s, line %d, column %d' % (
645                    e[0], filename, e[1][0], e[1][1])
646        finally:
647            if closep:
648                fp.close()
649
650    # write the output
651    if options.outfile == '-':
652        fp = sys.stdout
653        closep = 0
654    else:
655        if options.outpath:
656            options.outfile = os.path.join(options.outpath, options.outfile)
657        fp = open(options.outfile, 'w')
658        closep = 1
659    try:
660        eater.write(fp)
661    finally:
662        if closep:
663            fp.close()
664
665 
666if __name__ == '__main__':
667    main()
668    # some more test strings
669    _(u'a unicode string')
670    # this one creates a warning
671    _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
672    _('more' 'than' 'one' 'string')


Previous 199869 Revisions Next


© 1997-2024 The MAME Team