gh-126700: pygettext: Support more gettext functions (GH-126912)

Support multi-argument gettext functions: ngettext(), pgettext(), dgettext(), etc.
This commit is contained in:
Tomas R. 2024-11-22 15:52:16 +01:00 committed by GitHub
parent f83ca6962a
commit 0a1944cda8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 260 additions and 90 deletions

View File

@ -15,53 +15,75 @@ msgstr ""
"Generated-By: pygettext.py 1.5\n" "Generated-By: pygettext.py 1.5\n"
#: messages.py:5 #: messages.py:16
msgid "" msgid ""
msgstr "" msgstr ""
#: messages.py:8 messages.py:9 #: messages.py:19 messages.py:20
msgid "parentheses" msgid "parentheses"
msgstr "" msgstr ""
#: messages.py:12 #: messages.py:23
msgid "Hello, world!" msgid "Hello, world!"
msgstr "" msgstr ""
#: messages.py:15 #: messages.py:26
msgid "" msgid ""
"Hello,\n" "Hello,\n"
" multiline!\n" " multiline!\n"
msgstr "" msgstr ""
#: messages.py:29 #: messages.py:46 messages.py:89 messages.py:90 messages.py:93 messages.py:94
#: messages.py:99
msgid "foo"
msgid_plural "foos"
msgstr[0] ""
msgstr[1] ""
#: messages.py:47
msgid "something"
msgstr ""
#: messages.py:50
msgid "Hello, {}!" msgid "Hello, {}!"
msgstr "" msgstr ""
#: messages.py:33 #: messages.py:54
msgid "1" msgid "1"
msgstr "" msgstr ""
#: messages.py:33 #: messages.py:54
msgid "2" msgid "2"
msgstr "" msgstr ""
#: messages.py:34 messages.py:35 #: messages.py:55 messages.py:56
msgid "A" msgid "A"
msgstr "" msgstr ""
#: messages.py:34 messages.py:35 #: messages.py:55 messages.py:56
msgid "B" msgid "B"
msgstr "" msgstr ""
#: messages.py:36 #: messages.py:57
msgid "set" msgid "set"
msgstr "" msgstr ""
#: messages.py:42 #: messages.py:63
msgid "nested string" msgid "nested string"
msgstr "" msgstr ""
#: messages.py:47 #: messages.py:68
msgid "baz" msgid "baz"
msgstr "" msgstr ""
#: messages.py:91 messages.py:92 messages.py:95 messages.py:96
msgctxt "context"
msgid "foo"
msgid_plural "foos"
msgstr[0] ""
msgstr[1] ""
#: messages.py:100
msgid "domain foo"
msgstr ""

View File

@ -1,5 +1,16 @@
# Test message extraction # Test message extraction
from gettext import gettext as _ from gettext import (
gettext,
ngettext,
pgettext,
npgettext,
dgettext,
dngettext,
dpgettext,
dnpgettext
)
_ = gettext
# Empty string # Empty string
_("") _("")
@ -21,13 +32,23 @@ _()
_(None) _(None)
_(1) _(1)
_(False) _(False)
_(x="kwargs are not allowed") _(("invalid"))
_(["invalid"])
_({"invalid"})
_("string"[3])
_("string"[:3])
_({"string": "foo"})
# pygettext does not allow keyword arguments, but both xgettext and pybabel do
_(x="kwargs work!")
# Unusual, but valid arguments
_("foo", "bar") _("foo", "bar")
_("something", x="something else") _("something", x="something else")
# .format() # .format()
_("Hello, {}!").format("world") # valid _("Hello, {}!").format("world") # valid
_("Hello, {}!".format("world")) # invalid _("Hello, {}!".format("world")) # invalid, but xgettext and pybabel extract the first string
# Nested structures # Nested structures
_("1"), _("2") _("1"), _("2")
@ -62,3 +83,28 @@ def _(x):
def _(x="don't extract me"): def _(x="don't extract me"):
pass pass
# Other gettext functions
gettext("foo")
ngettext("foo", "foos", 1)
pgettext("context", "foo")
npgettext("context", "foo", "foos", 1)
dgettext("domain", "foo")
dngettext("domain", "foo", "foos", 1)
dpgettext("domain", "context", "foo")
dnpgettext("domain", "context", "foo", "foos", 1)
# Complex arguments
ngettext("foo", "foos", 42 + (10 - 20))
dgettext(["some", {"complex"}, ("argument",)], "domain foo")
# Invalid calls which are not extracted
gettext()
ngettext('foo')
pgettext('context')
npgettext('context', 'foo')
dgettext('domain')
dngettext('domain', 'foo')
dpgettext('domain', 'context')
dnpgettext('domain', 'context', 'foo')

View File

@ -332,14 +332,14 @@ class Test_pygettext(unittest.TestCase):
msgids = self.extract_docstrings_from_str(dedent('''\ msgids = self.extract_docstrings_from_str(dedent('''\
f"{_('foo', 'bar')}" f"{_('foo', 'bar')}"
''')) '''))
self.assertNotIn('foo', msgids) self.assertIn('foo', msgids)
self.assertNotIn('bar', msgids) self.assertNotIn('bar', msgids)
def test_calls_in_fstring_with_keyword_args(self): def test_calls_in_fstring_with_keyword_args(self):
msgids = self.extract_docstrings_from_str(dedent('''\ msgids = self.extract_docstrings_from_str(dedent('''\
f"{_('foo', bar='baz')}" f"{_('foo', bar='baz')}"
''')) '''))
self.assertNotIn('foo', msgids) self.assertIn('foo', msgids)
self.assertNotIn('bar', msgids) self.assertNotIn('bar', msgids)
self.assertNotIn('baz', msgids) self.assertNotIn('baz', msgids)

View File

@ -8,6 +8,8 @@ argument %(argument_name)s: %(message)s
argument '%(argument_name)s' is deprecated argument '%(argument_name)s' is deprecated
can't open '%(filename)s': %(error)s can't open '%(filename)s': %(error)s
command '%(parser_name)s' is deprecated command '%(parser_name)s' is deprecated
conflicting option string: %s
expected %s argument
expected at least one argument expected at least one argument
expected at most one argument expected at most one argument
expected one argument expected one argument

View File

@ -1,3 +1,4 @@
%(option)s option requires %(number)d argument
%prog [options] %prog [options]
%s option does not take a value %s option does not take a value
Options Options

View File

@ -0,0 +1 @@
Add support for multi-argument :mod:`gettext` functions in :program:`pygettext.py`.

View File

@ -163,16 +163,13 @@ import glob
import time import time
import getopt import getopt
import ast import ast
import token
import tokenize import tokenize
from collections import defaultdict
from dataclasses import dataclass, field
from operator import itemgetter
__version__ = '1.5' __version__ = '1.5'
default_keywords = ['_']
DEFAULTKEYWORDS = ', '.join(default_keywords)
EMPTYSTRING = ''
# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
# there. # there.
@ -306,12 +303,64 @@ def getFilesForName(name):
return [] return []
# Key is the function name, value is a dictionary mapping argument positions to the
# type of the argument. The type is one of 'msgid', 'msgid_plural', or 'msgctxt'.
DEFAULTKEYWORDS = {
'_': {0: 'msgid'},
'gettext': {0: 'msgid'},
'ngettext': {0: 'msgid', 1: 'msgid_plural'},
'pgettext': {0: 'msgctxt', 1: 'msgid'},
'npgettext': {0: 'msgctxt', 1: 'msgid', 2: 'msgid_plural'},
'dgettext': {1: 'msgid'},
'dngettext': {1: 'msgid', 2: 'msgid_plural'},
'dpgettext': {1: 'msgctxt', 2: 'msgid'},
'dnpgettext': {1: 'msgctxt', 2: 'msgid', 3: 'msgid_plural'},
}
def matches_spec(message, spec):
"""Check if a message has all the keys defined by the keyword spec."""
return all(key in message for key in spec.values())
@dataclass(frozen=True)
class Location:
filename: str
lineno: int
def __lt__(self, other):
return (self.filename, self.lineno) < (other.filename, other.lineno)
@dataclass
class Message:
msgid: str
msgid_plural: str | None
msgctxt: str | None
locations: set[Location] = field(default_factory=set)
is_docstring: bool = False
def add_location(self, filename, lineno, msgid_plural=None, *, is_docstring=False):
if self.msgid_plural is None:
self.msgid_plural = msgid_plural
self.locations.add(Location(filename, lineno))
self.is_docstring |= is_docstring
def key_for(msgid, msgctxt=None):
if msgctxt is not None:
return (msgctxt, msgid)
return msgid
class TokenEater: class TokenEater:
def __init__(self, options): def __init__(self, options):
self.__options = options self.__options = options
self.__messages = {} self.__messages = {}
self.__state = self.__waiting self.__state = self.__waiting
self.__data = [] self.__data = defaultdict(str)
self.__curr_arg = 0
self.__curr_keyword = None
self.__lineno = -1 self.__lineno = -1
self.__freshmodule = 1 self.__freshmodule = 1
self.__curfile = None self.__curfile = None
@ -331,7 +380,7 @@ class TokenEater:
# module docstring? # module docstring?
if self.__freshmodule: if self.__freshmodule:
if ttype == tokenize.STRING and is_literal_string(tstring): if ttype == tokenize.STRING and is_literal_string(tstring):
self.__addentry(safe_eval(tstring), lineno, isdocstring=1) self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True)
self.__freshmodule = 0 self.__freshmodule = 0
return return
if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING): if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING):
@ -346,6 +395,7 @@ class TokenEater:
return return
if ttype == tokenize.NAME and tstring in opts.keywords: if ttype == tokenize.NAME and tstring in opts.keywords:
self.__state = self.__keywordseen self.__state = self.__keywordseen
self.__curr_keyword = tstring
return return
if ttype == tokenize.STRING: if ttype == tokenize.STRING:
maybe_fstring = ast.parse(tstring, mode='eval').body maybe_fstring = ast.parse(tstring, mode='eval').body
@ -397,7 +447,8 @@ class TokenEater:
}, file=sys.stderr) }, file=sys.stderr)
continue continue
if isinstance(arg.value, str): if isinstance(arg.value, str):
self.__addentry(arg.value, lineno) self.__curr_keyword = func_name
self.__addentry({'msgid': arg.value}, lineno)
def __suiteseen(self, ttype, tstring, lineno): def __suiteseen(self, ttype, tstring, lineno):
# skip over any enclosure pairs until we see the colon # skip over any enclosure pairs until we see the colon
@ -413,7 +464,7 @@ class TokenEater:
def __suitedocstring(self, ttype, tstring, lineno): def __suitedocstring(self, ttype, tstring, lineno):
# ignore any intervening noise # ignore any intervening noise
if ttype == tokenize.STRING and is_literal_string(tstring): if ttype == tokenize.STRING and is_literal_string(tstring):
self.__addentry(safe_eval(tstring), lineno, isdocstring=1) self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True)
self.__state = self.__waiting self.__state = self.__waiting
elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
tokenize.COMMENT): tokenize.COMMENT):
@ -422,44 +473,90 @@ class TokenEater:
def __keywordseen(self, ttype, tstring, lineno): def __keywordseen(self, ttype, tstring, lineno):
if ttype == tokenize.OP and tstring == '(': if ttype == tokenize.OP and tstring == '(':
self.__data = [] self.__data.clear()
self.__curr_arg = 0
self.__enclosurecount = 0
self.__lineno = lineno self.__lineno = lineno
self.__state = self.__openseen self.__state = self.__openseen
else: else:
self.__state = self.__waiting self.__state = self.__waiting
def __openseen(self, ttype, tstring, lineno): def __openseen(self, ttype, tstring, lineno):
if ttype == tokenize.OP and tstring == ')': spec = self.__options.keywords[self.__curr_keyword]
# We've seen the last of the translatable strings. Record the arg_type = spec.get(self.__curr_arg)
# line number of the first line of the strings and update the list expect_string_literal = arg_type is not None
# of messages seen. Reset state for the next batch. If there
# were no strings inside _(), then just ignore this entry. if ttype == tokenize.OP and self.__enclosurecount == 0:
if self.__data: if tstring == ')':
self.__addentry(EMPTYSTRING.join(self.__data)) # We've seen the last of the translatable strings. Record the
self.__state = self.__waiting # line number of the first line of the strings and update the list
elif ttype == tokenize.STRING and is_literal_string(tstring): # of messages seen. Reset state for the next batch. If there
self.__data.append(safe_eval(tstring)) # were no strings inside _(), then just ignore this entry.
elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, if self.__data:
token.NEWLINE, tokenize.NL]: self.__addentry(self.__data)
# warn if we see anything else than STRING or whitespace self.__state = self.__waiting
print(_( return
'*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' elif tstring == ',':
) % { # Advance to the next argument
'token': tstring, self.__curr_arg += 1
'file': self.__curfile, return
'lineno': self.__lineno
}, file=sys.stderr) if expect_string_literal:
self.__state = self.__waiting if ttype == tokenize.STRING and is_literal_string(tstring):
self.__data[arg_type] += safe_eval(tstring)
elif ttype not in (tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT,
tokenize.NEWLINE, tokenize.NL):
# We are inside an argument which is a translatable string and
# we encountered a token that is not a string. This is an error.
self.warn_unexpected_token(tstring)
self.__enclosurecount = 0
self.__state = self.__waiting
elif ttype == tokenize.OP:
if tstring in '([{':
self.__enclosurecount += 1
elif tstring in ')]}':
self.__enclosurecount -= 1
def __ignorenext(self, ttype, tstring, lineno): def __ignorenext(self, ttype, tstring, lineno):
self.__state = self.__waiting self.__state = self.__waiting
def __addentry(self, msg, lineno=None, isdocstring=0): def __addentry(self, msg, lineno=None, *, is_docstring=False):
msgid = msg.get('msgid')
if msgid in self.__options.toexclude:
return
if not is_docstring:
spec = self.__options.keywords[self.__curr_keyword]
if not matches_spec(msg, spec):
return
if lineno is None: if lineno is None:
lineno = self.__lineno lineno = self.__lineno
if not msg in self.__options.toexclude: msgctxt = msg.get('msgctxt')
entry = (self.__curfile, lineno) msgid_plural = msg.get('msgid_plural')
self.__messages.setdefault(msg, {})[entry] = isdocstring key = key_for(msgid, msgctxt)
if key in self.__messages:
self.__messages[key].add_location(
self.__curfile,
lineno,
msgid_plural,
is_docstring=is_docstring,
)
else:
self.__messages[key] = Message(
msgid=msgid,
msgid_plural=msgid_plural,
msgctxt=msgctxt,
locations={Location(self.__curfile, lineno)},
is_docstring=is_docstring,
)
def warn_unexpected_token(self, token):
print(_(
'*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
) % {
'token': token,
'file': self.__curfile,
'lineno': self.__lineno
}, file=sys.stderr)
def set_filename(self, filename): def set_filename(self, filename):
self.__curfile = filename self.__curfile = filename
@ -472,55 +569,54 @@ class TokenEater:
print(pot_header % {'time': timestamp, 'version': __version__, print(pot_header % {'time': timestamp, 'version': __version__,
'charset': encoding, 'charset': encoding,
'encoding': '8bit'}, file=fp) 'encoding': '8bit'}, file=fp)
# Sort the entries. First sort each particular entry's keys, then
# sort all the entries by their first item. # Sort locations within each message by filename and lineno
reverse = {} sorted_keys = [
for k, v in self.__messages.items(): (key, sorted(msg.locations))
keys = sorted(v.keys()) for key, msg in self.__messages.items()
reverse.setdefault(tuple(keys), []).append((k, v)) ]
rkeys = sorted(reverse.keys()) # Sort messages by locations
for rkey in rkeys: # For example, a message with locations [('test.py', 1), ('test.py', 2)] will
rentries = reverse[rkey] # appear before a message with locations [('test.py', 1), ('test.py', 3)]
rentries.sort() sorted_keys.sort(key=itemgetter(1))
for k, v in rentries:
# If the entry was gleaned out of a docstring, then add a for key, locations in sorted_keys:
# comment stating so. This is to aid translators who may wish msg = self.__messages[key]
# to skip translating some unimportant docstrings. if options.writelocations:
isdocstring = any(v.values())
# k is the message string, v is a dictionary-set of (filename,
# lineno) tuples. We want to sort the entries in v first by
# file name and then by line number.
v = sorted(v.keys())
if not options.writelocations:
pass
# location comments are different b/w Solaris and GNU: # location comments are different b/w Solaris and GNU:
elif options.locationstyle == options.SOLARIS: if options.locationstyle == options.SOLARIS:
for filename, lineno in v: for location in locations:
d = {'filename': filename, 'lineno': lineno} print(f'# File: {location.filename}, line: {location.lineno}', file=fp)
print(_(
'# File: %(filename)s, line: %(lineno)d') % d, file=fp)
elif options.locationstyle == options.GNU: elif options.locationstyle == options.GNU:
# fit as many locations on one line, as long as the # fit as many locations on one line, as long as the
# resulting line length doesn't exceed 'options.width' # resulting line length doesn't exceed 'options.width'
locline = '#:' locline = '#:'
for filename, lineno in v: for location in locations:
d = {'filename': filename, 'lineno': lineno} s = f' {location.filename}:{location.lineno}'
s = _(' %(filename)s:%(lineno)d') % d
if len(locline) + len(s) <= options.width: if len(locline) + len(s) <= options.width:
locline = locline + s locline = locline + s
else: else:
print(locline, file=fp) print(locline, file=fp)
locline = "#:" + s locline = f'#:{s}'
if len(locline) > 2: if len(locline) > 2:
print(locline, file=fp) print(locline, file=fp)
if isdocstring: if msg.is_docstring:
print('#, docstring', file=fp) # If the entry was gleaned out of a docstring, then add a
print('msgid', normalize(k, encoding), file=fp) # comment stating so. This is to aid translators who may wish
# to skip translating some unimportant docstrings.
print('#, docstring', file=fp)
if msg.msgctxt is not None:
print('msgctxt', normalize(msg.msgctxt, encoding), file=fp)
print('msgid', normalize(msg.msgid, encoding), file=fp)
if msg.msgid_plural is not None:
print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp)
print('msgstr[0] ""', file=fp)
print('msgstr[1] ""\n', file=fp)
else:
print('msgstr ""\n', file=fp) print('msgstr ""\n', file=fp)
def main(): def main():
global default_keywords
try: try:
opts, args = getopt.getopt( opts, args = getopt.getopt(
sys.argv[1:], sys.argv[1:],
@ -557,7 +653,7 @@ def main():
locations = {'gnu' : options.GNU, locations = {'gnu' : options.GNU,
'solaris' : options.SOLARIS, 'solaris' : options.SOLARIS,
} }
no_default_keywords = False
# parse options # parse options
for opt, arg in opts: for opt, arg in opts:
if opt in ('-h', '--help'): if opt in ('-h', '--help'):
@ -573,7 +669,7 @@ def main():
elif opt in ('-k', '--keyword'): elif opt in ('-k', '--keyword'):
options.keywords.append(arg) options.keywords.append(arg)
elif opt in ('-K', '--no-default-keywords'): elif opt in ('-K', '--no-default-keywords'):
default_keywords = [] no_default_keywords = True
elif opt in ('-n', '--add-location'): elif opt in ('-n', '--add-location'):
options.writelocations = 1 options.writelocations = 1
elif opt in ('--no-location',): elif opt in ('--no-location',):
@ -613,7 +709,9 @@ def main():
make_escapes(not options.escape) make_escapes(not options.escape)
# calculate all keywords # calculate all keywords
options.keywords.extend(default_keywords) options.keywords = {kw: {0: 'msgid'} for kw in options.keywords}
if not no_default_keywords:
options.keywords |= DEFAULTKEYWORDS
# initialize list of strings to exclude # initialize list of strings to exclude
if options.excludefilename: if options.excludefilename: