bpo-25324: copy tok_name before changing it (#1608)
* add test to check if were modifying token * copy list so import tokenize doesnt have side effects on token * shorten line * add tokenize tokens to token.h to get them to show up in token * move ERRORTOKEN back to its previous location, and fix nitpick * copy comments from token.h automatically * fix whitespace and make more pythonic * change to fix comments from @haypo * update token.rst and Misc/NEWS * change wording * some more wording changes
This commit is contained in:
parent
85aba238e4
commit
fc354f0785
|
@ -101,6 +101,9 @@ The token constants are:
|
||||||
AWAIT
|
AWAIT
|
||||||
ASYNC
|
ASYNC
|
||||||
ERRORTOKEN
|
ERRORTOKEN
|
||||||
|
COMMENT
|
||||||
|
NL
|
||||||
|
ENCODING
|
||||||
N_TOKENS
|
N_TOKENS
|
||||||
NT_OFFSET
|
NT_OFFSET
|
||||||
|
|
||||||
|
@ -108,3 +111,8 @@ The token constants are:
|
||||||
Added :data:`AWAIT` and :data:`ASYNC` tokens. Starting with
|
Added :data:`AWAIT` and :data:`ASYNC` tokens. Starting with
|
||||||
Python 3.7, "async" and "await" will be tokenized as :data:`NAME`
|
Python 3.7, "async" and "await" will be tokenized as :data:`NAME`
|
||||||
tokens, and :data:`AWAIT` and :data:`ASYNC` will be removed.
|
tokens, and :data:`AWAIT` and :data:`ASYNC` will be removed.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.7
|
||||||
|
Added :data:`COMMENT`, :data:`NL` and :data:`ENCODING` to bring
|
||||||
|
the tokens in the C code in line with the tokens needed in
|
||||||
|
:mod:`tokenize` module. These tokens aren't used by the C tokenizer.
|
|
@ -67,7 +67,11 @@ extern "C" {
|
||||||
#define AWAIT 54
|
#define AWAIT 54
|
||||||
#define ASYNC 55
|
#define ASYNC 55
|
||||||
#define ERRORTOKEN 56
|
#define ERRORTOKEN 56
|
||||||
#define N_TOKENS 57
|
/* These aren't used by the C tokenizer but are needed for tokenize.py */
|
||||||
|
#define COMMENT 57
|
||||||
|
#define NL 58
|
||||||
|
#define ENCODING 59
|
||||||
|
#define N_TOKENS 60
|
||||||
|
|
||||||
/* Special definitions for cooperation with parser */
|
/* Special definitions for cooperation with parser */
|
||||||
|
|
||||||
|
|
|
@ -1343,13 +1343,13 @@ class TestTokenize(TestCase):
|
||||||
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
|
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
|
||||||
num_optypes = len(optypes)
|
num_optypes = len(optypes)
|
||||||
self.assertEqual(len(tokens), 2 + num_optypes)
|
self.assertEqual(len(tokens), 2 + num_optypes)
|
||||||
self.assertEqual(token.tok_name[tokens[0].exact_type],
|
self.assertEqual(tok_name[tokens[0].exact_type],
|
||||||
token.tok_name[ENCODING])
|
tok_name[ENCODING])
|
||||||
for i in range(num_optypes):
|
for i in range(num_optypes):
|
||||||
self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
|
self.assertEqual(tok_name[tokens[i + 1].exact_type],
|
||||||
token.tok_name[optypes[i]])
|
tok_name[optypes[i]])
|
||||||
self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
|
self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
|
||||||
token.tok_name[token.ENDMARKER])
|
tok_name[token.ENDMARKER])
|
||||||
|
|
||||||
def test_exact_type(self):
|
def test_exact_type(self):
|
||||||
self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
|
self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
|
||||||
|
|
29
Lib/token.py
29
Lib/token.py
|
@ -63,11 +63,17 @@ AT = 49
|
||||||
ATEQUAL = 50
|
ATEQUAL = 50
|
||||||
RARROW = 51
|
RARROW = 51
|
||||||
ELLIPSIS = 52
|
ELLIPSIS = 52
|
||||||
|
# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
|
||||||
OP = 53
|
OP = 53
|
||||||
AWAIT = 54
|
AWAIT = 54
|
||||||
ASYNC = 55
|
ASYNC = 55
|
||||||
ERRORTOKEN = 56
|
ERRORTOKEN = 56
|
||||||
N_TOKENS = 57
|
# These aren't used by the C tokenizer but are needed for tokenize.py
|
||||||
|
COMMENT = 57
|
||||||
|
NL = 58
|
||||||
|
ENCODING = 59
|
||||||
|
N_TOKENS = 60
|
||||||
|
# Special definitions for cooperation with parser
|
||||||
NT_OFFSET = 256
|
NT_OFFSET = 256
|
||||||
#--end constants--
|
#--end constants--
|
||||||
|
|
||||||
|
@ -102,15 +108,26 @@ def _main():
|
||||||
with fp:
|
with fp:
|
||||||
lines = fp.read().split("\n")
|
lines = fp.read().split("\n")
|
||||||
prog = re.compile(
|
prog = re.compile(
|
||||||
"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
|
r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
|
||||||
re.IGNORECASE)
|
re.IGNORECASE)
|
||||||
|
comment_regex = re.compile(
|
||||||
|
r"^\s*/\*\s*(.+?)\s*\*/\s*$",
|
||||||
|
re.IGNORECASE)
|
||||||
|
|
||||||
tokens = {}
|
tokens = {}
|
||||||
|
prev_val = None
|
||||||
for line in lines:
|
for line in lines:
|
||||||
match = prog.match(line)
|
match = prog.match(line)
|
||||||
if match:
|
if match:
|
||||||
name, val = match.group(1, 2)
|
name, val = match.group(1, 2)
|
||||||
val = int(val)
|
val = int(val)
|
||||||
tokens[val] = name # reverse so we can sort them...
|
tokens[val] = {'token': name} # reverse so we can sort them...
|
||||||
|
prev_val = val
|
||||||
|
else:
|
||||||
|
comment_match = comment_regex.match(line)
|
||||||
|
if comment_match and prev_val is not None:
|
||||||
|
comment = comment_match.group(1)
|
||||||
|
tokens[prev_val]['comment'] = comment
|
||||||
keys = sorted(tokens.keys())
|
keys = sorted(tokens.keys())
|
||||||
# load the output skeleton from the target:
|
# load the output skeleton from the target:
|
||||||
try:
|
try:
|
||||||
|
@ -127,8 +144,10 @@ def _main():
|
||||||
sys.stderr.write("target does not contain format markers")
|
sys.stderr.write("target does not contain format markers")
|
||||||
sys.exit(3)
|
sys.exit(3)
|
||||||
lines = []
|
lines = []
|
||||||
for val in keys:
|
for key in keys:
|
||||||
lines.append("%s = %d" % (tokens[val], val))
|
lines.append("%s = %d" % (tokens[key]["token"], key))
|
||||||
|
if "comment" in tokens[key]:
|
||||||
|
lines.append("# %s" % tokens[key]["comment"])
|
||||||
format[start:end] = lines
|
format[start:end] = lines
|
||||||
try:
|
try:
|
||||||
fp = open(outFileName, 'w')
|
fp = open(outFileName, 'w')
|
||||||
|
|
|
@ -38,17 +38,10 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||||
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||||
|
|
||||||
import token
|
import token
|
||||||
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
|
__all__ = token.__all__ + ["tokenize", "detect_encoding",
|
||||||
"NL", "untokenize", "ENCODING", "TokenInfo"]
|
"untokenize", "TokenInfo"]
|
||||||
del token
|
del token
|
||||||
|
|
||||||
COMMENT = N_TOKENS
|
|
||||||
tok_name[COMMENT] = 'COMMENT'
|
|
||||||
NL = N_TOKENS + 1
|
|
||||||
tok_name[NL] = 'NL'
|
|
||||||
ENCODING = N_TOKENS + 2
|
|
||||||
tok_name[ENCODING] = 'ENCODING'
|
|
||||||
N_TOKENS += 3
|
|
||||||
EXACT_TOKEN_TYPES = {
|
EXACT_TOKEN_TYPES = {
|
||||||
'(': LPAR,
|
'(': LPAR,
|
||||||
')': RPAR,
|
')': RPAR,
|
||||||
|
|
|
@ -10,6 +10,10 @@ What's New in Python 3.7.0 alpha 1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- bpo-25324: Tokens needed for parsing in Python moved to C. ``COMMENT``,
|
||||||
|
``NL`` and ``ENCODING``. This way the tokens and tok_names in the token
|
||||||
|
module don't get changed when you import the tokenize module.
|
||||||
|
|
||||||
- bpo-29104: Fixed parsing backslashes in f-strings.
|
- bpo-29104: Fixed parsing backslashes in f-strings.
|
||||||
|
|
||||||
- bpo-27945: Fixed various segfaults with dict when input collections are
|
- bpo-27945: Fixed various segfaults with dict when input collections are
|
||||||
|
|
|
@ -106,6 +106,9 @@ const char *_PyParser_TokenNames[] = {
|
||||||
"AWAIT",
|
"AWAIT",
|
||||||
"ASYNC",
|
"ASYNC",
|
||||||
"<ERRORTOKEN>",
|
"<ERRORTOKEN>",
|
||||||
|
"COMMENT",
|
||||||
|
"NL",
|
||||||
|
"ENCODING"
|
||||||
"<N_TOKENS>"
|
"<N_TOKENS>"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue