bpo-25324: copy tok_name before changing it (#1608)

* add test to check if were modifying token

* copy list so import tokenize doesnt have side effects on token

* shorten line

* add tokenize tokens to token.h to get them to show up in token

* move ERRORTOKEN back to its previous location, and fix nitpick

* copy comments from token.h automatically

* fix whitespace and make more pythonic

* change to fix comments from @haypo

* update token.rst and Misc/NEWS

* change wording

* some more wording changes
This commit is contained in:
Albert-Jan Nijburg 2017-05-31 15:00:21 +01:00 committed by Victor Stinner
parent 85aba238e4
commit fc354f0785
7 changed files with 52 additions and 21 deletions

View File

@ -101,6 +101,9 @@ The token constants are:
AWAIT AWAIT
ASYNC ASYNC
ERRORTOKEN ERRORTOKEN
COMMENT
NL
ENCODING
N_TOKENS N_TOKENS
NT_OFFSET NT_OFFSET
@ -108,3 +111,8 @@ The token constants are:
Added :data:`AWAIT` and :data:`ASYNC` tokens. Starting with Added :data:`AWAIT` and :data:`ASYNC` tokens. Starting with
Python 3.7, "async" and "await" will be tokenized as :data:`NAME` Python 3.7, "async" and "await" will be tokenized as :data:`NAME`
tokens, and :data:`AWAIT` and :data:`ASYNC` will be removed. tokens, and :data:`AWAIT` and :data:`ASYNC` will be removed.
.. versionchanged:: 3.7
Added :data:`COMMENT`, :data:`NL` and :data:`ENCODING` to bring
the tokens in the C code in line with the tokens needed in
:mod:`tokenize` module. These tokens aren't used by the C tokenizer.

View File

@ -67,7 +67,11 @@ extern "C" {
#define AWAIT 54 #define AWAIT 54
#define ASYNC 55 #define ASYNC 55
#define ERRORTOKEN 56 #define ERRORTOKEN 56
#define N_TOKENS 57 /* These aren't used by the C tokenizer but are needed for tokenize.py */
#define COMMENT 57
#define NL 58
#define ENCODING 59
#define N_TOKENS 60
/* Special definitions for cooperation with parser */ /* Special definitions for cooperation with parser */

View File

@ -1343,13 +1343,13 @@ class TestTokenize(TestCase):
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
num_optypes = len(optypes) num_optypes = len(optypes)
self.assertEqual(len(tokens), 2 + num_optypes) self.assertEqual(len(tokens), 2 + num_optypes)
self.assertEqual(token.tok_name[tokens[0].exact_type], self.assertEqual(tok_name[tokens[0].exact_type],
token.tok_name[ENCODING]) tok_name[ENCODING])
for i in range(num_optypes): for i in range(num_optypes):
self.assertEqual(token.tok_name[tokens[i + 1].exact_type], self.assertEqual(tok_name[tokens[i + 1].exact_type],
token.tok_name[optypes[i]]) tok_name[optypes[i]])
self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type], self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
token.tok_name[token.ENDMARKER]) tok_name[token.ENDMARKER])
def test_exact_type(self): def test_exact_type(self):
self.assertExactTypeEqual('()', token.LPAR, token.RPAR) self.assertExactTypeEqual('()', token.LPAR, token.RPAR)

View File

@ -63,11 +63,17 @@ AT = 49
ATEQUAL = 50 ATEQUAL = 50
RARROW = 51 RARROW = 51
ELLIPSIS = 52 ELLIPSIS = 52
# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
OP = 53 OP = 53
AWAIT = 54 AWAIT = 54
ASYNC = 55 ASYNC = 55
ERRORTOKEN = 56 ERRORTOKEN = 56
N_TOKENS = 57 # These aren't used by the C tokenizer but are needed for tokenize.py
COMMENT = 57
NL = 58
ENCODING = 59
N_TOKENS = 60
# Special definitions for cooperation with parser
NT_OFFSET = 256 NT_OFFSET = 256
#--end constants-- #--end constants--
@ -102,15 +108,26 @@ def _main():
with fp: with fp:
lines = fp.read().split("\n") lines = fp.read().split("\n")
prog = re.compile( prog = re.compile(
"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)", r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
re.IGNORECASE) re.IGNORECASE)
comment_regex = re.compile(
r"^\s*/\*\s*(.+?)\s*\*/\s*$",
re.IGNORECASE)
tokens = {} tokens = {}
prev_val = None
for line in lines: for line in lines:
match = prog.match(line) match = prog.match(line)
if match: if match:
name, val = match.group(1, 2) name, val = match.group(1, 2)
val = int(val) val = int(val)
tokens[val] = name # reverse so we can sort them... tokens[val] = {'token': name} # reverse so we can sort them...
prev_val = val
else:
comment_match = comment_regex.match(line)
if comment_match and prev_val is not None:
comment = comment_match.group(1)
tokens[prev_val]['comment'] = comment
keys = sorted(tokens.keys()) keys = sorted(tokens.keys())
# load the output skeleton from the target: # load the output skeleton from the target:
try: try:
@ -127,8 +144,10 @@ def _main():
sys.stderr.write("target does not contain format markers") sys.stderr.write("target does not contain format markers")
sys.exit(3) sys.exit(3)
lines = [] lines = []
for val in keys: for key in keys:
lines.append("%s = %d" % (tokens[val], val)) lines.append("%s = %d" % (tokens[key]["token"], key))
if "comment" in tokens[key]:
lines.append("# %s" % tokens[key]["comment"])
format[start:end] = lines format[start:end] = lines
try: try:
fp = open(outFileName, 'w') fp = open(outFileName, 'w')

View File

@ -38,17 +38,10 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
import token import token
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", __all__ = token.__all__ + ["tokenize", "detect_encoding",
"NL", "untokenize", "ENCODING", "TokenInfo"] "untokenize", "TokenInfo"]
del token del token
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
ENCODING = N_TOKENS + 2
tok_name[ENCODING] = 'ENCODING'
N_TOKENS += 3
EXACT_TOKEN_TYPES = { EXACT_TOKEN_TYPES = {
'(': LPAR, '(': LPAR,
')': RPAR, ')': RPAR,

View File

@ -10,6 +10,10 @@ What's New in Python 3.7.0 alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- bpo-25324: Tokens needed for parsing in Python moved to C. ``COMMENT``,
``NL`` and ``ENCODING``. This way the tokens and tok_names in the token
module don't get changed when you import the tokenize module.
- bpo-29104: Fixed parsing backslashes in f-strings. - bpo-29104: Fixed parsing backslashes in f-strings.
- bpo-27945: Fixed various segfaults with dict when input collections are - bpo-27945: Fixed various segfaults with dict when input collections are

View File

@ -106,6 +106,9 @@ const char *_PyParser_TokenNames[] = {
"AWAIT", "AWAIT",
"ASYNC", "ASYNC",
"<ERRORTOKEN>", "<ERRORTOKEN>",
"COMMENT",
"NL",
"ENCODING"
"<N_TOKENS>" "<N_TOKENS>"
}; };