diff --git a/.gitattributes b/.gitattributes
index 16237bb2b3a..c9a54fbd472 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -55,3 +55,7 @@ Include/opcode.h linguist-generated=true
Python/opcode_targets.h linguist-generated=true
Objects/typeslots.inc linguist-generated=true
Modules/unicodedata_db.h linguist-generated=true
+Doc/library/token-list.inc linguist-generated=true
+Include/token.h linguist-generated=true
+Lib/token.py linguist-generated=true
+Parser/token.c linguist-generated=true
diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc
new file mode 100644
index 00000000000..cd6e0f26968
--- /dev/null
+++ b/Doc/library/token-list.inc
@@ -0,0 +1,206 @@
+.. Auto-generated by Tools/scripts/generate_token.py
+.. data:: ENDMARKER
+
+.. data:: NAME
+
+.. data:: NUMBER
+
+.. data:: STRING
+
+.. data:: NEWLINE
+
+.. data:: INDENT
+
+.. data:: DEDENT
+
+.. data:: LPAR
+
+ Token value for ``"("``.
+
+.. data:: RPAR
+
+ Token value for ``")"``.
+
+.. data:: LSQB
+
+ Token value for ``"["``.
+
+.. data:: RSQB
+
+ Token value for ``"]"``.
+
+.. data:: COLON
+
+ Token value for ``":"``.
+
+.. data:: COMMA
+
+ Token value for ``","``.
+
+.. data:: SEMI
+
+ Token value for ``";"``.
+
+.. data:: PLUS
+
+ Token value for ``"+"``.
+
+.. data:: MINUS
+
+ Token value for ``"-"``.
+
+.. data:: STAR
+
+ Token value for ``"*"``.
+
+.. data:: SLASH
+
+ Token value for ``"/"``.
+
+.. data:: VBAR
+
+ Token value for ``"|"``.
+
+.. data:: AMPER
+
+ Token value for ``"&"``.
+
+.. data:: LESS
+
+ Token value for ``"<"``.
+
+.. data:: GREATER
+
+ Token value for ``">"``.
+
+.. data:: EQUAL
+
+ Token value for ``"="``.
+
+.. data:: DOT
+
+ Token value for ``"."``.
+
+.. data:: PERCENT
+
+ Token value for ``"%"``.
+
+.. data:: LBRACE
+
+ Token value for ``"{"``.
+
+.. data:: RBRACE
+
+ Token value for ``"}"``.
+
+.. data:: EQEQUAL
+
+ Token value for ``"=="``.
+
+.. data:: NOTEQUAL
+
+ Token value for ``"!="``.
+
+.. data:: LESSEQUAL
+
+ Token value for ``"<="``.
+
+.. data:: GREATEREQUAL
+
+ Token value for ``">="``.
+
+.. data:: TILDE
+
+ Token value for ``"~"``.
+
+.. data:: CIRCUMFLEX
+
+ Token value for ``"^"``.
+
+.. data:: LEFTSHIFT
+
+ Token value for ``"<<"``.
+
+.. data:: RIGHTSHIFT
+
+ Token value for ``">>"``.
+
+.. data:: DOUBLESTAR
+
+ Token value for ``"**"``.
+
+.. data:: PLUSEQUAL
+
+ Token value for ``"+="``.
+
+.. data:: MINEQUAL
+
+ Token value for ``"-="``.
+
+.. data:: STAREQUAL
+
+ Token value for ``"*="``.
+
+.. data:: SLASHEQUAL
+
+ Token value for ``"/="``.
+
+.. data:: PERCENTEQUAL
+
+ Token value for ``"%="``.
+
+.. data:: AMPEREQUAL
+
+ Token value for ``"&="``.
+
+.. data:: VBAREQUAL
+
+ Token value for ``"|="``.
+
+.. data:: CIRCUMFLEXEQUAL
+
+ Token value for ``"^="``.
+
+.. data:: LEFTSHIFTEQUAL
+
+ Token value for ``"<<="``.
+
+.. data:: RIGHTSHIFTEQUAL
+
+ Token value for ``">>="``.
+
+.. data:: DOUBLESTAREQUAL
+
+ Token value for ``"**="``.
+
+.. data:: DOUBLESLASH
+
+ Token value for ``"//"``.
+
+.. data:: DOUBLESLASHEQUAL
+
+ Token value for ``"//="``.
+
+.. data:: AT
+
+ Token value for ``"@"``.
+
+.. data:: ATEQUAL
+
+ Token value for ``"@="``.
+
+.. data:: RARROW
+
+ Token value for ``"->"``.
+
+.. data:: ELLIPSIS
+
+ Token value for ``"..."``.
+
+.. data:: OP
+
+.. data:: ERRORTOKEN
+
+.. data:: N_TOKENS
+
+.. data:: NT_OFFSET
diff --git a/Doc/library/token.rst b/Doc/library/token.rst
index 373991027e4..5358eb5a291 100644
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@@ -44,64 +44,7 @@ functions. The functions mirror definitions in the Python C header files.
The token constants are:
-.. data:: ENDMARKER
- NAME
- NUMBER
- STRING
- NEWLINE
- INDENT
- DEDENT
- LPAR
- RPAR
- LSQB
- RSQB
- COLON
- COMMA
- SEMI
- PLUS
- MINUS
- STAR
- SLASH
- VBAR
- AMPER
- LESS
- GREATER
- EQUAL
- DOT
- PERCENT
- LBRACE
- RBRACE
- EQEQUAL
- NOTEQUAL
- LESSEQUAL
- GREATEREQUAL
- TILDE
- CIRCUMFLEX
- LEFTSHIFT
- RIGHTSHIFT
- DOUBLESTAR
- PLUSEQUAL
- MINEQUAL
- STAREQUAL
- SLASHEQUAL
- PERCENTEQUAL
- AMPEREQUAL
- VBAREQUAL
- CIRCUMFLEXEQUAL
- LEFTSHIFTEQUAL
- RIGHTSHIFTEQUAL
- DOUBLESTAREQUAL
- DOUBLESLASH
- DOUBLESLASHEQUAL
- AT
- ATEQUAL
- RARROW
- ELLIPSIS
- OP
- ERRORTOKEN
- N_TOKENS
- NT_OFFSET
-
+.. include:: token-list.inc
The following token type values aren't used by the C tokenizer but are needed for
the :mod:`tokenize` module.
diff --git a/Grammar/Tokens b/Grammar/Tokens
new file mode 100644
index 00000000000..9595673a5af
--- /dev/null
+++ b/Grammar/Tokens
@@ -0,0 +1,62 @@
+ENDMARKER
+NAME
+NUMBER
+STRING
+NEWLINE
+INDENT
+DEDENT
+
+LPAR '('
+RPAR ')'
+LSQB '['
+RSQB ']'
+COLON ':'
+COMMA ','
+SEMI ';'
+PLUS '+'
+MINUS '-'
+STAR '*'
+SLASH '/'
+VBAR '|'
+AMPER '&'
+LESS '<'
+GREATER '>'
+EQUAL '='
+DOT '.'
+PERCENT '%'
+LBRACE '{'
+RBRACE '}'
+EQEQUAL '=='
+NOTEQUAL '!='
+LESSEQUAL '<='
+GREATEREQUAL '>='
+TILDE '~'
+CIRCUMFLEX '^'
+LEFTSHIFT '<<'
+RIGHTSHIFT '>>'
+DOUBLESTAR '**'
+PLUSEQUAL '+='
+MINEQUAL '-='
+STAREQUAL '*='
+SLASHEQUAL '/='
+PERCENTEQUAL '%='
+AMPEREQUAL '&='
+VBAREQUAL '|='
+CIRCUMFLEXEQUAL '^='
+LEFTSHIFTEQUAL '<<='
+RIGHTSHIFTEQUAL '>>='
+DOUBLESTAREQUAL '**='
+DOUBLESLASH '//'
+DOUBLESLASHEQUAL '//='
+AT '@'
+ATEQUAL '@='
+RARROW '->'
+ELLIPSIS '...'
+
+OP
+ERRORTOKEN
+
+# These aren't used by the C tokenizer but are needed for tokenize.py
+COMMENT
+NL
+ENCODING
diff --git a/Include/token.h b/Include/token.h
index cd1cd00f09c..2d491e6927d 100644
--- a/Include/token.h
+++ b/Include/token.h
@@ -1,3 +1,4 @@
+/* Auto-generated by Tools/scripts/generate_token.py */
/* Token types */
#ifndef Py_LIMITED_API
@@ -62,25 +63,19 @@ extern "C" {
#define ATEQUAL 50
#define RARROW 51
#define ELLIPSIS 52
-/* Don't forget to update the table _PyParser_TokenNames in tokenizer.c! */
#define OP 53
#define ERRORTOKEN 54
-/* These aren't used by the C tokenizer but are needed for tokenize.py */
-#define COMMENT 55
-#define NL 56
-#define ENCODING 57
#define N_TOKENS 58
+#define NT_OFFSET 256
/* Special definitions for cooperation with parser */
-#define NT_OFFSET 256
-
#define ISTERMINAL(x) ((x) < NT_OFFSET)
#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
#define ISEOF(x) ((x) == ENDMARKER)
-PyAPI_DATA(const char *) _PyParser_TokenNames[]; /* Token names */
+PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
PyAPI_FUNC(int) PyToken_OneChar(int);
PyAPI_FUNC(int) PyToken_TwoChars(int, int);
PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
diff --git a/Lib/symbol.py b/Lib/symbol.py
old mode 100755
new mode 100644
index dc7dcba5e4d..40d0ed1e035
--- a/Lib/symbol.py
+++ b/Lib/symbol.py
@@ -1,5 +1,3 @@
-#! /usr/bin/env python3
-
"""Non-terminal symbols of Python grammar (from "graminit.h")."""
# This file is automatically generated; please don't muck it up!
@@ -7,7 +5,11 @@
# To update the symbols in this file, 'cd' to the top directory of
# the python source tree after building the interpreter and run:
#
-# ./python Lib/symbol.py
+# python3 Tools/scripts/generate_symbol_py.py Include/graminit.h Lib/symbol.py
+#
+# or just
+#
+# make regen-symbol
#--start constants--
single_input = 256
@@ -103,14 +105,4 @@ sym_name = {}
for _name, _value in list(globals().items()):
if type(_value) is type(0):
sym_name[_value] = _name
-
-
-def _main():
- import sys
- import token
- if len(sys.argv) == 1:
- sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"]
- token._main()
-
-if __name__ == "__main__":
- _main()
+del _name, _value
diff --git a/Lib/test/test_symbol.py b/Lib/test/test_symbol.py
index c1306f54327..ed86aec36b8 100644
--- a/Lib/test/test_symbol.py
+++ b/Lib/test/test_symbol.py
@@ -6,6 +6,9 @@ import subprocess
SYMBOL_FILE = support.findfile('symbol.py')
+GEN_SYMBOL_FILE = os.path.join(os.path.dirname(__file__),
+ '..', '..', 'Tools', 'scripts',
+ 'generate_symbol_py.py')
GRAMMAR_FILE = os.path.join(os.path.dirname(__file__),
'..', '..', 'Include', 'graminit.h')
TEST_PY_FILE = 'symbol_test.py'
@@ -22,7 +25,7 @@ class TestSymbolGeneration(unittest.TestCase):
def _generate_symbols(self, grammar_file, target_symbol_py_file):
proc = subprocess.Popen([sys.executable,
- SYMBOL_FILE,
+ GEN_SYMBOL_FILE,
grammar_file,
target_symbol_py_file], stderr=subprocess.PIPE)
stderr = proc.communicate()[1]
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index ff144795494..04a12542c6a 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1619,6 +1619,8 @@ class TestRoundtrip(TestCase):
testfiles = random.sample(testfiles, 10)
for testfile in testfiles:
+ if support.verbose >= 2:
+ print('tokenize', testfile)
with open(testfile, 'rb') as f:
with self.subTest(file=testfile):
self.check_roundtrip(f)
diff --git a/Lib/token.py b/Lib/token.py
index ba132059abf..5af7e6b91ea 100644
--- a/Lib/token.py
+++ b/Lib/token.py
@@ -1,15 +1,8 @@
-"""Token constants (from "token.h")."""
+"""Token constants."""
+# Auto-generated by Tools/scripts/generate_token.py
__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
-# This file is automatically generated; please don't muck it up!
-#
-# To update the symbols in this file, 'cd' to the top directory of
-# the python source tree after building the interpreter and run:
-#
-# ./python Lib/token.py
-
-#--start constants--
ENDMARKER = 0
NAME = 1
NUMBER = 2
@@ -63,23 +56,70 @@ AT = 49
ATEQUAL = 50
RARROW = 51
ELLIPSIS = 52
-# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
OP = 53
-ERRORTOKEN = 54
# These aren't used by the C tokenizer but are needed for tokenize.py
+ERRORTOKEN = 54
COMMENT = 55
NL = 56
ENCODING = 57
N_TOKENS = 58
# Special definitions for cooperation with parser
NT_OFFSET = 256
-#--end constants--
tok_name = {value: name
for name, value in globals().items()
if isinstance(value, int) and not name.startswith('_')}
__all__.extend(tok_name.values())
+EXACT_TOKEN_TYPES = {
+ '!=': NOTEQUAL,
+ '%': PERCENT,
+ '%=': PERCENTEQUAL,
+ '&': AMPER,
+ '&=': AMPEREQUAL,
+ '(': LPAR,
+ ')': RPAR,
+ '*': STAR,
+ '**': DOUBLESTAR,
+ '**=': DOUBLESTAREQUAL,
+ '*=': STAREQUAL,
+ '+': PLUS,
+ '+=': PLUSEQUAL,
+ ',': COMMA,
+ '-': MINUS,
+ '-=': MINEQUAL,
+ '->': RARROW,
+ '.': DOT,
+ '...': ELLIPSIS,
+ '/': SLASH,
+ '//': DOUBLESLASH,
+ '//=': DOUBLESLASHEQUAL,
+ '/=': SLASHEQUAL,
+ ':': COLON,
+ ';': SEMI,
+ '<': LESS,
+ '<<': LEFTSHIFT,
+ '<<=': LEFTSHIFTEQUAL,
+ '<=': LESSEQUAL,
+ '=': EQUAL,
+ '==': EQEQUAL,
+ '>': GREATER,
+ '>=': GREATEREQUAL,
+ '>>': RIGHTSHIFT,
+ '>>=': RIGHTSHIFTEQUAL,
+ '@': AT,
+ '@=': ATEQUAL,
+ '[': LSQB,
+ ']': RSQB,
+ '^': CIRCUMFLEX,
+ '^=': CIRCUMFLEXEQUAL,
+ '{': LBRACE,
+ '|': VBAR,
+ '|=': VBAREQUAL,
+ '}': RBRACE,
+ '~': TILDE,
+}
+
def ISTERMINAL(x):
return x < NT_OFFSET
@@ -88,73 +128,3 @@ def ISNONTERMINAL(x):
def ISEOF(x):
return x == ENDMARKER
-
-
-def _main():
- import re
- import sys
- args = sys.argv[1:]
- inFileName = args and args[0] or "Include/token.h"
- outFileName = "Lib/token.py"
- if len(args) > 1:
- outFileName = args[1]
- try:
- fp = open(inFileName)
- except OSError as err:
- sys.stdout.write("I/O error: %s\n" % str(err))
- sys.exit(1)
- with fp:
- lines = fp.read().split("\n")
- prog = re.compile(
- r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
- re.IGNORECASE)
- comment_regex = re.compile(
- r"^\s*/\*\s*(.+?)\s*\*/\s*$",
- re.IGNORECASE)
-
- tokens = {}
- prev_val = None
- for line in lines:
- match = prog.match(line)
- if match:
- name, val = match.group(1, 2)
- val = int(val)
- tokens[val] = {'token': name} # reverse so we can sort them...
- prev_val = val
- else:
- comment_match = comment_regex.match(line)
- if comment_match and prev_val is not None:
- comment = comment_match.group(1)
- tokens[prev_val]['comment'] = comment
- keys = sorted(tokens.keys())
- # load the output skeleton from the target:
- try:
- fp = open(outFileName)
- except OSError as err:
- sys.stderr.write("I/O error: %s\n" % str(err))
- sys.exit(2)
- with fp:
- format = fp.read().split("\n")
- try:
- start = format.index("#--start constants--") + 1
- end = format.index("#--end constants--")
- except ValueError:
- sys.stderr.write("target does not contain format markers")
- sys.exit(3)
- lines = []
- for key in keys:
- lines.append("%s = %d" % (tokens[key]["token"], key))
- if "comment" in tokens[key]:
- lines.append("# %s" % tokens[key]["comment"])
- format[start:end] = lines
- try:
- fp = open(outFileName, 'w')
- except OSError as err:
- sys.stderr.write("I/O error: %s\n" % str(err))
- sys.exit(4)
- with fp:
- fp.write("\n".join(format))
-
-
-if __name__ == "__main__":
- _main()
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index fce010bc5e7..cf1ecc99a94 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -32,6 +32,7 @@ import itertools as _itertools
import re
import sys
from token import *
+from token import EXACT_TOKEN_TYPES
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -41,55 +42,6 @@ __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
"untokenize", "TokenInfo"]
del token
-EXACT_TOKEN_TYPES = {
- '(': LPAR,
- ')': RPAR,
- '[': LSQB,
- ']': RSQB,
- ':': COLON,
- ',': COMMA,
- ';': SEMI,
- '+': PLUS,
- '-': MINUS,
- '*': STAR,
- '/': SLASH,
- '|': VBAR,
- '&': AMPER,
- '<': LESS,
- '>': GREATER,
- '=': EQUAL,
- '.': DOT,
- '%': PERCENT,
- '{': LBRACE,
- '}': RBRACE,
- '==': EQEQUAL,
- '!=': NOTEQUAL,
- '<=': LESSEQUAL,
- '>=': GREATEREQUAL,
- '~': TILDE,
- '^': CIRCUMFLEX,
- '<<': LEFTSHIFT,
- '>>': RIGHTSHIFT,
- '**': DOUBLESTAR,
- '+=': PLUSEQUAL,
- '-=': MINEQUAL,
- '*=': STAREQUAL,
- '/=': SLASHEQUAL,
- '%=': PERCENTEQUAL,
- '&=': AMPEREQUAL,
- '|=': VBAREQUAL,
- '^=': CIRCUMFLEXEQUAL,
- '<<=': LEFTSHIFTEQUAL,
- '>>=': RIGHTSHIFTEQUAL,
- '**=': DOUBLESTAREQUAL,
- '//': DOUBLESLASH,
- '//=': DOUBLESLASHEQUAL,
- '...': ELLIPSIS,
- '->': RARROW,
- '@': AT,
- '@=': ATEQUAL,
-}
-
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self):
annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
@@ -163,17 +115,11 @@ Triple = group(StringPrefix + "'''", StringPrefix + '"""')
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
- r"//=?", r"->",
- r"[+\-*/%&@|^=<>]=?",
- r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-Funny = group(Operator, Bracket, Special)
+# Sorting in reverse order puts the long operators before their prefixes.
+# Otherwise if = came before ==, == would get recognized as two instances
+# of =.
+Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
+Funny = group(r'\r?\n', Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 518602b1a22..04312e1be1c 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -302,6 +302,7 @@ POBJS= \
Parser/metagrammar.o \
Parser/firstsets.o \
Parser/grammar.o \
+ Parser/token.o \
Parser/pgen.o
PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o
@@ -559,7 +560,7 @@ coverage-lcov:
@echo
# Force regeneration of parser and importlib
-coverage-report: regen-grammar regen-importlib
+coverage-report: regen-grammar regen-token regen-importlib
@ # build with coverage info
$(MAKE) coverage
@ # run tests, ignore failures
@@ -741,7 +742,7 @@ regen-importlib: Programs/_freeze_importlib
# Regenerate all generated files
regen-all: regen-opcode regen-opcode-targets regen-typeslots regen-grammar \
- regen-ast regen-importlib clinic
+ regen-token regen-symbol regen-ast regen-importlib clinic
############################################################################
# Special rules for object files
@@ -849,6 +850,37 @@ regen-opcode:
$(srcdir)/Include/opcode.h.new
$(UPDATE_FILE) $(srcdir)/Include/opcode.h $(srcdir)/Include/opcode.h.new
+.PHONY: regen-token
+regen-token:
+ # Regenerate Doc/library/token-list.inc from Grammar/Tokens
+ # using Tools/scripts/generate_token.py
+ $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py rst \
+ $(srcdir)/Grammar/Tokens \
+ $(srcdir)/Doc/library/token-list.inc
+ # Regenerate Include/token.h from Grammar/Tokens
+ # using Tools/scripts/generate_token.py
+ $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py h \
+ $(srcdir)/Grammar/Tokens \
+ $(srcdir)/Include/token.h
+ # Regenerate Parser/token.c from Grammar/Tokens
+ # using Tools/scripts/generate_token.py
+ $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py c \
+ $(srcdir)/Grammar/Tokens \
+ $(srcdir)/Parser/token.c
+ # Regenerate Lib/token.py from Grammar/Tokens
+ # using Tools/scripts/generate_token.py
+ $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py py \
+ $(srcdir)/Grammar/Tokens \
+ $(srcdir)/Lib/token.py
+
+.PHONY: regen-symbol
+regen-symbol: $(srcdir)/Include/graminit.h
+ # Regenerate Lib/symbol.py from Include/graminit.h
+ # using Tools/scripts/generate_symbol_py.py
+ $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_symbol_py.py \
+ $(srcdir)/Include/graminit.h \
+ $(srcdir)/Lib/symbol.py
+
Python/compile.o Python/symtable.o Python/ast_unparse.o Python/ast.o: $(srcdir)/Include/graminit.h $(srcdir)/Include/Python-ast.h
Python/getplatform.o: $(srcdir)/Python/getplatform.c
diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst b/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst
new file mode 100644
index 00000000000..21182525a2d
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst
@@ -0,0 +1,2 @@
+The C and Python code and the documentation related to tokens are now generated
+from a single source file :file:`Grammar/Tokens`.
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index 78ec9a16efa..ddf7f49d7a8 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -367,6 +367,7 @@
+
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 5a43a9951b6..77b018ffb4e 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -866,6 +866,9 @@
Parser
+
+ Parser
+
PC
diff --git a/Parser/token.c b/Parser/token.c
new file mode 100644
index 00000000000..35519aa4b61
--- /dev/null
+++ b/Parser/token.c
@@ -0,0 +1,233 @@
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+#include "Python.h"
+#include "token.h"
+
+/* Token names */
+
+const char * const _PyParser_TokenNames[] = {
+ "ENDMARKER",
+ "NAME",
+ "NUMBER",
+ "STRING",
+ "NEWLINE",
+ "INDENT",
+ "DEDENT",
+ "LPAR",
+ "RPAR",
+ "LSQB",
+ "RSQB",
+ "COLON",
+ "COMMA",
+ "SEMI",
+ "PLUS",
+ "MINUS",
+ "STAR",
+ "SLASH",
+ "VBAR",
+ "AMPER",
+ "LESS",
+ "GREATER",
+ "EQUAL",
+ "DOT",
+ "PERCENT",
+ "LBRACE",
+ "RBRACE",
+ "EQEQUAL",
+ "NOTEQUAL",
+ "LESSEQUAL",
+ "GREATEREQUAL",
+ "TILDE",
+ "CIRCUMFLEX",
+ "LEFTSHIFT",
+ "RIGHTSHIFT",
+ "DOUBLESTAR",
+ "PLUSEQUAL",
+ "MINEQUAL",
+ "STAREQUAL",
+ "SLASHEQUAL",
+ "PERCENTEQUAL",
+ "AMPEREQUAL",
+ "VBAREQUAL",
+ "CIRCUMFLEXEQUAL",
+ "LEFTSHIFTEQUAL",
+ "RIGHTSHIFTEQUAL",
+ "DOUBLESTAREQUAL",
+ "DOUBLESLASH",
+ "DOUBLESLASHEQUAL",
+ "AT",
+ "ATEQUAL",
+ "RARROW",
+ "ELLIPSIS",
+ "OP",
+ "",
+ "",
+ "",
+ "",
+ "",
+};
+
+/* Return the token corresponding to a single character */
+
+int
+PyToken_OneChar(int c1)
+{
+ switch (c1) {
+ case '%': return PERCENT;
+ case '&': return AMPER;
+ case '(': return LPAR;
+ case ')': return RPAR;
+ case '*': return STAR;
+ case '+': return PLUS;
+ case ',': return COMMA;
+ case '-': return MINUS;
+ case '.': return DOT;
+ case '/': return SLASH;
+ case ':': return COLON;
+ case ';': return SEMI;
+ case '<': return LESS;
+ case '=': return EQUAL;
+ case '>': return GREATER;
+ case '@': return AT;
+ case '[': return LSQB;
+ case ']': return RSQB;
+ case '^': return CIRCUMFLEX;
+ case '{': return LBRACE;
+ case '|': return VBAR;
+ case '}': return RBRACE;
+ case '~': return TILDE;
+ }
+ return OP;
+}
+
+int
+PyToken_TwoChars(int c1, int c2)
+{
+ switch (c1) {
+ case '!':
+ switch (c2) {
+ case '=': return NOTEQUAL;
+ }
+ break;
+ case '%':
+ switch (c2) {
+ case '=': return PERCENTEQUAL;
+ }
+ break;
+ case '&':
+ switch (c2) {
+ case '=': return AMPEREQUAL;
+ }
+ break;
+ case '*':
+ switch (c2) {
+ case '*': return DOUBLESTAR;
+ case '=': return STAREQUAL;
+ }
+ break;
+ case '+':
+ switch (c2) {
+ case '=': return PLUSEQUAL;
+ }
+ break;
+ case '-':
+ switch (c2) {
+ case '=': return MINEQUAL;
+ case '>': return RARROW;
+ }
+ break;
+ case '/':
+ switch (c2) {
+ case '/': return DOUBLESLASH;
+ case '=': return SLASHEQUAL;
+ }
+ break;
+ case '<':
+ switch (c2) {
+ case '<': return LEFTSHIFT;
+ case '=': return LESSEQUAL;
+ case '>': return NOTEQUAL;
+ }
+ break;
+ case '=':
+ switch (c2) {
+ case '=': return EQEQUAL;
+ }
+ break;
+ case '>':
+ switch (c2) {
+ case '=': return GREATEREQUAL;
+ case '>': return RIGHTSHIFT;
+ }
+ break;
+ case '@':
+ switch (c2) {
+ case '=': return ATEQUAL;
+ }
+ break;
+ case '^':
+ switch (c2) {
+ case '=': return CIRCUMFLEXEQUAL;
+ }
+ break;
+ case '|':
+ switch (c2) {
+ case '=': return VBAREQUAL;
+ }
+ break;
+ }
+ return OP;
+}
+
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
+{
+ switch (c1) {
+ case '*':
+ switch (c2) {
+ case '*':
+ switch (c3) {
+ case '=': return DOUBLESTAREQUAL;
+ }
+ break;
+ }
+ break;
+ case '.':
+ switch (c2) {
+ case '.':
+ switch (c3) {
+ case '.': return ELLIPSIS;
+ }
+ break;
+ }
+ break;
+ case '/':
+ switch (c2) {
+ case '/':
+ switch (c3) {
+ case '=': return DOUBLESLASHEQUAL;
+ }
+ break;
+ }
+ break;
+ case '<':
+ switch (c2) {
+ case '<':
+ switch (c3) {
+ case '=': return LEFTSHIFTEQUAL;
+ }
+ break;
+ }
+ break;
+ case '>':
+ switch (c2) {
+ case '>':
+ switch (c3) {
+ case '=': return RIGHTSHIFTEQUAL;
+ }
+ break;
+ }
+ break;
+ }
+ return OP;
+}
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index c246ee204c5..0e6c1a85e03 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -48,72 +48,6 @@ static int tok_nextc(struct tok_state *tok);
static void tok_backup(struct tok_state *tok, int c);
-/* Token names */
-
-const char *_PyParser_TokenNames[] = {
- "ENDMARKER",
- "NAME",
- "NUMBER",
- "STRING",
- "NEWLINE",
- "INDENT",
- "DEDENT",
- "LPAR",
- "RPAR",
- "LSQB",
- "RSQB",
- "COLON",
- "COMMA",
- "SEMI",
- "PLUS",
- "MINUS",
- "STAR",
- "SLASH",
- "VBAR",
- "AMPER",
- "LESS",
- "GREATER",
- "EQUAL",
- "DOT",
- "PERCENT",
- "LBRACE",
- "RBRACE",
- "EQEQUAL",
- "NOTEQUAL",
- "LESSEQUAL",
- "GREATEREQUAL",
- "TILDE",
- "CIRCUMFLEX",
- "LEFTSHIFT",
- "RIGHTSHIFT",
- "DOUBLESTAR",
- "PLUSEQUAL",
- "MINEQUAL",
- "STAREQUAL",
- "SLASHEQUAL",
- "PERCENTEQUAL",
- "AMPEREQUAL",
- "VBAREQUAL",
- "CIRCUMFLEXEQUAL",
- "LEFTSHIFTEQUAL",
- "RIGHTSHIFTEQUAL",
- "DOUBLESTAREQUAL",
- "DOUBLESLASH",
- "DOUBLESLASHEQUAL",
- "AT",
- "ATEQUAL",
- "RARROW",
- "ELLIPSIS",
- /* This table must match the #defines in token.h! */
- "OP",
- "",
- "COMMENT",
- "NL",
- "ENCODING",
- ""
-};
-
-
/* Create and initialize a new tok_state structure */
static struct tok_state *
@@ -1114,177 +1048,6 @@ tok_backup(struct tok_state *tok, int c)
}
-/* Return the token corresponding to a single character */
-
-int
-PyToken_OneChar(int c)
-{
- switch (c) {
- case '(': return LPAR;
- case ')': return RPAR;
- case '[': return LSQB;
- case ']': return RSQB;
- case ':': return COLON;
- case ',': return COMMA;
- case ';': return SEMI;
- case '+': return PLUS;
- case '-': return MINUS;
- case '*': return STAR;
- case '/': return SLASH;
- case '|': return VBAR;
- case '&': return AMPER;
- case '<': return LESS;
- case '>': return GREATER;
- case '=': return EQUAL;
- case '.': return DOT;
- case '%': return PERCENT;
- case '{': return LBRACE;
- case '}': return RBRACE;
- case '^': return CIRCUMFLEX;
- case '~': return TILDE;
- case '@': return AT;
- default: return OP;
- }
-}
-
-
-int
-PyToken_TwoChars(int c1, int c2)
-{
- switch (c1) {
- case '=':
- switch (c2) {
- case '=': return EQEQUAL;
- }
- break;
- case '!':
- switch (c2) {
- case '=': return NOTEQUAL;
- }
- break;
- case '<':
- switch (c2) {
- case '>': return NOTEQUAL;
- case '=': return LESSEQUAL;
- case '<': return LEFTSHIFT;
- }
- break;
- case '>':
- switch (c2) {
- case '=': return GREATEREQUAL;
- case '>': return RIGHTSHIFT;
- }
- break;
- case '+':
- switch (c2) {
- case '=': return PLUSEQUAL;
- }
- break;
- case '-':
- switch (c2) {
- case '=': return MINEQUAL;
- case '>': return RARROW;
- }
- break;
- case '*':
- switch (c2) {
- case '*': return DOUBLESTAR;
- case '=': return STAREQUAL;
- }
- break;
- case '/':
- switch (c2) {
- case '/': return DOUBLESLASH;
- case '=': return SLASHEQUAL;
- }
- break;
- case '|':
- switch (c2) {
- case '=': return VBAREQUAL;
- }
- break;
- case '%':
- switch (c2) {
- case '=': return PERCENTEQUAL;
- }
- break;
- case '&':
- switch (c2) {
- case '=': return AMPEREQUAL;
- }
- break;
- case '^':
- switch (c2) {
- case '=': return CIRCUMFLEXEQUAL;
- }
- break;
- case '@':
- switch (c2) {
- case '=': return ATEQUAL;
- }
- break;
- }
- return OP;
-}
-
-int
-PyToken_ThreeChars(int c1, int c2, int c3)
-{
- switch (c1) {
- case '<':
- switch (c2) {
- case '<':
- switch (c3) {
- case '=':
- return LEFTSHIFTEQUAL;
- }
- break;
- }
- break;
- case '>':
- switch (c2) {
- case '>':
- switch (c3) {
- case '=':
- return RIGHTSHIFTEQUAL;
- }
- break;
- }
- break;
- case '*':
- switch (c2) {
- case '*':
- switch (c3) {
- case '=':
- return DOUBLESTAREQUAL;
- }
- break;
- }
- break;
- case '/':
- switch (c2) {
- case '/':
- switch (c3) {
- case '=':
- return DOUBLESLASHEQUAL;
- }
- break;
- }
- break;
- case '.':
- switch (c2) {
- case '.':
- switch (c3) {
- case '.':
- return ELLIPSIS;
- }
- break;
- }
- break;
- }
- return OP;
-}
-
static int
syntaxerror(struct tok_state *tok, const char *format, ...)
{
diff --git a/Tools/scripts/generate_symbol_py.py b/Tools/scripts/generate_symbol_py.py
new file mode 100755
index 00000000000..9219b096e4d
--- /dev/null
+++ b/Tools/scripts/generate_symbol_py.py
@@ -0,0 +1,53 @@
+#! /usr/bin/env python3
+# This script generates the symbol.py source file.
+
+import sys
+import re
+
+def main(inFileName="Include/graminit.h", outFileName="Lib/symbol.py"):
+ try:
+ fp = open(inFileName)
+ except OSError as err:
+ sys.stderr.write("I/O error: %s\n" % str(err))
+ sys.exit(1)
+ with fp:
+ lines = fp.read().split("\n")
+ prog = re.compile(
+ "#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
+ re.IGNORECASE)
+ tokens = {}
+ for line in lines:
+ match = prog.match(line)
+ if match:
+ name, val = match.group(1, 2)
+ val = int(val)
+ tokens[val] = name # reverse so we can sort them...
+ keys = sorted(tokens.keys())
+ # load the output skeleton from the target:
+ try:
+ fp = open(outFileName)
+ except OSError as err:
+ sys.stderr.write("I/O error: %s\n" % str(err))
+ sys.exit(2)
+ with fp:
+ format = fp.read().split("\n")
+ try:
+ start = format.index("#--start constants--") + 1
+ end = format.index("#--end constants--")
+ except ValueError:
+ sys.stderr.write("target does not contain format markers")
+ sys.exit(3)
+ lines = []
+ for val in keys:
+ lines.append("%s = %d" % (tokens[val], val))
+ format[start:end] = lines
+ try:
+ fp = open(outFileName, 'w')
+ except OSError as err:
+ sys.stderr.write("I/O error: %s\n" % str(err))
+ sys.exit(4)
+ with fp:
+ fp.write("\n".join(format))
+
+if __name__ == '__main__':
+ main(*sys.argv[1:])
diff --git a/Tools/scripts/generate_token.py b/Tools/scripts/generate_token.py
new file mode 100644
index 00000000000..f2745e8353f
--- /dev/null
+++ b/Tools/scripts/generate_token.py
@@ -0,0 +1,268 @@
+#! /usr/bin/env python3
+# This script generates token related files from Grammar/Tokens:
+#
+# Doc/library/token-list.inc
+# Include/token.h
+# Parser/token.c
+# Lib/token.py
+
+
+NT_OFFSET = 256
+
+def load_tokens(path):
+ tok_names = []
+ string_to_tok = {}
+ ERRORTOKEN = None
+ with open(path) as fp:
+ for line in fp:
+ line = line.strip()
+ # strip comments
+ i = line.find('#')
+ if i >= 0:
+ line = line[:i].strip()
+ if not line:
+ continue
+ fields = line.split()
+ name = fields[0]
+ value = len(tok_names)
+ if name == 'ERRORTOKEN':
+ ERRORTOKEN = value
+ string = fields[1] if len(fields) > 1 else None
+ if string:
+ string = eval(string)
+ string_to_tok[string] = value
+ tok_names.append(name)
+ return tok_names, ERRORTOKEN, string_to_tok
+
+
+def update_file(file, content):
+ try:
+ with open(file, 'r') as fobj:
+ if fobj.read() == content:
+ return False
+ except (OSError, ValueError):
+ pass
+ with open(file, 'w') as fobj:
+ fobj.write(content)
+ return True
+
+
+token_h_template = """\
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+/* Token types */
+#ifndef Py_LIMITED_API
+#ifndef Py_TOKEN_H
+#define Py_TOKEN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
+
+%s\
+#define N_TOKENS %d
+#define NT_OFFSET %d
+
+/* Special definitions for cooperation with parser */
+
+#define ISTERMINAL(x) ((x) < NT_OFFSET)
+#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
+#define ISEOF(x) ((x) == ENDMARKER)
+
+
+PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
+PyAPI_FUNC(int) PyToken_OneChar(int);
+PyAPI_FUNC(int) PyToken_TwoChars(int, int);
+PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TOKEN_H */
+#endif /* Py_LIMITED_API */
+"""
+
+def make_h(infile, outfile='Include/token.h'):
+ tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+
+ defines = []
+ for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
+ defines.append("#define %-15s %d\n" % (name, value))
+
+ if update_file(outfile, token_h_template % (
+ ''.join(defines),
+ len(tok_names),
+ NT_OFFSET
+ )):
+ print("%s regenerated from %s" % (outfile, infile))
+
+
+token_c_template = """\
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+#include "Python.h"
+#include "token.h"
+
+/* Token names */
+
+const char * const _PyParser_TokenNames[] = {
+%s\
+};
+
+/* Return the token corresponding to a single character */
+
+int
+PyToken_OneChar(int c1)
+{
+%s\
+ return OP;
+}
+
+int
+PyToken_TwoChars(int c1, int c2)
+{
+%s\
+ return OP;
+}
+
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
+{
+%s\
+ return OP;
+}
+"""
+
+def generate_chars_to_token(mapping, n=1):
+ result = []
+ write = result.append
+ indent = ' ' * n
+ write(indent)
+ write('switch (c%d) {\n' % (n,))
+ for c in sorted(mapping):
+ write(indent)
+ value = mapping[c]
+ if isinstance(value, dict):
+ write("case '%s':\n" % (c,))
+ write(generate_chars_to_token(value, n + 1))
+ write(indent)
+ write(' break;\n')
+ else:
+ write("case '%s': return %s;\n" % (c, value))
+ write(indent)
+ write('}\n')
+ return ''.join(result)
+
+def make_c(infile, outfile='Parser/token.c'):
+ tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+ string_to_tok['<>'] = string_to_tok['!=']
+ chars_to_token = {}
+ for string, value in string_to_tok.items():
+ assert 1 <= len(string) <= 3
+ name = tok_names[value]
+ m = chars_to_token.setdefault(len(string), {})
+ for c in string[:-1]:
+ m = m.setdefault(c, {})
+ m[string[-1]] = name
+
+ names = []
+ for value, name in enumerate(tok_names):
+ if value >= ERRORTOKEN:
+ name = '<%s>' % name
+ names.append(' "%s",\n' % name)
+ names.append(' "",\n')
+
+ if update_file(outfile, token_c_template % (
+ ''.join(names),
+ generate_chars_to_token(chars_to_token[1]),
+ generate_chars_to_token(chars_to_token[2]),
+ generate_chars_to_token(chars_to_token[3])
+ )):
+ print("%s regenerated from %s" % (outfile, infile))
+
+
+token_inc_template = """\
+.. Auto-generated by Tools/scripts/generate_token.py
+%s
+.. data:: N_TOKENS
+
+.. data:: NT_OFFSET
+"""
+
+def make_rst(infile, outfile='Doc/library/token-list.inc'):
+ tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+ tok_to_string = {value: s for s, value in string_to_tok.items()}
+
+ names = []
+ for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
+ names.append('.. data:: %s' % (name,))
+ if value in tok_to_string:
+ names.append('')
+ names.append(' Token value for ``"%s"``.' % tok_to_string[value])
+ names.append('')
+
+ if update_file(outfile, token_inc_template % '\n'.join(names)):
+ print("%s regenerated from %s" % (outfile, infile))
+
+
+token_py_template = '''\
+"""Token constants."""
+# Auto-generated by Tools/scripts/generate_token.py
+
+__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
+
+%s
+N_TOKENS = %d
+# Special definitions for cooperation with parser
+NT_OFFSET = %d
+
+tok_name = {value: name
+ for name, value in globals().items()
+ if isinstance(value, int) and not name.startswith('_')}
+__all__.extend(tok_name.values())
+
+EXACT_TOKEN_TYPES = {
+%s
+}
+
+def ISTERMINAL(x):
+ return x < NT_OFFSET
+
+def ISNONTERMINAL(x):
+ return x >= NT_OFFSET
+
+def ISEOF(x):
+ return x == ENDMARKER
+'''
+
+def make_py(infile, outfile='Lib/token.py'):
+ tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+
+ constants = []
+ for value, name in enumerate(tok_names):
+ constants.append('%s = %d' % (name, value))
+ constants.insert(ERRORTOKEN,
+ "# These aren't used by the C tokenizer but are needed for tokenize.py")
+
+ token_types = []
+ for s, value in sorted(string_to_tok.items()):
+ token_types.append(' %r: %s,' % (s, tok_names[value]))
+
+ if update_file(outfile, token_py_template % (
+ '\n'.join(constants),
+ len(tok_names),
+ NT_OFFSET,
+ '\n'.join(token_types),
+ )):
+ print("%s regenerated from %s" % (outfile, infile))
+
+
+def main(op, infile='Grammar/Tokens', *args):
+ make = globals()['make_' + op]
+ make(infile, *args)
+
+
+if __name__ == '__main__':
+ import sys
+ main(*sys.argv[1:])