bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)

"Include/token.h", "Lib/token.py" (containing now some data moved from
"Lib/tokenize.py") and new files "Parser/token.c" (containing the code
moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included
in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by
"Tools/scripts/generate_token.py". The script overwrites files only if
needed and can be used on the read-only sources tree.

"Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py"
instead of been executable itself.

Added new make targets "regen-token" and "regen-symbol" which are now
dependencies of "regen-all".

The documentation contains now strings for operators and punctuation tokens.
This commit is contained in:
Serhiy Storchaka 2018-12-22 11:18:40 +02:00 committed by GitHub
parent c1b4b0f616
commit 8ac658114d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 940 additions and 462 deletions

4
.gitattributes vendored
View File

@ -55,3 +55,7 @@ Include/opcode.h linguist-generated=true
Python/opcode_targets.h linguist-generated=true Python/opcode_targets.h linguist-generated=true
Objects/typeslots.inc linguist-generated=true Objects/typeslots.inc linguist-generated=true
Modules/unicodedata_db.h linguist-generated=true Modules/unicodedata_db.h linguist-generated=true
Doc/library/token-list.inc linguist-generated=true
Include/token.h linguist-generated=true
Lib/token.py linguist-generated=true
Parser/token.c linguist-generated=true

206
Doc/library/token-list.inc generated Normal file
View File

@ -0,0 +1,206 @@
.. Auto-generated by Tools/scripts/generate_token.py
.. data:: ENDMARKER
.. data:: NAME
.. data:: NUMBER
.. data:: STRING
.. data:: NEWLINE
.. data:: INDENT
.. data:: DEDENT
.. data:: LPAR
Token value for ``"("``.
.. data:: RPAR
Token value for ``")"``.
.. data:: LSQB
Token value for ``"["``.
.. data:: RSQB
Token value for ``"]"``.
.. data:: COLON
Token value for ``":"``.
.. data:: COMMA
Token value for ``","``.
.. data:: SEMI
Token value for ``";"``.
.. data:: PLUS
Token value for ``"+"``.
.. data:: MINUS
Token value for ``"-"``.
.. data:: STAR
Token value for ``"*"``.
.. data:: SLASH
Token value for ``"/"``.
.. data:: VBAR
Token value for ``"|"``.
.. data:: AMPER
Token value for ``"&"``.
.. data:: LESS
Token value for ``"<"``.
.. data:: GREATER
Token value for ``">"``.
.. data:: EQUAL
Token value for ``"="``.
.. data:: DOT
Token value for ``"."``.
.. data:: PERCENT
Token value for ``"%"``.
.. data:: LBRACE
Token value for ``"{"``.
.. data:: RBRACE
Token value for ``"}"``.
.. data:: EQEQUAL
Token value for ``"=="``.
.. data:: NOTEQUAL
Token value for ``"!="``.
.. data:: LESSEQUAL
Token value for ``"<="``.
.. data:: GREATEREQUAL
Token value for ``">="``.
.. data:: TILDE
Token value for ``"~"``.
.. data:: CIRCUMFLEX
Token value for ``"^"``.
.. data:: LEFTSHIFT
Token value for ``"<<"``.
.. data:: RIGHTSHIFT
Token value for ``">>"``.
.. data:: DOUBLESTAR
Token value for ``"**"``.
.. data:: PLUSEQUAL
Token value for ``"+="``.
.. data:: MINEQUAL
Token value for ``"-="``.
.. data:: STAREQUAL
Token value for ``"*="``.
.. data:: SLASHEQUAL
Token value for ``"/="``.
.. data:: PERCENTEQUAL
Token value for ``"%="``.
.. data:: AMPEREQUAL
Token value for ``"&="``.
.. data:: VBAREQUAL
Token value for ``"|="``.
.. data:: CIRCUMFLEXEQUAL
Token value for ``"^="``.
.. data:: LEFTSHIFTEQUAL
Token value for ``"<<="``.
.. data:: RIGHTSHIFTEQUAL
Token value for ``">>="``.
.. data:: DOUBLESTAREQUAL
Token value for ``"**="``.
.. data:: DOUBLESLASH
Token value for ``"//"``.
.. data:: DOUBLESLASHEQUAL
Token value for ``"//="``.
.. data:: AT
Token value for ``"@"``.
.. data:: ATEQUAL
Token value for ``"@="``.
.. data:: RARROW
Token value for ``"->"``.
.. data:: ELLIPSIS
Token value for ``"..."``.
.. data:: OP
.. data:: ERRORTOKEN
.. data:: N_TOKENS
.. data:: NT_OFFSET

View File

@ -44,64 +44,7 @@ functions. The functions mirror definitions in the Python C header files.
The token constants are: The token constants are:
.. data:: ENDMARKER .. include:: token-list.inc
NAME
NUMBER
STRING
NEWLINE
INDENT
DEDENT
LPAR
RPAR
LSQB
RSQB
COLON
COMMA
SEMI
PLUS
MINUS
STAR
SLASH
VBAR
AMPER
LESS
GREATER
EQUAL
DOT
PERCENT
LBRACE
RBRACE
EQEQUAL
NOTEQUAL
LESSEQUAL
GREATEREQUAL
TILDE
CIRCUMFLEX
LEFTSHIFT
RIGHTSHIFT
DOUBLESTAR
PLUSEQUAL
MINEQUAL
STAREQUAL
SLASHEQUAL
PERCENTEQUAL
AMPEREQUAL
VBAREQUAL
CIRCUMFLEXEQUAL
LEFTSHIFTEQUAL
RIGHTSHIFTEQUAL
DOUBLESTAREQUAL
DOUBLESLASH
DOUBLESLASHEQUAL
AT
ATEQUAL
RARROW
ELLIPSIS
OP
ERRORTOKEN
N_TOKENS
NT_OFFSET
The following token type values aren't used by the C tokenizer but are needed for The following token type values aren't used by the C tokenizer but are needed for
the :mod:`tokenize` module. the :mod:`tokenize` module.

62
Grammar/Tokens Normal file
View File

@ -0,0 +1,62 @@
ENDMARKER
NAME
NUMBER
STRING
NEWLINE
INDENT
DEDENT
LPAR '('
RPAR ')'
LSQB '['
RSQB ']'
COLON ':'
COMMA ','
SEMI ';'
PLUS '+'
MINUS '-'
STAR '*'
SLASH '/'
VBAR '|'
AMPER '&'
LESS '<'
GREATER '>'
EQUAL '='
DOT '.'
PERCENT '%'
LBRACE '{'
RBRACE '}'
EQEQUAL '=='
NOTEQUAL '!='
LESSEQUAL '<='
GREATEREQUAL '>='
TILDE '~'
CIRCUMFLEX '^'
LEFTSHIFT '<<'
RIGHTSHIFT '>>'
DOUBLESTAR '**'
PLUSEQUAL '+='
MINEQUAL '-='
STAREQUAL '*='
SLASHEQUAL '/='
PERCENTEQUAL '%='
AMPEREQUAL '&='
VBAREQUAL '|='
CIRCUMFLEXEQUAL '^='
LEFTSHIFTEQUAL '<<='
RIGHTSHIFTEQUAL '>>='
DOUBLESTAREQUAL '**='
DOUBLESLASH '//'
DOUBLESLASHEQUAL '//='
AT '@'
ATEQUAL '@='
RARROW '->'
ELLIPSIS '...'
OP
ERRORTOKEN
# These aren't used by the C tokenizer but are needed for tokenize.py
COMMENT
NL
ENCODING

11
Include/token.h generated
View File

@ -1,3 +1,4 @@
/* Auto-generated by Tools/scripts/generate_token.py */
/* Token types */ /* Token types */
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API
@ -62,25 +63,19 @@ extern "C" {
#define ATEQUAL 50 #define ATEQUAL 50
#define RARROW 51 #define RARROW 51
#define ELLIPSIS 52 #define ELLIPSIS 52
/* Don't forget to update the table _PyParser_TokenNames in tokenizer.c! */
#define OP 53 #define OP 53
#define ERRORTOKEN 54 #define ERRORTOKEN 54
/* These aren't used by the C tokenizer but are needed for tokenize.py */
#define COMMENT 55
#define NL 56
#define ENCODING 57
#define N_TOKENS 58 #define N_TOKENS 58
#define NT_OFFSET 256
/* Special definitions for cooperation with parser */ /* Special definitions for cooperation with parser */
#define NT_OFFSET 256
#define ISTERMINAL(x) ((x) < NT_OFFSET) #define ISTERMINAL(x) ((x) < NT_OFFSET)
#define ISNONTERMINAL(x) ((x) >= NT_OFFSET) #define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
#define ISEOF(x) ((x) == ENDMARKER) #define ISEOF(x) ((x) == ENDMARKER)
PyAPI_DATA(const char *) _PyParser_TokenNames[]; /* Token names */ PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
PyAPI_FUNC(int) PyToken_OneChar(int); PyAPI_FUNC(int) PyToken_OneChar(int);
PyAPI_FUNC(int) PyToken_TwoChars(int, int); PyAPI_FUNC(int) PyToken_TwoChars(int, int);
PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int); PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);

20
Lib/symbol.py Executable file → Normal file
View File

@ -1,5 +1,3 @@
#! /usr/bin/env python3
"""Non-terminal symbols of Python grammar (from "graminit.h").""" """Non-terminal symbols of Python grammar (from "graminit.h")."""
# This file is automatically generated; please don't muck it up! # This file is automatically generated; please don't muck it up!
@ -7,7 +5,11 @@
# To update the symbols in this file, 'cd' to the top directory of # To update the symbols in this file, 'cd' to the top directory of
# the python source tree after building the interpreter and run: # the python source tree after building the interpreter and run:
# #
# ./python Lib/symbol.py # python3 Tools/scripts/generate_symbol_py.py Include/graminit.h Lib/symbol.py
#
# or just
#
# make regen-symbol
#--start constants-- #--start constants--
single_input = 256 single_input = 256
@ -103,14 +105,4 @@ sym_name = {}
for _name, _value in list(globals().items()): for _name, _value in list(globals().items()):
if type(_value) is type(0): if type(_value) is type(0):
sym_name[_value] = _name sym_name[_value] = _name
del _name, _value
def _main():
import sys
import token
if len(sys.argv) == 1:
sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"]
token._main()
if __name__ == "__main__":
_main()

View File

@ -6,6 +6,9 @@ import subprocess
SYMBOL_FILE = support.findfile('symbol.py') SYMBOL_FILE = support.findfile('symbol.py')
GEN_SYMBOL_FILE = os.path.join(os.path.dirname(__file__),
'..', '..', 'Tools', 'scripts',
'generate_symbol_py.py')
GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), GRAMMAR_FILE = os.path.join(os.path.dirname(__file__),
'..', '..', 'Include', 'graminit.h') '..', '..', 'Include', 'graminit.h')
TEST_PY_FILE = 'symbol_test.py' TEST_PY_FILE = 'symbol_test.py'
@ -22,7 +25,7 @@ class TestSymbolGeneration(unittest.TestCase):
def _generate_symbols(self, grammar_file, target_symbol_py_file): def _generate_symbols(self, grammar_file, target_symbol_py_file):
proc = subprocess.Popen([sys.executable, proc = subprocess.Popen([sys.executable,
SYMBOL_FILE, GEN_SYMBOL_FILE,
grammar_file, grammar_file,
target_symbol_py_file], stderr=subprocess.PIPE) target_symbol_py_file], stderr=subprocess.PIPE)
stderr = proc.communicate()[1] stderr = proc.communicate()[1]

View File

@ -1619,6 +1619,8 @@ class TestRoundtrip(TestCase):
testfiles = random.sample(testfiles, 10) testfiles = random.sample(testfiles, 10)
for testfile in testfiles: for testfile in testfiles:
if support.verbose >= 2:
print('tokenize', testfile)
with open(testfile, 'rb') as f: with open(testfile, 'rb') as f:
with self.subTest(file=testfile): with self.subTest(file=testfile):
self.check_roundtrip(f) self.check_roundtrip(f)

134
Lib/token.py generated
View File

@ -1,15 +1,8 @@
"""Token constants (from "token.h").""" """Token constants."""
# Auto-generated by Tools/scripts/generate_token.py
__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF'] __all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
# This file is automatically generated; please don't muck it up!
#
# To update the symbols in this file, 'cd' to the top directory of
# the python source tree after building the interpreter and run:
#
# ./python Lib/token.py
#--start constants--
ENDMARKER = 0 ENDMARKER = 0
NAME = 1 NAME = 1
NUMBER = 2 NUMBER = 2
@ -63,23 +56,70 @@ AT = 49
ATEQUAL = 50 ATEQUAL = 50
RARROW = 51 RARROW = 51
ELLIPSIS = 52 ELLIPSIS = 52
# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
OP = 53 OP = 53
ERRORTOKEN = 54
# These aren't used by the C tokenizer but are needed for tokenize.py # These aren't used by the C tokenizer but are needed for tokenize.py
ERRORTOKEN = 54
COMMENT = 55 COMMENT = 55
NL = 56 NL = 56
ENCODING = 57 ENCODING = 57
N_TOKENS = 58 N_TOKENS = 58
# Special definitions for cooperation with parser # Special definitions for cooperation with parser
NT_OFFSET = 256 NT_OFFSET = 256
#--end constants--
tok_name = {value: name tok_name = {value: name
for name, value in globals().items() for name, value in globals().items()
if isinstance(value, int) and not name.startswith('_')} if isinstance(value, int) and not name.startswith('_')}
__all__.extend(tok_name.values()) __all__.extend(tok_name.values())
EXACT_TOKEN_TYPES = {
'!=': NOTEQUAL,
'%': PERCENT,
'%=': PERCENTEQUAL,
'&': AMPER,
'&=': AMPEREQUAL,
'(': LPAR,
')': RPAR,
'*': STAR,
'**': DOUBLESTAR,
'**=': DOUBLESTAREQUAL,
'*=': STAREQUAL,
'+': PLUS,
'+=': PLUSEQUAL,
',': COMMA,
'-': MINUS,
'-=': MINEQUAL,
'->': RARROW,
'.': DOT,
'...': ELLIPSIS,
'/': SLASH,
'//': DOUBLESLASH,
'//=': DOUBLESLASHEQUAL,
'/=': SLASHEQUAL,
':': COLON,
';': SEMI,
'<': LESS,
'<<': LEFTSHIFT,
'<<=': LEFTSHIFTEQUAL,
'<=': LESSEQUAL,
'=': EQUAL,
'==': EQEQUAL,
'>': GREATER,
'>=': GREATEREQUAL,
'>>': RIGHTSHIFT,
'>>=': RIGHTSHIFTEQUAL,
'@': AT,
'@=': ATEQUAL,
'[': LSQB,
']': RSQB,
'^': CIRCUMFLEX,
'^=': CIRCUMFLEXEQUAL,
'{': LBRACE,
'|': VBAR,
'|=': VBAREQUAL,
'}': RBRACE,
'~': TILDE,
}
def ISTERMINAL(x): def ISTERMINAL(x):
return x < NT_OFFSET return x < NT_OFFSET
@ -88,73 +128,3 @@ def ISNONTERMINAL(x):
def ISEOF(x): def ISEOF(x):
return x == ENDMARKER return x == ENDMARKER
def _main():
import re
import sys
args = sys.argv[1:]
inFileName = args and args[0] or "Include/token.h"
outFileName = "Lib/token.py"
if len(args) > 1:
outFileName = args[1]
try:
fp = open(inFileName)
except OSError as err:
sys.stdout.write("I/O error: %s\n" % str(err))
sys.exit(1)
with fp:
lines = fp.read().split("\n")
prog = re.compile(
r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
re.IGNORECASE)
comment_regex = re.compile(
r"^\s*/\*\s*(.+?)\s*\*/\s*$",
re.IGNORECASE)
tokens = {}
prev_val = None
for line in lines:
match = prog.match(line)
if match:
name, val = match.group(1, 2)
val = int(val)
tokens[val] = {'token': name} # reverse so we can sort them...
prev_val = val
else:
comment_match = comment_regex.match(line)
if comment_match and prev_val is not None:
comment = comment_match.group(1)
tokens[prev_val]['comment'] = comment
keys = sorted(tokens.keys())
# load the output skeleton from the target:
try:
fp = open(outFileName)
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(2)
with fp:
format = fp.read().split("\n")
try:
start = format.index("#--start constants--") + 1
end = format.index("#--end constants--")
except ValueError:
sys.stderr.write("target does not contain format markers")
sys.exit(3)
lines = []
for key in keys:
lines.append("%s = %d" % (tokens[key]["token"], key))
if "comment" in tokens[key]:
lines.append("# %s" % tokens[key]["comment"])
format[start:end] = lines
try:
fp = open(outFileName, 'w')
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(4)
with fp:
fp.write("\n".join(format))
if __name__ == "__main__":
_main()

View File

@ -32,6 +32,7 @@ import itertools as _itertools
import re import re
import sys import sys
from token import * from token import *
from token import EXACT_TOKEN_TYPES
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@ -41,55 +42,6 @@ __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
"untokenize", "TokenInfo"] "untokenize", "TokenInfo"]
del token del token
EXACT_TOKEN_TYPES = {
'(': LPAR,
')': RPAR,
'[': LSQB,
']': RSQB,
':': COLON,
',': COMMA,
';': SEMI,
'+': PLUS,
'-': MINUS,
'*': STAR,
'/': SLASH,
'|': VBAR,
'&': AMPER,
'<': LESS,
'>': GREATER,
'=': EQUAL,
'.': DOT,
'%': PERCENT,
'{': LBRACE,
'}': RBRACE,
'==': EQEQUAL,
'!=': NOTEQUAL,
'<=': LESSEQUAL,
'>=': GREATEREQUAL,
'~': TILDE,
'^': CIRCUMFLEX,
'<<': LEFTSHIFT,
'>>': RIGHTSHIFT,
'**': DOUBLESTAR,
'+=': PLUSEQUAL,
'-=': MINEQUAL,
'*=': STAREQUAL,
'/=': SLASHEQUAL,
'%=': PERCENTEQUAL,
'&=': AMPEREQUAL,
'|=': VBAREQUAL,
'^=': CIRCUMFLEXEQUAL,
'<<=': LEFTSHIFTEQUAL,
'>>=': RIGHTSHIFTEQUAL,
'**=': DOUBLESTAREQUAL,
'//': DOUBLESLASH,
'//=': DOUBLESLASHEQUAL,
'...': ELLIPSIS,
'->': RARROW,
'@': AT,
'@=': ATEQUAL,
}
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self): def __repr__(self):
annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
@ -163,17 +115,11 @@ Triple = group(StringPrefix + "'''", StringPrefix + '"""')
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
# Because of leftmost-then-longest match semantics, be sure to put the # Sorting in reverse order puts the long operators before their prefixes.
# longest operators first (e.g., if = came before ==, == would get # Otherwise if = came before ==, == would get recognized as two instances
# recognized as two instances of =). # of =.
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
r"//=?", r"->", Funny = group(r'\r?\n', Special)
r"[+\-*/%&@|^=<>]=?",
r"~")
Bracket = '[][(){}]'
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Funny = group(Operator, Bracket, Special)
PlainToken = group(Number, Funny, String, Name) PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken Token = Ignore + PlainToken

View File

@ -302,6 +302,7 @@ POBJS= \
Parser/metagrammar.o \ Parser/metagrammar.o \
Parser/firstsets.o \ Parser/firstsets.o \
Parser/grammar.o \ Parser/grammar.o \
Parser/token.o \
Parser/pgen.o Parser/pgen.o
PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o
@ -559,7 +560,7 @@ coverage-lcov:
@echo @echo
# Force regeneration of parser and importlib # Force regeneration of parser and importlib
coverage-report: regen-grammar regen-importlib coverage-report: regen-grammar regen-token regen-importlib
@ # build with coverage info @ # build with coverage info
$(MAKE) coverage $(MAKE) coverage
@ # run tests, ignore failures @ # run tests, ignore failures
@ -741,7 +742,7 @@ regen-importlib: Programs/_freeze_importlib
# Regenerate all generated files # Regenerate all generated files
regen-all: regen-opcode regen-opcode-targets regen-typeslots regen-grammar \ regen-all: regen-opcode regen-opcode-targets regen-typeslots regen-grammar \
regen-ast regen-importlib clinic regen-token regen-symbol regen-ast regen-importlib clinic
############################################################################ ############################################################################
# Special rules for object files # Special rules for object files
@ -849,6 +850,37 @@ regen-opcode:
$(srcdir)/Include/opcode.h.new $(srcdir)/Include/opcode.h.new
$(UPDATE_FILE) $(srcdir)/Include/opcode.h $(srcdir)/Include/opcode.h.new $(UPDATE_FILE) $(srcdir)/Include/opcode.h $(srcdir)/Include/opcode.h.new
.PHONY: regen-token
regen-token:
# Regenerate Doc/library/token-list.inc from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py rst \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Doc/library/token-list.inc
# Regenerate Include/token.h from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py h \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Include/token.h
# Regenerate Parser/token.c from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py c \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Parser/token.c
# Regenerate Lib/token.py from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py py \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Lib/token.py
.PHONY: regen-symbol
regen-symbol: $(srcdir)/Include/graminit.h
# Regenerate Lib/symbol.py from Include/graminit.h
# using Tools/scripts/generate_symbol_py.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_symbol_py.py \
$(srcdir)/Include/graminit.h \
$(srcdir)/Lib/symbol.py
Python/compile.o Python/symtable.o Python/ast_unparse.o Python/ast.o: $(srcdir)/Include/graminit.h $(srcdir)/Include/Python-ast.h Python/compile.o Python/symtable.o Python/ast_unparse.o Python/ast.o: $(srcdir)/Include/graminit.h $(srcdir)/Include/Python-ast.h
Python/getplatform.o: $(srcdir)/Python/getplatform.c Python/getplatform.o: $(srcdir)/Python/getplatform.c

View File

@ -0,0 +1,2 @@
The C and Python code and the documentation related to tokens are now generated
from a single source file :file:`Grammar/Tokens`.

View File

@ -367,6 +367,7 @@
<ClCompile Include="..\Parser\parser.c" /> <ClCompile Include="..\Parser\parser.c" />
<ClCompile Include="..\Parser\parsetok.c" /> <ClCompile Include="..\Parser\parsetok.c" />
<ClCompile Include="..\Parser\tokenizer.c" /> <ClCompile Include="..\Parser\tokenizer.c" />
<ClCompile Include="..\Parser\token.c" />
<ClCompile Include="..\PC\invalid_parameter_handler.c" /> <ClCompile Include="..\PC\invalid_parameter_handler.c" />
<ClCompile Include="..\PC\winreg.c" /> <ClCompile Include="..\PC\winreg.c" />
<ClCompile Include="..\PC\config.c" /> <ClCompile Include="..\PC\config.c" />

View File

@ -866,6 +866,9 @@
<ClCompile Include="..\Parser\tokenizer.c"> <ClCompile Include="..\Parser\tokenizer.c">
<Filter>Parser</Filter> <Filter>Parser</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="..\Parser\token.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\PC\winreg.c"> <ClCompile Include="..\PC\winreg.c">
<Filter>PC</Filter> <Filter>PC</Filter>
</ClCompile> </ClCompile>

233
Parser/token.c generated Normal file
View File

@ -0,0 +1,233 @@
/* Auto-generated by Tools/scripts/generate_token.py */
#include "Python.h"
#include "token.h"
/* Token names */
const char * const _PyParser_TokenNames[] = {
"ENDMARKER",
"NAME",
"NUMBER",
"STRING",
"NEWLINE",
"INDENT",
"DEDENT",
"LPAR",
"RPAR",
"LSQB",
"RSQB",
"COLON",
"COMMA",
"SEMI",
"PLUS",
"MINUS",
"STAR",
"SLASH",
"VBAR",
"AMPER",
"LESS",
"GREATER",
"EQUAL",
"DOT",
"PERCENT",
"LBRACE",
"RBRACE",
"EQEQUAL",
"NOTEQUAL",
"LESSEQUAL",
"GREATEREQUAL",
"TILDE",
"CIRCUMFLEX",
"LEFTSHIFT",
"RIGHTSHIFT",
"DOUBLESTAR",
"PLUSEQUAL",
"MINEQUAL",
"STAREQUAL",
"SLASHEQUAL",
"PERCENTEQUAL",
"AMPEREQUAL",
"VBAREQUAL",
"CIRCUMFLEXEQUAL",
"LEFTSHIFTEQUAL",
"RIGHTSHIFTEQUAL",
"DOUBLESTAREQUAL",
"DOUBLESLASH",
"DOUBLESLASHEQUAL",
"AT",
"ATEQUAL",
"RARROW",
"ELLIPSIS",
"OP",
"<ERRORTOKEN>",
"<COMMENT>",
"<NL>",
"<ENCODING>",
"<N_TOKENS>",
};
/* Return the token corresponding to a single character */
int
PyToken_OneChar(int c1)
{
switch (c1) {
case '%': return PERCENT;
case '&': return AMPER;
case '(': return LPAR;
case ')': return RPAR;
case '*': return STAR;
case '+': return PLUS;
case ',': return COMMA;
case '-': return MINUS;
case '.': return DOT;
case '/': return SLASH;
case ':': return COLON;
case ';': return SEMI;
case '<': return LESS;
case '=': return EQUAL;
case '>': return GREATER;
case '@': return AT;
case '[': return LSQB;
case ']': return RSQB;
case '^': return CIRCUMFLEX;
case '{': return LBRACE;
case '|': return VBAR;
case '}': return RBRACE;
case '~': return TILDE;
}
return OP;
}
int
PyToken_TwoChars(int c1, int c2)
{
switch (c1) {
case '!':
switch (c2) {
case '=': return NOTEQUAL;
}
break;
case '%':
switch (c2) {
case '=': return PERCENTEQUAL;
}
break;
case '&':
switch (c2) {
case '=': return AMPEREQUAL;
}
break;
case '*':
switch (c2) {
case '*': return DOUBLESTAR;
case '=': return STAREQUAL;
}
break;
case '+':
switch (c2) {
case '=': return PLUSEQUAL;
}
break;
case '-':
switch (c2) {
case '=': return MINEQUAL;
case '>': return RARROW;
}
break;
case '/':
switch (c2) {
case '/': return DOUBLESLASH;
case '=': return SLASHEQUAL;
}
break;
case '<':
switch (c2) {
case '<': return LEFTSHIFT;
case '=': return LESSEQUAL;
case '>': return NOTEQUAL;
}
break;
case '=':
switch (c2) {
case '=': return EQEQUAL;
}
break;
case '>':
switch (c2) {
case '=': return GREATEREQUAL;
case '>': return RIGHTSHIFT;
}
break;
case '@':
switch (c2) {
case '=': return ATEQUAL;
}
break;
case '^':
switch (c2) {
case '=': return CIRCUMFLEXEQUAL;
}
break;
case '|':
switch (c2) {
case '=': return VBAREQUAL;
}
break;
}
return OP;
}
int
PyToken_ThreeChars(int c1, int c2, int c3)
{
switch (c1) {
case '*':
switch (c2) {
case '*':
switch (c3) {
case '=': return DOUBLESTAREQUAL;
}
break;
}
break;
case '.':
switch (c2) {
case '.':
switch (c3) {
case '.': return ELLIPSIS;
}
break;
}
break;
case '/':
switch (c2) {
case '/':
switch (c3) {
case '=': return DOUBLESLASHEQUAL;
}
break;
}
break;
case '<':
switch (c2) {
case '<':
switch (c3) {
case '=': return LEFTSHIFTEQUAL;
}
break;
}
break;
case '>':
switch (c2) {
case '>':
switch (c3) {
case '=': return RIGHTSHIFTEQUAL;
}
break;
}
break;
}
return OP;
}

View File

@ -48,72 +48,6 @@ static int tok_nextc(struct tok_state *tok);
static void tok_backup(struct tok_state *tok, int c); static void tok_backup(struct tok_state *tok, int c);
/* Token names */
const char *_PyParser_TokenNames[] = {
"ENDMARKER",
"NAME",
"NUMBER",
"STRING",
"NEWLINE",
"INDENT",
"DEDENT",
"LPAR",
"RPAR",
"LSQB",
"RSQB",
"COLON",
"COMMA",
"SEMI",
"PLUS",
"MINUS",
"STAR",
"SLASH",
"VBAR",
"AMPER",
"LESS",
"GREATER",
"EQUAL",
"DOT",
"PERCENT",
"LBRACE",
"RBRACE",
"EQEQUAL",
"NOTEQUAL",
"LESSEQUAL",
"GREATEREQUAL",
"TILDE",
"CIRCUMFLEX",
"LEFTSHIFT",
"RIGHTSHIFT",
"DOUBLESTAR",
"PLUSEQUAL",
"MINEQUAL",
"STAREQUAL",
"SLASHEQUAL",
"PERCENTEQUAL",
"AMPEREQUAL",
"VBAREQUAL",
"CIRCUMFLEXEQUAL",
"LEFTSHIFTEQUAL",
"RIGHTSHIFTEQUAL",
"DOUBLESTAREQUAL",
"DOUBLESLASH",
"DOUBLESLASHEQUAL",
"AT",
"ATEQUAL",
"RARROW",
"ELLIPSIS",
/* This table must match the #defines in token.h! */
"OP",
"<ERRORTOKEN>",
"COMMENT",
"NL",
"ENCODING",
"<N_TOKENS>"
};
/* Create and initialize a new tok_state structure */ /* Create and initialize a new tok_state structure */
static struct tok_state * static struct tok_state *
@ -1114,177 +1048,6 @@ tok_backup(struct tok_state *tok, int c)
} }
/* Return the token corresponding to a single character */
int
PyToken_OneChar(int c)
{
switch (c) {
case '(': return LPAR;
case ')': return RPAR;
case '[': return LSQB;
case ']': return RSQB;
case ':': return COLON;
case ',': return COMMA;
case ';': return SEMI;
case '+': return PLUS;
case '-': return MINUS;
case '*': return STAR;
case '/': return SLASH;
case '|': return VBAR;
case '&': return AMPER;
case '<': return LESS;
case '>': return GREATER;
case '=': return EQUAL;
case '.': return DOT;
case '%': return PERCENT;
case '{': return LBRACE;
case '}': return RBRACE;
case '^': return CIRCUMFLEX;
case '~': return TILDE;
case '@': return AT;
default: return OP;
}
}
int
PyToken_TwoChars(int c1, int c2)
{
switch (c1) {
case '=':
switch (c2) {
case '=': return EQEQUAL;
}
break;
case '!':
switch (c2) {
case '=': return NOTEQUAL;
}
break;
case '<':
switch (c2) {
case '>': return NOTEQUAL;
case '=': return LESSEQUAL;
case '<': return LEFTSHIFT;
}
break;
case '>':
switch (c2) {
case '=': return GREATEREQUAL;
case '>': return RIGHTSHIFT;
}
break;
case '+':
switch (c2) {
case '=': return PLUSEQUAL;
}
break;
case '-':
switch (c2) {
case '=': return MINEQUAL;
case '>': return RARROW;
}
break;
case '*':
switch (c2) {
case '*': return DOUBLESTAR;
case '=': return STAREQUAL;
}
break;
case '/':
switch (c2) {
case '/': return DOUBLESLASH;
case '=': return SLASHEQUAL;
}
break;
case '|':
switch (c2) {
case '=': return VBAREQUAL;
}
break;
case '%':
switch (c2) {
case '=': return PERCENTEQUAL;
}
break;
case '&':
switch (c2) {
case '=': return AMPEREQUAL;
}
break;
case '^':
switch (c2) {
case '=': return CIRCUMFLEXEQUAL;
}
break;
case '@':
switch (c2) {
case '=': return ATEQUAL;
}
break;
}
return OP;
}
int
PyToken_ThreeChars(int c1, int c2, int c3)
{
switch (c1) {
case '<':
switch (c2) {
case '<':
switch (c3) {
case '=':
return LEFTSHIFTEQUAL;
}
break;
}
break;
case '>':
switch (c2) {
case '>':
switch (c3) {
case '=':
return RIGHTSHIFTEQUAL;
}
break;
}
break;
case '*':
switch (c2) {
case '*':
switch (c3) {
case '=':
return DOUBLESTAREQUAL;
}
break;
}
break;
case '/':
switch (c2) {
case '/':
switch (c3) {
case '=':
return DOUBLESLASHEQUAL;
}
break;
}
break;
case '.':
switch (c2) {
case '.':
switch (c3) {
case '.':
return ELLIPSIS;
}
break;
}
break;
}
return OP;
}
static int static int
syntaxerror(struct tok_state *tok, const char *format, ...) syntaxerror(struct tok_state *tok, const char *format, ...)
{ {

View File

@ -0,0 +1,53 @@
#! /usr/bin/env python3
# This script generates the symbol.py source file.
import sys
import re
def main(inFileName="Include/graminit.h", outFileName="Lib/symbol.py"):
try:
fp = open(inFileName)
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(1)
with fp:
lines = fp.read().split("\n")
prog = re.compile(
"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
re.IGNORECASE)
tokens = {}
for line in lines:
match = prog.match(line)
if match:
name, val = match.group(1, 2)
val = int(val)
tokens[val] = name # reverse so we can sort them...
keys = sorted(tokens.keys())
# load the output skeleton from the target:
try:
fp = open(outFileName)
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(2)
with fp:
format = fp.read().split("\n")
try:
start = format.index("#--start constants--") + 1
end = format.index("#--end constants--")
except ValueError:
sys.stderr.write("target does not contain format markers")
sys.exit(3)
lines = []
for val in keys:
lines.append("%s = %d" % (tokens[val], val))
format[start:end] = lines
try:
fp = open(outFileName, 'w')
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(4)
with fp:
fp.write("\n".join(format))
if __name__ == '__main__':
main(*sys.argv[1:])

View File

@ -0,0 +1,268 @@
#! /usr/bin/env python3
# This script generates token related files from Grammar/Tokens:
#
# Doc/library/token-list.inc
# Include/token.h
# Parser/token.c
# Lib/token.py
NT_OFFSET = 256
def load_tokens(path):
tok_names = []
string_to_tok = {}
ERRORTOKEN = None
with open(path) as fp:
for line in fp:
line = line.strip()
# strip comments
i = line.find('#')
if i >= 0:
line = line[:i].strip()
if not line:
continue
fields = line.split()
name = fields[0]
value = len(tok_names)
if name == 'ERRORTOKEN':
ERRORTOKEN = value
string = fields[1] if len(fields) > 1 else None
if string:
string = eval(string)
string_to_tok[string] = value
tok_names.append(name)
return tok_names, ERRORTOKEN, string_to_tok
def update_file(file, content):
try:
with open(file, 'r') as fobj:
if fobj.read() == content:
return False
except (OSError, ValueError):
pass
with open(file, 'w') as fobj:
fobj.write(content)
return True
token_h_template = """\
/* Auto-generated by Tools/scripts/generate_token.py */
/* Token types */
#ifndef Py_LIMITED_API
#ifndef Py_TOKEN_H
#define Py_TOKEN_H
#ifdef __cplusplus
extern "C" {
#endif
#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
%s\
#define N_TOKENS %d
#define NT_OFFSET %d
/* Special definitions for cooperation with parser */
#define ISTERMINAL(x) ((x) < NT_OFFSET)
#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
#define ISEOF(x) ((x) == ENDMARKER)
PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
PyAPI_FUNC(int) PyToken_OneChar(int);
PyAPI_FUNC(int) PyToken_TwoChars(int, int);
PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
#ifdef __cplusplus
}
#endif
#endif /* !Py_TOKEN_H */
#endif /* Py_LIMITED_API */
"""
def make_h(infile, outfile='Include/token.h'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
defines = []
for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
defines.append("#define %-15s %d\n" % (name, value))
if update_file(outfile, token_h_template % (
''.join(defines),
len(tok_names),
NT_OFFSET
)):
print("%s regenerated from %s" % (outfile, infile))
token_c_template = """\
/* Auto-generated by Tools/scripts/generate_token.py */
#include "Python.h"
#include "token.h"
/* Token names */
const char * const _PyParser_TokenNames[] = {
%s\
};
/* Return the token corresponding to a single character */
int
PyToken_OneChar(int c1)
{
%s\
return OP;
}
int
PyToken_TwoChars(int c1, int c2)
{
%s\
return OP;
}
int
PyToken_ThreeChars(int c1, int c2, int c3)
{
%s\
return OP;
}
"""
def generate_chars_to_token(mapping, n=1):
result = []
write = result.append
indent = ' ' * n
write(indent)
write('switch (c%d) {\n' % (n,))
for c in sorted(mapping):
write(indent)
value = mapping[c]
if isinstance(value, dict):
write("case '%s':\n" % (c,))
write(generate_chars_to_token(value, n + 1))
write(indent)
write(' break;\n')
else:
write("case '%s': return %s;\n" % (c, value))
write(indent)
write('}\n')
return ''.join(result)
def make_c(infile, outfile='Parser/token.c'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
string_to_tok['<>'] = string_to_tok['!=']
chars_to_token = {}
for string, value in string_to_tok.items():
assert 1 <= len(string) <= 3
name = tok_names[value]
m = chars_to_token.setdefault(len(string), {})
for c in string[:-1]:
m = m.setdefault(c, {})
m[string[-1]] = name
names = []
for value, name in enumerate(tok_names):
if value >= ERRORTOKEN:
name = '<%s>' % name
names.append(' "%s",\n' % name)
names.append(' "<N_TOKENS>",\n')
if update_file(outfile, token_c_template % (
''.join(names),
generate_chars_to_token(chars_to_token[1]),
generate_chars_to_token(chars_to_token[2]),
generate_chars_to_token(chars_to_token[3])
)):
print("%s regenerated from %s" % (outfile, infile))
token_inc_template = """\
.. Auto-generated by Tools/scripts/generate_token.py
%s
.. data:: N_TOKENS
.. data:: NT_OFFSET
"""
def make_rst(infile, outfile='Doc/library/token-list.inc'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
tok_to_string = {value: s for s, value in string_to_tok.items()}
names = []
for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
names.append('.. data:: %s' % (name,))
if value in tok_to_string:
names.append('')
names.append(' Token value for ``"%s"``.' % tok_to_string[value])
names.append('')
if update_file(outfile, token_inc_template % '\n'.join(names)):
print("%s regenerated from %s" % (outfile, infile))
token_py_template = '''\
"""Token constants."""
# Auto-generated by Tools/scripts/generate_token.py
__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
%s
N_TOKENS = %d
# Special definitions for cooperation with parser
NT_OFFSET = %d
tok_name = {value: name
for name, value in globals().items()
if isinstance(value, int) and not name.startswith('_')}
__all__.extend(tok_name.values())
EXACT_TOKEN_TYPES = {
%s
}
def ISTERMINAL(x):
return x < NT_OFFSET
def ISNONTERMINAL(x):
return x >= NT_OFFSET
def ISEOF(x):
return x == ENDMARKER
'''
def make_py(infile, outfile='Lib/token.py'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
constants = []
for value, name in enumerate(tok_names):
constants.append('%s = %d' % (name, value))
constants.insert(ERRORTOKEN,
"# These aren't used by the C tokenizer but are needed for tokenize.py")
token_types = []
for s, value in sorted(string_to_tok.items()):
token_types.append(' %r: %s,' % (s, tok_names[value]))
if update_file(outfile, token_py_template % (
'\n'.join(constants),
len(tok_names),
NT_OFFSET,
'\n'.join(token_types),
)):
print("%s regenerated from %s" % (outfile, infile))
def main(op, infile='Grammar/Tokens', *args):
make = globals()['make_' + op]
make(infile, *args)
if __name__ == '__main__':
import sys
main(*sys.argv[1:])