cpython/Tools/scripts/generate_token.py

#! /usr/bin/env python3
# This script generates token related files from Grammar/Tokens:
#
#   Doc/library/token-list.inc
#   Include/token.h
#   Parser/token.c
#   Lib/token.py


NT_OFFSET = 256

def load_tokens(path):
    tok_names = []
    string_to_tok = {}
    ERRORTOKEN = None
    with open(path) as fp:
        for line in fp:
            line = line.strip()
            # strip comments
            i = line.find('#')
            if i >= 0:
                line = line[:i].strip()
            if not line:
                continue
            fields = line.split()
            name = fields[0]
            value = len(tok_names)
            if name == 'ERRORTOKEN':
                ERRORTOKEN = value
            string = fields[1] if len(fields) > 1 else None
            if string:
                string = eval(string)
                string_to_tok[string] = value
            tok_names.append(name)
    return tok_names, ERRORTOKEN, string_to_tok


def update_file(file, content):
    try:
        with open(file, 'r') as fobj:
            if fobj.read() == content:
                return False
    except (OSError, ValueError):
        pass
    with open(file, 'w') as fobj:
        fobj.write(content)
    return True


token_h_template = """\
/* Auto-generated by Tools/scripts/generate_token.py */

/* Token types */
#ifndef Py_LIMITED_API
#ifndef Py_TOKEN_H
#define Py_TOKEN_H
#ifdef __cplusplus
extern "C" {
#endif

#undef TILDE   /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */

%s\
#define N_TOKENS        %d
#define NT_OFFSET       %d

/* Special definitions for cooperation with parser */

#define ISTERMINAL(x)           ((x) < NT_OFFSET)
#define ISNONTERMINAL(x)        ((x) >= NT_OFFSET)
#define ISEOF(x)                ((x) == ENDMARKER)
#define ISWHITESPACE(x)         ((x) == ENDMARKER || \\
                                 (x) == NEWLINE   || \\
                                 (x) == INDENT    || \\
                                 (x) == DEDENT)


PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
PyAPI_FUNC(int) PyToken_OneChar(int);
PyAPI_FUNC(int) PyToken_TwoChars(int, int);
PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);

#ifdef __cplusplus
}
#endif
#endif /* !Py_TOKEN_H */
#endif /* Py_LIMITED_API */
"""

def make_h(infile, outfile='Include/token.h'):
    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)

    defines = []
    for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
        defines.append("#define %-15s %d\n" % (name, value))

    if update_file(outfile, token_h_template % (
            ''.join(defines),
            len(tok_names),
            NT_OFFSET
        )):
        print("%s regenerated from %s" % (outfile, infile))


token_c_template = """\
/* Auto-generated by Tools/scripts/generate_token.py */

#include "Python.h"
#include "token.h"

/* Token names */

const char * const _PyParser_TokenNames[] = {
%s\
};

/* Return the token corresponding to a single character */

int
PyToken_OneChar(int c1)
{
%s\
    return OP;
}

int
PyToken_TwoChars(int c1, int c2)
{
%s\
    return OP;
}

int
PyToken_ThreeChars(int c1, int c2, int c3)
{
%s\
    return OP;
}
"""

def generate_chars_to_token(mapping, n=1):
    result = []
    write = result.append
    indent = '    ' * n
    write(indent)
    write('switch (c%d) {\n' % (n,))
    for c in sorted(mapping):
        write(indent)
        value = mapping[c]
        if isinstance(value, dict):
            write("case '%s':\n" % (c,))
            write(generate_chars_to_token(value, n + 1))
            write(indent)
            write('    break;\n')
        else:
            write("case '%s': return %s;\n" % (c, value))
    write(indent)
    write('}\n')
    return ''.join(result)

def make_c(infile, outfile='Parser/token.c'):
    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
    string_to_tok['<>'] = string_to_tok['!=']
    chars_to_token = {}
    for string, value in string_to_tok.items():
        assert 1 <= len(string) <= 3
        name = tok_names[value]
        m = chars_to_token.setdefault(len(string), {})
        for c in string[:-1]:
            m = m.setdefault(c, {})
        m[string[-1]] = name

    names = []
    for value, name in enumerate(tok_names):
        if value >= ERRORTOKEN:
            name = '<%s>' % name
        names.append('    "%s",\n' % name)
    names.append('    "<N_TOKENS>",\n')

    if update_file(outfile, token_c_template % (
            ''.join(names),
            generate_chars_to_token(chars_to_token[1]),
            generate_chars_to_token(chars_to_token[2]),
            generate_chars_to_token(chars_to_token[3])
        )):
        print("%s regenerated from %s" % (outfile, infile))


token_inc_template = """\
.. Auto-generated by Tools/scripts/generate_token.py
%s
.. data:: N_TOKENS

.. data:: NT_OFFSET
"""

def make_rst(infile, outfile='Doc/library/token-list.inc'):
    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
    tok_to_string = {value: s for s, value in string_to_tok.items()}

    names = []
    for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
        names.append('.. data:: %s' % (name,))
        if value in tok_to_string:
            names.append('')
            names.append('   Token value for ``"%s"``.' % tok_to_string[value])
        names.append('')

    if update_file(outfile, token_inc_template % '\n'.join(names)):
        print("%s regenerated from %s" % (outfile, infile))


token_py_template = '''\
"""Token constants."""
# Auto-generated by Tools/scripts/generate_token.py

__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']

%s
N_TOKENS = %d
# Special definitions for cooperation with parser
NT_OFFSET = %d

tok_name = {value: name
            for name, value in globals().items()
            if isinstance(value, int) and not name.startswith('_')}
__all__.extend(tok_name.values())

EXACT_TOKEN_TYPES = {
%s
}

def ISTERMINAL(x):
    return x < NT_OFFSET

def ISNONTERMINAL(x):
    return x >= NT_OFFSET

def ISEOF(x):
    return x == ENDMARKER
'''

def make_py(infile, outfile='Lib/token.py'):
    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)

    constants = []
    for value, name in enumerate(tok_names):
        constants.append('%s = %d' % (name, value))
    constants.insert(ERRORTOKEN,
        "# These aren't used by the C tokenizer but are needed for tokenize.py")

    token_types = []
    for s, value in sorted(string_to_tok.items()):
        token_types.append('    %r: %s,' % (s, tok_names[value]))

    if update_file(outfile, token_py_template % (
            '\n'.join(constants),
            len(tok_names),
            NT_OFFSET,
            '\n'.join(token_types),
        )):
        print("%s regenerated from %s" % (outfile, infile))


def main(op, infile='Grammar/Tokens', *args):
    make = globals()['make_' + op]
    make(infile, *args)


if __name__ == '__main__':
    import sys
    main(*sys.argv[1:])
bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370) "Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens. 2018-12-22 05:18:40 -04:00			`#! /usr/bin/env python3`
			`# This script generates token related files from Grammar/Tokens:`
			`#`
			`# Doc/library/token-list.inc`
			`# Include/token.h`
			`# Parser/token.c`
			`# Lib/token.py`


			`NT_OFFSET = 256`

			`def load_tokens(path):`
			`tok_names = []`
			`string_to_tok = {}`
			`ERRORTOKEN = None`
			`with open(path) as fp:`
			`for line in fp:`
			`line = line.strip()`
			`# strip comments`
			`i = line.find('#')`
			`if i >= 0:`
			`line = line[:i].strip()`
			`if not line:`
			`continue`
			`fields = line.split()`
			`name = fields[0]`
			`value = len(tok_names)`
			`if name == 'ERRORTOKEN':`
			`ERRORTOKEN = value`
			`string = fields[1] if len(fields) > 1 else None`
			`if string:`
			`string = eval(string)`
			`string_to_tok[string] = value`
			`tok_names.append(name)`
			`return tok_names, ERRORTOKEN, string_to_tok`


			`def update_file(file, content):`
			`try:`
			`with open(file, 'r') as fobj:`
			`if fobj.read() == content:`
			`return False`
			`except (OSError, ValueError):`
			`pass`
			`with open(file, 'w') as fobj:`
			`fobj.write(content)`
			`return True`


			`token_h_template = """\`
			`/* Auto-generated by Tools/scripts/generate_token.py */`

			`/* Token types */`
			`#ifndef Py_LIMITED_API`
			`#ifndef Py_TOKEN_H`
			`#define Py_TOKEN_H`
			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

			`#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */`

			`%s\`
			`#define N_TOKENS %d`
			`#define NT_OFFSET %d`

			`/* Special definitions for cooperation with parser */`

			`#define ISTERMINAL(x) ((x) < NT_OFFSET)`
			`#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)`
			`#define ISEOF(x) ((x) == ENDMARKER)`
bpo-40267: Fix message when last input character produces a SyntaxError (GH-19521) When there is a SyntaxError after reading the last input character from the tokenizer and if no newline follows it, the error message used to be `unexpected EOF while parsing`, which is wrong. 2020-04-15 15:22:10 -03:00			`#define ISWHITESPACE(x) ((x) == ENDMARKER \|\| \\`
			`(x) == NEWLINE \|\| \\`
			`(x) == INDENT \|\| \\`
			`(x) == DEDENT)`
bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370) "Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens. 2018-12-22 05:18:40 -04:00

			`PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */`
			`PyAPI_FUNC(int) PyToken_OneChar(int);`
			`PyAPI_FUNC(int) PyToken_TwoChars(int, int);`
			`PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);`

			`#ifdef __cplusplus`
			`}`
			`#endif`
			`#endif /* !Py_TOKEN_H */`
			`#endif /* Py_LIMITED_API */`
			`"""`

			`def make_h(infile, outfile='Include/token.h'):`
			`tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)`

			`defines = []`
			`for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):`
			`defines.append("#define %-15s %d\n" % (name, value))`

			`if update_file(outfile, token_h_template % (`
			`''.join(defines),`
			`len(tok_names),`
			`NT_OFFSET`
			`)):`
			`print("%s regenerated from %s" % (outfile, infile))`


			`token_c_template = """\`
			`/* Auto-generated by Tools/scripts/generate_token.py */`

			`#include "Python.h"`
			`#include "token.h"`

			`/* Token names */`

			`const char * const _PyParser_TokenNames[] = {`
			`%s\`
			`};`

			`/* Return the token corresponding to a single character */`

			`int`
			`PyToken_OneChar(int c1)`
			`{`
			`%s\`
			`return OP;`
			`}`

			`int`
			`PyToken_TwoChars(int c1, int c2)`
			`{`
			`%s\`
			`return OP;`
			`}`

			`int`
			`PyToken_ThreeChars(int c1, int c2, int c3)`
			`{`
			`%s\`
			`return OP;`
			`}`
			`"""`

			`def generate_chars_to_token(mapping, n=1):`
			`result = []`
			`write = result.append`
			`indent = ' ' * n`
			`write(indent)`
			`write('switch (c%d) {\n' % (n,))`
			`for c in sorted(mapping):`
			`write(indent)`
			`value = mapping[c]`
			`if isinstance(value, dict):`
			`write("case '%s':\n" % (c,))`
			`write(generate_chars_to_token(value, n + 1))`
			`write(indent)`
			`write(' break;\n')`
			`else:`
			`write("case '%s': return %s;\n" % (c, value))`
			`write(indent)`
			`write('}\n')`
			`return ''.join(result)`

			`def make_c(infile, outfile='Parser/token.c'):`
			`tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)`
			`string_to_tok['<>'] = string_to_tok['!=']`
			`chars_to_token = {}`
			`for string, value in string_to_tok.items():`
			`assert 1 <= len(string) <= 3`
			`name = tok_names[value]`
			`m = chars_to_token.setdefault(len(string), {})`
			`for c in string[:-1]:`
			`m = m.setdefault(c, {})`
			`m[string[-1]] = name`

			`names = []`
			`for value, name in enumerate(tok_names):`
			`if value >= ERRORTOKEN:`
			`name = '<%s>' % name`
			`names.append(' "%s",\n' % name)`
			`names.append(' "<N_TOKENS>",\n')`

			`if update_file(outfile, token_c_template % (`
			`''.join(names),`
			`generate_chars_to_token(chars_to_token[1]),`
			`generate_chars_to_token(chars_to_token[2]),`
			`generate_chars_to_token(chars_to_token[3])`
			`)):`
			`print("%s regenerated from %s" % (outfile, infile))`


			`token_inc_template = """\`
			`.. Auto-generated by Tools/scripts/generate_token.py`
			`%s`
			`.. data:: N_TOKENS`

			`.. data:: NT_OFFSET`
			`"""`

			`def make_rst(infile, outfile='Doc/library/token-list.inc'):`
			`tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)`
			`tok_to_string = {value: s for s, value in string_to_tok.items()}`

			`names = []`
			`for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):`
			`names.append('.. data:: %s' % (name,))`
			`if value in tok_to_string:`
			`names.append('')`
			names.append(' Token value for ``"%s"``.' % tok_to_string[value])
			`names.append('')`

			`if update_file(outfile, token_inc_template % '\n'.join(names)):`
			`print("%s regenerated from %s" % (outfile, infile))`


			`token_py_template = '''\`
			`"""Token constants."""`
			`# Auto-generated by Tools/scripts/generate_token.py`

			`__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']`

			`%s`
			`N_TOKENS = %d`
			`# Special definitions for cooperation with parser`
			`NT_OFFSET = %d`

			`tok_name = {value: name`
			`for name, value in globals().items()`
			`if isinstance(value, int) and not name.startswith('_')}`
			`__all__.extend(tok_name.values())`

			`EXACT_TOKEN_TYPES = {`
			`%s`
			`}`

			`def ISTERMINAL(x):`
			`return x < NT_OFFSET`

			`def ISNONTERMINAL(x):`
			`return x >= NT_OFFSET`

			`def ISEOF(x):`
			`return x == ENDMARKER`
			`'''`

			`def make_py(infile, outfile='Lib/token.py'):`
			`tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)`

			`constants = []`
			`for value, name in enumerate(tok_names):`
			`constants.append('%s = %d' % (name, value))`
			`constants.insert(ERRORTOKEN,`
			`"# These aren't used by the C tokenizer but are needed for tokenize.py")`

			`token_types = []`
			`for s, value in sorted(string_to_tok.items()):`
			`token_types.append(' %r: %s,' % (s, tok_names[value]))`

			`if update_file(outfile, token_py_template % (`
			`'\n'.join(constants),`
			`len(tok_names),`
			`NT_OFFSET,`
			`'\n'.join(token_types),`
			`)):`
			`print("%s regenerated from %s" % (outfile, infile))`


			`def main(op, infile='Grammar/Tokens', *args):`
			`make = globals()['make_' + op]`
			`make(infile, *args)`


			`if __name__ == '__main__':`
			`import sys`
			`main(*sys.argv[1:])`