mirror of https://github.com/python/cpython
gh-102856: Python tokenizer implementation for PEP 701 (#104323)
This commit replaces the Python implementation of the tokenize module with an implementation that reuses the real C tokenizer via a private extension module. The tokenize module now implements a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward compatibility. As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
This commit is contained in:
parent
3ed57e4995
commit
6715f91edc
|
@ -223,6 +223,10 @@
|
|||
|
||||
.. data:: FSTRING_END
|
||||
|
||||
.. data:: COMMENT
|
||||
|
||||
.. data:: NL
|
||||
|
||||
.. data:: ERRORTOKEN
|
||||
|
||||
.. data:: N_TOKENS
|
||||
|
|
|
@ -50,11 +50,13 @@ The following token type values aren't used by the C tokenizer but are needed fo
|
|||
the :mod:`tokenize` module.
|
||||
|
||||
.. data:: COMMENT
|
||||
:noindex:
|
||||
|
||||
Token value used to indicate a comment.
|
||||
|
||||
|
||||
.. data:: NL
|
||||
:noindex:
|
||||
|
||||
Token value used to indicate a non-terminating newline. The
|
||||
:data:`NEWLINE` token indicates the end of a logical line of Python code;
|
||||
|
|
|
@ -64,9 +64,9 @@ SOFT_KEYWORD
|
|||
FSTRING_START
|
||||
FSTRING_MIDDLE
|
||||
FSTRING_END
|
||||
COMMENT
|
||||
NL
|
||||
ERRORTOKEN
|
||||
|
||||
# These aren't used by the C tokenizer but are needed for tokenize.py
|
||||
COMMENT
|
||||
NL
|
||||
ENCODING
|
||||
|
|
|
@ -918,6 +918,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
|
|||
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
|
||||
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exp));
|
||||
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extend));
|
||||
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extra_tokens));
|
||||
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(facility));
|
||||
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(factory));
|
||||
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(false));
|
||||
|
|
|
@ -406,6 +406,7 @@ struct _Py_global_strings {
|
|||
STRUCT_FOR_ID(exception)
|
||||
STRUCT_FOR_ID(exp)
|
||||
STRUCT_FOR_ID(extend)
|
||||
STRUCT_FOR_ID(extra_tokens)
|
||||
STRUCT_FOR_ID(facility)
|
||||
STRUCT_FOR_ID(factory)
|
||||
STRUCT_FOR_ID(false)
|
||||
|
|
|
@ -912,6 +912,7 @@ extern "C" {
|
|||
INIT_ID(exception), \
|
||||
INIT_ID(exp), \
|
||||
INIT_ID(extend), \
|
||||
INIT_ID(extra_tokens), \
|
||||
INIT_ID(facility), \
|
||||
INIT_ID(factory), \
|
||||
INIT_ID(false), \
|
||||
|
|
|
@ -77,7 +77,9 @@ extern "C" {
|
|||
#define FSTRING_START 61
|
||||
#define FSTRING_MIDDLE 62
|
||||
#define FSTRING_END 63
|
||||
#define ERRORTOKEN 64
|
||||
#define COMMENT 64
|
||||
#define NL 65
|
||||
#define ERRORTOKEN 66
|
||||
#define N_TOKENS 68
|
||||
#define NT_OFFSET 256
|
||||
|
||||
|
|
|
@ -1059,6 +1059,9 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
|
|||
string = &_Py_ID(extend);
|
||||
assert(_PyUnicode_CheckConsistency(string, 1));
|
||||
_PyUnicode_InternInPlace(interp, &string);
|
||||
string = &_Py_ID(extra_tokens);
|
||||
assert(_PyUnicode_CheckConsistency(string, 1));
|
||||
_PyUnicode_InternInPlace(interp, &string);
|
||||
string = &_Py_ID(facility);
|
||||
assert(_PyUnicode_CheckConsistency(string, 1));
|
||||
_PyUnicode_InternInPlace(interp, &string);
|
||||
|
|
|
@ -2187,7 +2187,7 @@ def _signature_strip_non_python_syntax(signature):
|
|||
if string == ',':
|
||||
current_parameter += 1
|
||||
|
||||
if (type == ERRORTOKEN) and (string == '$'):
|
||||
if (type == OP) and (string == '$'):
|
||||
assert self_parameter is None
|
||||
self_parameter = current_parameter
|
||||
continue
|
||||
|
@ -2195,7 +2195,7 @@ def _signature_strip_non_python_syntax(signature):
|
|||
add(string)
|
||||
if (string == ','):
|
||||
add(' ')
|
||||
clean_signature = ''.join(text)
|
||||
clean_signature = ''.join(text).strip()
|
||||
return clean_signature, self_parameter
|
||||
|
||||
|
||||
|
|
|
@ -107,6 +107,10 @@ def check(file):
|
|||
errprint("%r: Token Error: %s" % (file, msg))
|
||||
return
|
||||
|
||||
except SyntaxError as msg:
|
||||
errprint("%r: Token Error: %s" % (file, msg))
|
||||
return
|
||||
|
||||
except IndentationError as msg:
|
||||
errprint("%r: Indentation Error: %s" % (file, msg))
|
||||
return
|
||||
|
@ -272,6 +276,12 @@ def format_witnesses(w):
|
|||
return prefix + " " + ', '.join(firsts)
|
||||
|
||||
def process_tokens(tokens):
|
||||
try:
|
||||
_process_tokens(tokens)
|
||||
except TabError as e:
|
||||
raise NannyNag(e.lineno, e.msg, e.text)
|
||||
|
||||
def _process_tokens(tokens):
|
||||
INDENT = tokenize.INDENT
|
||||
DEDENT = tokenize.DEDENT
|
||||
NEWLINE = tokenize.NEWLINE
|
||||
|
|
|
@ -223,7 +223,7 @@ class TestCheck(TestCase):
|
|||
with TemporaryPyFile(SOURCE_CODES["nannynag_errored"]) as file_path:
|
||||
out = f"{file_path!r}: *** Line 3: trouble in tab city! ***\n"
|
||||
out += "offending line: '\\tprint(\"world\")\\n'\n"
|
||||
out += "indent not equal e.g. at tab size 1\n"
|
||||
out += "inconsistent use of tabs and spaces in indentation\n"
|
||||
|
||||
tabnanny.verbose = 1
|
||||
self.verify_tabnanny_check(file_path, out=out)
|
||||
|
@ -315,7 +315,7 @@ class TestCommandLine(TestCase):
|
|||
def test_with_errored_file(self):
|
||||
"""Should displays error when errored python file is given."""
|
||||
with TemporaryPyFile(SOURCE_CODES["wrong_indented"]) as file_path:
|
||||
stderr = f"{file_path!r}: Indentation Error: "
|
||||
stderr = f"{file_path!r}: Token Error: "
|
||||
stderr += ('unindent does not match any outer indentation level'
|
||||
' (<tokenize>, line 3)')
|
||||
self.validate_cmd(file_path, stderr=stderr, expect_failure=True)
|
||||
|
|
|
@ -3,7 +3,7 @@ from test.support import os_helper
|
|||
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
|
||||
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
||||
open as tokenize_open, Untokenizer, generate_tokens,
|
||||
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT)
|
||||
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
|
||||
from io import BytesIO, StringIO
|
||||
import unittest
|
||||
from textwrap import dedent
|
||||
|
@ -82,7 +82,7 @@ class TokenizeTest(TestCase):
|
|||
NAME 'False' (4, 11) (4, 16)
|
||||
COMMENT '# NEWLINE' (4, 17) (4, 26)
|
||||
NEWLINE '\\n' (4, 26) (4, 27)
|
||||
DEDENT '' (5, 0) (5, 0)
|
||||
DEDENT '' (4, 27) (4, 27)
|
||||
""")
|
||||
indent_error_file = b"""\
|
||||
def k(x):
|
||||
|
@ -230,6 +230,10 @@ def k(x):
|
|||
continue
|
||||
self.assertEqual(number_token(lit), lit)
|
||||
for lit in INVALID_UNDERSCORE_LITERALS:
|
||||
try:
|
||||
number_token(lit)
|
||||
except SyntaxError:
|
||||
continue
|
||||
self.assertNotEqual(number_token(lit), lit)
|
||||
|
||||
def test_string(self):
|
||||
|
@ -381,21 +385,119 @@ c"""', """\
|
|||
STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
|
||||
""")
|
||||
self.check_tokenize('f"abc"', """\
|
||||
STRING 'f"abc"' (1, 0) (1, 6)
|
||||
FSTRING_START 'f"' (1, 0) (1, 2)
|
||||
FSTRING_MIDDLE 'abc' (1, 2) (1, 5)
|
||||
FSTRING_END '"' (1, 5) (1, 6)
|
||||
""")
|
||||
self.check_tokenize('fR"a{b}c"', """\
|
||||
STRING 'fR"a{b}c"' (1, 0) (1, 9)
|
||||
FSTRING_START 'fR"' (1, 0) (1, 3)
|
||||
FSTRING_MIDDLE 'a' (1, 3) (1, 4)
|
||||
OP '{' (1, 4) (1, 5)
|
||||
NAME 'b' (1, 5) (1, 6)
|
||||
OP '}' (1, 6) (1, 7)
|
||||
FSTRING_MIDDLE 'c' (1, 7) (1, 8)
|
||||
FSTRING_END '"' (1, 8) (1, 9)
|
||||
""")
|
||||
self.check_tokenize('fR"a{{{b!r}}}c"', """\
|
||||
FSTRING_START 'fR"' (1, 0) (1, 3)
|
||||
FSTRING_MIDDLE 'a{' (1, 3) (1, 5)
|
||||
OP '{' (1, 6) (1, 7)
|
||||
NAME 'b' (1, 7) (1, 8)
|
||||
OP '!' (1, 8) (1, 9)
|
||||
NAME 'r' (1, 9) (1, 10)
|
||||
OP '}' (1, 10) (1, 11)
|
||||
FSTRING_MIDDLE '}' (1, 11) (1, 12)
|
||||
FSTRING_MIDDLE 'c' (1, 13) (1, 14)
|
||||
FSTRING_END '"' (1, 14) (1, 15)
|
||||
""")
|
||||
self.check_tokenize('f"{{{1+1}}}"', """\
|
||||
FSTRING_START 'f"' (1, 0) (1, 2)
|
||||
FSTRING_MIDDLE '{' (1, 2) (1, 3)
|
||||
OP '{' (1, 4) (1, 5)
|
||||
NUMBER '1' (1, 5) (1, 6)
|
||||
OP '+' (1, 6) (1, 7)
|
||||
NUMBER '1' (1, 7) (1, 8)
|
||||
OP '}' (1, 8) (1, 9)
|
||||
FSTRING_MIDDLE '}' (1, 9) (1, 10)
|
||||
FSTRING_END '"' (1, 11) (1, 12)
|
||||
""")
|
||||
self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\
|
||||
FSTRING_START 'f\"""' (1, 0) (1, 4)
|
||||
OP '{' (1, 4) (1, 5)
|
||||
FSTRING_START "f'''" (1, 5) (1, 9)
|
||||
OP '{' (1, 9) (1, 10)
|
||||
FSTRING_START "f'" (1, 10) (1, 12)
|
||||
OP '{' (1, 12) (1, 13)
|
||||
FSTRING_START 'f"' (1, 13) (1, 15)
|
||||
OP '{' (1, 15) (1, 16)
|
||||
NUMBER '1' (1, 16) (1, 17)
|
||||
OP '+' (1, 17) (1, 18)
|
||||
NUMBER '1' (1, 18) (1, 19)
|
||||
OP '}' (1, 19) (1, 20)
|
||||
FSTRING_END '"' (1, 20) (1, 21)
|
||||
OP '}' (1, 21) (1, 22)
|
||||
FSTRING_END "'" (1, 22) (1, 23)
|
||||
OP '}' (1, 23) (1, 24)
|
||||
FSTRING_END "'''" (1, 24) (1, 27)
|
||||
OP '}' (1, 27) (1, 28)
|
||||
FSTRING_END '\"""' (1, 28) (1, 31)
|
||||
""")
|
||||
self.check_tokenize('f""" x\nstr(data, encoding={invalid!r})\n"""', """\
|
||||
FSTRING_START 'f\"""' (1, 0) (1, 4)
|
||||
FSTRING_MIDDLE ' x\\nstr(data, encoding=' (1, 4) (2, 19)
|
||||
OP '{' (2, 19) (2, 20)
|
||||
NAME 'invalid' (2, 20) (2, 27)
|
||||
OP '!' (2, 27) (2, 28)
|
||||
NAME 'r' (2, 28) (2, 29)
|
||||
OP '}' (2, 29) (2, 30)
|
||||
FSTRING_MIDDLE ')\\n' (2, 30) (3, 0)
|
||||
FSTRING_END '\"""' (3, 0) (3, 3)
|
||||
""")
|
||||
self.check_tokenize('f"""123456789\nsomething{None}bad"""', """\
|
||||
FSTRING_START 'f\"""' (1, 0) (1, 4)
|
||||
FSTRING_MIDDLE '123456789\\nsomething' (1, 4) (2, 9)
|
||||
OP '{' (2, 9) (2, 10)
|
||||
NAME 'None' (2, 10) (2, 14)
|
||||
OP '}' (2, 14) (2, 15)
|
||||
FSTRING_MIDDLE 'bad' (2, 15) (2, 18)
|
||||
FSTRING_END '\"""' (2, 18) (2, 21)
|
||||
""")
|
||||
self.check_tokenize('f"""abc"""', """\
|
||||
STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
|
||||
FSTRING_START 'f\"""' (1, 0) (1, 4)
|
||||
FSTRING_MIDDLE 'abc' (1, 4) (1, 7)
|
||||
FSTRING_END '\"""' (1, 7) (1, 10)
|
||||
""")
|
||||
self.check_tokenize(r'f"abc\
|
||||
def"', """\
|
||||
STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
|
||||
FSTRING_START 'f"' (1, 0) (1, 2)
|
||||
FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 2) (2, 3)
|
||||
FSTRING_END '"' (2, 3) (2, 4)
|
||||
""")
|
||||
self.check_tokenize(r'Rf"abc\
|
||||
def"', """\
|
||||
STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
|
||||
FSTRING_START 'Rf"' (1, 0) (1, 3)
|
||||
FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3)
|
||||
FSTRING_END '"' (2, 3) (2, 4)
|
||||
""")
|
||||
self.check_tokenize("f'some words {a+b:.3f} more words {c+d=} final words'", """\
|
||||
FSTRING_START "f'" (1, 0) (1, 2)
|
||||
FSTRING_MIDDLE 'some words ' (1, 2) (1, 13)
|
||||
OP '{' (1, 13) (1, 14)
|
||||
NAME 'a' (1, 14) (1, 15)
|
||||
OP '+' (1, 15) (1, 16)
|
||||
NAME 'b' (1, 16) (1, 17)
|
||||
OP ':' (1, 17) (1, 18)
|
||||
FSTRING_MIDDLE '.3f' (1, 18) (1, 21)
|
||||
OP '}' (1, 21) (1, 22)
|
||||
FSTRING_MIDDLE ' more words ' (1, 22) (1, 34)
|
||||
OP '{' (1, 34) (1, 35)
|
||||
NAME 'c' (1, 35) (1, 36)
|
||||
OP '+' (1, 36) (1, 37)
|
||||
NAME 'd' (1, 37) (1, 38)
|
||||
OP '=' (1, 38) (1, 39)
|
||||
OP '}' (1, 39) (1, 40)
|
||||
FSTRING_MIDDLE ' final words' (1, 40) (1, 52)
|
||||
FSTRING_END "'" (1, 52) (1, 53)
|
||||
""")
|
||||
|
||||
def test_function(self):
|
||||
|
@ -644,8 +746,8 @@ def"', """\
|
|||
NEWLINE '\\n' (2, 5) (2, 6)
|
||||
INDENT ' \\t' (3, 0) (3, 9)
|
||||
NAME 'pass' (3, 9) (3, 13)
|
||||
DEDENT '' (4, 0) (4, 0)
|
||||
DEDENT '' (4, 0) (4, 0)
|
||||
DEDENT '' (3, 14) (3, 14)
|
||||
DEDENT '' (3, 14) (3, 14)
|
||||
""")
|
||||
|
||||
def test_non_ascii_identifiers(self):
|
||||
|
@ -857,7 +959,7 @@ async def foo():
|
|||
NUMBER '1' (2, 17) (2, 18)
|
||||
OP ':' (2, 18) (2, 19)
|
||||
NAME 'pass' (2, 20) (2, 24)
|
||||
DEDENT '' (3, 0) (3, 0)
|
||||
DEDENT '' (2, 25) (2, 25)
|
||||
""")
|
||||
|
||||
self.check_tokenize('''async def foo(async): await''', """\
|
||||
|
@ -905,7 +1007,7 @@ def f():
|
|||
NAME 'await' (6, 2) (6, 7)
|
||||
OP '=' (6, 8) (6, 9)
|
||||
NUMBER '2' (6, 10) (6, 11)
|
||||
DEDENT '' (7, 0) (7, 0)
|
||||
DEDENT '' (6, 12) (6, 12)
|
||||
""")
|
||||
|
||||
self.check_tokenize('''\
|
||||
|
@ -943,7 +1045,7 @@ async def f():
|
|||
NAME 'await' (6, 2) (6, 7)
|
||||
OP '=' (6, 8) (6, 9)
|
||||
NUMBER '2' (6, 10) (6, 11)
|
||||
DEDENT '' (7, 0) (7, 0)
|
||||
DEDENT '' (6, 12) (6, 12)
|
||||
""")
|
||||
|
||||
class GenerateTokensTest(TokenizeTest):
|
||||
|
@ -968,7 +1070,7 @@ def decistmt(s):
|
|||
])
|
||||
else:
|
||||
result.append((toknum, tokval))
|
||||
return untokenize(result).decode('utf-8')
|
||||
return untokenize(result).decode('utf-8').strip()
|
||||
|
||||
class TestMisc(TestCase):
|
||||
|
||||
|
@ -1040,33 +1142,16 @@ class Test_Tokenize(TestCase):
|
|||
nonlocal first
|
||||
if not first:
|
||||
first = True
|
||||
return line
|
||||
yield line
|
||||
else:
|
||||
return b''
|
||||
yield b''
|
||||
|
||||
# skip the initial encoding token and the end tokens
|
||||
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
|
||||
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
|
||||
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
|
||||
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
|
||||
self.assertEqual(tokens, expected_tokens,
|
||||
"bytes not decoded with encoding")
|
||||
|
||||
def test__tokenize_does_not_decode_with_encoding_none(self):
|
||||
literal = '"ЉЊЈЁЂ"'
|
||||
first = False
|
||||
def readline():
|
||||
nonlocal first
|
||||
if not first:
|
||||
first = True
|
||||
return literal
|
||||
else:
|
||||
return b''
|
||||
|
||||
# skip the end tokens
|
||||
tokens = list(_tokenize(readline, encoding=None))[:-2]
|
||||
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
|
||||
self.assertEqual(tokens, expected_tokens,
|
||||
"string not tokenized when encoding is None")
|
||||
|
||||
|
||||
class TestDetectEncoding(TestCase):
|
||||
|
||||
|
@ -1326,7 +1411,7 @@ class TestTokenize(TestCase):
|
|||
|
||||
def test_tokenize(self):
|
||||
import tokenize as tokenize_module
|
||||
encoding = object()
|
||||
encoding = "utf-8"
|
||||
encoding_used = None
|
||||
def mock_detect_encoding(readline):
|
||||
return encoding, [b'first', b'second']
|
||||
|
@ -1336,7 +1421,10 @@ class TestTokenize(TestCase):
|
|||
encoding_used = encoding
|
||||
out = []
|
||||
while True:
|
||||
next_line = readline()
|
||||
try:
|
||||
next_line = next(readline)
|
||||
except StopIteration:
|
||||
return out
|
||||
if next_line:
|
||||
out.append(next_line)
|
||||
continue
|
||||
|
@ -1356,7 +1444,7 @@ class TestTokenize(TestCase):
|
|||
tokenize_module._tokenize = mock__tokenize
|
||||
try:
|
||||
results = tokenize(mock_readline)
|
||||
self.assertEqual(list(results),
|
||||
self.assertEqual(list(results)[1:],
|
||||
[b'first', b'second', b'1', b'2', b'3', b'4'])
|
||||
finally:
|
||||
tokenize_module.detect_encoding = orig_detect_encoding
|
||||
|
@ -1652,8 +1740,8 @@ class TestRoundtrip(TestCase):
|
|||
if support.verbose >= 2:
|
||||
print('tokenize', testfile)
|
||||
with open(testfile, 'rb') as f:
|
||||
with self.subTest(file=testfile):
|
||||
self.check_roundtrip(f)
|
||||
# with self.subTest(file=testfile):
|
||||
self.check_roundtrip(f)
|
||||
|
||||
|
||||
def roundtrip(self, code):
|
||||
|
@ -2496,13 +2584,13 @@ async def f():
|
|||
def test_unicode(self):
|
||||
|
||||
self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
|
||||
NAME 'Örter' (1, 0) (1, 6)
|
||||
EQUAL '=' (1, 7) (1, 8)
|
||||
STRING "u'places'" (1, 9) (1, 18)
|
||||
NEWLINE '' (1, 18) (1, 18)
|
||||
NAME 'grün' (2, 0) (2, 5)
|
||||
EQUAL '=' (2, 6) (2, 7)
|
||||
STRING "U'green'" (2, 8) (2, 16)
|
||||
NAME 'Örter' (1, 0) (1, 5)
|
||||
EQUAL '=' (1, 6) (1, 7)
|
||||
STRING "u'places'" (1, 8) (1, 17)
|
||||
NEWLINE '' (1, 17) (1, 17)
|
||||
NAME 'grün' (2, 0) (2, 4)
|
||||
EQUAL '=' (2, 5) (2, 6)
|
||||
STRING "U'green'" (2, 7) (2, 15)
|
||||
""")
|
||||
|
||||
def test_invalid_syntax(self):
|
||||
|
@ -2559,8 +2647,7 @@ async def f():
|
|||
compile(valid, "<string>", "exec")
|
||||
|
||||
invalid = generate_source(MAXINDENT)
|
||||
tokens = list(_generate_tokens_from_c_tokenizer(invalid))
|
||||
self.assertEqual(tokens[-1].type, NEWLINE)
|
||||
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
|
||||
self.assertRaises(
|
||||
IndentationError, compile, invalid, "<string>", "exec"
|
||||
)
|
||||
|
|
|
@ -67,10 +67,10 @@ SOFT_KEYWORD = 60
|
|||
FSTRING_START = 61
|
||||
FSTRING_MIDDLE = 62
|
||||
FSTRING_END = 63
|
||||
COMMENT = 64
|
||||
NL = 65
|
||||
# These aren't used by the C tokenizer but are needed for tokenize.py
|
||||
ERRORTOKEN = 64
|
||||
COMMENT = 65
|
||||
NL = 66
|
||||
ERRORTOKEN = 66
|
||||
ENCODING = 67
|
||||
N_TOKENS = 68
|
||||
# Special definitions for cooperation with parser
|
||||
|
|
335
Lib/tokenize.py
335
Lib/tokenize.py
|
@ -56,112 +56,11 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
|
|||
else:
|
||||
return self.type
|
||||
|
||||
def group(*choices): return '(' + '|'.join(choices) + ')'
|
||||
def any(*choices): return group(*choices) + '*'
|
||||
def maybe(*choices): return group(*choices) + '?'
|
||||
|
||||
# Note: we use unicode matching for names ("\w") but ascii matching for
|
||||
# number literals.
|
||||
Whitespace = r'[ \f\t]*'
|
||||
Comment = r'#[^\r\n]*'
|
||||
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
|
||||
Name = r'\w+'
|
||||
|
||||
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
|
||||
Binnumber = r'0[bB](?:_?[01])+'
|
||||
Octnumber = r'0[oO](?:_?[0-7])+'
|
||||
Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
|
||||
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
|
||||
Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
|
||||
Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
|
||||
r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
|
||||
Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
|
||||
Floatnumber = group(Pointfloat, Expfloat)
|
||||
Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
|
||||
Number = group(Imagnumber, Floatnumber, Intnumber)
|
||||
|
||||
# Return the empty string, plus all of the valid string prefixes.
|
||||
def _all_string_prefixes():
|
||||
# The valid string prefixes. Only contain the lower case versions,
|
||||
# and don't contain any permutations (include 'fr', but not
|
||||
# 'rf'). The various permutations will be generated.
|
||||
_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
|
||||
# if we add binary f-strings, add: ['fb', 'fbr']
|
||||
result = {''}
|
||||
for prefix in _valid_string_prefixes:
|
||||
for t in _itertools.permutations(prefix):
|
||||
# create a list with upper and lower versions of each
|
||||
# character
|
||||
for u in _itertools.product(*[(c, c.upper()) for c in t]):
|
||||
result.add(''.join(u))
|
||||
return result
|
||||
|
||||
@functools.lru_cache
|
||||
def _compile(expr):
|
||||
return re.compile(expr, re.UNICODE)
|
||||
|
||||
# Note that since _all_string_prefixes includes the empty string,
|
||||
# StringPrefix can be the empty string (making it optional).
|
||||
StringPrefix = group(*_all_string_prefixes())
|
||||
|
||||
# Tail end of ' string.
|
||||
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
|
||||
# Tail end of " string.
|
||||
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
|
||||
# Tail end of ''' string.
|
||||
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
|
||||
# Tail end of """ string.
|
||||
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
|
||||
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
|
||||
# Single-line ' or " string.
|
||||
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
|
||||
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
|
||||
|
||||
# Sorting in reverse order puts the long operators before their prefixes.
|
||||
# Otherwise if = came before ==, == would get recognized as two instances
|
||||
# of =.
|
||||
Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
|
||||
Funny = group(r'\r?\n', Special)
|
||||
|
||||
PlainToken = group(Number, Funny, String, Name)
|
||||
Token = Ignore + PlainToken
|
||||
|
||||
# First (or only) line of ' or " string.
|
||||
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
|
||||
group("'", r'\\\r?\n'),
|
||||
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
||||
group('"', r'\\\r?\n'))
|
||||
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
|
||||
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
|
||||
|
||||
# For a given string prefix plus quotes, endpats maps it to a regex
|
||||
# to match the remainder of that string. _prefix can be empty, for
|
||||
# a normal single or triple quoted string (with no prefix).
|
||||
endpats = {}
|
||||
for _prefix in _all_string_prefixes():
|
||||
endpats[_prefix + "'"] = Single
|
||||
endpats[_prefix + '"'] = Double
|
||||
endpats[_prefix + "'''"] = Single3
|
||||
endpats[_prefix + '"""'] = Double3
|
||||
del _prefix
|
||||
|
||||
# A set of all of the single and triple quoted string prefixes,
|
||||
# including the opening quotes.
|
||||
single_quoted = set()
|
||||
triple_quoted = set()
|
||||
for t in _all_string_prefixes():
|
||||
for u in (t + '"', t + "'"):
|
||||
single_quoted.add(u)
|
||||
for u in (t + '"""', t + "'''"):
|
||||
triple_quoted.add(u)
|
||||
del t, u
|
||||
|
||||
tabsize = 8
|
||||
|
||||
class TokenError(Exception): pass
|
||||
|
||||
class StopTokenizing(Exception): pass
|
||||
|
||||
class StopTokenizing(Exception): pass
|
||||
|
||||
class Untokenizer:
|
||||
|
||||
|
@ -213,6 +112,14 @@ class Untokenizer:
|
|||
self.tokens.append(indent)
|
||||
self.prev_col = len(indent)
|
||||
startline = False
|
||||
elif tok_type == FSTRING_MIDDLE:
|
||||
if '{' in token or '}' in token:
|
||||
end_line, end_col = end
|
||||
end = (end_line, end_col + token.count('{') + token.count('}'))
|
||||
token = re.sub('{', '{{', token)
|
||||
token = re.sub('}', '}}', token)
|
||||
|
||||
|
||||
self.add_whitespace(start)
|
||||
self.tokens.append(token)
|
||||
self.prev_row, self.prev_col = end
|
||||
|
@ -255,6 +162,11 @@ class Untokenizer:
|
|||
elif startline and indents:
|
||||
toks_append(indents[-1])
|
||||
startline = False
|
||||
elif toknum == FSTRING_MIDDLE:
|
||||
if '{' in tokval or '}' in tokval:
|
||||
tokval = re.sub('{', '{{', tokval)
|
||||
tokval = re.sub('}', '}}', tokval)
|
||||
|
||||
toks_append(tokval)
|
||||
|
||||
|
||||
|
@ -404,7 +316,6 @@ def open(filename):
|
|||
buffer.close()
|
||||
raise
|
||||
|
||||
|
||||
def tokenize(readline):
|
||||
"""
|
||||
The tokenize() generator requires one argument, readline, which
|
||||
|
@ -425,192 +336,32 @@ def tokenize(readline):
|
|||
which tells you which encoding was used to decode the bytes stream.
|
||||
"""
|
||||
encoding, consumed = detect_encoding(readline)
|
||||
empty = _itertools.repeat(b"")
|
||||
rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
|
||||
return _tokenize(rl_gen.__next__, encoding)
|
||||
|
||||
|
||||
def _tokenize(readline, encoding):
|
||||
lnum = parenlev = continued = 0
|
||||
numchars = '0123456789'
|
||||
contstr, needcont = '', 0
|
||||
contline = None
|
||||
indents = [0]
|
||||
|
||||
rl_gen = _itertools.chain(consumed, iter(readline, b""))
|
||||
if encoding is not None:
|
||||
if encoding == "utf-8-sig":
|
||||
# BOM will already have been stripped.
|
||||
encoding = "utf-8"
|
||||
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
|
||||
last_line = b''
|
||||
line = b''
|
||||
while True: # loop over lines in stream
|
||||
try:
|
||||
# We capture the value of the line variable here because
|
||||
# readline uses the empty string '' to signal end of input,
|
||||
# hence `line` itself will always be overwritten at the end
|
||||
# of this loop.
|
||||
last_line = line
|
||||
line = readline()
|
||||
except StopIteration:
|
||||
line = b''
|
||||
yield from _tokenize(rl_gen, encoding)
|
||||
|
||||
if encoding is not None:
|
||||
line = line.decode(encoding)
|
||||
lnum += 1
|
||||
pos, max = 0, len(line)
|
||||
def _tokenize(rl_gen, encoding):
|
||||
source = b"".join(rl_gen).decode(encoding)
|
||||
token = None
|
||||
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
|
||||
# TODO: Marta -> limpiar esto
|
||||
if 6 < token.type <= 54:
|
||||
token = token._replace(type=OP)
|
||||
if token.type in {ASYNC, AWAIT}:
|
||||
token = token._replace(type=NAME)
|
||||
if token.type == NEWLINE:
|
||||
l_start, c_start = token.start
|
||||
l_end, c_end = token.end
|
||||
token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
|
||||
|
||||
if contstr: # continued string
|
||||
if not line:
|
||||
raise TokenError("EOF in multi-line string", strstart)
|
||||
endmatch = endprog.match(line)
|
||||
if endmatch:
|
||||
pos = end = endmatch.end(0)
|
||||
yield TokenInfo(STRING, contstr + line[:end],
|
||||
strstart, (lnum, end), contline + line)
|
||||
contstr, needcont = '', 0
|
||||
contline = None
|
||||
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
|
||||
yield TokenInfo(ERRORTOKEN, contstr + line,
|
||||
strstart, (lnum, len(line)), contline)
|
||||
contstr = ''
|
||||
contline = None
|
||||
continue
|
||||
else:
|
||||
contstr = contstr + line
|
||||
contline = contline + line
|
||||
continue
|
||||
|
||||
elif parenlev == 0 and not continued: # new statement
|
||||
if not line: break
|
||||
column = 0
|
||||
while pos < max: # measure leading whitespace
|
||||
if line[pos] == ' ':
|
||||
column += 1
|
||||
elif line[pos] == '\t':
|
||||
column = (column//tabsize + 1)*tabsize
|
||||
elif line[pos] == '\f':
|
||||
column = 0
|
||||
else:
|
||||
break
|
||||
pos += 1
|
||||
if pos == max:
|
||||
break
|
||||
|
||||
if line[pos] in '#\r\n': # skip comments or blank lines
|
||||
if line[pos] == '#':
|
||||
comment_token = line[pos:].rstrip('\r\n')
|
||||
yield TokenInfo(COMMENT, comment_token,
|
||||
(lnum, pos), (lnum, pos + len(comment_token)), line)
|
||||
pos += len(comment_token)
|
||||
|
||||
yield TokenInfo(NL, line[pos:],
|
||||
(lnum, pos), (lnum, len(line)), line)
|
||||
continue
|
||||
|
||||
if column > indents[-1]: # count indents or dedents
|
||||
indents.append(column)
|
||||
yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
|
||||
while column < indents[-1]:
|
||||
if column not in indents:
|
||||
raise IndentationError(
|
||||
"unindent does not match any outer indentation level",
|
||||
("<tokenize>", lnum, pos, line))
|
||||
indents = indents[:-1]
|
||||
|
||||
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
|
||||
|
||||
else: # continued statement
|
||||
if not line:
|
||||
raise TokenError("EOF in multi-line statement", (lnum, 0))
|
||||
continued = 0
|
||||
|
||||
while pos < max:
|
||||
pseudomatch = _compile(PseudoToken).match(line, pos)
|
||||
if pseudomatch: # scan for tokens
|
||||
start, end = pseudomatch.span(1)
|
||||
spos, epos, pos = (lnum, start), (lnum, end), end
|
||||
if start == end:
|
||||
continue
|
||||
token, initial = line[start:end], line[start]
|
||||
|
||||
if (initial in numchars or # ordinary number
|
||||
(initial == '.' and token != '.' and token != '...')):
|
||||
yield TokenInfo(NUMBER, token, spos, epos, line)
|
||||
elif initial in '\r\n':
|
||||
if parenlev > 0:
|
||||
yield TokenInfo(NL, token, spos, epos, line)
|
||||
else:
|
||||
yield TokenInfo(NEWLINE, token, spos, epos, line)
|
||||
|
||||
elif initial == '#':
|
||||
assert not token.endswith("\n")
|
||||
yield TokenInfo(COMMENT, token, spos, epos, line)
|
||||
|
||||
elif token in triple_quoted:
|
||||
endprog = _compile(endpats[token])
|
||||
endmatch = endprog.match(line, pos)
|
||||
if endmatch: # all on one line
|
||||
pos = endmatch.end(0)
|
||||
token = line[start:pos]
|
||||
yield TokenInfo(STRING, token, spos, (lnum, pos), line)
|
||||
else:
|
||||
strstart = (lnum, start) # multiple lines
|
||||
contstr = line[start:]
|
||||
contline = line
|
||||
break
|
||||
|
||||
# Check up to the first 3 chars of the token to see if
|
||||
# they're in the single_quoted set. If so, they start
|
||||
# a string.
|
||||
# We're using the first 3, because we're looking for
|
||||
# "rb'" (for example) at the start of the token. If
|
||||
# we switch to longer prefixes, this needs to be
|
||||
# adjusted.
|
||||
# Note that initial == token[:1].
|
||||
# Also note that single quote checking must come after
|
||||
# triple quote checking (above).
|
||||
elif (initial in single_quoted or
|
||||
token[:2] in single_quoted or
|
||||
token[:3] in single_quoted):
|
||||
if token[-1] == '\n': # continued string
|
||||
strstart = (lnum, start)
|
||||
# Again, using the first 3 chars of the
|
||||
# token. This is looking for the matching end
|
||||
# regex for the correct type of quote
|
||||
# character. So it's really looking for
|
||||
# endpats["'"] or endpats['"'], by trying to
|
||||
# skip string prefix characters, if any.
|
||||
endprog = _compile(endpats.get(initial) or
|
||||
endpats.get(token[1]) or
|
||||
endpats.get(token[2]))
|
||||
contstr, needcont = line[start:], 1
|
||||
contline = line
|
||||
break
|
||||
else: # ordinary string
|
||||
yield TokenInfo(STRING, token, spos, epos, line)
|
||||
|
||||
elif initial.isidentifier(): # ordinary name
|
||||
yield TokenInfo(NAME, token, spos, epos, line)
|
||||
elif initial == '\\': # continued stmt
|
||||
continued = 1
|
||||
else:
|
||||
if initial in '([{':
|
||||
parenlev += 1
|
||||
elif initial in ')]}':
|
||||
parenlev -= 1
|
||||
yield TokenInfo(OP, token, spos, epos, line)
|
||||
else:
|
||||
yield TokenInfo(ERRORTOKEN, line[pos],
|
||||
(lnum, pos), (lnum, pos+1), line)
|
||||
pos += 1
|
||||
|
||||
# Add an implicit NEWLINE if the input doesn't end in one
|
||||
if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
|
||||
yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
|
||||
for indent in indents[1:]: # pop remaining indent levels
|
||||
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
|
||||
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
|
||||
yield token
|
||||
if token is not None:
|
||||
last_line, _ = token.start
|
||||
yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
|
||||
|
||||
|
||||
def generate_tokens(readline):
|
||||
|
@ -619,7 +370,16 @@ def generate_tokens(readline):
|
|||
This has the same API as tokenize(), except that it expects the *readline*
|
||||
callable to return str objects instead of bytes.
|
||||
"""
|
||||
return _tokenize(readline, None)
|
||||
def _gen():
|
||||
while True:
|
||||
try:
|
||||
line = readline()
|
||||
except StopIteration:
|
||||
return
|
||||
if not line:
|
||||
return
|
||||
yield line.encode()
|
||||
return _tokenize(_gen(), 'utf-8')
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
@ -656,7 +416,10 @@ def main():
|
|||
tokens = list(tokenize(f.readline))
|
||||
else:
|
||||
filename = "<stdin>"
|
||||
tokens = _tokenize(sys.stdin.readline, None)
|
||||
tokens = _tokenize(
|
||||
(x.encode('utf-8') for x in iter(sys.stdin.readline, "")
|
||||
), "utf-8")
|
||||
|
||||
|
||||
# Output the tokenization
|
||||
for token in tokens:
|
||||
|
@ -682,10 +445,10 @@ def main():
|
|||
perror("unexpected error: %s" % err)
|
||||
raise
|
||||
|
||||
def _generate_tokens_from_c_tokenizer(source):
|
||||
def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
|
||||
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
|
||||
import _tokenize as c_tokenizer
|
||||
for info in c_tokenizer.TokenizerIter(source):
|
||||
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
|
||||
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
|
||||
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Implement PEP 701 changes in the :mod:`tokenize` module. Patch by Marta Gómez Macías and Pablo Galindo Salgado
|
|
@ -208,7 +208,7 @@ int
|
|||
_PyPegen_fill_token(Parser *p)
|
||||
{
|
||||
struct token new_token;
|
||||
new_token.metadata = NULL;
|
||||
_PyToken_Init(&new_token);
|
||||
int type = _PyTokenizer_Get(p->tok, &new_token);
|
||||
|
||||
// Record and skip '# type: ignore' comments
|
||||
|
@ -251,7 +251,7 @@ _PyPegen_fill_token(Parser *p)
|
|||
Token *t = p->tokens[p->fill];
|
||||
return initialize_token(p, t, &new_token, type);
|
||||
error:
|
||||
Py_XDECREF(new_token.metadata);
|
||||
_PyToken_Free(&new_token);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
|
|
@ -165,7 +165,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
|
|||
|
||||
int ret = 0;
|
||||
struct token new_token;
|
||||
new_token.metadata = NULL;
|
||||
_PyToken_Init(&new_token);
|
||||
|
||||
for (;;) {
|
||||
switch (_PyTokenizer_Get(p->tok, &new_token)) {
|
||||
|
@ -193,7 +193,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
|
|||
|
||||
|
||||
exit:
|
||||
Py_XDECREF(new_token.metadata);
|
||||
_PyToken_Free(&new_token);
|
||||
// If we're in an f-string, we want the syntax error in the expression part
|
||||
// to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
|
||||
// do not swallow it.
|
||||
|
|
|
@ -70,9 +70,9 @@ const char * const _PyParser_TokenNames[] = {
|
|||
"FSTRING_START",
|
||||
"FSTRING_MIDDLE",
|
||||
"FSTRING_END",
|
||||
"COMMENT",
|
||||
"NL",
|
||||
"<ERRORTOKEN>",
|
||||
"<COMMENT>",
|
||||
"<NL>",
|
||||
"<ENCODING>",
|
||||
"<N_TOKENS>",
|
||||
};
|
||||
|
|
|
@ -111,6 +111,8 @@ tok_new(void)
|
|||
tok->interactive_underflow = IUNDERFLOW_NORMAL;
|
||||
tok->str = NULL;
|
||||
tok->report_warnings = 1;
|
||||
tok->tok_extra_tokens = 0;
|
||||
tok->comment_newline = 0;
|
||||
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
|
||||
tok->tok_mode_stack_index = 0;
|
||||
tok->tok_report_warnings = 1;
|
||||
|
@ -980,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok)
|
|||
PyMem_Free(tok);
|
||||
}
|
||||
|
||||
void
|
||||
_PyToken_Free(struct token *token) {
|
||||
Py_XDECREF(token->metadata);
|
||||
}
|
||||
|
||||
void
|
||||
_PyToken_Init(struct token *token) {
|
||||
token->metadata = NULL;
|
||||
}
|
||||
|
||||
static int
|
||||
tok_readline_raw(struct tok_state *tok)
|
||||
{
|
||||
|
@ -1636,6 +1648,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
|
|||
return type;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
|
||||
{
|
||||
|
@ -1649,6 +1662,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
tok->starting_col_offset = -1;
|
||||
blankline = 0;
|
||||
|
||||
|
||||
/* Get indentation level */
|
||||
if (tok->atbol) {
|
||||
int col = 0;
|
||||
|
@ -1749,12 +1763,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
tok->starting_col_offset = tok->col_offset;
|
||||
|
||||
/* Return pending indents/dedents */
|
||||
if (tok->pendin != 0) {
|
||||
if (tok->pendin != 0) {
|
||||
if (tok->pendin < 0) {
|
||||
if (tok->tok_extra_tokens) {
|
||||
p_start = tok->cur;
|
||||
p_end = tok->cur;
|
||||
}
|
||||
tok->pendin++;
|
||||
return MAKE_TOKEN(DEDENT);
|
||||
}
|
||||
else {
|
||||
if (tok->tok_extra_tokens) {
|
||||
p_start = tok->buf;
|
||||
p_end = tok->cur;
|
||||
}
|
||||
tok->pendin--;
|
||||
return MAKE_TOKEN(INDENT);
|
||||
}
|
||||
|
@ -1803,13 +1825,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
|
||||
}
|
||||
|
||||
const char *prefix, *p, *type_start;
|
||||
const char* p = NULL;
|
||||
const char *prefix, *type_start;
|
||||
int current_starting_col_offset;
|
||||
|
||||
while (c != EOF && c != '\n') {
|
||||
c = tok_nextc(tok);
|
||||
}
|
||||
|
||||
if (tok->tok_extra_tokens) {
|
||||
p = tok->start;
|
||||
}
|
||||
|
||||
if (tok->type_comments) {
|
||||
p = tok->start;
|
||||
current_starting_col_offset = tok->starting_col_offset;
|
||||
|
@ -1864,6 +1891,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
}
|
||||
}
|
||||
}
|
||||
if (tok->tok_extra_tokens) {
|
||||
tok_backup(tok, c); /* don't eat the newline or EOF */
|
||||
p_start = p;
|
||||
p_end = tok->cur;
|
||||
tok->comment_newline = blankline;
|
||||
return MAKE_TOKEN(COMMENT);
|
||||
}
|
||||
}
|
||||
|
||||
if (tok->done == E_INTERACT_STOP) {
|
||||
|
@ -1949,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
|
||||
struct tok_state ahead_tok;
|
||||
struct token ahead_token;
|
||||
_PyToken_Init(&ahead_token);
|
||||
int ahead_tok_kind;
|
||||
|
||||
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
|
||||
|
@ -1964,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
returning a plain NAME token, return ASYNC. */
|
||||
tok->async_def_indent = tok->indent;
|
||||
tok->async_def = 1;
|
||||
_PyToken_Free(&ahead_token);
|
||||
return MAKE_TOKEN(ASYNC);
|
||||
}
|
||||
_PyToken_Free(&ahead_token);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1976,8 +2013,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
if (c == '\n') {
|
||||
tok->atbol = 1;
|
||||
if (blankline || tok->level > 0) {
|
||||
if (tok->tok_extra_tokens) {
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(NL);
|
||||
}
|
||||
goto nextline;
|
||||
}
|
||||
if (tok->comment_newline && tok->tok_extra_tokens) {
|
||||
tok->comment_newline = 0;
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(NL);
|
||||
}
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur - 1; /* Leave '\n' out of the string */
|
||||
tok->cont_line = 0;
|
||||
|
@ -2563,6 +2611,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
|
|||
|
||||
f_string_middle:
|
||||
|
||||
// TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
|
||||
// this.
|
||||
tok->multi_line_start = tok->line_start;
|
||||
while (end_quote_size != current_tok->f_string_quote_size) {
|
||||
int c = tok_nextc(tok);
|
||||
if (tok->done == E_ERROR) {
|
||||
|
@ -2788,7 +2839,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
|
|||
// if fetching the encoding shows a warning.
|
||||
tok->report_warnings = 0;
|
||||
while (tok->lineno < 2 && tok->done == E_OK) {
|
||||
_PyToken_Init(&token);
|
||||
_PyTokenizer_Get(tok, &token);
|
||||
_PyToken_Free(&token);
|
||||
}
|
||||
fclose(fp);
|
||||
if (tok->encoding) {
|
||||
|
|
|
@ -128,6 +128,8 @@ struct tok_state {
|
|||
tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
|
||||
int tok_mode_stack_index;
|
||||
int tok_report_warnings;
|
||||
int tok_extra_tokens;
|
||||
int comment_newline;
|
||||
#ifdef Py_DEBUG
|
||||
int debug;
|
||||
#endif
|
||||
|
@ -138,6 +140,8 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
|
|||
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
|
||||
const char *, const char *);
|
||||
extern void _PyTokenizer_Free(struct tok_state *);
|
||||
extern void _PyToken_Free(struct token *);
|
||||
extern void _PyToken_Init(struct token *);
|
||||
extern int _PyTokenizer_Get(struct tok_state *, struct token *);
|
||||
|
||||
#define tok_dump _Py_tok_dump
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
#include "Python.h"
|
||||
#include "errcode.h"
|
||||
#include "../Parser/tokenizer.h"
|
||||
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
|
||||
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
|
||||
|
||||
static struct PyModuleDef _tokenizemodule;
|
||||
|
||||
|
@ -34,11 +37,14 @@ typedef struct
|
|||
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
|
||||
|
||||
source: str
|
||||
*
|
||||
extra_tokens: bool
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
tokenizeriter_new_impl(PyTypeObject *type, const char *source)
|
||||
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
|
||||
tokenizeriter_new_impl(PyTypeObject *type, const char *source,
|
||||
int extra_tokens)
|
||||
/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
|
||||
{
|
||||
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
|
||||
if (self == NULL) {
|
||||
|
@ -54,20 +60,123 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source)
|
|||
return NULL;
|
||||
}
|
||||
self->tok->filename = filename;
|
||||
if (extra_tokens) {
|
||||
self->tok->tok_extra_tokens = 1;
|
||||
}
|
||||
return (PyObject *)self;
|
||||
}
|
||||
|
||||
static int
|
||||
_tokenizer_error(struct tok_state *tok)
|
||||
{
|
||||
if (PyErr_Occurred()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
const char *msg = NULL;
|
||||
PyObject* errtype = PyExc_SyntaxError;
|
||||
switch (tok->done) {
|
||||
case E_TOKEN:
|
||||
msg = "invalid token";
|
||||
break;
|
||||
case E_EOF:
|
||||
if (tok->level) {
|
||||
PyErr_Format(PyExc_SyntaxError,
|
||||
"parenthesis '%c' was never closed",
|
||||
tok->parenstack[tok->level-1]);
|
||||
} else {
|
||||
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
|
||||
}
|
||||
return -1;
|
||||
case E_DEDENT:
|
||||
PyErr_Format(PyExc_IndentationError,
|
||||
"unindent does not match any outer indentation level "
|
||||
"(<tokenize>, line %d)",
|
||||
tok->lineno);
|
||||
return -1;
|
||||
case E_INTR:
|
||||
if (!PyErr_Occurred()) {
|
||||
PyErr_SetNone(PyExc_KeyboardInterrupt);
|
||||
}
|
||||
return -1;
|
||||
case E_NOMEM:
|
||||
PyErr_NoMemory();
|
||||
return -1;
|
||||
case E_TABSPACE:
|
||||
errtype = PyExc_TabError;
|
||||
msg = "inconsistent use of tabs and spaces in indentation";
|
||||
break;
|
||||
case E_TOODEEP:
|
||||
errtype = PyExc_IndentationError;
|
||||
msg = "too many levels of indentation";
|
||||
break;
|
||||
case E_LINECONT: {
|
||||
msg = "unexpected character after line continuation character";
|
||||
break;
|
||||
}
|
||||
default:
|
||||
msg = "unknown tokenization error";
|
||||
}
|
||||
|
||||
PyObject* errstr = NULL;
|
||||
PyObject* error_line = NULL;
|
||||
PyObject* tmp = NULL;
|
||||
PyObject* value = NULL;
|
||||
int result = 0;
|
||||
|
||||
Py_ssize_t size = tok->inp - tok->buf;
|
||||
error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
|
||||
if (!error_line) {
|
||||
result = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
tmp = Py_BuildValue("(OnnOii)", tok->filename, tok->lineno, 0, error_line, 0, 0);
|
||||
if (!tmp) {
|
||||
result = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
errstr = PyUnicode_FromString(msg);
|
||||
if (!errstr) {
|
||||
result = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
value = PyTuple_Pack(2, errstr, tmp);
|
||||
if (!value) {
|
||||
result = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
PyErr_SetObject(errtype, value);
|
||||
|
||||
exit:
|
||||
Py_XDECREF(errstr);
|
||||
Py_XDECREF(error_line);
|
||||
Py_XDECREF(tmp);
|
||||
Py_XDECREF(value);
|
||||
return result;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
tokenizeriter_next(tokenizeriterobject *it)
|
||||
{
|
||||
PyObject* result = NULL;
|
||||
struct token token;
|
||||
_PyToken_Init(&token);
|
||||
|
||||
int type = _PyTokenizer_Get(it->tok, &token);
|
||||
if (type == ERRORTOKEN && PyErr_Occurred()) {
|
||||
return NULL;
|
||||
if (type == ERRORTOKEN) {
|
||||
if(!PyErr_Occurred()) {
|
||||
_tokenizer_error(it->tok);
|
||||
assert(PyErr_Occurred());
|
||||
}
|
||||
goto exit;
|
||||
}
|
||||
if (type == ERRORTOKEN || type == ENDMARKER) {
|
||||
PyErr_SetString(PyExc_StopIteration, "EOF");
|
||||
return NULL;
|
||||
goto exit;
|
||||
}
|
||||
PyObject *str = NULL;
|
||||
if (token.start == NULL || token.end == NULL) {
|
||||
|
@ -77,28 +186,31 @@ tokenizeriter_next(tokenizeriterobject *it)
|
|||
str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
|
||||
}
|
||||
if (str == NULL) {
|
||||
return NULL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
Py_ssize_t size = it->tok->inp - it->tok->buf;
|
||||
PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
|
||||
if (line == NULL) {
|
||||
Py_DECREF(str);
|
||||
return NULL;
|
||||
goto exit;
|
||||
}
|
||||
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
|
||||
int lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
|
||||
int end_lineno = it->tok->lineno;
|
||||
int col_offset = -1;
|
||||
int end_col_offset = -1;
|
||||
Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
|
||||
Py_ssize_t end_lineno = it->tok->lineno;
|
||||
Py_ssize_t col_offset = -1;
|
||||
Py_ssize_t end_col_offset = -1;
|
||||
if (token.start != NULL && token.start >= line_start) {
|
||||
col_offset = (int)(token.start - line_start);
|
||||
col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
|
||||
}
|
||||
if (token.end != NULL && token.end >= it->tok->line_start) {
|
||||
end_col_offset = (int)(token.end - it->tok->line_start);
|
||||
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
|
||||
}
|
||||
|
||||
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
|
||||
result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
|
||||
exit:
|
||||
_PyToken_Free(&token);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
@ -9,7 +9,8 @@ preserve
|
|||
|
||||
|
||||
static PyObject *
|
||||
tokenizeriter_new_impl(PyTypeObject *type, const char *source);
|
||||
tokenizeriter_new_impl(PyTypeObject *type, const char *source,
|
||||
int extra_tokens);
|
||||
|
||||
static PyObject *
|
||||
tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
||||
|
@ -17,14 +18,14 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
|||
PyObject *return_value = NULL;
|
||||
#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
|
||||
|
||||
#define NUM_KEYWORDS 1
|
||||
#define NUM_KEYWORDS 2
|
||||
static struct {
|
||||
PyGC_Head _this_is_not_used;
|
||||
PyObject_VAR_HEAD
|
||||
PyObject *ob_item[NUM_KEYWORDS];
|
||||
} _kwtuple = {
|
||||
.ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
|
||||
.ob_item = { &_Py_ID(source), },
|
||||
.ob_item = { &_Py_ID(source), &_Py_ID(extra_tokens), },
|
||||
};
|
||||
#undef NUM_KEYWORDS
|
||||
#define KWTUPLE (&_kwtuple.ob_base.ob_base)
|
||||
|
@ -33,19 +34,20 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
|||
# define KWTUPLE NULL
|
||||
#endif // !Py_BUILD_CORE
|
||||
|
||||
static const char * const _keywords[] = {"source", NULL};
|
||||
static const char * const _keywords[] = {"source", "extra_tokens", NULL};
|
||||
static _PyArg_Parser _parser = {
|
||||
.keywords = _keywords,
|
||||
.fname = "tokenizeriter",
|
||||
.kwtuple = KWTUPLE,
|
||||
};
|
||||
#undef KWTUPLE
|
||||
PyObject *argsbuf[1];
|
||||
PyObject *argsbuf[2];
|
||||
PyObject * const *fastargs;
|
||||
Py_ssize_t nargs = PyTuple_GET_SIZE(args);
|
||||
const char *source;
|
||||
int extra_tokens;
|
||||
|
||||
fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf);
|
||||
fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf);
|
||||
if (!fastargs) {
|
||||
goto exit;
|
||||
}
|
||||
|
@ -62,9 +64,13 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
|||
PyErr_SetString(PyExc_ValueError, "embedded null character");
|
||||
goto exit;
|
||||
}
|
||||
return_value = tokenizeriter_new_impl(type, source);
|
||||
extra_tokens = PyObject_IsTrue(fastargs[1]);
|
||||
if (extra_tokens < 0) {
|
||||
goto exit;
|
||||
}
|
||||
return_value = tokenizeriter_new_impl(type, source, extra_tokens);
|
||||
|
||||
exit:
|
||||
return return_value;
|
||||
}
|
||||
/*[clinic end generated code: output=8c2c09f651961986 input=a9049054013a1b77]*/
|
||||
/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/
|
||||
|
|
Loading…
Reference in New Issue