Add tests for the C tokenizer and expose it as a private module (GH-27924)

This commit is contained in:
Pablo Galindo Salgado 2021-08-24 17:50:05 +01:00 committed by GitHub
parent 9ed523159c
commit a24676bedc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 1114 additions and 5 deletions

View File

@ -3,7 +3,7 @@ from test.support import os_helper
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens,
NEWLINE)
NEWLINE, _generate_tokens_from_c_tokenizer)
from io import BytesIO, StringIO
import unittest
from unittest import TestCase, mock
@ -12,7 +12,6 @@ from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
import os
import token
# Converts a source string into a list of textual representation
# of the tokens such as:
# ` NAME 'if' (1, 0) (1, 2)`
@ -1654,5 +1653,865 @@ class TestRoundtrip(TestCase):
self.check_roundtrip(code)
class CTokenizeTest(TestCase):
def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
# The ENDMARKER and final NEWLINE are omitted.
with self.subTest(source=s):
result = stringify_tokens_from_source(
_generate_tokens_from_c_tokenizer(s), s
)
self.assertEqual(result, expected.rstrip().splitlines())
def test_int(self):
self.check_tokenize('0xff <= 255', """\
NUMBER '0xff' (1, 0) (1, 4)
LESSEQUAL '<=' (1, 5) (1, 7)
NUMBER '255' (1, 8) (1, 11)
""")
self.check_tokenize('0b10 <= 255', """\
NUMBER '0b10' (1, 0) (1, 4)
LESSEQUAL '<=' (1, 5) (1, 7)
NUMBER '255' (1, 8) (1, 11)
""")
self.check_tokenize('0o123 <= 0O123', """\
NUMBER '0o123' (1, 0) (1, 5)
LESSEQUAL '<=' (1, 6) (1, 8)
NUMBER '0O123' (1, 9) (1, 14)
""")
self.check_tokenize('1234567 > ~0x15', """\
NUMBER '1234567' (1, 0) (1, 7)
GREATER '>' (1, 8) (1, 9)
TILDE '~' (1, 10) (1, 11)
NUMBER '0x15' (1, 11) (1, 15)
""")
self.check_tokenize('2134568 != 1231515', """\
NUMBER '2134568' (1, 0) (1, 7)
NOTEQUAL '!=' (1, 8) (1, 10)
NUMBER '1231515' (1, 11) (1, 18)
""")
self.check_tokenize('(-124561-1) & 200000000', """\
LPAR '(' (1, 0) (1, 1)
MINUS '-' (1, 1) (1, 2)
NUMBER '124561' (1, 2) (1, 8)
MINUS '-' (1, 8) (1, 9)
NUMBER '1' (1, 9) (1, 10)
RPAR ')' (1, 10) (1, 11)
AMPER '&' (1, 12) (1, 13)
NUMBER '200000000' (1, 14) (1, 23)
""")
self.check_tokenize('0xdeadbeef != -1', """\
NUMBER '0xdeadbeef' (1, 0) (1, 10)
NOTEQUAL '!=' (1, 11) (1, 13)
MINUS '-' (1, 14) (1, 15)
NUMBER '1' (1, 15) (1, 16)
""")
self.check_tokenize('0xdeadc0de & 12345', """\
NUMBER '0xdeadc0de' (1, 0) (1, 10)
AMPER '&' (1, 11) (1, 12)
NUMBER '12345' (1, 13) (1, 18)
""")
self.check_tokenize('0xFF & 0x15 | 1234', """\
NUMBER '0xFF' (1, 0) (1, 4)
AMPER '&' (1, 5) (1, 6)
NUMBER '0x15' (1, 7) (1, 11)
VBAR '|' (1, 12) (1, 13)
NUMBER '1234' (1, 14) (1, 18)
""")
def test_float(self):
self.check_tokenize('x = 3.14159', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
NUMBER '3.14159' (1, 4) (1, 11)
""")
self.check_tokenize('x = 314159.', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
NUMBER '314159.' (1, 4) (1, 11)
""")
self.check_tokenize('x = .314159', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
NUMBER '.314159' (1, 4) (1, 11)
""")
self.check_tokenize('x = 3e14159', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
NUMBER '3e14159' (1, 4) (1, 11)
""")
self.check_tokenize('x = 3E123', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
NUMBER '3E123' (1, 4) (1, 9)
""")
self.check_tokenize('x+y = 3e-1230', """\
NAME 'x' (1, 0) (1, 1)
PLUS '+' (1, 1) (1, 2)
NAME 'y' (1, 2) (1, 3)
EQUAL '=' (1, 4) (1, 5)
NUMBER '3e-1230' (1, 6) (1, 13)
""")
self.check_tokenize('x = 3.14e159', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
NUMBER '3.14e159' (1, 4) (1, 12)
""")
def test_string(self):
self.check_tokenize('x = \'\'; y = ""', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
STRING "''" (1, 4) (1, 6)
SEMI ';' (1, 6) (1, 7)
NAME 'y' (1, 8) (1, 9)
EQUAL '=' (1, 10) (1, 11)
STRING '""' (1, 12) (1, 14)
""")
self.check_tokenize('x = \'"\'; y = "\'"', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
STRING '\\'"\\'' (1, 4) (1, 7)
SEMI ';' (1, 7) (1, 8)
NAME 'y' (1, 9) (1, 10)
EQUAL '=' (1, 11) (1, 12)
STRING '"\\'"' (1, 13) (1, 16)
""")
self.check_tokenize('x = "doesn\'t "shrink", does it"', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
STRING '"doesn\\'t "' (1, 4) (1, 14)
NAME 'shrink' (1, 14) (1, 20)
STRING '", does it"' (1, 20) (1, 31)
""")
self.check_tokenize("x = 'abc' + 'ABC'", """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
STRING "'abc'" (1, 4) (1, 9)
PLUS '+' (1, 10) (1, 11)
STRING "'ABC'" (1, 12) (1, 17)
""")
self.check_tokenize('y = "ABC" + "ABC"', """\
NAME 'y' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
STRING '"ABC"' (1, 4) (1, 9)
PLUS '+' (1, 10) (1, 11)
STRING '"ABC"' (1, 12) (1, 17)
""")
self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
STRING "r'abc'" (1, 4) (1, 10)
PLUS '+' (1, 11) (1, 12)
STRING "r'ABC'" (1, 13) (1, 19)
PLUS '+' (1, 20) (1, 21)
STRING "R'ABC'" (1, 22) (1, 28)
PLUS '+' (1, 29) (1, 30)
STRING "R'ABC'" (1, 31) (1, 37)
""")
self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
NAME 'y' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
STRING 'r"abc"' (1, 4) (1, 10)
PLUS '+' (1, 11) (1, 12)
STRING 'r"ABC"' (1, 13) (1, 19)
PLUS '+' (1, 20) (1, 21)
STRING 'R"ABC"' (1, 22) (1, 28)
PLUS '+' (1, 29) (1, 30)
STRING 'R"ABC"' (1, 31) (1, 37)
""")
self.check_tokenize("u'abc' + U'abc'", """\
STRING "u'abc'" (1, 0) (1, 6)
PLUS '+' (1, 7) (1, 8)
STRING "U'abc'" (1, 9) (1, 15)
""")
self.check_tokenize('u"abc" + U"abc"', """\
STRING 'u"abc"' (1, 0) (1, 6)
PLUS '+' (1, 7) (1, 8)
STRING 'U"abc"' (1, 9) (1, 15)
""")
self.check_tokenize("b'abc' + B'abc'", """\
STRING "b'abc'" (1, 0) (1, 6)
PLUS '+' (1, 7) (1, 8)
STRING "B'abc'" (1, 9) (1, 15)
""")
self.check_tokenize('b"abc" + B"abc"', """\
STRING 'b"abc"' (1, 0) (1, 6)
PLUS '+' (1, 7) (1, 8)
STRING 'B"abc"' (1, 9) (1, 15)
""")
self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
STRING "br'abc'" (1, 0) (1, 7)
PLUS '+' (1, 8) (1, 9)
STRING "bR'abc'" (1, 10) (1, 17)
PLUS '+' (1, 18) (1, 19)
STRING "Br'abc'" (1, 20) (1, 27)
PLUS '+' (1, 28) (1, 29)
STRING "BR'abc'" (1, 30) (1, 37)
""")
self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
STRING 'br"abc"' (1, 0) (1, 7)
PLUS '+' (1, 8) (1, 9)
STRING 'bR"abc"' (1, 10) (1, 17)
PLUS '+' (1, 18) (1, 19)
STRING 'Br"abc"' (1, 20) (1, 27)
PLUS '+' (1, 28) (1, 29)
STRING 'BR"abc"' (1, 30) (1, 37)
""")
self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
STRING "rb'abc'" (1, 0) (1, 7)
PLUS '+' (1, 8) (1, 9)
STRING "rB'abc'" (1, 10) (1, 17)
PLUS '+' (1, 18) (1, 19)
STRING "Rb'abc'" (1, 20) (1, 27)
PLUS '+' (1, 28) (1, 29)
STRING "RB'abc'" (1, 30) (1, 37)
""")
self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
STRING 'rb"abc"' (1, 0) (1, 7)
PLUS '+' (1, 8) (1, 9)
STRING 'rB"abc"' (1, 10) (1, 17)
PLUS '+' (1, 18) (1, 19)
STRING 'Rb"abc"' (1, 20) (1, 27)
PLUS '+' (1, 28) (1, 29)
STRING 'RB"abc"' (1, 30) (1, 37)
""")
self.check_tokenize('"a\\\nde\\\nfg"', """\
STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
""")
self.check_tokenize('u"a\\\nde"', """\
STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3)
""")
self.check_tokenize('rb"a\\\nd"', """\
STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2)
""")
self.check_tokenize(r'"""a\
b"""', """\
STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
""")
self.check_tokenize(r'u"""a\
b"""', """\
STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
""")
self.check_tokenize(r'rb"""a\
b\
c"""', """\
STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
""")
self.check_tokenize('f"abc"', """\
STRING 'f"abc"' (1, 0) (1, 6)
""")
self.check_tokenize('fR"a{b}c"', """\
STRING 'fR"a{b}c"' (1, 0) (1, 9)
""")
self.check_tokenize('f"""abc"""', """\
STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
""")
self.check_tokenize(r'f"abc\
def"', """\
STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
""")
self.check_tokenize(r'Rf"abc\
def"', """\
STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
""")
def test_function(self):
self.check_tokenize('def d22(a, b, c=2, d=2, *k): pass', """\
NAME 'def' (1, 0) (1, 3)
NAME 'd22' (1, 4) (1, 7)
LPAR '(' (1, 7) (1, 8)
NAME 'a' (1, 8) (1, 9)
COMMA ',' (1, 9) (1, 10)
NAME 'b' (1, 11) (1, 12)
COMMA ',' (1, 12) (1, 13)
NAME 'c' (1, 14) (1, 15)
EQUAL '=' (1, 15) (1, 16)
NUMBER '2' (1, 16) (1, 17)
COMMA ',' (1, 17) (1, 18)
NAME 'd' (1, 19) (1, 20)
EQUAL '=' (1, 20) (1, 21)
NUMBER '2' (1, 21) (1, 22)
COMMA ',' (1, 22) (1, 23)
STAR '*' (1, 24) (1, 25)
NAME 'k' (1, 25) (1, 26)
RPAR ')' (1, 26) (1, 27)
COLON ':' (1, 27) (1, 28)
NAME 'pass' (1, 29) (1, 33)
""")
self.check_tokenize('def d01v_(a=1, *k, **w): pass', """\
NAME 'def' (1, 0) (1, 3)
NAME 'd01v_' (1, 4) (1, 9)
LPAR '(' (1, 9) (1, 10)
NAME 'a' (1, 10) (1, 11)
EQUAL '=' (1, 11) (1, 12)
NUMBER '1' (1, 12) (1, 13)
COMMA ',' (1, 13) (1, 14)
STAR '*' (1, 15) (1, 16)
NAME 'k' (1, 16) (1, 17)
COMMA ',' (1, 17) (1, 18)
DOUBLESTAR '**' (1, 19) (1, 21)
NAME 'w' (1, 21) (1, 22)
RPAR ')' (1, 22) (1, 23)
COLON ':' (1, 23) (1, 24)
NAME 'pass' (1, 25) (1, 29)
""")
self.check_tokenize('def d23(a: str, b: int=3) -> int: pass', """\
NAME 'def' (1, 0) (1, 3)
NAME 'd23' (1, 4) (1, 7)
LPAR '(' (1, 7) (1, 8)
NAME 'a' (1, 8) (1, 9)
COLON ':' (1, 9) (1, 10)
NAME 'str' (1, 11) (1, 14)
COMMA ',' (1, 14) (1, 15)
NAME 'b' (1, 16) (1, 17)
COLON ':' (1, 17) (1, 18)
NAME 'int' (1, 19) (1, 22)
EQUAL '=' (1, 22) (1, 23)
NUMBER '3' (1, 23) (1, 24)
RPAR ')' (1, 24) (1, 25)
RARROW '->' (1, 26) (1, 28)
NAME 'int' (1, 29) (1, 32)
COLON ':' (1, 32) (1, 33)
NAME 'pass' (1, 34) (1, 38)
""")
def test_comparison(self):
self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
"1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
NAME 'if' (1, 0) (1, 2)
NUMBER '1' (1, 3) (1, 4)
LESS '<' (1, 5) (1, 6)
NUMBER '1' (1, 7) (1, 8)
GREATER '>' (1, 9) (1, 10)
NUMBER '1' (1, 11) (1, 12)
EQEQUAL '==' (1, 13) (1, 15)
NUMBER '1' (1, 16) (1, 17)
GREATEREQUAL '>=' (1, 18) (1, 20)
NUMBER '5' (1, 21) (1, 22)
LESSEQUAL '<=' (1, 23) (1, 25)
NUMBER '0x15' (1, 26) (1, 30)
LESSEQUAL '<=' (1, 31) (1, 33)
NUMBER '0x12' (1, 34) (1, 38)
NOTEQUAL '!=' (1, 39) (1, 41)
NUMBER '1' (1, 42) (1, 43)
NAME 'and' (1, 44) (1, 47)
NUMBER '5' (1, 48) (1, 49)
NAME 'in' (1, 50) (1, 52)
NUMBER '1' (1, 53) (1, 54)
NAME 'not' (1, 55) (1, 58)
NAME 'in' (1, 59) (1, 61)
NUMBER '1' (1, 62) (1, 63)
NAME 'is' (1, 64) (1, 66)
NUMBER '1' (1, 67) (1, 68)
NAME 'or' (1, 69) (1, 71)
NUMBER '5' (1, 72) (1, 73)
NAME 'is' (1, 74) (1, 76)
NAME 'not' (1, 77) (1, 80)
NUMBER '1' (1, 81) (1, 82)
COLON ':' (1, 82) (1, 83)
NAME 'pass' (1, 84) (1, 88)
""")
def test_additive(self):
self.check_tokenize('x = 1 - y + 15 - 1 + 0x124 + z + a[5]', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
MINUS '-' (1, 6) (1, 7)
NAME 'y' (1, 8) (1, 9)
PLUS '+' (1, 10) (1, 11)
NUMBER '15' (1, 12) (1, 14)
MINUS '-' (1, 15) (1, 16)
NUMBER '1' (1, 17) (1, 18)
PLUS '+' (1, 19) (1, 20)
NUMBER '0x124' (1, 21) (1, 26)
PLUS '+' (1, 27) (1, 28)
NAME 'z' (1, 29) (1, 30)
PLUS '+' (1, 31) (1, 32)
NAME 'a' (1, 33) (1, 34)
LSQB '[' (1, 34) (1, 35)
NUMBER '5' (1, 35) (1, 36)
RSQB ']' (1, 36) (1, 37)
""")
def test_multiplicative(self):
self.check_tokenize('x = 1//1*1/5*12%0x12@42', """\
NAME 'x' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
DOUBLESLASH '//' (1, 5) (1, 7)
NUMBER '1' (1, 7) (1, 8)
STAR '*' (1, 8) (1, 9)
NUMBER '1' (1, 9) (1, 10)
SLASH '/' (1, 10) (1, 11)
NUMBER '5' (1, 11) (1, 12)
STAR '*' (1, 12) (1, 13)
NUMBER '12' (1, 13) (1, 15)
PERCENT '%' (1, 15) (1, 16)
NUMBER '0x12' (1, 16) (1, 20)
AT '@' (1, 20) (1, 21)
NUMBER '42' (1, 21) (1, 23)
""")
def test_unary(self):
self.check_tokenize('~1 ^ 1 & 1 |1 ^ -1', """\
TILDE '~' (1, 0) (1, 1)
NUMBER '1' (1, 1) (1, 2)
CIRCUMFLEX '^' (1, 3) (1, 4)
NUMBER '1' (1, 5) (1, 6)
AMPER '&' (1, 7) (1, 8)
NUMBER '1' (1, 9) (1, 10)
VBAR '|' (1, 11) (1, 12)
NUMBER '1' (1, 12) (1, 13)
CIRCUMFLEX '^' (1, 14) (1, 15)
MINUS '-' (1, 16) (1, 17)
NUMBER '1' (1, 17) (1, 18)
""")
self.check_tokenize('-1*1/1+1*1//1 - ---1**1', """\
MINUS '-' (1, 0) (1, 1)
NUMBER '1' (1, 1) (1, 2)
STAR '*' (1, 2) (1, 3)
NUMBER '1' (1, 3) (1, 4)
SLASH '/' (1, 4) (1, 5)
NUMBER '1' (1, 5) (1, 6)
PLUS '+' (1, 6) (1, 7)
NUMBER '1' (1, 7) (1, 8)
STAR '*' (1, 8) (1, 9)
NUMBER '1' (1, 9) (1, 10)
DOUBLESLASH '//' (1, 10) (1, 12)
NUMBER '1' (1, 12) (1, 13)
MINUS '-' (1, 14) (1, 15)
MINUS '-' (1, 16) (1, 17)
MINUS '-' (1, 17) (1, 18)
MINUS '-' (1, 18) (1, 19)
NUMBER '1' (1, 19) (1, 20)
DOUBLESTAR '**' (1, 20) (1, 22)
NUMBER '1' (1, 22) (1, 23)
""")
def test_selector(self):
self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
NAME 'import' (1, 0) (1, 6)
NAME 'sys' (1, 7) (1, 10)
COMMA ',' (1, 10) (1, 11)
NAME 'time' (1, 12) (1, 16)
NEWLINE '' (1, 16) (1, 16)
NAME 'x' (2, 0) (2, 1)
EQUAL '=' (2, 2) (2, 3)
NAME 'sys' (2, 4) (2, 7)
DOT '.' (2, 7) (2, 8)
NAME 'modules' (2, 8) (2, 15)
LSQB '[' (2, 15) (2, 16)
STRING "'time'" (2, 16) (2, 22)
RSQB ']' (2, 22) (2, 23)
DOT '.' (2, 23) (2, 24)
NAME 'time' (2, 24) (2, 28)
LPAR '(' (2, 28) (2, 29)
RPAR ')' (2, 29) (2, 30)
""")
def test_method(self):
self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
AT '@' (1, 0) (1, 1)
NAME 'staticmethod' (1, 1) (1, 13)
NEWLINE '' (1, 13) (1, 13)
NAME 'def' (2, 0) (2, 3)
NAME 'foo' (2, 4) (2, 7)
LPAR '(' (2, 7) (2, 8)
NAME 'x' (2, 8) (2, 9)
COMMA ',' (2, 9) (2, 10)
NAME 'y' (2, 10) (2, 11)
RPAR ')' (2, 11) (2, 12)
COLON ':' (2, 12) (2, 13)
NAME 'pass' (2, 14) (2, 18)
""")
def test_tabs(self):
self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
AT '@' (1, 0) (1, 1)
NAME 'staticmethod' (1, 1) (1, 13)
NEWLINE '' (1, 13) (1, 13)
NAME 'def' (2, 0) (2, 3)
NAME 'foo' (2, 4) (2, 7)
LPAR '(' (2, 7) (2, 8)
NAME 'x' (2, 8) (2, 9)
COMMA ',' (2, 9) (2, 10)
NAME 'y' (2, 10) (2, 11)
RPAR ')' (2, 11) (2, 12)
COLON ':' (2, 12) (2, 13)
NAME 'pass' (2, 14) (2, 18)
""")
def test_async(self):
self.check_tokenize('async = 1', """\
ASYNC 'async' (1, 0) (1, 5)
EQUAL '=' (1, 6) (1, 7)
NUMBER '1' (1, 8) (1, 9)
""")
self.check_tokenize('a = (async = 1)', """\
NAME 'a' (1, 0) (1, 1)
EQUAL '=' (1, 2) (1, 3)
LPAR '(' (1, 4) (1, 5)
ASYNC 'async' (1, 5) (1, 10)
EQUAL '=' (1, 11) (1, 12)
NUMBER '1' (1, 13) (1, 14)
RPAR ')' (1, 14) (1, 15)
""")
self.check_tokenize('async()', """\
ASYNC 'async' (1, 0) (1, 5)
LPAR '(' (1, 5) (1, 6)
RPAR ')' (1, 6) (1, 7)
""")
self.check_tokenize('class async(Bar):pass', """\
NAME 'class' (1, 0) (1, 5)
ASYNC 'async' (1, 6) (1, 11)
LPAR '(' (1, 11) (1, 12)
NAME 'Bar' (1, 12) (1, 15)
RPAR ')' (1, 15) (1, 16)
COLON ':' (1, 16) (1, 17)
NAME 'pass' (1, 17) (1, 21)
""")
self.check_tokenize('class async:pass', """\
NAME 'class' (1, 0) (1, 5)
ASYNC 'async' (1, 6) (1, 11)
COLON ':' (1, 11) (1, 12)
NAME 'pass' (1, 12) (1, 16)
""")
self.check_tokenize('await = 1', """\
AWAIT 'await' (1, 0) (1, 5)
EQUAL '=' (1, 6) (1, 7)
NUMBER '1' (1, 8) (1, 9)
""")
self.check_tokenize('foo.async', """\
NAME 'foo' (1, 0) (1, 3)
DOT '.' (1, 3) (1, 4)
ASYNC 'async' (1, 4) (1, 9)
""")
self.check_tokenize('async for a in b: pass', """\
ASYNC 'async' (1, 0) (1, 5)
NAME 'for' (1, 6) (1, 9)
NAME 'a' (1, 10) (1, 11)
NAME 'in' (1, 12) (1, 14)
NAME 'b' (1, 15) (1, 16)
COLON ':' (1, 16) (1, 17)
NAME 'pass' (1, 18) (1, 22)
""")
self.check_tokenize('async with a as b: pass', """\
ASYNC 'async' (1, 0) (1, 5)
NAME 'with' (1, 6) (1, 10)
NAME 'a' (1, 11) (1, 12)
NAME 'as' (1, 13) (1, 15)
NAME 'b' (1, 16) (1, 17)
COLON ':' (1, 17) (1, 18)
NAME 'pass' (1, 19) (1, 23)
""")
self.check_tokenize('async.foo', """\
ASYNC 'async' (1, 0) (1, 5)
DOT '.' (1, 5) (1, 6)
NAME 'foo' (1, 6) (1, 9)
""")
self.check_tokenize('async', """\
ASYNC 'async' (1, 0) (1, 5)
""")
self.check_tokenize('async\n#comment\nawait', """\
ASYNC 'async' (1, 0) (1, 5)
NEWLINE '' (1, 5) (1, 5)
AWAIT 'await' (3, 0) (3, 5)
""")
self.check_tokenize('async\n...\nawait', """\
ASYNC 'async' (1, 0) (1, 5)
NEWLINE '' (1, 5) (1, 5)
ELLIPSIS '...' (2, 0) (2, 3)
NEWLINE '' (2, 3) (2, 3)
AWAIT 'await' (3, 0) (3, 5)
""")
self.check_tokenize('async\nawait', """\
ASYNC 'async' (1, 0) (1, 5)
NEWLINE '' (1, 5) (1, 5)
AWAIT 'await' (2, 0) (2, 5)
""")
self.check_tokenize('foo.async + 1', """\
NAME 'foo' (1, 0) (1, 3)
DOT '.' (1, 3) (1, 4)
ASYNC 'async' (1, 4) (1, 9)
PLUS '+' (1, 10) (1, 11)
NUMBER '1' (1, 12) (1, 13)
""")
self.check_tokenize('async def foo(): pass', """\
ASYNC 'async' (1, 0) (1, 5)
NAME 'def' (1, 6) (1, 9)
NAME 'foo' (1, 10) (1, 13)
LPAR '(' (1, 13) (1, 14)
RPAR ')' (1, 14) (1, 15)
COLON ':' (1, 15) (1, 16)
NAME 'pass' (1, 17) (1, 21)
""")
self.check_tokenize('''\
async def foo():
def foo(await):
await = 1
if 1:
await
async += 1
''', """\
ASYNC 'async' (1, 0) (1, 5)
NAME 'def' (1, 6) (1, 9)
NAME 'foo' (1, 10) (1, 13)
LPAR '(' (1, 13) (1, 14)
RPAR ')' (1, 14) (1, 15)
COLON ':' (1, 15) (1, 16)
NEWLINE '' (1, 16) (1, 16)
INDENT '' (2, -1) (2, -1)
NAME 'def' (2, 2) (2, 5)
NAME 'foo' (2, 6) (2, 9)
LPAR '(' (2, 9) (2, 10)
AWAIT 'await' (2, 10) (2, 15)
RPAR ')' (2, 15) (2, 16)
COLON ':' (2, 16) (2, 17)
NEWLINE '' (2, 17) (2, 17)
INDENT '' (3, -1) (3, -1)
AWAIT 'await' (3, 4) (3, 9)
EQUAL '=' (3, 10) (3, 11)
NUMBER '1' (3, 12) (3, 13)
NEWLINE '' (3, 13) (3, 13)
DEDENT '' (4, -1) (4, -1)
NAME 'if' (4, 2) (4, 4)
NUMBER '1' (4, 5) (4, 6)
COLON ':' (4, 6) (4, 7)
NEWLINE '' (4, 7) (4, 7)
INDENT '' (5, -1) (5, -1)
AWAIT 'await' (5, 4) (5, 9)
NEWLINE '' (5, 9) (5, 9)
DEDENT '' (6, -1) (6, -1)
DEDENT '' (6, -1) (6, -1)
ASYNC 'async' (6, 0) (6, 5)
PLUSEQUAL '+=' (6, 6) (6, 8)
NUMBER '1' (6, 9) (6, 10)
NEWLINE '' (6, 10) (6, 10)
""")
self.check_tokenize('async def foo():\n async for i in 1: pass', """\
ASYNC 'async' (1, 0) (1, 5)
NAME 'def' (1, 6) (1, 9)
NAME 'foo' (1, 10) (1, 13)
LPAR '(' (1, 13) (1, 14)
RPAR ')' (1, 14) (1, 15)
COLON ':' (1, 15) (1, 16)
NEWLINE '' (1, 16) (1, 16)
INDENT '' (2, -1) (2, -1)
ASYNC 'async' (2, 2) (2, 7)
NAME 'for' (2, 8) (2, 11)
NAME 'i' (2, 12) (2, 13)
NAME 'in' (2, 14) (2, 16)
NUMBER '1' (2, 17) (2, 18)
COLON ':' (2, 18) (2, 19)
NAME 'pass' (2, 20) (2, 24)
DEDENT '' (2, -1) (2, -1)
""")
self.check_tokenize('async def foo(async): await', """\
ASYNC 'async' (1, 0) (1, 5)
NAME 'def' (1, 6) (1, 9)
NAME 'foo' (1, 10) (1, 13)
LPAR '(' (1, 13) (1, 14)
ASYNC 'async' (1, 14) (1, 19)
RPAR ')' (1, 19) (1, 20)
COLON ':' (1, 20) (1, 21)
AWAIT 'await' (1, 22) (1, 27)
""")
self.check_tokenize('''\
def f():
def baz(): pass
async def bar(): pass
await = 2''', """\
NAME 'def' (1, 0) (1, 3)
NAME 'f' (1, 4) (1, 5)
LPAR '(' (1, 5) (1, 6)
RPAR ')' (1, 6) (1, 7)
COLON ':' (1, 7) (1, 8)
NEWLINE '' (1, 8) (1, 8)
INDENT '' (3, -1) (3, -1)
NAME 'def' (3, 2) (3, 5)
NAME 'baz' (3, 6) (3, 9)
LPAR '(' (3, 9) (3, 10)
RPAR ')' (3, 10) (3, 11)
COLON ':' (3, 11) (3, 12)
NAME 'pass' (3, 13) (3, 17)
NEWLINE '' (3, 17) (3, 17)
ASYNC 'async' (4, 2) (4, 7)
NAME 'def' (4, 8) (4, 11)
NAME 'bar' (4, 12) (4, 15)
LPAR '(' (4, 15) (4, 16)
RPAR ')' (4, 16) (4, 17)
COLON ':' (4, 17) (4, 18)
NAME 'pass' (4, 19) (4, 23)
NEWLINE '' (4, 23) (4, 23)
AWAIT 'await' (6, 2) (6, 7)
EQUAL '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
DEDENT '' (6, -1) (6, -1)
""")
self.check_tokenize('''\
async def f():
def baz(): pass
async def bar(): pass
await = 2''', """\
ASYNC 'async' (1, 0) (1, 5)
NAME 'def' (1, 6) (1, 9)
NAME 'f' (1, 10) (1, 11)
LPAR '(' (1, 11) (1, 12)
RPAR ')' (1, 12) (1, 13)
COLON ':' (1, 13) (1, 14)
NEWLINE '' (1, 14) (1, 14)
INDENT '' (3, -1) (3, -1)
NAME 'def' (3, 2) (3, 5)
NAME 'baz' (3, 6) (3, 9)
LPAR '(' (3, 9) (3, 10)
RPAR ')' (3, 10) (3, 11)
COLON ':' (3, 11) (3, 12)
NAME 'pass' (3, 13) (3, 17)
NEWLINE '' (3, 17) (3, 17)
ASYNC 'async' (4, 2) (4, 7)
NAME 'def' (4, 8) (4, 11)
NAME 'bar' (4, 12) (4, 15)
LPAR '(' (4, 15) (4, 16)
RPAR ')' (4, 16) (4, 17)
COLON ':' (4, 17) (4, 18)
NAME 'pass' (4, 19) (4, 23)
NEWLINE '' (4, 23) (4, 23)
AWAIT 'await' (6, 2) (6, 7)
EQUAL '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
DEDENT '' (6, -1) (6, -1)
""")
def test_unicode(self):
self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
NAME 'Örter' (1, 0) (1, 6)
EQUAL '=' (1, 7) (1, 8)
STRING "u'places'" (1, 9) (1, 18)
NEWLINE '' (1, 18) (1, 18)
NAME 'grün' (2, 0) (2, 5)
EQUAL '=' (2, 6) (2, 7)
STRING "U'green'" (2, 8) (2, 16)
""")
def test_invalid_syntax(self):
def get_tokens(string):
return list(_generate_tokens_from_c_tokenizer(string))
self.assertRaises(SyntaxError, get_tokens, "(1+2]")
self.assertRaises(SyntaxError, get_tokens, "(1+2}")
self.assertRaises(SyntaxError, get_tokens, "{1+2]")
self.assertRaises(SyntaxError, get_tokens, "1_")
self.assertRaises(SyntaxError, get_tokens, "1.2_")
self.assertRaises(SyntaxError, get_tokens, "1e2_")
self.assertRaises(SyntaxError, get_tokens, "1e+")
self.assertRaises(SyntaxError, get_tokens, "\xa0")
self.assertRaises(SyntaxError, get_tokens, "")
self.assertRaises(SyntaxError, get_tokens, "0b12")
self.assertRaises(SyntaxError, get_tokens, "0b1_2")
self.assertRaises(SyntaxError, get_tokens, "0b2")
self.assertRaises(SyntaxError, get_tokens, "0b1_")
self.assertRaises(SyntaxError, get_tokens, "0b")
self.assertRaises(SyntaxError, get_tokens, "0o18")
self.assertRaises(SyntaxError, get_tokens, "0o1_8")
self.assertRaises(SyntaxError, get_tokens, "0o8")
self.assertRaises(SyntaxError, get_tokens, "0o1_")
self.assertRaises(SyntaxError, get_tokens, "0o")
self.assertRaises(SyntaxError, get_tokens, "0x1_")
self.assertRaises(SyntaxError, get_tokens, "0x")
self.assertRaises(SyntaxError, get_tokens, "1_")
self.assertRaises(SyntaxError, get_tokens, "012")
self.assertRaises(SyntaxError, get_tokens, "1.2_")
self.assertRaises(SyntaxError, get_tokens, "1e2_")
self.assertRaises(SyntaxError, get_tokens, "1e+")
self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
self.assertRaises(SyntaxError, get_tokens, "]")
if __name__ == "__main__":
unittest.main()

View File

@ -680,5 +680,13 @@ def main():
perror("unexpected error: %s" % err)
raise
def _generate_tokens_from_c_tokenizer(source):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
import _tokenize as c_tokenizer
for info in c_tokenizer.TokenizerIter(source):
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
if __name__ == "__main__":
main()

View File

@ -339,6 +339,7 @@ PARSER_HEADERS= \
PYTHON_OBJS= \
Python/_warnings.o \
Python/Python-ast.o \
Python/Python-tokenize.o \
Python/asdl.o \
Python/ast.o \
Python/ast_opt.o \

View File

@ -28,6 +28,7 @@ extern PyObject* PyMarshal_Init(void);
extern PyObject* PyInit__imp(void);
extern PyObject* PyInit_gc(void);
extern PyObject* PyInit__ast(void);
extern PyObject* PyInit__tokenize(void);
extern PyObject* _PyWarnings_Init(void);
extern PyObject* PyInit__string(void);
@ -44,6 +45,9 @@ struct _inittab _PyImport_Inittab[] = {
/* This lives in Python/Python-ast.c */
{"_ast", PyInit__ast},
/* This lives in Python/Python-tokenizer.c */
{"_tokenize", PyInit__tokenize},
/* These entries are here for sys.builtin_module_names */
{"builtins", NULL},
{"sys", NULL},

View File

@ -72,9 +72,8 @@ extern PyObject* _PyWarnings_Init(void);
extern PyObject* PyInit__string(void);
extern PyObject* PyInit__stat(void);
extern PyObject* PyInit__opcode(void);
extern PyObject* PyInit__contextvars(void);
extern PyObject* PyInit__tokenize(void);
/* tools/freeze/makeconfig.py marker for additional "extern" */
/* -- ADDMODULE MARKER 1 -- */
@ -83,7 +82,6 @@ extern PyObject* PyMarshal_Init(void);
extern PyObject* PyInit__imp(void);
struct _inittab _PyImport_Inittab[] = {
{"_abc", PyInit__abc},
{"array", PyInit_array},
{"_ast", PyInit__ast},
@ -105,6 +103,7 @@ struct _inittab _PyImport_Inittab[] = {
{"_blake2", PyInit__blake2},
{"time", PyInit_time},
{"_thread", PyInit__thread},
{"_tokenize", PyInit__tokenize},
{"_typing", PyInit__typing},
{"_statistics", PyInit__statistics},
#ifdef WIN32

View File

@ -488,6 +488,7 @@
<ClCompile Include="..\Python\pystrtod.c" />
<ClCompile Include="..\Python\dtoa.c" />
<ClCompile Include="..\Python\Python-ast.c" />
<ClCompile Include="..\Python\Python-tokenize.c" />
<ClCompile Include="..\Python\pythonrun.c" />
<ClCompile Include="..\Python\specialize.c" />
<ClCompile Include="..\Python\suggestions.c" />

195
Python/Python-tokenize.c Normal file
View File

@ -0,0 +1,195 @@
#include "Python.h"
#include "../Parser/tokenizer.h"
static struct PyModuleDef _tokenizemodule;
typedef struct {
PyTypeObject* TokenizerIter;
} tokenize_state;
static tokenize_state*
get_tokenize_state(PyObject* module)
{
return (tokenize_state*)PyModule_GetState(module);
}
#define _tokenize_get_state_by_type(type) \
get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))
#include "clinic/Python-tokenize.c.h"
/*[clinic input]
module _tokenizer
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
[clinic start generated code]*/
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
typedef struct {
PyObject_HEAD
struct tok_state* tok;
} tokenizeriterobject;
/*[clinic input]
@classmethod
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
source: str
[clinic start generated code]*/
static PyObject *
tokenizeriter_new_impl(PyTypeObject *type, const char *source)
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
{
tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
if (self == NULL) {
return NULL;
}
PyObject* filename = PyUnicode_FromString("<string>");
if (filename == NULL) {
return NULL;
}
self->tok = PyTokenizer_FromUTF8(source, 1);
if (self->tok == NULL) {
return NULL;
}
self->tok->filename = filename;
return (PyObject*)self;
}
static PyObject*
tokenizeriter_next(tokenizeriterobject* it)
{
const char* start;
const char* end;
int type = PyTokenizer_Get(it->tok, &start, &end);
if (type == ERRORTOKEN && PyErr_Occurred()) {
return NULL;
}
if (type == ERRORTOKEN || type == ENDMARKER) {
PyErr_SetString(PyExc_StopIteration, "EOF");
return NULL;
}
PyObject* str = NULL;
if (start == NULL || end == NULL) {
str = PyUnicode_FromString("");
} else {
str = PyUnicode_FromStringAndSize(start, end - start);
}
if (str == NULL) {
return NULL;
}
Py_ssize_t size = it->tok->inp - it->tok->buf;
PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
if (line == NULL) {
Py_DECREF(str);
return NULL;
}
const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
int end_lineno = it->tok->lineno;
int col_offset = -1;
int end_col_offset = -1;
if (start != NULL && start >= line_start) {
col_offset = (int)(start - line_start);
}
if (end != NULL && end >= it->tok->line_start) {
end_col_offset = (int)(end - it->tok->line_start);
}
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
}
static void
tokenizeriter_dealloc(tokenizeriterobject* it)
{
PyTypeObject* tp = Py_TYPE(it);
PyTokenizer_Free(it->tok);
tp->tp_free(it);
Py_DECREF(tp);
}
static PyType_Slot tokenizeriter_slots[] = {
{Py_tp_new, tokenizeriter_new},
{Py_tp_dealloc, tokenizeriter_dealloc},
{Py_tp_getattro, PyObject_GenericGetAttr},
{Py_tp_iter, PyObject_SelfIter},
{Py_tp_iternext, tokenizeriter_next},
{0, NULL},
};
static PyType_Spec tokenizeriter_spec = {
.name = "_tokenize.TokenizerIter",
.basicsize = sizeof(tokenizeriterobject),
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
.slots = tokenizeriter_slots,
};
static int
tokenizemodule_exec(PyObject* m)
{
tokenize_state* state = get_tokenize_state(m);
if (state == NULL) {
return -1;
}
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
m, &tokenizeriter_spec, NULL);
if (state->TokenizerIter == NULL) {
return -1;
}
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
return -1;
}
return 0;
}
static PyMethodDef tokenize_methods[] = {
{NULL, NULL, 0, NULL} /* Sentinel */
};
static PyModuleDef_Slot tokenizemodule_slots[] = {
{Py_mod_exec, tokenizemodule_exec},
{0, NULL}
};
static int
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
{
tokenize_state *state = get_tokenize_state(m);
Py_VISIT(state->TokenizerIter);
return 0;
}
static int
tokenizemodule_clear(PyObject *m)
{
tokenize_state *state = get_tokenize_state(m);
Py_CLEAR(state->TokenizerIter);
return 0;
}
static void
tokenizemodule_free(void *m)
{
tokenizemodule_clear((PyObject *)m);
}
static struct PyModuleDef _tokenizemodule = {
PyModuleDef_HEAD_INIT,
.m_name = "_tokenize",
.m_size = sizeof(tokenize_state),
.m_slots = tokenizemodule_slots,
.m_methods = tokenize_methods,
.m_traverse = tokenizemodule_traverse,
.m_clear = tokenizemodule_clear,
.m_free = tokenizemodule_free,
};
PyMODINIT_FUNC
PyInit__tokenize(void)
{
return PyModuleDef_Init(&_tokenizemodule);
}

41
Python/clinic/Python-tokenize.c.h generated Normal file
View File

@ -0,0 +1,41 @@
/*[clinic input]
preserve
[clinic start generated code]*/
static PyObject *
tokenizeriter_new_impl(PyTypeObject *type, const char *source);
static PyObject *
tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{
PyObject *return_value = NULL;
static const char * const _keywords[] = {"source", NULL};
static _PyArg_Parser _parser = {NULL, _keywords, "tokenizeriter", 0};
PyObject *argsbuf[1];
PyObject * const *fastargs;
Py_ssize_t nargs = PyTuple_GET_SIZE(args);
const char *source;
fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf);
if (!fastargs) {
goto exit;
}
if (!PyUnicode_Check(fastargs[0])) {
_PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]);
goto exit;
}
Py_ssize_t source_length;
source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length);
if (source == NULL) {
goto exit;
}
if (strlen(source) != (size_t)source_length) {
PyErr_SetString(PyExc_ValueError, "embedded null character");
goto exit;
}
return_value = tokenizeriter_new_impl(type, source);
exit:
return return_value;
}
/*[clinic end generated code: output=dfcd64774e01bfe6 input=a9049054013a1b77]*/

View File

@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = {
"_thread",
"_threading_local",
"_tkinter",
"_tokenize",
"_tracemalloc",
"_typing",
"_uuid",