mirror of https://github.com/python/cpython
gh-104972: Ensure that line attributes in tokens in the tokenize module are correct (#104975)
This commit is contained in:
parent
2cb445635e
commit
3fdb55c482
|
@ -201,8 +201,8 @@ class IndentSearcherTest(unittest.TestCase):
|
|||
test_info = (# text, (block, indent))
|
||||
("", (None, None)),
|
||||
("[1,", (None, None)), # TokenError
|
||||
("if 1:\n", ('if 1:', None)),
|
||||
("if 1:\n 2\n 3\n", ('if 1:', ' 2')),
|
||||
("if 1:\n", ('if 1:\n', None)),
|
||||
("if 1:\n 2\n 3\n", ('if 1:\n', ' 2\n')),
|
||||
)
|
||||
for code, expected_pair in test_info:
|
||||
with self.subTest(code=code):
|
||||
|
|
|
@ -1174,7 +1174,7 @@ class Test_Tokenize(TestCase):
|
|||
|
||||
# skip the initial encoding token and the end tokens
|
||||
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
|
||||
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
|
||||
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
|
||||
self.assertEqual(tokens, expected_tokens,
|
||||
"bytes not decoded with encoding")
|
||||
|
||||
|
@ -1657,7 +1657,6 @@ class TestRoundtrip(TestCase):
|
|||
code = f.encode('utf-8')
|
||||
else:
|
||||
code = f.read()
|
||||
f.close()
|
||||
readline = iter(code.splitlines(keepends=True)).__next__
|
||||
tokens5 = list(tokenize(readline))
|
||||
tokens2 = [tok[:2] for tok in tokens5]
|
||||
|
@ -1672,6 +1671,17 @@ class TestRoundtrip(TestCase):
|
|||
tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
|
||||
self.assertEqual(tokens2_from5, tokens2)
|
||||
|
||||
def check_line_extraction(self, f):
|
||||
if isinstance(f, str):
|
||||
code = f.encode('utf-8')
|
||||
else:
|
||||
code = f.read()
|
||||
readline = iter(code.splitlines(keepends=True)).__next__
|
||||
for tok in tokenize(readline):
|
||||
if tok.type in {ENCODING, ENDMARKER}:
|
||||
continue
|
||||
self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]])
|
||||
|
||||
def test_roundtrip(self):
|
||||
# There are some standard formatting practices that are easy to get right.
|
||||
|
||||
|
@ -1768,6 +1778,7 @@ class TestRoundtrip(TestCase):
|
|||
with open(testfile, 'rb') as f:
|
||||
# with self.subTest(file=testfile):
|
||||
self.check_roundtrip(f)
|
||||
self.check_line_extraction(f)
|
||||
|
||||
|
||||
def roundtrip(self, code):
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Ensure that the ``line`` attribute in :class:`tokenize.TokenInfo` objects in
|
||||
the :mod:`tokenize` module are always correct. Patch by Pablo Galindo
|
|
@ -194,15 +194,14 @@ tokenizeriter_next(tokenizeriterobject *it)
|
|||
goto exit;
|
||||
}
|
||||
|
||||
Py_ssize_t size = it->tok->inp - it->tok->buf;
|
||||
assert(it->tok->buf[size-1] == '\n');
|
||||
size -= 1; // Remove the newline character from the end of the line
|
||||
PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
|
||||
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
|
||||
Py_ssize_t size = it->tok->inp - line_start;
|
||||
PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace");
|
||||
if (line == NULL) {
|
||||
Py_DECREF(str);
|
||||
goto exit;
|
||||
}
|
||||
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
|
||||
|
||||
Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
|
||||
Py_ssize_t end_lineno = it->tok->lineno;
|
||||
Py_ssize_t col_offset = -1;
|
||||
|
|
Loading…
Reference in New Issue