diff --git a/Lib/idlelib/idle_test/test_editor.py b/Lib/idlelib/idle_test/test_editor.py index ba59c40dc6d..9296a6d235f 100644 --- a/Lib/idlelib/idle_test/test_editor.py +++ b/Lib/idlelib/idle_test/test_editor.py @@ -201,8 +201,8 @@ class IndentSearcherTest(unittest.TestCase): test_info = (# text, (block, indent)) ("", (None, None)), ("[1,", (None, None)), # TokenError - ("if 1:\n", ('if 1:', None)), - ("if 1:\n 2\n 3\n", ('if 1:', ' 2')), + ("if 1:\n", ('if 1:\n', None)), + ("if 1:\n 2\n 3\n", ('if 1:\n', ' 2\n')), ) for code, expected_pair in test_info: with self.subTest(code=code): diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 251ce2b864a..0b7c25838d6 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1174,7 +1174,7 @@ class Test_Tokenize(TestCase): # skip the initial encoding token and the end tokens tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2] - expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") @@ -1657,7 +1657,6 @@ class TestRoundtrip(TestCase): code = f.encode('utf-8') else: code = f.read() - f.close() readline = iter(code.splitlines(keepends=True)).__next__ tokens5 = list(tokenize(readline)) tokens2 = [tok[:2] for tok in tokens5] @@ -1672,6 +1671,17 @@ class TestRoundtrip(TestCase): tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) + def check_line_extraction(self, f): + if isinstance(f, str): + code = f.encode('utf-8') + else: + code = f.read() + readline = iter(code.splitlines(keepends=True)).__next__ + for tok in tokenize(readline): + if tok.type in {ENCODING, ENDMARKER}: + continue + self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]]) + def test_roundtrip(self): # There are some standard formatting practices that are easy to get right. @@ -1768,6 +1778,7 @@ class TestRoundtrip(TestCase): with open(testfile, 'rb') as f: # with self.subTest(file=testfile): self.check_roundtrip(f) + self.check_line_extraction(f) def roundtrip(self, code): diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst new file mode 100644 index 00000000000..05d50c108c7 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst @@ -0,0 +1,2 @@ +Ensure that the ``line`` attribute in :class:`tokenize.TokenInfo` objects in +the :mod:`tokenize` module are always correct. Patch by Pablo Galindo diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 0023e303b96..88087c12562 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -194,15 +194,14 @@ tokenizeriter_next(tokenizeriterobject *it) goto exit; } - Py_ssize_t size = it->tok->inp - it->tok->buf; - assert(it->tok->buf[size-1] == '\n'); - size -= 1; // Remove the newline character from the end of the line - PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace"); + const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; + Py_ssize_t size = it->tok->inp - line_start; + PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace"); if (line == NULL) { Py_DECREF(str); goto exit; } - const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; + Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; Py_ssize_t end_lineno = it->tok->lineno; Py_ssize_t col_offset = -1;