bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891)
Most of the change involves fixing up the test suite, which previously made the assumption that there wouldn't be a new line if the input didn't end in one. Contributed by Ammar Askar.
This commit is contained in:
parent
3c8aae9ffe
commit
c4ef4896ea
|
@ -1,7 +1,8 @@
|
|||
from test import support
|
||||
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
|
||||
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
||||
open as tokenize_open, Untokenizer, generate_tokens)
|
||||
open as tokenize_open, Untokenizer, generate_tokens,
|
||||
NEWLINE)
|
||||
from io import BytesIO, StringIO
|
||||
import unittest
|
||||
from unittest import TestCase, mock
|
||||
|
@ -11,27 +12,51 @@ import os
|
|||
import token
|
||||
|
||||
|
||||
# Converts a source string into a list of textual representation
|
||||
# of the tokens such as:
|
||||
# ` NAME 'if' (1, 0) (1, 2)`
|
||||
# to make writing tests easier.
|
||||
def stringify_tokens_from_source(token_generator, source_string):
|
||||
result = []
|
||||
num_lines = len(source_string.splitlines())
|
||||
missing_trailing_nl = source_string[-1] not in '\r\n'
|
||||
|
||||
for type, token, start, end, line in token_generator:
|
||||
if type == ENDMARKER:
|
||||
break
|
||||
# Ignore the new line on the last line if the input lacks one
|
||||
if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
|
||||
continue
|
||||
type = tok_name[type]
|
||||
result.append(f" {type:10} {token!r:13} {start} {end}")
|
||||
|
||||
return result
|
||||
|
||||
class TokenizeTest(TestCase):
|
||||
# Tests for the tokenize module.
|
||||
|
||||
# The tests can be really simple. Given a small fragment of source
|
||||
# code, print out a table with tokens. The ENDMARKER is omitted for
|
||||
# brevity.
|
||||
# code, print out a table with tokens. The ENDMARKER, ENCODING and
|
||||
# final NEWLINE are omitted for brevity.
|
||||
|
||||
def check_tokenize(self, s, expected):
|
||||
# Format the tokens in s in a table format.
|
||||
# The ENDMARKER is omitted.
|
||||
result = []
|
||||
# The ENDMARKER and final NEWLINE are omitted.
|
||||
f = BytesIO(s.encode('utf-8'))
|
||||
for type, token, start, end, line in tokenize(f.readline):
|
||||
if type == ENDMARKER:
|
||||
break
|
||||
type = tok_name[type]
|
||||
result.append(f" {type:10} {token!r:13} {start} {end}")
|
||||
result = stringify_tokens_from_source(tokenize(f.readline), s)
|
||||
|
||||
self.assertEqual(result,
|
||||
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
|
||||
expected.rstrip().splitlines())
|
||||
|
||||
def test_implicit_newline(self):
|
||||
# Make sure that the tokenizer puts in an implicit NEWLINE
|
||||
# when the input lacks a trailing new line.
|
||||
f = BytesIO("x".encode('utf-8'))
|
||||
tokens = list(tokenize(f.readline))
|
||||
self.assertEqual(tokens[-2].type, NEWLINE)
|
||||
self.assertEqual(tokens[-1].type, ENDMARKER)
|
||||
|
||||
def test_basic(self):
|
||||
self.check_tokenize("1 + 1", """\
|
||||
NUMBER '1' (1, 0) (1, 1)
|
||||
|
@ -922,14 +947,9 @@ async def f():
|
|||
class GenerateTokensTest(TokenizeTest):
|
||||
def check_tokenize(self, s, expected):
|
||||
# Format the tokens in s in a table format.
|
||||
# The ENDMARKER is omitted.
|
||||
result = []
|
||||
# The ENDMARKER and final NEWLINE are omitted.
|
||||
f = StringIO(s)
|
||||
for type, token, start, end, line in generate_tokens(f.readline):
|
||||
if type == ENDMARKER:
|
||||
break
|
||||
type = tok_name[type]
|
||||
result.append(f" {type:10} {token!r:13} {start} {end}")
|
||||
result = stringify_tokens_from_source(generate_tokens(f.readline), s)
|
||||
self.assertEqual(result, expected.rstrip().splitlines())
|
||||
|
||||
|
||||
|
@ -1022,8 +1042,8 @@ class Test_Tokenize(TestCase):
|
|||
else:
|
||||
return b''
|
||||
|
||||
# skip the initial encoding token and the end token
|
||||
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
|
||||
# skip the initial encoding token and the end tokens
|
||||
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
|
||||
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
|
||||
self.assertEqual(tokens, expected_tokens,
|
||||
"bytes not decoded with encoding")
|
||||
|
@ -1039,8 +1059,8 @@ class Test_Tokenize(TestCase):
|
|||
else:
|
||||
return b''
|
||||
|
||||
# skip the end token
|
||||
tokens = list(_tokenize(readline, encoding=None))[:-1]
|
||||
# skip the end tokens
|
||||
tokens = list(_tokenize(readline, encoding=None))[:-2]
|
||||
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
|
||||
self.assertEqual(tokens, expected_tokens,
|
||||
"string not tokenized when encoding is None")
|
||||
|
@ -1351,18 +1371,21 @@ class TestTokenize(TestCase):
|
|||
|
||||
# Test that 500 consequent, one-line defs is OK
|
||||
toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
|
||||
self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
|
||||
self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
|
||||
# [-2] is always NEWLINE
|
||||
|
||||
def assertExactTypeEqual(self, opstr, *optypes):
|
||||
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
|
||||
num_optypes = len(optypes)
|
||||
self.assertEqual(len(tokens), 2 + num_optypes)
|
||||
self.assertEqual(len(tokens), 3 + num_optypes)
|
||||
self.assertEqual(tok_name[tokens[0].exact_type],
|
||||
tok_name[ENCODING])
|
||||
for i in range(num_optypes):
|
||||
self.assertEqual(tok_name[tokens[i + 1].exact_type],
|
||||
tok_name[optypes[i]])
|
||||
self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
|
||||
tok_name[token.NEWLINE])
|
||||
self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
|
||||
tok_name[token.ENDMARKER])
|
||||
|
||||
def test_exact_type(self):
|
||||
|
@ -1515,7 +1538,7 @@ class TestRoundtrip(TestCase):
|
|||
self.check_roundtrip("if x == 1:\n"
|
||||
" print(x)\n")
|
||||
self.check_roundtrip("# This is a comment\n"
|
||||
"# This also")
|
||||
"# This also\n")
|
||||
|
||||
# Some people use different formatting conventions, which makes
|
||||
# untokenize a little trickier. Note that this test involves trailing
|
||||
|
|
|
@ -492,8 +492,15 @@ def _tokenize(readline, encoding):
|
|||
# BOM will already have been stripped.
|
||||
encoding = "utf-8"
|
||||
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
|
||||
last_line = b''
|
||||
line = b''
|
||||
while True: # loop over lines in stream
|
||||
try:
|
||||
# We capture the value of the line variable here because
|
||||
# readline uses the empty string '' to signal end of input,
|
||||
# hence `line` itself will always be overwritten at the end
|
||||
# of this loop.
|
||||
last_line = line
|
||||
line = readline()
|
||||
except StopIteration:
|
||||
line = b''
|
||||
|
@ -648,6 +655,9 @@ def _tokenize(readline, encoding):
|
|||
(lnum, pos), (lnum, pos+1), line)
|
||||
pos += 1
|
||||
|
||||
# Add an implicit NEWLINE if the input doesn't end in one
|
||||
if last_line and last_line[-1] not in '\r\n':
|
||||
yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
|
||||
for indent in indents[1:]: # pop remaining indent levels
|
||||
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
|
||||
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
Tokenize module now implicitly emits a NEWLINE when provided with input that
|
||||
does not have a trailing new line. This behavior now matches what the C
|
||||
tokenizer does internally. Contributed by Ammar Askar.
|
Loading…
Reference in New Issue