mirror of https://github.com/python/cpython
gh-105390: Correctly raise TokenError instead of SyntaxError for tokenize errors (#105399)
This commit is contained in:
parent
27c68a6d8f
commit
ffd2654550
|
@ -139,11 +139,6 @@ function it uses to do this is available:
|
|||
2,
|
||||
3
|
||||
|
||||
Note that unclosed single-quoted strings do not cause an error to be
|
||||
raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the
|
||||
tokenization of their contents.
|
||||
|
||||
|
||||
.. _tokenize-cli:
|
||||
|
||||
Command-Line Usage
|
||||
|
|
|
@ -1490,14 +1490,15 @@ Changes in the Python API
|
|||
Additionally, there may be some minor behavioral changes as a consecuence of the
|
||||
changes required to support :pep:`701`. Some of these changes include:
|
||||
|
||||
* Some final ``DEDENT`` tokens are now emitted within the bounds of the
|
||||
input. This means that for a file containing 3 lines, the old version of the
|
||||
tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns
|
||||
the token in line 3.
|
||||
|
||||
* The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
|
||||
characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.
|
||||
|
||||
* Incomplete single-line strings now also raise :exc:`tokenize.TokenError` as incomplete
|
||||
multiline strings do.
|
||||
|
||||
* Some incomplete or invalid Python code now raises :exc:`tokenize.TokenError` instead of
|
||||
returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it.
|
||||
|
||||
Build Changes
|
||||
=============
|
||||
|
||||
|
|
|
@ -3,7 +3,8 @@ from test.support import os_helper
|
|||
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
|
||||
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
||||
open as tokenize_open, Untokenizer, generate_tokens,
|
||||
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
|
||||
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
|
||||
TokenError)
|
||||
from io import BytesIO, StringIO
|
||||
import unittest
|
||||
from textwrap import dedent
|
||||
|
@ -286,7 +287,7 @@ def k(x):
|
|||
for lit in INVALID_UNDERSCORE_LITERALS:
|
||||
try:
|
||||
number_token(lit)
|
||||
except SyntaxError:
|
||||
except TokenError:
|
||||
continue
|
||||
self.assertNotEqual(number_token(lit), lit)
|
||||
|
||||
|
@ -1379,7 +1380,7 @@ class TestDetectEncoding(TestCase):
|
|||
self.assertEqual(found, "iso-8859-1")
|
||||
|
||||
def test_syntaxerror_latin1(self):
|
||||
# Issue 14629: need to raise SyntaxError if the first
|
||||
# Issue 14629: need to raise TokenError if the first
|
||||
# line(s) have non-UTF-8 characters
|
||||
lines = (
|
||||
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
|
||||
|
@ -2754,7 +2755,7 @@ async def f():
|
|||
"]",
|
||||
]:
|
||||
with self.subTest(case=case):
|
||||
self.assertRaises(SyntaxError, get_tokens, case)
|
||||
self.assertRaises(TokenError, get_tokens, case)
|
||||
|
||||
def test_max_indent(self):
|
||||
MAXINDENT = 100
|
||||
|
@ -2773,7 +2774,7 @@ async def f():
|
|||
|
||||
invalid = generate_source(MAXINDENT)
|
||||
the_input = StringIO(invalid)
|
||||
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
|
||||
self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
|
||||
self.assertRaises(
|
||||
IndentationError, compile, invalid, "<string>", "exec"
|
||||
)
|
||||
|
|
|
@ -517,14 +517,30 @@ def main():
|
|||
perror("unexpected error: %s" % err)
|
||||
raise
|
||||
|
||||
def _transform_msg(msg):
|
||||
"""Transform error messages from the C tokenizer into the Python tokenize
|
||||
|
||||
The C tokenizer is more picky than the Python one, so we need to massage
|
||||
the error messages a bit for backwards compatibility.
|
||||
"""
|
||||
if "unterminated triple-quoted string literal" in msg:
|
||||
return "EOF in multi-line string"
|
||||
return msg
|
||||
|
||||
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
|
||||
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
|
||||
if encoding is None:
|
||||
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
|
||||
else:
|
||||
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
|
||||
for info in it:
|
||||
yield TokenInfo._make(info)
|
||||
try:
|
||||
for info in it:
|
||||
yield TokenInfo._make(info)
|
||||
except SyntaxError as e:
|
||||
if type(e) != SyntaxError:
|
||||
raise e from None
|
||||
msg = _transform_msg(e.msg)
|
||||
raise TokenError(msg, (e.lineno, e.offset)) from None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
Correctly raise :exc:`tokenize.TokenError` exceptions instead of
|
||||
:exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by
|
||||
Pablo Galindo
|
|
@ -84,13 +84,8 @@ _tokenizer_error(struct tok_state *tok)
|
|||
msg = "invalid token";
|
||||
break;
|
||||
case E_EOF:
|
||||
if (tok->level > 0) {
|
||||
PyErr_Format(PyExc_SyntaxError,
|
||||
"parenthesis '%c' was never closed",
|
||||
tok->parenstack[tok->level-1]);
|
||||
} else {
|
||||
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
|
||||
}
|
||||
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
|
||||
PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf);
|
||||
return -1;
|
||||
case E_DEDENT:
|
||||
msg = "unindent does not match any outer indentation level";
|
||||
|
|
Loading…
Reference in New Issue