From b2729e93e9d73503b1fda4ea4fecd77c58909091 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 7 Sep 2023 17:00:13 +0300 Subject: [PATCH] gh-88943: Improve syntax error for non-ASCII character that follows a numerical literal (GH-109081) It now points on the invalid non-ASCII character, not on the valid numerical literal. --- Lib/test/test_grammar.py | 4 ++++ .../2023-09-07-16-05-36.gh-issue-88943.rH_X3W.rst | 3 +++ Parser/tokenizer.c | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-09-07-16-05-36.gh-issue-88943.rH_X3W.rst diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py index 7c15a23a691..8501006b799 100644 --- a/Lib/test/test_grammar.py +++ b/Lib/test/test_grammar.py @@ -236,6 +236,10 @@ class TokenTests(unittest.TestCase): check(f"[{num}for x in ()]") check(f"{num}spam", error=True) + # gh-88943: Invalid non-ASCII character following a numerical literal. + with self.assertRaisesRegex(SyntaxError, r"invalid character '⁄' \(U\+2044\)"): + compile(f"{num}⁄7", "", "eval") + with self.assertWarnsRegex(SyntaxWarning, r'invalid \w+ literal'): compile(f"{num}is x", "", "eval") with warnings.catch_warnings(): diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-09-07-16-05-36.gh-issue-88943.rH_X3W.rst b/Misc/NEWS.d/next/Core and Builtins/2023-09-07-16-05-36.gh-issue-88943.rH_X3W.rst new file mode 100644 index 00000000000..a99830fe422 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-09-07-16-05-36.gh-issue-88943.rH_X3W.rst @@ -0,0 +1,3 @@ +Improve syntax error for non-ASCII character that follows a numerical +literal. It now points on the invalid non-ASCII character, not on the valid +numerical literal. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 6ec24895785..46b7159ff05 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1642,7 +1642,7 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind) { tok_nextc(tok); } else /* In future releases, only error will remain. */ - if (is_potential_identifier_char(c)) { + if (c < 128 && is_potential_identifier_char(c)) { tok_backup(tok, c); syntaxerror(tok, "invalid %s literal", kind); return 0;