Issue #21765: Add support for non-ascii identifiers to HyperParser
This commit is contained in:
commit
2e4394ee0d
|
@ -6,11 +6,24 @@ the structure of code.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import string
|
import string
|
||||||
import keyword
|
from keyword import iskeyword
|
||||||
from idlelib import PyParse
|
from idlelib import PyParse
|
||||||
|
|
||||||
class HyperParser:
|
|
||||||
|
|
||||||
|
# all ASCII chars that may be in an identifier
|
||||||
|
_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
|
||||||
|
# all ASCII chars that may be the first char of an identifier
|
||||||
|
_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
|
||||||
|
|
||||||
|
# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
|
||||||
|
_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
|
||||||
|
# lookup table for whether 7-bit ASCII chars are valid as the first
|
||||||
|
# char in a Python identifier
|
||||||
|
_IS_ASCII_ID_FIRST_CHAR = \
|
||||||
|
[(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
|
||||||
|
|
||||||
|
|
||||||
|
class HyperParser:
|
||||||
def __init__(self, editwin, index):
|
def __init__(self, editwin, index):
|
||||||
"To initialize, analyze the surroundings of the given index."
|
"To initialize, analyze the surroundings of the given index."
|
||||||
|
|
||||||
|
@ -143,26 +156,70 @@ class HyperParser:
|
||||||
|
|
||||||
return beforeindex, afterindex
|
return beforeindex, afterindex
|
||||||
|
|
||||||
# Ascii chars that may be in a white space
|
# the set of built-in identifiers which are also keywords,
|
||||||
_whitespace_chars = " \t\n\\"
|
# i.e. keyword.iskeyword() returns True for them
|
||||||
# Ascii chars that may be in an identifier
|
_ID_KEYWORDS = frozenset({"True", "False", "None"})
|
||||||
_id_chars = string.ascii_letters + string.digits + "_"
|
|
||||||
# Ascii chars that may be the first char of an identifier
|
|
||||||
_id_first_chars = string.ascii_letters + "_"
|
|
||||||
|
|
||||||
# Given a string and pos, return the number of chars in the
|
@classmethod
|
||||||
# identifier which ends at pos, or 0 if there is no such one. Saved
|
def _eat_identifier(cls, str, limit, pos):
|
||||||
# words are not identifiers.
|
"""Given a string and pos, return the number of chars in the
|
||||||
def _eat_identifier(self, str, limit, pos):
|
identifier which ends at pos, or 0 if there is no such one.
|
||||||
|
|
||||||
|
This ignores non-identifier eywords are not identifiers.
|
||||||
|
"""
|
||||||
|
is_ascii_id_char = _IS_ASCII_ID_CHAR
|
||||||
|
|
||||||
|
# Start at the end (pos) and work backwards.
|
||||||
i = pos
|
i = pos
|
||||||
while i > limit and str[i-1] in self._id_chars:
|
|
||||||
|
# Go backwards as long as the characters are valid ASCII
|
||||||
|
# identifier characters. This is an optimization, since it
|
||||||
|
# is faster in the common case where most of the characters
|
||||||
|
# are ASCII.
|
||||||
|
while i > limit and (
|
||||||
|
ord(str[i - 1]) < 128 and
|
||||||
|
is_ascii_id_char[ord(str[i - 1])]
|
||||||
|
):
|
||||||
i -= 1
|
i -= 1
|
||||||
if (i < pos and (str[i] not in self._id_first_chars or
|
|
||||||
(keyword.iskeyword(str[i:pos]) and
|
# If the above loop ended due to reaching a non-ASCII
|
||||||
str[i:pos] not in {'None', 'False', 'True'}))):
|
# character, continue going backwards using the most generic
|
||||||
i = pos
|
# test for whether a string contains only valid identifier
|
||||||
|
# characters.
|
||||||
|
if i > limit and ord(str[i - 1]) >= 128:
|
||||||
|
while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
|
||||||
|
i -= 4
|
||||||
|
if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
|
||||||
|
i -= 2
|
||||||
|
if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
|
||||||
|
i -= 1
|
||||||
|
|
||||||
|
# The identifier candidate starts here. If it isn't a valid
|
||||||
|
# identifier, don't eat anything. At this point that is only
|
||||||
|
# possible if the first character isn't a valid first
|
||||||
|
# character for an identifier.
|
||||||
|
if not str[i:pos].isidentifier():
|
||||||
|
return 0
|
||||||
|
elif i < pos:
|
||||||
|
# All characters in str[i:pos] are valid ASCII identifier
|
||||||
|
# characters, so it is enough to check that the first is
|
||||||
|
# valid as the first character of an identifier.
|
||||||
|
if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# All keywords are valid identifiers, but should not be
|
||||||
|
# considered identifiers here, except for True, False and None.
|
||||||
|
if i < pos and (
|
||||||
|
iskeyword(str[i:pos]) and
|
||||||
|
str[i:pos] not in cls._ID_KEYWORDS
|
||||||
|
):
|
||||||
|
return 0
|
||||||
|
|
||||||
return pos - i
|
return pos - i
|
||||||
|
|
||||||
|
# This string includes all chars that may be in a white space
|
||||||
|
_whitespace_chars = " \t\n\\"
|
||||||
|
|
||||||
def get_expression(self):
|
def get_expression(self):
|
||||||
"""Return a string with the Python expression which ends at the
|
"""Return a string with the Python expression which ends at the
|
||||||
given index, which is empty if there is no real one.
|
given index, which is empty if there is no real one.
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from collections import Mapping
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
# Reason last stmt is continued (or C_NONE if it's not).
|
# Reason last stmt is continued (or C_NONE if it's not).
|
||||||
(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
|
(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
|
||||||
|
@ -91,19 +93,48 @@ _chew_ordinaryre = re.compile(r"""
|
||||||
[^[\](){}#'"\\]+
|
[^[\](){}#'"\\]+
|
||||||
""", re.VERBOSE).match
|
""", re.VERBOSE).match
|
||||||
|
|
||||||
# Build translation table to map uninteresting chars to "x", open
|
|
||||||
# brackets to "(", and close brackets to ")".
|
|
||||||
|
|
||||||
_tran = {}
|
class StringTranslatePseudoMapping(Mapping):
|
||||||
for i in range(256):
|
r"""Utility class to be used with str.translate()
|
||||||
_tran[i] = 'x'
|
|
||||||
for ch in "({[":
|
This Mapping class wraps a given dict. When a value for a key is
|
||||||
_tran[ord(ch)] = '('
|
requested via __getitem__() or get(), the key is looked up in the
|
||||||
for ch in ")}]":
|
given dict. If found there, the value from the dict is returned.
|
||||||
_tran[ord(ch)] = ')'
|
Otherwise, the default value given upon initialization is returned.
|
||||||
for ch in "\"'\\\n#":
|
|
||||||
_tran[ord(ch)] = ch
|
This allows using str.translate() to make some replacements, and to
|
||||||
del i, ch
|
replace all characters for which no replacement was specified with
|
||||||
|
a given character instead of leaving them as-is.
|
||||||
|
|
||||||
|
For example, to replace everything except whitespace with 'x':
|
||||||
|
|
||||||
|
>>> whitespace_chars = ' \t\n\r'
|
||||||
|
>>> preserve_dict = {ord(c): ord(c) for c in whitespace_chars}
|
||||||
|
>>> mapping = StringTranslatePseudoMapping(preserve_dict, ord('x'))
|
||||||
|
>>> text = "a + b\tc\nd"
|
||||||
|
>>> text.translate(mapping)
|
||||||
|
'x x x\tx\nx'
|
||||||
|
"""
|
||||||
|
def __init__(self, non_defaults, default_value):
|
||||||
|
self._non_defaults = non_defaults
|
||||||
|
self._default_value = default_value
|
||||||
|
|
||||||
|
def _get(key, _get=non_defaults.get, _default=default_value):
|
||||||
|
return _get(key, _default)
|
||||||
|
self._get = _get
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
return self._get(item)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._non_defaults)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self._non_defaults)
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
return self._get(key)
|
||||||
|
|
||||||
|
|
||||||
class Parser:
|
class Parser:
|
||||||
|
|
||||||
|
@ -113,19 +144,6 @@ class Parser:
|
||||||
|
|
||||||
def set_str(self, s):
|
def set_str(self, s):
|
||||||
assert len(s) == 0 or s[-1] == '\n'
|
assert len(s) == 0 or s[-1] == '\n'
|
||||||
if isinstance(s, str):
|
|
||||||
# The parse functions have no idea what to do with Unicode, so
|
|
||||||
# replace all Unicode characters with "x". This is "safe"
|
|
||||||
# so long as the only characters germane to parsing the structure
|
|
||||||
# of Python are 7-bit ASCII. It's *necessary* because Unicode
|
|
||||||
# strings don't have a .translate() method that supports
|
|
||||||
# deletechars.
|
|
||||||
uniphooey = s
|
|
||||||
s = []
|
|
||||||
push = s.append
|
|
||||||
for raw in map(ord, uniphooey):
|
|
||||||
push(raw < 127 and chr(raw) or "x")
|
|
||||||
s = "".join(s)
|
|
||||||
self.str = s
|
self.str = s
|
||||||
self.study_level = 0
|
self.study_level = 0
|
||||||
|
|
||||||
|
@ -197,6 +215,16 @@ class Parser:
|
||||||
if lo > 0:
|
if lo > 0:
|
||||||
self.str = self.str[lo:]
|
self.str = self.str[lo:]
|
||||||
|
|
||||||
|
# Build a translation table to map uninteresting chars to 'x', open
|
||||||
|
# brackets to '(', close brackets to ')' while preserving quotes,
|
||||||
|
# backslashes, newlines and hashes. This is to be passed to
|
||||||
|
# str.translate() in _study1().
|
||||||
|
_tran = {}
|
||||||
|
_tran.update((ord(c), ord('(')) for c in "({[")
|
||||||
|
_tran.update((ord(c), ord(')')) for c in ")}]")
|
||||||
|
_tran.update((ord(c), ord(c)) for c in "\"'\\\n#")
|
||||||
|
_tran = StringTranslatePseudoMapping(_tran, default_value=ord('x'))
|
||||||
|
|
||||||
# As quickly as humanly possible <wink>, find the line numbers (0-
|
# As quickly as humanly possible <wink>, find the line numbers (0-
|
||||||
# based) of the non-continuation lines.
|
# based) of the non-continuation lines.
|
||||||
# Creates self.{goodlines, continuation}.
|
# Creates self.{goodlines, continuation}.
|
||||||
|
@ -211,7 +239,7 @@ class Parser:
|
||||||
# uninteresting characters. This can cut the number of chars
|
# uninteresting characters. This can cut the number of chars
|
||||||
# by a factor of 10-40, and so greatly speed the following loop.
|
# by a factor of 10-40, and so greatly speed the following loop.
|
||||||
str = self.str
|
str = self.str
|
||||||
str = str.translate(_tran)
|
str = str.translate(self._tran)
|
||||||
str = str.replace('xxxxxxxx', 'x')
|
str = str.replace('xxxxxxxx', 'x')
|
||||||
str = str.replace('xxxx', 'x')
|
str = str.replace('xxxx', 'x')
|
||||||
str = str.replace('xx', 'x')
|
str = str.replace('xx', 'x')
|
||||||
|
|
|
@ -30,6 +30,7 @@ class HyperParserTest(unittest.TestCase):
|
||||||
"z = ((r'asdf')+('a')))\n"
|
"z = ((r'asdf')+('a')))\n"
|
||||||
'[x for x in\n'
|
'[x for x in\n'
|
||||||
'for = False\n'
|
'for = False\n'
|
||||||
|
'cliché = "this is a string with unicode, what a cliché"'
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -93,6 +94,8 @@ class HyperParserTest(unittest.TestCase):
|
||||||
self.assertTrue(p.is_in_string())
|
self.assertTrue(p.is_in_string())
|
||||||
p = get('4.6')
|
p = get('4.6')
|
||||||
self.assertTrue(p.is_in_string())
|
self.assertTrue(p.is_in_string())
|
||||||
|
p = get('12.54')
|
||||||
|
self.assertTrue(p.is_in_string())
|
||||||
|
|
||||||
def test_is_in_code(self):
|
def test_is_in_code(self):
|
||||||
get = self.get_parser
|
get = self.get_parser
|
||||||
|
@ -180,12 +183,91 @@ class HyperParserTest(unittest.TestCase):
|
||||||
p = get('10.0')
|
p = get('10.0')
|
||||||
self.assertEqual(p.get_expression(), '')
|
self.assertEqual(p.get_expression(), '')
|
||||||
|
|
||||||
|
p = get('10.6')
|
||||||
|
self.assertEqual(p.get_expression(), '')
|
||||||
|
|
||||||
|
p = get('10.11')
|
||||||
|
self.assertEqual(p.get_expression(), '')
|
||||||
|
|
||||||
p = get('11.3')
|
p = get('11.3')
|
||||||
self.assertEqual(p.get_expression(), '')
|
self.assertEqual(p.get_expression(), '')
|
||||||
|
|
||||||
p = get('11.11')
|
p = get('11.11')
|
||||||
self.assertEqual(p.get_expression(), 'False')
|
self.assertEqual(p.get_expression(), 'False')
|
||||||
|
|
||||||
|
p = get('12.6')
|
||||||
|
self.assertEqual(p.get_expression(), 'cliché')
|
||||||
|
|
||||||
|
def test_eat_identifier(self):
|
||||||
|
def is_valid_id(candidate):
|
||||||
|
result = HyperParser._eat_identifier(candidate, 0, len(candidate))
|
||||||
|
if result == len(candidate):
|
||||||
|
return True
|
||||||
|
elif result == 0:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
err_msg = "Unexpected result: {} (expected 0 or {}".format(
|
||||||
|
result, len(candidate)
|
||||||
|
)
|
||||||
|
raise Exception(err_msg)
|
||||||
|
|
||||||
|
# invalid first character which is valid elsewhere in an identifier
|
||||||
|
self.assertFalse(is_valid_id('2notid'))
|
||||||
|
|
||||||
|
# ASCII-only valid identifiers
|
||||||
|
self.assertTrue(is_valid_id('valid_id'))
|
||||||
|
self.assertTrue(is_valid_id('_valid_id'))
|
||||||
|
self.assertTrue(is_valid_id('valid_id_'))
|
||||||
|
self.assertTrue(is_valid_id('_2valid_id'))
|
||||||
|
|
||||||
|
# keywords which should be "eaten"
|
||||||
|
self.assertTrue(is_valid_id('True'))
|
||||||
|
self.assertTrue(is_valid_id('False'))
|
||||||
|
self.assertTrue(is_valid_id('None'))
|
||||||
|
|
||||||
|
# keywords which should not be "eaten"
|
||||||
|
self.assertFalse(is_valid_id('for'))
|
||||||
|
self.assertFalse(is_valid_id('import'))
|
||||||
|
self.assertFalse(is_valid_id('return'))
|
||||||
|
|
||||||
|
# valid unicode identifiers
|
||||||
|
self.assertTrue(is_valid_id('cliche'))
|
||||||
|
self.assertTrue(is_valid_id('cliché'))
|
||||||
|
self.assertTrue(is_valid_id('a٢'))
|
||||||
|
|
||||||
|
# invalid unicode identifiers
|
||||||
|
self.assertFalse(is_valid_id('2a'))
|
||||||
|
self.assertFalse(is_valid_id('٢a'))
|
||||||
|
self.assertFalse(is_valid_id('a²'))
|
||||||
|
|
||||||
|
# valid identifier after "punctuation"
|
||||||
|
self.assertEqual(HyperParser._eat_identifier('+ var', 0, 5), len('var'))
|
||||||
|
self.assertEqual(HyperParser._eat_identifier('+var', 0, 4), len('var'))
|
||||||
|
self.assertEqual(HyperParser._eat_identifier('.var', 0, 4), len('var'))
|
||||||
|
|
||||||
|
# invalid identifiers
|
||||||
|
self.assertFalse(is_valid_id('+'))
|
||||||
|
self.assertFalse(is_valid_id(' '))
|
||||||
|
self.assertFalse(is_valid_id(':'))
|
||||||
|
self.assertFalse(is_valid_id('?'))
|
||||||
|
self.assertFalse(is_valid_id('^'))
|
||||||
|
self.assertFalse(is_valid_id('\\'))
|
||||||
|
self.assertFalse(is_valid_id('"'))
|
||||||
|
self.assertFalse(is_valid_id('"a string"'))
|
||||||
|
|
||||||
|
def test_eat_identifier_various_lengths(self):
|
||||||
|
eat_id = HyperParser._eat_identifier
|
||||||
|
|
||||||
|
for length in range(1, 21):
|
||||||
|
self.assertEqual(eat_id('a' * length, 0, length), length)
|
||||||
|
self.assertEqual(eat_id('é' * length, 0, length), length)
|
||||||
|
self.assertEqual(eat_id('a' + '2' * (length - 1), 0, length), length)
|
||||||
|
self.assertEqual(eat_id('é' + '2' * (length - 1), 0, length), length)
|
||||||
|
self.assertEqual(eat_id('é' + 'a' * (length - 1), 0, length), length)
|
||||||
|
self.assertEqual(eat_id('é' * (length - 1) + 'a', 0, length), length)
|
||||||
|
self.assertEqual(eat_id('+' * length, 0, length), 0)
|
||||||
|
self.assertEqual(eat_id('2' + 'a' * (length - 1), 0, length), 0)
|
||||||
|
self.assertEqual(eat_id('2' + 'é' * (length - 1), 0, length), 0)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main(verbosity=2)
|
unittest.main(verbosity=2)
|
||||||
|
|
|
@ -264,6 +264,8 @@ Library
|
||||||
- Issue #21455: Add a default backlog to socket.listen().
|
- Issue #21455: Add a default backlog to socket.listen().
|
||||||
|
|
||||||
- Issue #21525: Most Tkinter methods which accepted tuples now accept lists too.
|
- Issue #21525: Most Tkinter methods which accepted tuples now accept lists too.
|
||||||
|
- Issue #21765: Add support for non-ascii identifiers to HyperParser.
|
||||||
|
|
||||||
|
|
||||||
- Issue #10744: Fix PEP 3118 format strings on ctypes objects with a nontrivial
|
- Issue #10744: Fix PEP 3118 format strings on ctypes objects with a nontrivial
|
||||||
shape.
|
shape.
|
||||||
|
|
Loading…
Reference in New Issue