Issue #21765: Add support for non-ascii identifiers to HyperParser

This commit is contained in:
Tal Einat 2014-07-16 16:41:14 +03:00
commit 2e4394ee0d
4 changed files with 212 additions and 43 deletions

View File

@ -6,11 +6,24 @@ the structure of code.
"""
import string
import keyword
from keyword import iskeyword
from idlelib import PyParse
class HyperParser:
# all ASCII chars that may be in an identifier
_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
# all ASCII chars that may be the first char of an identifier
_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
# lookup table for whether 7-bit ASCII chars are valid as the first
# char in a Python identifier
_IS_ASCII_ID_FIRST_CHAR = \
[(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
class HyperParser:
def __init__(self, editwin, index):
"To initialize, analyze the surroundings of the given index."
@ -143,26 +156,70 @@ class HyperParser:
return beforeindex, afterindex
# Ascii chars that may be in a white space
_whitespace_chars = " \t\n\\"
# Ascii chars that may be in an identifier
_id_chars = string.ascii_letters + string.digits + "_"
# Ascii chars that may be the first char of an identifier
_id_first_chars = string.ascii_letters + "_"
# the set of built-in identifiers which are also keywords,
# i.e. keyword.iskeyword() returns True for them
_ID_KEYWORDS = frozenset({"True", "False", "None"})
# Given a string and pos, return the number of chars in the
# identifier which ends at pos, or 0 if there is no such one. Saved
# words are not identifiers.
def _eat_identifier(self, str, limit, pos):
@classmethod
def _eat_identifier(cls, str, limit, pos):
"""Given a string and pos, return the number of chars in the
identifier which ends at pos, or 0 if there is no such one.
This ignores non-identifier eywords are not identifiers.
"""
is_ascii_id_char = _IS_ASCII_ID_CHAR
# Start at the end (pos) and work backwards.
i = pos
while i > limit and str[i-1] in self._id_chars:
# Go backwards as long as the characters are valid ASCII
# identifier characters. This is an optimization, since it
# is faster in the common case where most of the characters
# are ASCII.
while i > limit and (
ord(str[i - 1]) < 128 and
is_ascii_id_char[ord(str[i - 1])]
):
i -= 1
if (i < pos and (str[i] not in self._id_first_chars or
(keyword.iskeyword(str[i:pos]) and
str[i:pos] not in {'None', 'False', 'True'}))):
i = pos
# If the above loop ended due to reaching a non-ASCII
# character, continue going backwards using the most generic
# test for whether a string contains only valid identifier
# characters.
if i > limit and ord(str[i - 1]) >= 128:
while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
i -= 4
if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
i -= 2
if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
i -= 1
# The identifier candidate starts here. If it isn't a valid
# identifier, don't eat anything. At this point that is only
# possible if the first character isn't a valid first
# character for an identifier.
if not str[i:pos].isidentifier():
return 0
elif i < pos:
# All characters in str[i:pos] are valid ASCII identifier
# characters, so it is enough to check that the first is
# valid as the first character of an identifier.
if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
return 0
# All keywords are valid identifiers, but should not be
# considered identifiers here, except for True, False and None.
if i < pos and (
iskeyword(str[i:pos]) and
str[i:pos] not in cls._ID_KEYWORDS
):
return 0
return pos - i
# This string includes all chars that may be in a white space
_whitespace_chars = " \t\n\\"
def get_expression(self):
"""Return a string with the Python expression which ends at the
given index, which is empty if there is no real one.

View File

@ -1,5 +1,7 @@
import re
import sys
from collections import Mapping
from functools import partial
# Reason last stmt is continued (or C_NONE if it's not).
(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
@ -91,19 +93,48 @@ _chew_ordinaryre = re.compile(r"""
[^[\](){}#'"\\]+
""", re.VERBOSE).match
# Build translation table to map uninteresting chars to "x", open
# brackets to "(", and close brackets to ")".
_tran = {}
for i in range(256):
_tran[i] = 'x'
for ch in "({[":
_tran[ord(ch)] = '('
for ch in ")}]":
_tran[ord(ch)] = ')'
for ch in "\"'\\\n#":
_tran[ord(ch)] = ch
del i, ch
class StringTranslatePseudoMapping(Mapping):
r"""Utility class to be used with str.translate()
This Mapping class wraps a given dict. When a value for a key is
requested via __getitem__() or get(), the key is looked up in the
given dict. If found there, the value from the dict is returned.
Otherwise, the default value given upon initialization is returned.
This allows using str.translate() to make some replacements, and to
replace all characters for which no replacement was specified with
a given character instead of leaving them as-is.
For example, to replace everything except whitespace with 'x':
>>> whitespace_chars = ' \t\n\r'
>>> preserve_dict = {ord(c): ord(c) for c in whitespace_chars}
>>> mapping = StringTranslatePseudoMapping(preserve_dict, ord('x'))
>>> text = "a + b\tc\nd"
>>> text.translate(mapping)
'x x x\tx\nx'
"""
def __init__(self, non_defaults, default_value):
self._non_defaults = non_defaults
self._default_value = default_value
def _get(key, _get=non_defaults.get, _default=default_value):
return _get(key, _default)
self._get = _get
def __getitem__(self, item):
return self._get(item)
def __len__(self):
return len(self._non_defaults)
def __iter__(self):
return iter(self._non_defaults)
def get(self, key, default=None):
return self._get(key)
class Parser:
@ -113,19 +144,6 @@ class Parser:
def set_str(self, s):
assert len(s) == 0 or s[-1] == '\n'
if isinstance(s, str):
# The parse functions have no idea what to do with Unicode, so
# replace all Unicode characters with "x". This is "safe"
# so long as the only characters germane to parsing the structure
# of Python are 7-bit ASCII. It's *necessary* because Unicode
# strings don't have a .translate() method that supports
# deletechars.
uniphooey = s
s = []
push = s.append
for raw in map(ord, uniphooey):
push(raw < 127 and chr(raw) or "x")
s = "".join(s)
self.str = s
self.study_level = 0
@ -197,6 +215,16 @@ class Parser:
if lo > 0:
self.str = self.str[lo:]
# Build a translation table to map uninteresting chars to 'x', open
# brackets to '(', close brackets to ')' while preserving quotes,
# backslashes, newlines and hashes. This is to be passed to
# str.translate() in _study1().
_tran = {}
_tran.update((ord(c), ord('(')) for c in "({[")
_tran.update((ord(c), ord(')')) for c in ")}]")
_tran.update((ord(c), ord(c)) for c in "\"'\\\n#")
_tran = StringTranslatePseudoMapping(_tran, default_value=ord('x'))
# As quickly as humanly possible <wink>, find the line numbers (0-
# based) of the non-continuation lines.
# Creates self.{goodlines, continuation}.
@ -211,7 +239,7 @@ class Parser:
# uninteresting characters. This can cut the number of chars
# by a factor of 10-40, and so greatly speed the following loop.
str = self.str
str = str.translate(_tran)
str = str.translate(self._tran)
str = str.replace('xxxxxxxx', 'x')
str = str.replace('xxxx', 'x')
str = str.replace('xx', 'x')

View File

@ -30,6 +30,7 @@ class HyperParserTest(unittest.TestCase):
"z = ((r'asdf')+('a')))\n"
'[x for x in\n'
'for = False\n'
'cliché = "this is a string with unicode, what a cliché"'
)
@classmethod
@ -93,6 +94,8 @@ class HyperParserTest(unittest.TestCase):
self.assertTrue(p.is_in_string())
p = get('4.6')
self.assertTrue(p.is_in_string())
p = get('12.54')
self.assertTrue(p.is_in_string())
def test_is_in_code(self):
get = self.get_parser
@ -180,12 +183,91 @@ class HyperParserTest(unittest.TestCase):
p = get('10.0')
self.assertEqual(p.get_expression(), '')
p = get('10.6')
self.assertEqual(p.get_expression(), '')
p = get('10.11')
self.assertEqual(p.get_expression(), '')
p = get('11.3')
self.assertEqual(p.get_expression(), '')
p = get('11.11')
self.assertEqual(p.get_expression(), 'False')
p = get('12.6')
self.assertEqual(p.get_expression(), 'cliché')
def test_eat_identifier(self):
def is_valid_id(candidate):
result = HyperParser._eat_identifier(candidate, 0, len(candidate))
if result == len(candidate):
return True
elif result == 0:
return False
else:
err_msg = "Unexpected result: {} (expected 0 or {}".format(
result, len(candidate)
)
raise Exception(err_msg)
# invalid first character which is valid elsewhere in an identifier
self.assertFalse(is_valid_id('2notid'))
# ASCII-only valid identifiers
self.assertTrue(is_valid_id('valid_id'))
self.assertTrue(is_valid_id('_valid_id'))
self.assertTrue(is_valid_id('valid_id_'))
self.assertTrue(is_valid_id('_2valid_id'))
# keywords which should be "eaten"
self.assertTrue(is_valid_id('True'))
self.assertTrue(is_valid_id('False'))
self.assertTrue(is_valid_id('None'))
# keywords which should not be "eaten"
self.assertFalse(is_valid_id('for'))
self.assertFalse(is_valid_id('import'))
self.assertFalse(is_valid_id('return'))
# valid unicode identifiers
self.assertTrue(is_valid_id('cliche'))
self.assertTrue(is_valid_id('cliché'))
self.assertTrue(is_valid_id(''))
# invalid unicode identifiers
self.assertFalse(is_valid_id('2a'))
self.assertFalse(is_valid_id('٢a'))
self.assertFalse(is_valid_id(''))
# valid identifier after "punctuation"
self.assertEqual(HyperParser._eat_identifier('+ var', 0, 5), len('var'))
self.assertEqual(HyperParser._eat_identifier('+var', 0, 4), len('var'))
self.assertEqual(HyperParser._eat_identifier('.var', 0, 4), len('var'))
# invalid identifiers
self.assertFalse(is_valid_id('+'))
self.assertFalse(is_valid_id(' '))
self.assertFalse(is_valid_id(':'))
self.assertFalse(is_valid_id('?'))
self.assertFalse(is_valid_id('^'))
self.assertFalse(is_valid_id('\\'))
self.assertFalse(is_valid_id('"'))
self.assertFalse(is_valid_id('"a string"'))
def test_eat_identifier_various_lengths(self):
eat_id = HyperParser._eat_identifier
for length in range(1, 21):
self.assertEqual(eat_id('a' * length, 0, length), length)
self.assertEqual(eat_id('é' * length, 0, length), length)
self.assertEqual(eat_id('a' + '2' * (length - 1), 0, length), length)
self.assertEqual(eat_id('é' + '2' * (length - 1), 0, length), length)
self.assertEqual(eat_id('é' + 'a' * (length - 1), 0, length), length)
self.assertEqual(eat_id('é' * (length - 1) + 'a', 0, length), length)
self.assertEqual(eat_id('+' * length, 0, length), 0)
self.assertEqual(eat_id('2' + 'a' * (length - 1), 0, length), 0)
self.assertEqual(eat_id('2' + 'é' * (length - 1), 0, length), 0)
if __name__ == '__main__':
unittest.main(verbosity=2)

View File

@ -264,6 +264,8 @@ Library
- Issue #21455: Add a default backlog to socket.listen().
- Issue #21525: Most Tkinter methods which accepted tuples now accept lists too.
- Issue #21765: Add support for non-ascii identifiers to HyperParser.
- Issue #10744: Fix PEP 3118 format strings on ctypes objects with a nontrivial
shape.