bpo-29995: re.escape() now escapes only special characters. (#1007)

This commit is contained in:
Serhiy Storchaka 2017-04-13 21:06:43 +03:00 committed by GitHub
parent a6e395dffa
commit 5908300e4b
6 changed files with 40 additions and 51 deletions

View File

@ -786,7 +786,7 @@ form.
.. function:: escape(pattern) .. function:: escape(pattern)
Escape all the characters in *pattern* except ASCII letters, numbers and ``'_'``. Escape special characters in *pattern*.
This is useful if you want to match an arbitrary literal string that may This is useful if you want to match an arbitrary literal string that may
have regular expression metacharacters in it. For example:: have regular expression metacharacters in it. For example::
@ -795,15 +795,19 @@ form.
>>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:" >>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
>>> print('[%s]+' % re.escape(legal_chars)) >>> print('[%s]+' % re.escape(legal_chars))
[abcdefghijklmnopqrstuvwxyz0123456789\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:]+ [abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+
>>> operators = ['+', '-', '*', '/', '**'] >>> operators = ['+', '-', '*', '/', '**']
>>> print('|'.join(map(re.escape, sorted(operators, reverse=True)))) >>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))
\/|\-|\+|\*\*|\* /|\-|\+|\*\*|\*
.. versionchanged:: 3.3 .. versionchanged:: 3.3
The ``'_'`` character is no longer escaped. The ``'_'`` character is no longer escaped.
.. versionchanged:: 3.7
Only characters that can have special meaning in a regular expression
are escaped.
.. function:: purge() .. function:: purge()

View File

@ -303,7 +303,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a
whatsnew/3.2,,:location,zope9-location = ${zope9:location} whatsnew/3.2,,:location,zope9-location = ${zope9:location}
whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
library/re,,`,!#$%&'*+-.^_`|~: library/re,,`,!#$%&'*+-.^_`|~:
library/re,,`,\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\: library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~:
library/tarfile,,:xz,'x:xz' library/tarfile,,:xz,'x:xz'
library/xml.etree.elementtree,,:sometag,prefix:sometag library/xml.etree.elementtree,,:sometag,prefix:sometag
library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com""" library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""

1 c-api/arg :ref PyArg_ParseTuple(args, "O|O:ref", &object, &callback)
303 whatsnew/3.2 :location zope9-location = ${zope9:location}
304 whatsnew/3.2 :prefix zope-conf = ${custom:prefix}/etc/zope.conf
305 library/re ` !#$%&'*+-.^_`|~:
306 library/re ` \!\#\$\%\&\'\*\+\-\.\^_\`\|\~\: !\#\$%&'\*\+\-\.\^_`\|~:
307 library/tarfile :xz 'x:xz'
308 library/xml.etree.elementtree :sometag prefix:sometag
309 library/xml.etree.elementtree :fictional <actors xmlns:fictional="http://characters.example.com"

View File

@ -221,8 +221,8 @@ class ReplaceDialogTest(unittest.TestCase):
self.assertIn('Invalid Replace Expression', showerror.message) self.assertIn('Invalid Replace Expression', showerror.message)
# test access method # test access method
self.engine.setcookedpat("\'") self.engine.setcookedpat("?")
equal(pv.get(), "\\'") equal(pv.get(), "\\?")
def test_replace_backwards(self): def test_replace_backwards(self):
equal = self.assertEqual equal = self.assertEqual

View File

@ -241,39 +241,21 @@ def template(pattern, flags=0):
"Compile a template pattern, returning a pattern object" "Compile a template pattern, returning a pattern object"
return _compile(pattern, flags|T) return _compile(pattern, flags|T)
_alphanum_str = frozenset( # SPECIAL_CHARS
"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890") # closing ')', '}' and ']'
_alphanum_bytes = frozenset( # '-' (a range in character set)
b"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890") # '#' (comment) and WHITESPACE (ignored) in verbose mode
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'}
def escape(pattern): def escape(pattern):
""" """
Escape all the characters in pattern except ASCII letters, numbers and '_'. Escape special characters in a string.
""" """
if isinstance(pattern, str): if isinstance(pattern, str):
alphanum = _alphanum_str return pattern.translate(_special_chars_map)
s = list(pattern)
for i, c in enumerate(pattern):
if c not in alphanum:
if c == "\000":
s[i] = "\\000"
else: else:
s[i] = "\\" + c pattern = str(pattern, 'latin1')
return "".join(s) return pattern.translate(_special_chars_map).encode('latin1')
else:
alphanum = _alphanum_bytes
s = []
esc = ord(b"\\")
for c in pattern:
if c in alphanum:
s.append(c)
else:
if c == 0:
s.extend(b"\\000")
else:
s.append(esc)
s.append(c)
return bytes(s)
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# internals # internals

View File

@ -904,7 +904,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
def assertMatch(self, pattern, text, match=None, span=None, def assertMatch(self, pattern, text, match=None, span=None,
matcher=re.match): matcher=re.fullmatch):
if match is None and span is None: if match is None and span is None:
# the pattern matches the whole text # the pattern matches the whole text
match = text match = text
@ -917,37 +917,38 @@ class ReTests(unittest.TestCase):
self.assertEqual(m.group(), match) self.assertEqual(m.group(), match)
self.assertEqual(m.span(), span) self.assertEqual(m.span(), span)
LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~'
def test_re_escape(self): def test_re_escape(self):
alnum_chars = string.ascii_letters + string.digits + '_'
p = ''.join(chr(i) for i in range(256)) p = ''.join(chr(i) for i in range(256))
for c in p: for c in p:
if c in alnum_chars:
self.assertEqual(re.escape(c), c)
elif c == '\x00':
self.assertEqual(re.escape(c), '\\000')
else:
self.assertEqual(re.escape(c), '\\' + c)
self.assertMatch(re.escape(c), c) self.assertMatch(re.escape(c), c)
self.assertMatch('[' + re.escape(c) + ']', c)
self.assertMatch('(?x)' + re.escape(c), c)
self.assertMatch(re.escape(p), p) self.assertMatch(re.escape(p), p)
for c in '-.]{}':
self.assertEqual(re.escape(c)[:1], '\\')
literal_chars = self.LITERAL_CHARS
self.assertEqual(re.escape(literal_chars), literal_chars)
def test_re_escape_byte(self): def test_re_escape_bytes(self):
alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
p = bytes(range(256)) p = bytes(range(256))
for i in p: for i in p:
b = bytes([i]) b = bytes([i])
if b in alnum_chars:
self.assertEqual(re.escape(b), b)
elif i == 0:
self.assertEqual(re.escape(b), b'\\000')
else:
self.assertEqual(re.escape(b), b'\\' + b)
self.assertMatch(re.escape(b), b) self.assertMatch(re.escape(b), b)
self.assertMatch(b'[' + re.escape(b) + b']', b)
self.assertMatch(b'(?x)' + re.escape(b), b)
self.assertMatch(re.escape(p), p) self.assertMatch(re.escape(p), p)
for i in b'-.]{}':
b = bytes([i])
self.assertEqual(re.escape(b)[:1], b'\\')
literal_chars = self.LITERAL_CHARS.encode('ascii')
self.assertEqual(re.escape(literal_chars), literal_chars)
def test_re_escape_non_ascii(self): def test_re_escape_non_ascii(self):
s = 'xxx\u2620\u2620\u2620xxx' s = 'xxx\u2620\u2620\u2620xxx'
s_escaped = re.escape(s) s_escaped = re.escape(s)
self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx') self.assertEqual(s_escaped, s)
self.assertMatch(s_escaped, s) self.assertMatch(s_escaped, s)
self.assertMatch('.%s+.' % re.escape('\u2620'), s, self.assertMatch('.%s+.' % re.escape('\u2620'), s,
'x\u2620\u2620\u2620x', (2, 7), re.search) 'x\u2620\u2620\u2620x', (2, 7), re.search)
@ -955,7 +956,7 @@ class ReTests(unittest.TestCase):
def test_re_escape_non_ascii_bytes(self): def test_re_escape_non_ascii_bytes(self):
b = 'y\u2620y\u2620y'.encode('utf-8') b = 'y\u2620y\u2620y'.encode('utf-8')
b_escaped = re.escape(b) b_escaped = re.escape(b)
self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') self.assertEqual(b_escaped, b)
self.assertMatch(b_escaped, b) self.assertMatch(b_escaped, b)
res = re.findall(re.escape('\u2620'.encode('utf-8')), b) res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
self.assertEqual(len(res), 2) self.assertEqual(len(res), 2)

View File

@ -320,6 +320,8 @@ Library
- bpo-29998: Pickling and copying ImportError now preserves name and path - bpo-29998: Pickling and copying ImportError now preserves name and path
attributes. attributes.
- bpo-29995: re.escape() now escapes only regex special characters.
- bpo-29962: Add math.remainder operation, implementing remainder - bpo-29962: Add math.remainder operation, implementing remainder
as specified in IEEE 754. as specified in IEEE 754.