mirror of https://github.com/python/cpython
Issue #3665: \u and \U escapes are now supported in unicode regular expressions.
Patch by Serhiy Storchaka.
This commit is contained in:
parent
c9aa8425c4
commit
463badf06c
|
@ -414,17 +414,24 @@ Most of the standard escapes supported by Python string literals are also
|
||||||
accepted by the regular expression parser::
|
accepted by the regular expression parser::
|
||||||
|
|
||||||
\a \b \f \n
|
\a \b \f \n
|
||||||
\r \t \v \x
|
\r \t \u \U
|
||||||
\\
|
\v \x \\
|
||||||
|
|
||||||
(Note that ``\b`` is used to represent word boundaries, and means "backspace"
|
(Note that ``\b`` is used to represent word boundaries, and means "backspace"
|
||||||
only inside character classes.)
|
only inside character classes.)
|
||||||
|
|
||||||
|
``'\u'`` and ``'\U'`` escape sequences are only recognized in Unicode
|
||||||
|
patterns. In bytes patterns they are not treated specially.
|
||||||
|
|
||||||
Octal escapes are included in a limited form. If the first digit is a 0, or if
|
Octal escapes are included in a limited form. If the first digit is a 0, or if
|
||||||
there are three octal digits, it is considered an octal escape. Otherwise, it is
|
there are three octal digits, it is considered an octal escape. Otherwise, it is
|
||||||
a group reference. As for string literals, octal escapes are always at most
|
a group reference. As for string literals, octal escapes are always at most
|
||||||
three digits in length.
|
three digits in length.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.3
|
||||||
|
The ``'\u'`` and ``'\U'`` escape sequences have been added.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
.. _contents-of-module-re:
|
.. _contents-of-module-re:
|
||||||
|
|
||||||
|
|
|
@ -177,6 +177,7 @@ class SubPattern:
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
def __init__(self, string):
|
def __init__(self, string):
|
||||||
|
self.istext = isinstance(string, str)
|
||||||
self.string = string
|
self.string = string
|
||||||
self.index = 0
|
self.index = 0
|
||||||
self.__next()
|
self.__next()
|
||||||
|
@ -187,14 +188,14 @@ class Tokenizer:
|
||||||
char = self.string[self.index:self.index+1]
|
char = self.string[self.index:self.index+1]
|
||||||
# Special case for the str8, since indexing returns a integer
|
# Special case for the str8, since indexing returns a integer
|
||||||
# XXX This is only needed for test_bug_926075 in test_re.py
|
# XXX This is only needed for test_bug_926075 in test_re.py
|
||||||
if char and isinstance(char, bytes):
|
if char and not self.istext:
|
||||||
char = chr(char[0])
|
char = chr(char[0])
|
||||||
if char == "\\":
|
if char == "\\":
|
||||||
try:
|
try:
|
||||||
c = self.string[self.index + 1]
|
c = self.string[self.index + 1]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
raise error("bogus escape (end of line)")
|
raise error("bogus escape (end of line)")
|
||||||
if isinstance(self.string, bytes):
|
if not self.istext:
|
||||||
c = chr(c)
|
c = chr(c)
|
||||||
char = char + c
|
char = char + c
|
||||||
self.index = self.index + len(char)
|
self.index = self.index + len(char)
|
||||||
|
@ -209,6 +210,15 @@ class Tokenizer:
|
||||||
this = self.next
|
this = self.next
|
||||||
self.__next()
|
self.__next()
|
||||||
return this
|
return this
|
||||||
|
def getwhile(self, n, charset):
|
||||||
|
result = ''
|
||||||
|
for _ in range(n):
|
||||||
|
c = self.next
|
||||||
|
if c not in charset:
|
||||||
|
break
|
||||||
|
result += c
|
||||||
|
self.__next()
|
||||||
|
return result
|
||||||
def tell(self):
|
def tell(self):
|
||||||
return self.index, self.next
|
return self.index, self.next
|
||||||
def seek(self, index):
|
def seek(self, index):
|
||||||
|
@ -241,20 +251,30 @@ def _class_escape(source, escape):
|
||||||
c = escape[1:2]
|
c = escape[1:2]
|
||||||
if c == "x":
|
if c == "x":
|
||||||
# hexadecimal escape (exactly two digits)
|
# hexadecimal escape (exactly two digits)
|
||||||
while source.next in HEXDIGITS and len(escape) < 4:
|
escape += source.getwhile(2, HEXDIGITS)
|
||||||
escape = escape + source.get()
|
if len(escape) != 4:
|
||||||
escape = escape[2:]
|
raise ValueError
|
||||||
if len(escape) != 2:
|
return LITERAL, int(escape[2:], 16) & 0xff
|
||||||
raise error("bogus escape: %s" % repr("\\" + escape))
|
elif c == "u" and source.istext:
|
||||||
return LITERAL, int(escape, 16) & 0xff
|
# unicode escape (exactly four digits)
|
||||||
|
escape += source.getwhile(4, HEXDIGITS)
|
||||||
|
if len(escape) != 6:
|
||||||
|
raise ValueError
|
||||||
|
return LITERAL, int(escape[2:], 16)
|
||||||
|
elif c == "U" and source.istext:
|
||||||
|
# unicode escape (exactly eight digits)
|
||||||
|
escape += source.getwhile(8, HEXDIGITS)
|
||||||
|
if len(escape) != 10:
|
||||||
|
raise ValueError
|
||||||
|
c = int(escape[2:], 16)
|
||||||
|
chr(c) # raise ValueError for invalid code
|
||||||
|
return LITERAL, c
|
||||||
elif c in OCTDIGITS:
|
elif c in OCTDIGITS:
|
||||||
# octal escape (up to three digits)
|
# octal escape (up to three digits)
|
||||||
while source.next in OCTDIGITS and len(escape) < 4:
|
escape += source.getwhile(2, OCTDIGITS)
|
||||||
escape = escape + source.get()
|
return LITERAL, int(escape[1:], 8) & 0xff
|
||||||
escape = escape[1:]
|
|
||||||
return LITERAL, int(escape, 8) & 0xff
|
|
||||||
elif c in DIGITS:
|
elif c in DIGITS:
|
||||||
raise error("bogus escape: %s" % repr(escape))
|
raise ValueError
|
||||||
if len(escape) == 2:
|
if len(escape) == 2:
|
||||||
return LITERAL, ord(escape[1])
|
return LITERAL, ord(escape[1])
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
@ -273,15 +293,27 @@ def _escape(source, escape, state):
|
||||||
c = escape[1:2]
|
c = escape[1:2]
|
||||||
if c == "x":
|
if c == "x":
|
||||||
# hexadecimal escape
|
# hexadecimal escape
|
||||||
while source.next in HEXDIGITS and len(escape) < 4:
|
escape += source.getwhile(2, HEXDIGITS)
|
||||||
escape = escape + source.get()
|
|
||||||
if len(escape) != 4:
|
if len(escape) != 4:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
return LITERAL, int(escape[2:], 16) & 0xff
|
return LITERAL, int(escape[2:], 16) & 0xff
|
||||||
|
elif c == "u" and source.istext:
|
||||||
|
# unicode escape (exactly four digits)
|
||||||
|
escape += source.getwhile(4, HEXDIGITS)
|
||||||
|
if len(escape) != 6:
|
||||||
|
raise ValueError
|
||||||
|
return LITERAL, int(escape[2:], 16)
|
||||||
|
elif c == "U" and source.istext:
|
||||||
|
# unicode escape (exactly eight digits)
|
||||||
|
escape += source.getwhile(8, HEXDIGITS)
|
||||||
|
if len(escape) != 10:
|
||||||
|
raise ValueError
|
||||||
|
c = int(escape[2:], 16)
|
||||||
|
chr(c) # raise ValueError for invalid code
|
||||||
|
return LITERAL, c
|
||||||
elif c == "0":
|
elif c == "0":
|
||||||
# octal escape
|
# octal escape
|
||||||
while source.next in OCTDIGITS and len(escape) < 4:
|
escape += source.getwhile(2, OCTDIGITS)
|
||||||
escape = escape + source.get()
|
|
||||||
return LITERAL, int(escape[1:], 8) & 0xff
|
return LITERAL, int(escape[1:], 8) & 0xff
|
||||||
elif c in DIGITS:
|
elif c in DIGITS:
|
||||||
# octal escape *or* decimal group reference (sigh)
|
# octal escape *or* decimal group reference (sigh)
|
||||||
|
|
|
@ -526,24 +526,92 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertNotEqual(re.compile('^pattern$', flag), None)
|
self.assertNotEqual(re.compile('^pattern$', flag), None)
|
||||||
|
|
||||||
def test_sre_character_literals(self):
|
def test_sre_character_literals(self):
|
||||||
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
|
for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
|
||||||
self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None)
|
if i < 256:
|
||||||
self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None)
|
self.assertIsNotNone(re.match(r"\%03o" % i, chr(i)))
|
||||||
self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None)
|
self.assertIsNotNone(re.match(r"\%03o0" % i, chr(i)+"0"))
|
||||||
self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None)
|
self.assertIsNotNone(re.match(r"\%03o8" % i, chr(i)+"8"))
|
||||||
self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None)
|
self.assertIsNotNone(re.match(r"\x%02x" % i, chr(i)))
|
||||||
self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
|
self.assertIsNotNone(re.match(r"\x%02x0" % i, chr(i)+"0"))
|
||||||
self.assertRaises(re.error, re.match, "\911", "")
|
self.assertIsNotNone(re.match(r"\x%02xz" % i, chr(i)+"z"))
|
||||||
|
if i < 0x10000:
|
||||||
|
self.assertIsNotNone(re.match(r"\u%04x" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"\u%04x0" % i, chr(i)+"0"))
|
||||||
|
self.assertIsNotNone(re.match(r"\u%04xz" % i, chr(i)+"z"))
|
||||||
|
self.assertIsNotNone(re.match(r"\U%08x" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"\U%08x0" % i, chr(i)+"0"))
|
||||||
|
self.assertIsNotNone(re.match(r"\U%08xz" % i, chr(i)+"z"))
|
||||||
|
self.assertIsNotNone(re.match(r"\0", "\000"))
|
||||||
|
self.assertIsNotNone(re.match(r"\08", "\0008"))
|
||||||
|
self.assertIsNotNone(re.match(r"\01", "\001"))
|
||||||
|
self.assertIsNotNone(re.match(r"\018", "\0018"))
|
||||||
|
self.assertIsNotNone(re.match(r"\567", chr(0o167)))
|
||||||
|
self.assertRaises(re.error, re.match, r"\911", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"\x1", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"\x1z", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"\u123", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"\u123z", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"\U0001234", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"\U0001234z", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"\U00110000", "")
|
||||||
|
|
||||||
def test_sre_character_class_literals(self):
|
def test_sre_character_class_literals(self):
|
||||||
|
for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
|
||||||
|
if i < 256:
|
||||||
|
self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
|
||||||
|
if i < 0x10000:
|
||||||
|
self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
|
||||||
|
self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
|
||||||
|
self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
|
||||||
|
self.assertRaises(re.error, re.match, r"[\911]", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"[\x1z]", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"[\u123z]", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
|
||||||
|
self.assertRaises(re.error, re.match, r"[\U00110000]", "")
|
||||||
|
|
||||||
|
def test_sre_byte_literals(self):
|
||||||
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
|
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
|
||||||
self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
|
self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
|
||||||
self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
|
self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
|
||||||
self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
|
self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
|
||||||
self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
|
self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
|
||||||
self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
|
self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
|
||||||
self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
|
self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
|
||||||
self.assertRaises(re.error, re.match, "[\911]", "")
|
self.assertIsNotNone(re.match(br"\u", b'u'))
|
||||||
|
self.assertIsNotNone(re.match(br"\U", b'U'))
|
||||||
|
self.assertIsNotNone(re.match(br"\0", b"\000"))
|
||||||
|
self.assertIsNotNone(re.match(br"\08", b"\0008"))
|
||||||
|
self.assertIsNotNone(re.match(br"\01", b"\001"))
|
||||||
|
self.assertIsNotNone(re.match(br"\018", b"\0018"))
|
||||||
|
self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
|
||||||
|
self.assertRaises(re.error, re.match, br"\911", b"")
|
||||||
|
self.assertRaises(re.error, re.match, br"\x1", b"")
|
||||||
|
self.assertRaises(re.error, re.match, br"\x1z", b"")
|
||||||
|
|
||||||
|
def test_sre_byte_class_literals(self):
|
||||||
|
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
|
||||||
|
self.assertIsNotNone(re.match((r"[\%o]" % i).encode(), bytes([i])))
|
||||||
|
self.assertIsNotNone(re.match((r"[\%o8]" % i).encode(), bytes([i])))
|
||||||
|
self.assertIsNotNone(re.match((r"[\%03o]" % i).encode(), bytes([i])))
|
||||||
|
self.assertIsNotNone(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
|
||||||
|
self.assertIsNotNone(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
|
||||||
|
self.assertIsNotNone(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
|
||||||
|
self.assertIsNotNone(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
|
||||||
|
self.assertIsNotNone(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
|
||||||
|
self.assertIsNotNone(re.match(br"[\u]", b'u'))
|
||||||
|
self.assertIsNotNone(re.match(br"[\U]", b'U'))
|
||||||
|
self.assertRaises(re.error, re.match, br"[\911]", "")
|
||||||
|
self.assertRaises(re.error, re.match, br"[\x1z]", "")
|
||||||
|
|
||||||
def test_bug_113254(self):
|
def test_bug_113254(self):
|
||||||
self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
|
self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
|
||||||
|
|
|
@ -40,6 +40,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #3665: \u and \U escapes are now supported in unicode regular
|
||||||
|
expressions. Patch by Serhiy Storchaka.
|
||||||
|
|
||||||
- Issue #15153: Added inspect.getgeneratorlocals to simplify white box
|
- Issue #15153: Added inspect.getgeneratorlocals to simplify white box
|
||||||
testing of generator state updates
|
testing of generator state updates
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue