From 463badf06ce33d9de88dfea645c1253f44588aad Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 23 Jun 2012 13:29:19 +0200 Subject: [PATCH] Issue #3665: \u and \U escapes are now supported in unicode regular expressions. Patch by Serhiy Storchaka. --- Doc/library/re.rst | 11 ++++- Lib/sre_parse.py | 66 ++++++++++++++++++++++-------- Lib/test/test_re.py | 98 ++++++++++++++++++++++++++++++++++++++------- Misc/NEWS | 3 ++ 4 files changed, 144 insertions(+), 34 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index d8ead6822d2..07623c9582a 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -414,17 +414,24 @@ Most of the standard escapes supported by Python string literals are also accepted by the regular expression parser:: \a \b \f \n - \r \t \v \x - \\ + \r \t \u \U + \v \x \\ (Note that ``\b`` is used to represent word boundaries, and means "backspace" only inside character classes.) +``'\u'`` and ``'\U'`` escape sequences are only recognized in Unicode +patterns. In bytes patterns they are not treated specially. + Octal escapes are included in a limited form. If the first digit is a 0, or if there are three octal digits, it is considered an octal escape. Otherwise, it is a group reference. As for string literals, octal escapes are always at most three digits in length. +.. versionchanged:: 3.3 + The ``'\u'`` and ``'\U'`` escape sequences have been added. + + .. _contents-of-module-re: diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index ae63c31ebc0..d358646a39a 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -177,6 +177,7 @@ class SubPattern: class Tokenizer: def __init__(self, string): + self.istext = isinstance(string, str) self.string = string self.index = 0 self.__next() @@ -187,14 +188,14 @@ class Tokenizer: char = self.string[self.index:self.index+1] # Special case for the str8, since indexing returns a integer # XXX This is only needed for test_bug_926075 in test_re.py - if char and isinstance(char, bytes): + if char and not self.istext: char = chr(char[0]) if char == "\\": try: c = self.string[self.index + 1] except IndexError: raise error("bogus escape (end of line)") - if isinstance(self.string, bytes): + if not self.istext: c = chr(c) char = char + c self.index = self.index + len(char) @@ -209,6 +210,15 @@ class Tokenizer: this = self.next self.__next() return this + def getwhile(self, n, charset): + result = '' + for _ in range(n): + c = self.next + if c not in charset: + break + result += c + self.__next() + return result def tell(self): return self.index, self.next def seek(self, index): @@ -241,20 +251,30 @@ def _class_escape(source, escape): c = escape[1:2] if c == "x": # hexadecimal escape (exactly two digits) - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[2:] - if len(escape) != 2: - raise error("bogus escape: %s" % repr("\\" + escape)) - return LITERAL, int(escape, 16) & 0xff + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise ValueError + return LITERAL, int(escape[2:], 16) & 0xff + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise ValueError + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise ValueError + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c elif c in OCTDIGITS: # octal escape (up to three digits) - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[1:] - return LITERAL, int(escape, 8) & 0xff + escape += source.getwhile(2, OCTDIGITS) + return LITERAL, int(escape[1:], 8) & 0xff elif c in DIGITS: - raise error("bogus escape: %s" % repr(escape)) + raise ValueError if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: @@ -273,15 +293,27 @@ def _escape(source, escape, state): c = escape[1:2] if c == "x": # hexadecimal escape - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() + escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError return LITERAL, int(escape[2:], 16) & 0xff + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise ValueError + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise ValueError + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c elif c == "0": # octal escape - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() + escape += source.getwhile(2, OCTDIGITS) return LITERAL, int(escape[1:], 8) & 0xff elif c in DIGITS: # octal escape *or* decimal group reference (sigh) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 74a7b71f10e..9b0aa75c130 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -526,24 +526,92 @@ class ReTests(unittest.TestCase): self.assertNotEqual(re.compile('^pattern$', flag), None) def test_sre_character_literals(self): - for i in [0, 8, 16, 32, 64, 127, 128, 255]: - self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None) - self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None) - self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None) - self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None) - self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None) - self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None) - self.assertRaises(re.error, re.match, "\911", "") + for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: + if i < 256: + self.assertIsNotNone(re.match(r"\%03o" % i, chr(i))) + self.assertIsNotNone(re.match(r"\%03o0" % i, chr(i)+"0")) + self.assertIsNotNone(re.match(r"\%03o8" % i, chr(i)+"8")) + self.assertIsNotNone(re.match(r"\x%02x" % i, chr(i))) + self.assertIsNotNone(re.match(r"\x%02x0" % i, chr(i)+"0")) + self.assertIsNotNone(re.match(r"\x%02xz" % i, chr(i)+"z")) + if i < 0x10000: + self.assertIsNotNone(re.match(r"\u%04x" % i, chr(i))) + self.assertIsNotNone(re.match(r"\u%04x0" % i, chr(i)+"0")) + self.assertIsNotNone(re.match(r"\u%04xz" % i, chr(i)+"z")) + self.assertIsNotNone(re.match(r"\U%08x" % i, chr(i))) + self.assertIsNotNone(re.match(r"\U%08x0" % i, chr(i)+"0")) + self.assertIsNotNone(re.match(r"\U%08xz" % i, chr(i)+"z")) + self.assertIsNotNone(re.match(r"\0", "\000")) + self.assertIsNotNone(re.match(r"\08", "\0008")) + self.assertIsNotNone(re.match(r"\01", "\001")) + self.assertIsNotNone(re.match(r"\018", "\0018")) + self.assertIsNotNone(re.match(r"\567", chr(0o167))) + self.assertRaises(re.error, re.match, r"\911", "") + self.assertRaises(re.error, re.match, r"\x1", "") + self.assertRaises(re.error, re.match, r"\x1z", "") + self.assertRaises(re.error, re.match, r"\u123", "") + self.assertRaises(re.error, re.match, r"\u123z", "") + self.assertRaises(re.error, re.match, r"\U0001234", "") + self.assertRaises(re.error, re.match, r"\U0001234z", "") + self.assertRaises(re.error, re.match, r"\U00110000", "") def test_sre_character_class_literals(self): + for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: + if i < 256: + self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i))) + if i < 0x10000: + self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i))) + self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0")) + self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z")) + self.assertRaises(re.error, re.match, r"[\911]", "") + self.assertRaises(re.error, re.match, r"[\x1z]", "") + self.assertRaises(re.error, re.match, r"[\u123z]", "") + self.assertRaises(re.error, re.match, r"[\U0001234z]", "") + self.assertRaises(re.error, re.match, r"[\U00110000]", "") + + def test_sre_byte_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: - self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None) - self.assertRaises(re.error, re.match, "[\911]", "") + self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0")) + self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8")) + self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) + self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) + self.assertIsNotNone(re.match(br"\u", b'u')) + self.assertIsNotNone(re.match(br"\U", b'U')) + self.assertIsNotNone(re.match(br"\0", b"\000")) + self.assertIsNotNone(re.match(br"\08", b"\0008")) + self.assertIsNotNone(re.match(br"\01", b"\001")) + self.assertIsNotNone(re.match(br"\018", b"\0018")) + self.assertIsNotNone(re.match(br"\567", bytes([0o167]))) + self.assertRaises(re.error, re.match, br"\911", b"") + self.assertRaises(re.error, re.match, br"\x1", b"") + self.assertRaises(re.error, re.match, br"\x1z", b"") + + def test_sre_byte_class_literals(self): + for i in [0, 8, 16, 32, 64, 127, 128, 255]: + self.assertIsNotNone(re.match((r"[\%o]" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match((r"[\%o8]" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match((r"[\%03o]" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match((r"[\%03o0]" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match((r"[\%03o8]" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) + self.assertIsNotNone(re.match(br"[\u]", b'u')) + self.assertIsNotNone(re.match(br"[\U]", b'U')) + self.assertRaises(re.error, re.match, br"[\911]", "") + self.assertRaises(re.error, re.match, br"[\x1z]", "") def test_bug_113254(self): self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) diff --git a/Misc/NEWS b/Misc/NEWS index 37a0dee073f..f768602fd28 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -40,6 +40,9 @@ Core and Builtins Library ------- +- Issue #3665: \u and \U escapes are now supported in unicode regular + expressions. Patch by Serhiy Storchaka. + - Issue #15153: Added inspect.getgeneratorlocals to simplify white box testing of generator state updates