bpo-30688: Support \N{name} escapes in re patterns. (GH-5588)
Co-authored-by: Jonathan Eunice <jonathan.eunice@gmail.com>
This commit is contained in:
parent
2411292ba8
commit
a445feb729
|
@ -468,13 +468,13 @@ Most of the standard escapes supported by Python string literals are also
|
|||
accepted by the regular expression parser::
|
||||
|
||||
\a \b \f \n
|
||||
\r \t \u \U
|
||||
\v \x \\
|
||||
\N \r \t \u
|
||||
\U \v \x \\
|
||||
|
||||
(Note that ``\b`` is used to represent word boundaries, and means "backspace"
|
||||
only inside character classes.)
|
||||
|
||||
``'\u'`` and ``'\U'`` escape sequences are only recognized in Unicode
|
||||
``'\u'``, ``'\U'``, and ``'\N'`` escape sequences are only recognized in Unicode
|
||||
patterns. In bytes patterns they are errors.
|
||||
|
||||
Octal escapes are included in a limited form. If the first digit is a 0, or if
|
||||
|
@ -488,6 +488,9 @@ three digits in length.
|
|||
.. versionchanged:: 3.6
|
||||
Unknown escapes consisting of ``'\'`` and an ASCII letter now are errors.
|
||||
|
||||
.. versionchanged:: 3.8
|
||||
The ``'\N{name}'`` escape sequence has been added. As in string literals,
|
||||
it expands to the named Unicode character (e.g. ``'\N{EM DASH}'``).
|
||||
|
||||
.. seealso::
|
||||
|
||||
|
|
|
@ -75,6 +75,8 @@ New Features
|
|||
Other Language Changes
|
||||
======================
|
||||
|
||||
* Added support of ``\N{name}`` escapes in :mod:`regular expressions <re>`.
|
||||
(Contributed by Jonathan Eunice and Serhiy Storchaka in :issue:`30688`.)
|
||||
|
||||
|
||||
New Modules
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
# XXX: show string offset and offending character for all errors
|
||||
|
||||
from sre_constants import *
|
||||
import unicodedata
|
||||
|
||||
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
||||
REPEAT_CHARS = "*+?{"
|
||||
|
@ -264,19 +265,19 @@ class Tokenizer:
|
|||
result += c
|
||||
self.__next()
|
||||
return result
|
||||
def getuntil(self, terminator):
|
||||
def getuntil(self, terminator, name):
|
||||
result = ''
|
||||
while True:
|
||||
c = self.next
|
||||
self.__next()
|
||||
if c is None:
|
||||
if not result:
|
||||
raise self.error("missing group name")
|
||||
raise self.error("missing " + name)
|
||||
raise self.error("missing %s, unterminated name" % terminator,
|
||||
len(result))
|
||||
if c == terminator:
|
||||
if not result:
|
||||
raise self.error("missing group name", 1)
|
||||
raise self.error("missing " + name, 1)
|
||||
break
|
||||
result += c
|
||||
return result
|
||||
|
@ -322,6 +323,17 @@ def _class_escape(source, escape):
|
|||
c = int(escape[2:], 16)
|
||||
chr(c) # raise ValueError for invalid code
|
||||
return LITERAL, c
|
||||
elif c == "N" and source.istext:
|
||||
# named unicode escape e.g. \N{EM DASH}
|
||||
if not source.match('{'):
|
||||
raise source.error("missing {")
|
||||
charname = source.getuntil('}', 'character name')
|
||||
try:
|
||||
c = ord(unicodedata.lookup(charname))
|
||||
except KeyError:
|
||||
raise source.error("undefined character name %r" % charname,
|
||||
len(charname) + len(r'\N{}'))
|
||||
return LITERAL, c
|
||||
elif c in OCTDIGITS:
|
||||
# octal escape (up to three digits)
|
||||
escape += source.getwhile(2, OCTDIGITS)
|
||||
|
@ -370,6 +382,17 @@ def _escape(source, escape, state):
|
|||
c = int(escape[2:], 16)
|
||||
chr(c) # raise ValueError for invalid code
|
||||
return LITERAL, c
|
||||
elif c == "N" and source.istext:
|
||||
# named unicode escape e.g. \N{EM DASH}
|
||||
if not source.match('{'):
|
||||
raise source.error("missing {")
|
||||
charname = source.getuntil('}', 'character name')
|
||||
try:
|
||||
c = ord(unicodedata.lookup(charname))
|
||||
except KeyError:
|
||||
raise source.error("undefined character name %r" % charname,
|
||||
len(charname) + len(r'\N{}'))
|
||||
return LITERAL, c
|
||||
elif c == "0":
|
||||
# octal escape
|
||||
escape += source.getwhile(2, OCTDIGITS)
|
||||
|
@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
# python extensions
|
||||
if sourcematch("<"):
|
||||
# named group: skip forward to end of name
|
||||
name = source.getuntil(">")
|
||||
name = source.getuntil(">", "group name")
|
||||
if not name.isidentifier():
|
||||
msg = "bad character in group name %r" % name
|
||||
raise source.error(msg, len(name) + 1)
|
||||
elif sourcematch("="):
|
||||
# named backreference
|
||||
name = source.getuntil(")")
|
||||
name = source.getuntil(")", "group name")
|
||||
if not name.isidentifier():
|
||||
msg = "bad character in group name %r" % name
|
||||
raise source.error(msg, len(name) + 1)
|
||||
|
@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
|
||||
elif char == "(":
|
||||
# conditional backreference group
|
||||
condname = source.getuntil(")")
|
||||
condname = source.getuntil(")", "group name")
|
||||
if condname.isidentifier():
|
||||
condgroup = state.groupdict.get(condname)
|
||||
if condgroup is None:
|
||||
|
@ -977,7 +1000,7 @@ def parse_template(source, pattern):
|
|||
name = ""
|
||||
if not s.match("<"):
|
||||
raise s.error("missing <")
|
||||
name = s.getuntil(">")
|
||||
name = s.getuntil(">", "group name")
|
||||
if name.isidentifier():
|
||||
try:
|
||||
index = groupindex[name]
|
||||
|
|
|
@ -694,6 +694,42 @@ class ReTests(unittest.TestCase):
|
|||
with self.subTest(c):
|
||||
self.assertRaises(re.error, re.compile, '[\\%c]' % c)
|
||||
|
||||
def test_named_unicode_escapes(self):
|
||||
# test individual Unicode named escapes
|
||||
self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
|
||||
self.assertTrue(re.match(r'\N{less-than sign}', '<'))
|
||||
self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>'))
|
||||
self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d'))
|
||||
self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH '
|
||||
r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}',
|
||||
'\ufbf9'))
|
||||
self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
|
||||
'='))
|
||||
self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
|
||||
';'))
|
||||
|
||||
# test errors in \N{name} handling - only valid names should pass
|
||||
self.checkPatternError(r'\N', 'missing {', 2)
|
||||
self.checkPatternError(r'[\N]', 'missing {', 3)
|
||||
self.checkPatternError(r'\N{', 'missing character name', 3)
|
||||
self.checkPatternError(r'[\N{', 'missing character name', 4)
|
||||
self.checkPatternError(r'\N{}', 'missing character name', 3)
|
||||
self.checkPatternError(r'[\N{}]', 'missing character name', 4)
|
||||
self.checkPatternError(r'\NSNAKE}', 'missing {', 2)
|
||||
self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3)
|
||||
self.checkPatternError(r'\N{SNAKE',
|
||||
'missing }, unterminated name', 3)
|
||||
self.checkPatternError(r'[\N{SNAKE]',
|
||||
'missing }, unterminated name', 4)
|
||||
self.checkPatternError(r'[\N{SNAKE]}',
|
||||
"undefined character name 'SNAKE]'", 1)
|
||||
self.checkPatternError(r'\N{SPAM}',
|
||||
"undefined character name 'SPAM'", 0)
|
||||
self.checkPatternError(r'[\N{SPAM}]',
|
||||
"undefined character name 'SPAM'", 1)
|
||||
self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
|
||||
self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)
|
||||
|
||||
def test_string_boundaries(self):
|
||||
# See http://bugs.python.org/issue10713
|
||||
self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
|
||||
|
|
|
@ -441,6 +441,7 @@ Andy Eskilsson
|
|||
André Espaze
|
||||
Stefan Esser
|
||||
Nicolas Estibals
|
||||
Jonathan Eunice
|
||||
Carey Evans
|
||||
Stephen D Evans
|
||||
Tim Everett
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Added support of ``\N{name}`` escapes in regular expressions. Based on
|
||||
patch by Jonathan Eunice.
|
Loading…
Reference in New Issue