Issues #814253, #9179: Warnings now are raised when group references and

conditional group references are used in lookbehind assertions in regular
expressions.
This commit is contained in:
Serhiy Storchaka 2015-02-21 12:08:52 +02:00
parent a1543cdcd6
commit a3369a524c
4 changed files with 61 additions and 3 deletions

View File

@ -281,7 +281,9 @@ The special characters are:
assertion`. ``(?<=abc)def`` will find a match in ``abcdef``, since the assertion`. ``(?<=abc)def`` will find a match in ``abcdef``, since the
lookbehind will back up 3 characters and check if the contained pattern matches. lookbehind will back up 3 characters and check if the contained pattern matches.
The contained pattern must only match strings of some fixed length, meaning that The contained pattern must only match strings of some fixed length, meaning that
``abc`` or ``a|b`` are allowed, but ``a*`` and ``a{3,4}`` are not. Note that ``abc`` or ``a|b`` are allowed, but ``a*`` and ``a{3,4}`` are not. Group
references are not supported even if they match strings of some fixed length.
Note that
patterns which start with positive lookbehind assertions will not match at the patterns which start with positive lookbehind assertions will not match at the
beginning of the string being searched; you will most likely want to use the beginning of the string being searched; you will most likely want to use the
:func:`search` function rather than the :func:`match` function: :func:`search` function rather than the :func:`match` function:
@ -301,7 +303,8 @@ The special characters are:
Matches if the current position in the string is not preceded by a match for Matches if the current position in the string is not preceded by a match for
``...``. This is called a :dfn:`negative lookbehind assertion`. Similar to ``...``. This is called a :dfn:`negative lookbehind assertion`. Similar to
positive lookbehind assertions, the contained pattern must only match strings of positive lookbehind assertions, the contained pattern must only match strings of
some fixed length. Patterns which start with negative lookbehind assertions may some fixed length and shouldn't contain group references.
Patterns which start with negative lookbehind assertions may
match at the beginning of the string being searched. match at the beginning of the string being searched.
``(?(id/name)yes-pattern|no-pattern)`` ``(?(id/name)yes-pattern|no-pattern)``

View File

@ -69,6 +69,8 @@ class Pattern:
self.open = [] self.open = []
self.groups = 1 self.groups = 1
self.groupdict = {} self.groupdict = {}
self.lookbehind = 0
def opengroup(self, name=None): def opengroup(self, name=None):
gid = self.groups gid = self.groups
self.groups = gid + 1 self.groups = gid + 1
@ -352,6 +354,11 @@ def _escape(source, escape, state):
if group < state.groups: if group < state.groups:
if not state.checkgroup(group): if not state.checkgroup(group):
raise error("cannot refer to open group") raise error("cannot refer to open group")
if state.lookbehind:
import warnings
warnings.warn('group references in lookbehind '
'assertions are not supported',
RuntimeWarning)
return GROUPREF, group return GROUPREF, group
raise ValueError raise ValueError
if len(escape) == 2: if len(escape) == 2:
@ -630,6 +637,11 @@ def _parse(source, state):
if gid is None: if gid is None:
msg = "unknown group name: {0!r}".format(name) msg = "unknown group name: {0!r}".format(name)
raise error(msg) raise error(msg)
if state.lookbehind:
import warnings
warnings.warn('group references in lookbehind '
'assertions are not supported',
RuntimeWarning)
subpatternappend((GROUPREF, gid)) subpatternappend((GROUPREF, gid))
continue continue
else: else:
@ -658,7 +670,10 @@ def _parse(source, state):
raise error("syntax error") raise error("syntax error")
dir = -1 # lookbehind dir = -1 # lookbehind
char = sourceget() char = sourceget()
state.lookbehind += 1
p = _parse_sub(source, state) p = _parse_sub(source, state)
if dir < 0:
state.lookbehind -= 1
if not sourcematch(")"): if not sourcematch(")"):
raise error("unbalanced parenthesis") raise error("unbalanced parenthesis")
if char == "=": if char == "=":
@ -689,6 +704,11 @@ def _parse(source, state):
condgroup = int(condname) condgroup = int(condname)
except ValueError: except ValueError:
raise error("bad character in group name") raise error("bad character in group name")
if state.lookbehind:
import warnings
warnings.warn('group references in lookbehind '
'assertions are not supported',
RuntimeWarning)
else: else:
# flags # flags
if not source.next in FLAGS: if not source.next in FLAGS:

View File

@ -557,7 +557,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
"a\n\nb") "a\n\nb")
def test_non_consuming(self): def test_lookahead(self):
self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a") self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a") self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a") self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
@ -571,6 +571,37 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
# Group reference.
self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
# Named group reference.
self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba'))
self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac'))
# Conditional group reference.
self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
# Group used before defined.
self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
def test_lookbehind(self):
self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
# Group reference.
self.assertWarns(RuntimeWarning, re.compile, r'(a)a(?<=\1)c')
# Named group reference.
self.assertWarns(RuntimeWarning, re.compile, r'(?P<g>a)a(?<=(?P=g))c')
# Conditional group reference.
self.assertWarns(RuntimeWarning, re.compile, r'(a)b(?<=(?(1)b|x))c')
# Group used before defined.
self.assertWarns(RuntimeWarning, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
def test_ignore_case(self): def test_ignore_case(self):
self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")

View File

@ -13,6 +13,10 @@ Core and Builtins
Library Library
------- -------
- Issues #814253, #9179: Warnings now are raised when group references and
conditional group references are used in lookbehind assertions in regular
expressions.
- Issue #23215: Multibyte codecs with custom error handlers that ignores errors - Issue #23215: Multibyte codecs with custom error handlers that ignores errors
consumed too much memory and raised SystemError or MemoryError. consumed too much memory and raised SystemError or MemoryError.
Original patch by Aleksi Torhamo. Original patch by Aleksi Torhamo.