diff --git a/Doc/library/re.rst b/Doc/library/re.rst index ac6455a2207..b51283089c8 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -371,6 +371,8 @@ The special characters are: ``(?#...)`` A comment; the contents of the parentheses are simply ignored. +.. index:: single: (?=; in regular expressions + ``(?=...)`` Matches if ``...`` matches next, but doesn't consume any of the string. This is called a :dfn:`lookahead assertion`. For example, ``Isaac (?=Asimov)`` will match diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index ab1d985d59f..797d85d0629 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2067,6 +2067,40 @@ ELSE self.assertEqual(m.group(), b'xyz') self.assertEqual(m2.group(), b'') + def test_bug_34294(self): + # Issue 34294: wrong capturing groups + + # exists since Python 2 + s = "a\tx" + p = r"\b(?=(\t)|(x))x" + self.assertEqual(re.search(p, s).groups(), (None, 'x')) + + # introduced in Python 3.7.0 + s = "ab" + p = r"(?=(.)(.)?)" + self.assertEqual(re.findall(p, s), + [('a', 'b'), ('b', '')]) + self.assertEqual([m.groups() for m in re.finditer(p, s)], + [('a', 'b'), ('b', None)]) + + # test-cases provided by issue34294, introduced in Python 3.7.0 + p = r"(?=<(?P\w+)/?>(?:(?P.+?))?)" + s = "" + self.assertEqual(re.findall(p, s), + [('test', ''), ('foo2', '')]) + self.assertEqual([m.groupdict() for m in re.finditer(p, s)], + [{'tag': 'test', 'text': ''}, + {'tag': 'foo2', 'text': None}]) + s = "Hello" + self.assertEqual([m.groupdict() for m in re.finditer(p, s)], + [{'tag': 'test', 'text': 'Hello'}, + {'tag': 'foo', 'text': None}]) + s = "Hello" + self.assertEqual([m.groupdict() for m in re.finditer(p, s)], + [{'tag': 'test', 'text': 'Hello'}, + {'tag': 'foo', 'text': None}, + {'tag': 'foo', 'text': None}]) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst b/Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst new file mode 100644 index 00000000000..e1ae2ea6a33 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst @@ -0,0 +1,4 @@ +re module, fix wrong capturing groups in rare cases. :func:`re.search`, +:func:`re.findall`, :func:`re.sub` and other functions that scan through +string looking for a match, should reset capturing groups between two match +attempts. Patch by Ma Lin. \ No newline at end of file diff --git a/Modules/_sre.c b/Modules/_sre.c index 75f030cfaa8..21c41b5580b 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -340,7 +340,7 @@ _sre_unicode_tolower_impl(PyObject *module, int character) LOCAL(void) state_reset(SRE_STATE* state) { - /* FIXME: dynamic! */ + /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */ /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/ state->lastmark = -1; diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 44948e21ad9..437ab43f434 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1363,6 +1363,10 @@ exit: return ret; /* should never get here */ } +/* need to reset capturing groups between two SRE(match) callings in loops */ +#define RESET_CAPTURE_GROUP() \ + do { state->lastmark = state->lastindex = -1; } while (0) + LOCAL(Py_ssize_t) SRE(search)(SRE_STATE* state, SRE_CODE* pattern) { @@ -1440,6 +1444,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) if (status != 0) return status; ++ptr; + RESET_CAPTURE_GROUP(); } return 0; } @@ -1487,6 +1492,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) /* close but no cigar -- try again */ if (++ptr >= end) return 0; + RESET_CAPTURE_GROUP(); } i = overlap[i]; } while (i != 0); @@ -1510,6 +1516,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) if (status != 0) break; ptr++; + RESET_CAPTURE_GROUP(); } } else { /* general case */ @@ -1520,6 +1527,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) state->must_advance = 0; while (status == 0 && ptr < end) { ptr++; + RESET_CAPTURE_GROUP(); TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); state->start = state->ptr = ptr; status = SRE(match)(state, pattern, 0);