From 9baa5b2de2e1bd4d56791de8144f737f65b89c74 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 29 Sep 2014 22:49:23 +0300 Subject: [PATCH] Issue #22437: Number of capturing groups in regular expression is no longer limited by 100. --- Doc/whatsnew/3.5.rst | 6 ++++++ Lib/sre_compile.py | 6 ------ Lib/sre_constants.py | 2 +- Lib/sre_parse.py | 10 +++++++++ Lib/test/test_re.py | 18 ++++++++++++++-- Misc/NEWS | 3 +++ Modules/_sre.c | 51 ++++++++++++++++++++++++++++++++------------ Modules/sre.h | 7 +++--- 8 files changed, 76 insertions(+), 27 deletions(-) diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst index 5c2be4778a6..7fb0fd59267 100644 --- a/Doc/whatsnew/3.5.rst +++ b/Doc/whatsnew/3.5.rst @@ -217,6 +217,12 @@ os * :class:`os.stat_result` now has a :attr:`~os.stat_result.st_file_attributes` attribute on Windows (contributed by Ben Hoyt in :issue:`21719`). +re +-- + +* Number of capturing groups in regular expression is no longer limited by 100. + (Contributed by Serhiy Storchaka in :issue:`22437`.) + shutil ------ diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index c6860b5bda5..d4d129b6746 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -470,12 +470,6 @@ def compile(p, flags=0): # print code - # XXX: get rid of this limitation! - if p.pattern.groups > 100: - raise AssertionError( - "sorry, but this version only supports 100 named groups" - ) - # map in either direction groupindex = p.pattern.groupdict indexgroup = [None] * p.pattern.groups diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 23e3516006c..8815d1d4874 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -15,7 +15,7 @@ MAGIC = 20031017 -from _sre import MAXREPEAT +from _sre import MAXREPEAT, MAXGROUPS # SRE standard exception (access as sre.error) # should this really be here? diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 7fd145b6233..b9a1852823d 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -72,6 +72,8 @@ class Pattern: def opengroup(self, name=None): gid = self.groups self.groups = gid + 1 + if self.groups > MAXGROUPS: + raise error("groups number is too large") if name is not None: ogid = self.groupdict.get(name, None) if ogid is not None: @@ -695,8 +697,14 @@ def _parse(source, state): else: try: condgroup = int(condname) + if condgroup < 0: + raise ValueError except ValueError: raise error("bad character in group name") + if not condgroup: + raise error("bad group number") + if condgroup >= MAXGROUPS: + raise error("the group number is too large") else: # flags if not source.next in FLAGS: @@ -822,6 +830,8 @@ def parse_template(source, pattern): index = int(name) if index < 0: raise error("negative group number") + if index >= MAXGROUPS: + raise error("the group number is too large") except ValueError: if not name.isidentifier(): raise error("bad character in group name") diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index d85b767f12c..e5ad6cb6bc1 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -193,6 +193,7 @@ class ReTests(unittest.TestCase): def test_symbolic_groups(self): re.compile('(?Px)(?P=a)(?(a)y)') re.compile('(?Px)(?P=a1)(?(a1)y)') + re.compile('(?Px)\1(?(1)y)') self.assertRaises(re.error, re.compile, '(?P)(?P)') self.assertRaises(re.error, re.compile, '(?Px)') self.assertRaises(re.error, re.compile, '(?P=)') @@ -212,6 +213,10 @@ class ReTests(unittest.TestCase): re.compile('(?P<ยต>x)(?P=ยต)(?(ยต)y)') re.compile('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)(?P=๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)(?(๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)y)') self.assertRaises(re.error, re.compile, '(?P<ยฉ>x)') + # Support > 100 groups. + pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) + pat = '(?:%s)(?(200)z|t)' % pat + self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) def test_symbolic_refs(self): self.assertRaises(re.error, re.sub, '(?Px)', '\gx)', r'\g<ยต>', 'xx'), 'xx') self.assertEqual(re.sub('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)', r'\g<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>', 'xx'), 'xx') self.assertRaises(re.error, re.sub, '(?Px)', r'\g<ยฉ>', 'xx') + # Support > 100 groups. + pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) + self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8') def test_re_subn(self): self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) @@ -404,6 +412,10 @@ class ReTests(unittest.TestCase): self.assertIsNone(p.match('abd')) self.assertIsNone(p.match('ac')) + # Support > 100 groups. + pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) + pat = '(?:%s)(?(200)z)' % pat + self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) def test_re_groupref(self): self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), @@ -1070,8 +1082,10 @@ class ReTests(unittest.TestCase): # a RuntimeError is raised instead of OverflowError. long_overflow = 2**128 self.assertRaises(TypeError, re.finditer, "a", {}) - self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) - self.assertRaises(TypeError, _sre.compile, {}, 0, []) + with self.assertRaises(OverflowError): + _sre.compile("abc", 0, [long_overflow], 0, [], []) + with self.assertRaises(TypeError): + _sre.compile({}, 0, [], 0, [], []) def test_search_dot_unicode(self): self.assertTrue(re.search("123.*-", '123abc-')) diff --git a/Misc/NEWS b/Misc/NEWS index 63942a920cf..77a27251d94 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -145,6 +145,9 @@ Core and Builtins Library ------- +- Issue #22437: Number of capturing groups in regular expression is no longer + limited by 100. + - Issue #17442: InteractiveInterpreter now displays the full chained traceback in its showtraceback method, to match the built in interactive interpreter. diff --git a/Modules/_sre.c b/Modules/_sre.c index 13479ba5d7f..5c3d1058909 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -357,6 +357,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, memset(state, 0, sizeof(SRE_STATE)); + state->mark = PyMem_New(void *, pattern->groups * 2); + if (!state->mark) { + PyErr_NoMemory(); + goto err; + } state->lastmark = -1; state->lastindex = -1; @@ -409,6 +414,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, return string; err: + PyMem_Del(state->mark); + state->mark = NULL; if (state->buffer.buf) PyBuffer_Release(&state->buffer); return NULL; @@ -421,6 +428,8 @@ state_fini(SRE_STATE* state) PyBuffer_Release(&state->buffer); Py_XDECREF(state->string); data_stack_dealloc(state); + PyMem_Del(state->mark); + state->mark = NULL; } /* calculate offset from start of string */ @@ -560,6 +569,7 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs) PyObject *pattern = NULL; SRE_STATE state; Py_ssize_t status; + PyObject *match; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|Onn$O:match", _keywords, @@ -579,12 +589,14 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs) status = sre_match(&state, PatternObject_GetCode(self), 0); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); - if (PyErr_Occurred()) + if (PyErr_Occurred()) { + state_fini(&state); return NULL; + } + match = pattern_new_match(self, &state, status); state_fini(&state); - - return (PyObject *)pattern_new_match(self, &state, status); + return match; } static PyObject* @@ -592,6 +604,7 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw) { SRE_STATE state; Py_ssize_t status; + PyObject *match; PyObject *string = NULL, *string2 = NULL; Py_ssize_t start = 0; @@ -616,12 +629,14 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw) status = sre_match(&state, PatternObject_GetCode(self), 1); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); - if (PyErr_Occurred()) + if (PyErr_Occurred()) { + state_fini(&state); return NULL; + } + match = pattern_new_match(self, &state, status); state_fini(&state); - - return pattern_new_match(self, &state, status); + return match; } static PyObject* @@ -629,6 +644,7 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw) { SRE_STATE state; Py_ssize_t status; + PyObject *match; PyObject *string = NULL, *string2 = NULL; Py_ssize_t start = 0; @@ -652,12 +668,14 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw) TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); - state_fini(&state); - - if (PyErr_Occurred()) + if (PyErr_Occurred()) { + state_fini(&state); return NULL; + } - return pattern_new_match(self, &state, status); + match = pattern_new_match(self, &state, status); + state_fini(&state); + return match; } static PyObject* @@ -1417,7 +1435,7 @@ _compile(PyObject* self_, PyObject* args) PyObject* groupindex = NULL; PyObject* indexgroup = NULL; - if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags, + if (!PyArg_ParseTuple(args, "OiO!nOO", &pattern, &flags, &PyList_Type, &code, &groups, &groupindex, &indexgroup)) return NULL; @@ -1933,10 +1951,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) static int _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) { - if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS) + if (groups < 0 || (size_t)groups > SRE_MAXGROUPS || + code >= end || end[-1] != SRE_OP_SUCCESS) FAIL; - if (groups == 0) /* fix for simplejson */ - groups = 100; /* 100 groups should always be safe */ return _validate_inner(code, end-1, groups); } @@ -2747,6 +2764,12 @@ PyMODINIT_FUNC PyInit__sre(void) Py_DECREF(x); } + x = PyLong_FromUnsignedLong(SRE_MAXGROUPS); + if (x) { + PyDict_SetItemString(d, "MAXGROUPS", x); + Py_DECREF(x); + } + x = PyUnicode_FromString(copyright); if (x) { PyDict_SetItemString(d, "copyright", x); diff --git a/Modules/sre.h b/Modules/sre.h index 42fe28d554c..35d198feadc 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -18,8 +18,10 @@ #define SRE_CODE Py_UCS4 #if SIZEOF_SIZE_T > 4 # define SRE_MAXREPEAT (~(SRE_CODE)0) +# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2) #else # define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX) +# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2) #endif typedef struct { @@ -52,9 +54,6 @@ typedef struct { typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); -/* FIXME: shouldn't be a constant, really... */ -#define SRE_MARK_SIZE 200 - typedef struct SRE_REPEAT_T { Py_ssize_t count; SRE_CODE* pattern; /* points to REPEAT operator arguments */ @@ -76,7 +75,7 @@ typedef struct { /* registers */ Py_ssize_t lastindex; Py_ssize_t lastmark; - void* mark[SRE_MARK_SIZE]; + void** mark; /* dynamically allocated stuff */ char* data_stack; size_t data_stack_size;