From 4300019e1a6b20f6e2e780a36d96d795c9e71a6f Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Sun, 13 Oct 2013 09:18:45 +0200 Subject: [PATCH] Add re.fullmatch() function and regex.fullmatch() method, which anchor the pattern at both ends of the string to match. Patch by Matthew Barnett. Closes #16203. --- Doc/library/re.rst | 28 +++++++++++ Lib/re.py | 28 +++++++---- Lib/test/test_re.py | 24 +++++++++ Modules/_sre.c | 120 +++++++++++++++++++++++++++++++------------- Modules/sre.h | 1 + 5 files changed, 156 insertions(+), 45 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 762ca496bc3..9ea99a94635 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -584,6 +584,16 @@ form. instead (see also :ref:`search-vs-match`). +.. function:: fullmatch(pattern, string, flags=0) + + If the whole *string* matches the regular expression *pattern*, return a + corresponding :ref:`match object `. Return ``None`` if the + string does not match the pattern; note that this is different from a + zero-length match. + + .. versionadded:: 3.4 + + .. function:: split(pattern, string, maxsplit=0, flags=0) Split *string* by the occurrences of *pattern*. If capturing parentheses are @@ -778,6 +788,24 @@ attributes: :meth:`~regex.search` instead (see also :ref:`search-vs-match`). +.. method:: regex.fullmatch(string[, pos[, endpos]]) + + If the whole *string* matches this regular expression, return a corresponding + :ref:`match object `. Return ``None`` if the string does not + match the pattern; note that this is different from a zero-length match. + + The optional *pos* and *endpos* parameters have the same meaning as for the + :meth:`~regex.search` method. + + >>> pattern = re.compile("o[gh]") + >>> pattern.fullmatch("dog") # No match as "o" is not at the start of "dog". + >>> pattern.fullmatch("ogre") # No match as not the full string matches. + >>> pattern.fullmatch("doggie", 1, 3) # Matches within given limits. + <_sre.SRE_Match object at ...> + + .. versionadded:: 3.4 + + .. method:: regex.split(string, maxsplit=0) Identical to the :func:`split` function, using the compiled pattern. diff --git a/Lib/re.py b/Lib/re.py index b41aab0bafd..77f5e3fadba 100644 --- a/Lib/re.py +++ b/Lib/re.py @@ -85,16 +85,17 @@ resulting RE will match the second character. \\ Matches a literal backslash. This module exports the following functions: - match Match a regular expression pattern to the beginning of a string. - search Search a string for the presence of a pattern. - sub Substitute occurrences of a pattern found in a string. - subn Same as sub, but also return the number of substitutions made. - split Split a string by the occurrences of a pattern. - findall Find all occurrences of a pattern in a string. - finditer Return an iterator yielding a match object for each match. - compile Compile a pattern into a RegexObject. - purge Clear the regular expression cache. - escape Backslash all non-alphanumerics in a string. + match Match a regular expression pattern to the beginning of a string. + fullmatch Match a regular expression pattern to all of a string. + search Search a string for the presence of a pattern. + sub Substitute occurrences of a pattern found in a string. + subn Same as sub, but also return the number of substitutions made. + split Split a string by the occurrences of a pattern. + findall Find all occurrences of a pattern in a string. + finditer Return an iterator yielding a match object for each match. + compile Compile a pattern into a RegexObject. + purge Clear the regular expression cache. + escape Backslash all non-alphanumerics in a string. Some of the functions in this module takes flags as optional parameters: A ASCII For string patterns, make \w, \W, \b, \B, \d, \D @@ -123,7 +124,7 @@ import sre_compile import sre_parse # public symbols -__all__ = [ "match", "search", "sub", "subn", "split", "findall", +__all__ = [ "match", "fullmatch", "search", "sub", "subn", "split", "findall", "compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X", "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", "UNICODE", "error" ] @@ -154,6 +155,11 @@ def match(pattern, string, flags=0): a match object, or None if no match was found.""" return _compile(pattern, flags).match(string) +def fullmatch(pattern, string, flags=0): + """Try to apply the pattern to all of the string, returning + a match object, or None if no match was found.""" + return _compile(pattern, flags).fullmatch(string) + def search(pattern, string, flags=0): """Scan through string looking for a match to the pattern, returning a match object, or None if no match was found.""" diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 2104437408d..ea57d1f845f 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1061,6 +1061,30 @@ class ReTests(unittest.TestCase): self.assertEqual(m.group(1), "") self.assertEqual(m.group(2), "y") + def test_fullmatch(self): + # Issue 16203: Proposal: add re.fullmatch() method. + self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1)) + self.assertEqual(re.fullmatch(r"a|ab", "ab").span(), (0, 2)) + self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3)) + self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3)) + self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2)) + self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3)) + self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4)) + self.assertEqual(re.fullmatch(r"abc$", "abc\n"), None) + self.assertEqual(re.fullmatch(r"abc\Z", "abc\n"), None) + self.assertEqual(re.fullmatch(r"(?m)abc$", "abc\n"), None) + self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4)) + self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4)) + self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2)) + + self.assertEqual( + re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) + self.assertEqual( + re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) + self.assertEqual( + re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) + + def run_re_tests(): from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR if verbose: diff --git a/Modules/_sre.c b/Modules/_sre.c index 99c3cd5c05f..8d9cb982ec3 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -4,24 +4,25 @@ * regular expression matching engine * * partial history: - * 1999-10-24 fl created (based on existing template matcher code) - * 2000-03-06 fl first alpha, sort of - * 2000-08-01 fl fixes for 1.6b1 - * 2000-08-07 fl use PyOS_CheckStack() if available - * 2000-09-20 fl added expand method - * 2001-03-20 fl lots of fixes for 2.1b2 - * 2001-04-15 fl export copyright as Python attribute, not global - * 2001-04-28 fl added __copy__ methods (work in progress) - * 2001-05-14 fl fixes for 1.5.2 compatibility - * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) - * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) - * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 - * 2001-10-21 fl added sub/subn primitive - * 2001-10-24 fl added finditer primitive (for 2.2 only) - * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) - * 2002-11-09 fl fixed empty sub/subn return type - * 2003-04-18 mvl fully support 4-byte codes - * 2003-10-17 gn implemented non recursive scheme + * 1999-10-24 fl created (based on existing template matcher code) + * 2000-03-06 fl first alpha, sort of + * 2000-08-01 fl fixes for 1.6b1 + * 2000-08-07 fl use PyOS_CheckStack() if available + * 2000-09-20 fl added expand method + * 2001-03-20 fl lots of fixes for 2.1b2 + * 2001-04-15 fl export copyright as Python attribute, not global + * 2001-04-28 fl added __copy__ methods (work in progress) + * 2001-05-14 fl fixes for 1.5.2 compatibility + * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) + * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) + * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 + * 2001-10-21 fl added sub/subn primitive + * 2001-10-24 fl added finditer primitive (for 2.2 only) + * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) + * 2002-11-09 fl fixed empty sub/subn return type + * 2003-04-18 mvl fully support 4-byte codes + * 2003-10-17 gn implemented non recursive scheme + * 2013-02-04 mrab added fullmatch primitive * * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * @@ -746,11 +747,12 @@ do { \ #define JUMP_ASSERT 12 #define JUMP_ASSERT_NOT 13 -#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ +#define DO_JUMP(jumpvalue, jumplabel, nextpattern, matchall) \ DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \ nextctx->last_ctx_pos = ctx_pos; \ nextctx->jump = jumpvalue; \ nextctx->pattern = nextpattern; \ + nextctx->match_all = matchall; \ ctx_pos = alloc_pos; \ ctx = nextctx; \ goto entrance; \ @@ -769,6 +771,7 @@ typedef struct { SRE_CODE chr; SRE_REPEAT* rep; } u; + int match_all; } SRE_MATCH_CONTEXT; /* check if string matches the given pattern. returns <0 for @@ -791,6 +794,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ctx->last_ctx_pos = -1; ctx->jump = JUMP_NONE; ctx->pattern = pattern; + ctx->match_all = state->match_all; ctx_pos = alloc_pos; entrance: @@ -864,6 +868,8 @@ entrance: case SRE_OP_SUCCESS: /* end of pattern */ TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr)); + if (ctx->match_all && ctx->ptr != state->end) + RETURN_FAILURE; state->ptr = ctx->ptr; RETURN_SUCCESS; @@ -972,7 +978,7 @@ entrance: !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0)))) continue; state->ptr = ctx->ptr; - DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); + DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1, ctx->match_all); if (ret) { if (ctx->u.rep) MARK_POP_DISCARD(ctx->lastmark); @@ -1019,7 +1025,8 @@ entrance: if (ctx->count < (Py_ssize_t) ctx->pattern[1]) RETURN_FAILURE; - if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { + if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && + (!ctx->match_all || ctx->ptr == state->end)) { /* tail is empty. we're finished */ state->ptr = ctx->ptr; RETURN_SUCCESS; @@ -1042,7 +1049,7 @@ entrance: break; state->ptr = ctx->ptr; DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, - ctx->pattern+ctx->pattern[0]); + ctx->pattern+ctx->pattern[0], ctx->match_all); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1059,7 +1066,7 @@ entrance: while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { state->ptr = ctx->ptr; DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, - ctx->pattern+ctx->pattern[0]); + ctx->pattern+ctx->pattern[0], ctx->match_all); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1104,7 +1111,8 @@ entrance: ctx->ptr += state->charsize * ctx->count; } - if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { + if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && + (!ctx->match_all || ctx->ptr == state->end)) { /* tail is empty. we're finished */ state->ptr = ctx->ptr; RETURN_SUCCESS; @@ -1116,7 +1124,7 @@ entrance: || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { state->ptr = ctx->ptr; DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, - ctx->pattern+ctx->pattern[0]); + ctx->pattern+ctx->pattern[0], ctx->match_all); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1155,7 +1163,7 @@ entrance: state->repeat = ctx->u.rep; state->ptr = ctx->ptr; - DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); + DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0], ctx->match_all); state->repeat = ctx->u.rep->prev; PyObject_FREE(ctx->u.rep); @@ -1187,7 +1195,7 @@ entrance: /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+3, ctx->match_all); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1209,7 +1217,7 @@ entrance: DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+3, ctx->match_all); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { MARK_POP_DISCARD(ctx->lastmark); @@ -1225,7 +1233,7 @@ entrance: /* cannot match more repeated items here. make sure the tail matches */ state->repeat = ctx->u.rep->prev; - DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); + DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern, ctx->match_all); RETURN_ON_SUCCESS(ret); state->repeat = ctx->u.rep; state->ptr = ctx->ptr; @@ -1250,7 +1258,7 @@ entrance: /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+3, ctx->match_all); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1264,7 +1272,7 @@ entrance: /* see if the tail matches */ state->repeat = ctx->u.rep->prev; - DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); + DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern, ctx->match_all); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1285,7 +1293,7 @@ entrance: DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+3, ctx->match_all); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { RETURN_ON_ERROR(ret); @@ -1378,7 +1386,7 @@ entrance: state->ptr = ctx->ptr - state->charsize * ctx->pattern[1]; if (state->ptr < state->beginning) RETURN_FAILURE; - DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2); + DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2, 0); RETURN_ON_FAILURE(ret); ctx->pattern += ctx->pattern[0]; break; @@ -1390,7 +1398,7 @@ entrance: ctx->ptr, ctx->pattern[1])); state->ptr = ctx->ptr - state->charsize * ctx->pattern[1]; if (state->ptr >= state->beginning) { - DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); + DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2, 0); if (ret) { RETURN_ON_ERROR(ret); RETURN_FAILURE; @@ -1909,6 +1917,44 @@ pattern_match(PatternObject* self, PyObject* args, PyObject* kw) return pattern_new_match(self, &state, status); } +static PyObject* +pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw) +{ + SRE_STATE state; + Py_ssize_t status; + + PyObject* string; + Py_ssize_t start = 0; + Py_ssize_t end = PY_SSIZE_T_MAX; + static char* kwlist[] = { "pattern", "pos", "endpos", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:fullmatch", kwlist, + &string, &start, &end)) + return NULL; + + string = state_init(&state, self, string, start, end); + if (!string) + return NULL; + + state.match_all = 1; + state.ptr = state.start; + + TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr)); + + if (state.logical_charsize == 1) { + status = sre_match(&state, PatternObject_GetCode(self)); + } else { + status = sre_umatch(&state, PatternObject_GetCode(self)); + } + + TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); + if (PyErr_Occurred()) + return NULL; + + state_fini(&state); + + return pattern_new_match(self, &state, status); +} + static PyObject* pattern_search(PatternObject* self, PyObject* args, PyObject* kw) { @@ -2530,6 +2576,10 @@ PyDoc_STRVAR(pattern_match_doc, "match(string[, pos[, endpos]]) -> match object or None.\n\ Matches zero or more characters at the beginning of the string"); +PyDoc_STRVAR(pattern_fullmatch_doc, +"fullmatch(string[, pos[, endpos]]) -> match object or None.\n\ + Matches against all of the string"); + PyDoc_STRVAR(pattern_search_doc, "search(string[, pos[, endpos]]) -> match object or None.\n\ Scan through string looking for a match, and return a corresponding\n\ @@ -2565,6 +2615,8 @@ PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects"); static PyMethodDef pattern_methods[] = { {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS, pattern_match_doc}, + {"fullmatch", (PyCFunction) pattern_fullmatch, METH_VARARGS|METH_KEYWORDS, + pattern_fullmatch_doc}, {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS, pattern_search_doc}, {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS, diff --git a/Modules/sre.h b/Modules/sre.h index 0a8f0cfe3d9..1b64a6d67d6 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -89,6 +89,7 @@ typedef struct { SRE_REPEAT *repeat; /* hooks */ SRE_TOLOWER_HOOK lower; + int match_all; } SRE_STATE; typedef struct {