From 770617b23e286f1147f9480b5f625e88e7badd50 Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Sun, 14 Jan 2001 15:06:11 +0000 Subject: [PATCH] SRE fixes for 2.1 alpha: -- added some more docstrings -- fixed typo in scanner class (#125531) -- the multiline flag (?m) should't affect the \Z operator (#127259) -- fixed non-greedy backtracking bug (#123769, #127259) -- added sre.DEBUG flag (currently dumps the parsed pattern structure) -- fixed a couple of glitches in groupdict (the #126587 memory leak had already been fixed by AMK) --- Lib/sre.py | 51 ++++++++++++++++++++++++++----------- Lib/sre_compile.py | 6 ++--- Lib/sre_constants.py | 11 +++++--- Lib/sre_parse.py | 18 ++++++------- Lib/test/test_sre.py | 16 ++++++------ Modules/_sre.c | 56 ++++++++++++++++++++++++----------------- Modules/sre_constants.h | 12 +++++---- 7 files changed, 104 insertions(+), 66 deletions(-) diff --git a/Lib/sre.py b/Lib/sre.py index 6dea5c40456..8d03e921a91 100644 --- a/Lib/sre.py +++ b/Lib/sre.py @@ -3,7 +3,7 @@ # # re-compatible interface for the sre matching engine # -# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. # # This version of the SRE library can be redistributed under CNRI's # Python 1.6 license. For any other use, please contact Secret Labs @@ -14,23 +14,22 @@ # other compatibility work. # -# FIXME: change all FIXME's to XXX ;-) - import sre_compile import sre_parse import string # flags -I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE -L = LOCALE = sre_compile.SRE_FLAG_LOCALE -M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE -S = DOTALL = sre_compile.SRE_FLAG_DOTALL -X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE +I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case +L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale +U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale +M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline +S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline +X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments -# sre extensions (may or may not be in 1.6/2.0 final) -T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE -U = UNICODE = sre_compile.SRE_FLAG_UNICODE +# sre extensions (experimental, don't rely on these) +T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking +DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation # sre exception error = sre_compile.error @@ -38,36 +37,60 @@ error = sre_compile.error # -------------------------------------------------------------------- # public interface -# FIXME: add docstrings - def match(pattern, string, flags=0): + """Try to apply the pattern at the start of the string, returning + a match object, or None if no match was found.""" return _compile(pattern, flags).match(string) def search(pattern, string, flags=0): + """Scan through string looking for a match to the pattern, returning + a match object, or None if no match was found.""" return _compile(pattern, flags).search(string) def sub(pattern, repl, string, count=0): + """Return the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in string by the + replacement repl""" return _compile(pattern, 0).sub(repl, string, count) def subn(pattern, repl, string, count=0): + """Return a 2-tuple containing (new_string, number). + new_string is the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in the source + string by the replacement repl. number is the number of + substitutions that were made.""" return _compile(pattern, 0).subn(repl, string, count) def split(pattern, string, maxsplit=0): + """Split the source string by the occurrences of the pattern, + returning a list containing the resulting substrings.""" return _compile(pattern, 0).split(string, maxsplit) def findall(pattern, string, maxsplit=0): + """Return a list of all non-overlapping matches in the string. + + If one or more groups are present in the pattern, return a + list of groups; this will be a list of tuples if the pattern + has more than one group. + + Empty matches are included in the result.""" return _compile(pattern, 0).findall(string, maxsplit) def compile(pattern, flags=0): + "Compile a regular expression pattern, returning a pattern object." return _compile(pattern, flags) def purge(): + "Clear the regular expression cache" _cache.clear() def template(pattern, flags=0): + "Compile a template pattern, returning a pattern object" + return _compile(pattern, flags|T) def escape(pattern): + "Escape all non-alphanumeric characters in pattern." s = list(pattern) for i in range(len(pattern)): c = pattern[i] @@ -204,7 +227,7 @@ class Scanner: break action = self.lexicon[m.lastindex][1] if callable(action): - self.match = match + self.match = m action = action(self, m.group()) if action is not None: append(action) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index dc508e57cdc..adab767230b 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -3,7 +3,7 @@ # # convert template to internal format # -# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. +# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. # # See the sre.py file for information on usage and redistribution. # @@ -176,7 +176,7 @@ def _optimize_charset(charset, fixup): for i in range(fixup(av[0]), fixup(av[1])+1): charmap[i] = 1 elif op is CATEGORY: - # FIXME: could append to charmap tail + # XXX: could append to charmap tail return charset # cannot compress except IndexError: # character set contains unicode characters @@ -364,7 +364,7 @@ def compile(p, flags=0): # print code - # FIXME: get rid of this limitation! + # XXX: get rid of this limitation! assert p.pattern.groups <= 100,\ "sorry, but this version only supports 100 named groups" diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index ea649c04829..a5e4bb8c1e3 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -4,7 +4,7 @@ # various symbols used by the regular expression engine. # run this script to update the _sre include files! # -# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. # # See the sre.py file for information on usage and redistribution. # @@ -54,10 +54,12 @@ SUBPATTERN = "subpattern" # positions AT_BEGINNING = "at_beginning" AT_BEGINNING_LINE = "at_beginning_line" +AT_BEGINNING_STRING = "at_beginning_string" AT_BOUNDARY = "at_boundary" AT_NON_BOUNDARY = "at_non_boundary" AT_END = "at_end" AT_END_LINE = "at_end_line" +AT_END_STRING = "at_end_string" # categories CATEGORY_DIGIT = "category_digit" @@ -109,8 +111,8 @@ OPCODES = [ ] ATCODES = [ - AT_BEGINNING, AT_BEGINNING_LINE, AT_BOUNDARY, - AT_NON_BOUNDARY, AT_END, AT_END_LINE + AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, + AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING ] CHCODES = [ @@ -178,6 +180,7 @@ SRE_FLAG_MULTILINE = 8 # treat target as multiline string SRE_FLAG_DOTALL = 16 # treat target as a single string SRE_FLAG_UNICODE = 32 # use unicode locale SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments +SRE_FLAG_DEBUG = 128 # debugging # flags for INFO primitive SRE_INFO_PREFIX = 1 # has prefix @@ -201,7 +204,7 @@ if __name__ == "__main__": * NOTE: This file is generated by sre_constants.py. If you need * to change anything in here, edit sre_constants.py and run it. * - * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * * See the _sre.c file for information on usage and redistribution. */ diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 5334e0661aa..a21fd61dc9b 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -3,7 +3,7 @@ # # convert re-style regular expression to sre pattern # -# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. # # See the sre.py file for information on usage and redistribution. # @@ -34,7 +34,7 @@ ESCAPES = { } CATEGORIES = { - r"\A": (AT, AT_BEGINNING), # start of string + r"\A": (AT, AT_BEGINNING_STRING), # start of string r"\b": (AT, AT_BOUNDARY), r"\B": (AT, AT_NON_BOUNDARY), r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), @@ -43,7 +43,7 @@ CATEGORIES = { r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - r"\Z": (AT, AT_END), # end of string + r"\Z": (AT, AT_END_STRING), # end of string } FLAGS = { @@ -421,13 +421,13 @@ def _parse(source, state): code1 = code1[1][0] set.append(code1) - # FIXME: move set optimization to compiler! + # XXX: should move set optimization to compiler! if len(set)==1 and set[0][0] is LITERAL: subpattern.append(set[0]) # optimization elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: subpattern.append((NOT_LITERAL, set[1][1])) # optimization else: - # FIXME: add charmap optimization + # XXX: should add charmap optimization here subpattern.append((IN, set)) elif this and this[0] in REPEAT_CHARS: @@ -457,7 +457,7 @@ def _parse(source, state): min = int(lo) if hi: max = int(hi) - # FIXME: check that hi >= lo! + # XXX: check that hi >= lo ??? else: raise error, "not supported" # figure out which item to repeat @@ -601,7 +601,8 @@ def parse(str, flags=0, pattern=None): elif tail: raise error, "bogus characters at end of regular expression" - # p.dump() + if flags & SRE_FLAG_DEBUG: + p.dump() if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: # the VERBOSE flag was switched on inside the pattern. to be @@ -672,8 +673,7 @@ def parse_template(source, pattern): return p def expand_template(template, match): - # FIXME: this is sooooo slow. drop in the slicelist - # code instead + # XXX: this is sooooo slow. drop in the slicelist code instead p = [] a = p.append sep = match.string[:0] diff --git a/Lib/test/test_sre.py b/Lib/test/test_sre.py index 9c01c666fff..b9692a1f343 100644 --- a/Lib/test/test_sre.py +++ b/Lib/test/test_sre.py @@ -47,12 +47,12 @@ if verbose: print 'Running tests on character literals' for i in [0, 8, 16, 32, 64, 127, 128, 255]: - test(r"""sre.match(r"\%03o" % i, chr(i)) is not None""", 1) - test(r"""sre.match(r"\%03o0" % i, chr(i)+"0") is not None""", 1) - test(r"""sre.match(r"\%03o8" % i, chr(i)+"8") is not None""", 1) - test(r"""sre.match(r"\x%02x" % i, chr(i)) is not None""", 1) - test(r"""sre.match(r"\x%02x0" % i, chr(i)+"0") is not None""", 1) - test(r"""sre.match(r"\x%02xz" % i, chr(i)+"z") is not None""", 1) + test(r"""sre.match(r"\%03o" % i, chr(i)) != None""", 1) + test(r"""sre.match(r"\%03o0" % i, chr(i)+"0") != None""", 1) + test(r"""sre.match(r"\%03o8" % i, chr(i)+"8") != None""", 1) + test(r"""sre.match(r"\x%02x" % i, chr(i)) != None""", 1) + test(r"""sre.match(r"\x%02x0" % i, chr(i)+"0") != None""", 1) + test(r"""sre.match(r"\x%02xz" % i, chr(i)+"z") != None""", 1) test(r"""sre.match("\911", "")""", None, sre.error) # @@ -197,11 +197,11 @@ if verbose: p = "" for i in range(0, 256): p = p + chr(i) - test(r"""sre.match(sre.escape(chr(i)), chr(i)) is not None""", 1) + test(r"""sre.match(sre.escape(chr(i)), chr(i)) != None""", 1) test(r"""sre.match(sre.escape(chr(i)), chr(i)).span()""", (0,1)) pat = sre.compile(sre.escape(p)) -test(r"""pat.match(p) is not None""", 1) +test(r"""pat.match(p) != None""", 1) test(r"""pat.match(p).span()""", (0,256)) if verbose: diff --git a/Modules/_sre.c b/Modules/_sre.c index ccbd7b2857a..28ec61c2100 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -22,8 +22,10 @@ * 2000-09-21 fl don't use the buffer interface for unicode strings * 2000-10-03 fl fixed assert_not primitive; support keyword arguments * 2000-10-24 fl really fixed assert_not; reset groups in findall + * 2000-12-21 fl fixed memory leak in groupdict + * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL * - * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * * This version of the SRE library can be redistributed under CNRI's * Python 1.6 license. For any other use, please contact Secret Labs @@ -355,6 +357,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) switch (at) { case SRE_AT_BEGINNING: + case SRE_AT_BEGINNING_STRING: return ((void*) ptr == state->beginning); case SRE_AT_BEGINNING_LINE: @@ -370,6 +373,9 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) return ((void*) ptr == state->end || SRE_IS_LINEBREAK((int) ptr[0])); + case SRE_AT_END_STRING: + return ((void*) ptr == state->end); + case SRE_AT_BOUNDARY: if (state->beginning == state->end) return 0; @@ -826,7 +832,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) /* this operator only works if the repeated item is exactly one character wide, and we're not already collecting backtracking points. for other cases, - use the MAX_REPEAT operator instead */ + use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ @@ -900,7 +906,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) case SRE_OP_REPEAT: /* create repeat context. all the hard work is done - by the UNTIL operator */ + by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr, pattern[1], pattern[2])); @@ -974,6 +980,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) if (i) return i; state->repeat = rp; + state->ptr = ptr; return 0; case SRE_OP_MIN_UNTIL: @@ -986,7 +993,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) count = rp->count + 1; - TRACE(("|%p|%p|MIN_UNTIL %d\n", pattern, ptr, count)); + TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count, + rp->pattern)); state->ptr = ptr; @@ -1009,6 +1017,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) /* free(rp); */ return i; } + state->ptr = ptr; state->repeat = rp; if (count >= rp->pattern[2] && rp->pattern[2] != 65535) @@ -1020,6 +1029,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) if (i) return i; rp->count = count - 1; + state->ptr = ptr; return 0; default: @@ -1965,7 +1975,7 @@ match_groupdict(MatchObject* self, PyObject* args, PyObject* kw) PyObject* def = Py_None; static char* kwlist[] = { "default", NULL }; - if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def)) + if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def)) return NULL; result = PyDict_New(); @@ -1973,35 +1983,35 @@ match_groupdict(MatchObject* self, PyObject* args, PyObject* kw) return result; keys = PyMapping_Keys(self->pattern->groupindex); - if (!keys) { - Py_DECREF(result); - return NULL; - } + if (!keys) + goto failed; for (index = 0; index < PyList_GET_SIZE(keys); index++) { + int status; PyObject* key; - PyObject* item; + PyObject* value; key = PyList_GET_ITEM(keys, index); - if (!key) { - Py_DECREF(keys); - Py_DECREF(result); - return NULL; - } - item = match_getslice(self, key, def); - if (!item) { + if (!key) + goto failed; + value = match_getslice(self, key, def); + if (!value) { Py_DECREF(key); - Py_DECREF(keys); - Py_DECREF(result); - return NULL; + goto failed; } - /* FIXME: this can fail, right? */ - PyDict_SetItem(result, key, item); - Py_DECREF(item); + status = PyDict_SetItem(result, key, value); + Py_DECREF(value); + if (status < 0) + goto failed; } Py_DECREF(keys); return result; + +failed: + Py_DECREF(keys); + Py_DECREF(result); + return NULL; } static PyObject* diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 5c55c3dbd91..6cad0899c63 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -6,7 +6,7 @@ * NOTE: This file is generated by sre_constants.py. If you need * to change anything in here, edit sre_constants.py and run it. * - * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * * See the _sre.c file for information on usage and redistribution. */ @@ -42,10 +42,12 @@ #define SRE_OP_SUBPATTERN 28 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 -#define SRE_AT_BOUNDARY 2 -#define SRE_AT_NON_BOUNDARY 3 -#define SRE_AT_END 4 -#define SRE_AT_END_LINE 5 +#define SRE_AT_BEGINNING_STRING 2 +#define SRE_AT_BOUNDARY 3 +#define SRE_AT_NON_BOUNDARY 4 +#define SRE_AT_END 5 +#define SRE_AT_END_LINE 6 +#define SRE_AT_END_STRING 7 #define SRE_CATEGORY_DIGIT 0 #define SRE_CATEGORY_NOT_DIGIT 1 #define SRE_CATEGORY_SPACE 2