diff --git a/Lib/sre.py b/Lib/sre.py index 97a5140e916..5e6aeeb8533 100644 --- a/Lib/sre.py +++ b/Lib/sre.py @@ -89,6 +89,10 @@ def _compile(pattern, flags=0): _cache[key] = p return p +def purge(): + # clear pattern cache + _cache.clear() + def _sub(pattern, template, string, count=0): # internal: pattern.sub implementation hook return _subn(pattern, template, string, count)[0] @@ -142,3 +146,12 @@ def _split(pattern, string, maxsplit=0): n = n + 1 append(string[i:]) return s + +# register myself for pickling + +import copy_reg + +def _pickle(p): + return _compile, (p.pattern, p.flags) + +copy_reg.pickle(type(_compile("")), _pickle, _compile) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 0829c00e279..e48a7eb9901 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -31,15 +31,15 @@ def _compile(code, pattern, flags): emit(OPCODES[OP_IGNORE[op]]) else: emit(OPCODES[op]) - emit(ord(av)) + emit(av) elif op is IN: if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) def fixup(literal, flags=flags): - return _sre.getlower(ord(literal), flags) + return _sre.getlower(literal, flags) else: emit(OPCODES[op]) - fixup = ord + fixup = lambda x: x skip = len(code); emit(0) for op, av in av: emit(OPCODES[op]) @@ -165,7 +165,7 @@ def _compile_info(code, pattern, flags): if not (flags & SRE_FLAG_IGNORECASE): for op, av in pattern.data: if op is LITERAL: - prefix.append(ord(av)) + prefix.append(av) else: break # add an info block diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index d3dbe00041e..fb954e994c6 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -19,6 +19,9 @@ from sre_constants import * # FIXME: should be 65535, but the arraymodule is still broken MAXREPEAT = 32767 +# FIXME: same here +CHARMASK = 0x7fff + SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -30,14 +33,14 @@ HEXDIGITS = tuple("0123456789abcdefABCDEF") WHITESPACE = string.whitespace ESCAPES = { - r"\a": (LITERAL, chr(7)), - r"\b": (LITERAL, chr(8)), - r"\f": (LITERAL, chr(12)), - r"\n": (LITERAL, chr(10)), - r"\r": (LITERAL, chr(13)), - r"\t": (LITERAL, chr(9)), - r"\v": (LITERAL, chr(11)), - r"\\": (LITERAL, "\\") + r"\a": (LITERAL, 7), + r"\b": (LITERAL, 8), + r"\f": (LITERAL, 12), + r"\n": (LITERAL, 10), + r"\r": (LITERAL, 13), + r"\t": (LITERAL, 9), + r"\v": (LITERAL, 11), + r"\\": (LITERAL, ord("\\")) } CATEGORIES = { @@ -176,9 +179,6 @@ def isdigit(char): def isname(name): # check that group name is a valid string - # FIXME: this code is really lame. should use a regular - # expression instead, but I seem to have certain bootstrapping - # problems here ;-) if not isident(name[0]): return 0 for char in name: @@ -209,16 +209,14 @@ def _class_escape(source, escape): while source.next in HEXDIGITS: escape = escape + source.get() escape = escape[2:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-4:], 16) & 0xff) + return LITERAL, int(escape[-4:], 16) & CHARMASK elif str(escape[1:2]) in OCTDIGITS: while source.next in OCTDIGITS: escape = escape + source.get() escape = escape[1:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-6:], 8) & 0xff) + return LITERAL, int(escape[-6:], 8) & CHARMASK if len(escape) == 2: - return LITERAL, escape[1] + return LITERAL, ord(escape[1]) except ValueError: pass raise error, "bogus escape: %s" % repr(escape) @@ -236,8 +234,7 @@ def _escape(source, escape, state): while source.next in HEXDIGITS: escape = escape + source.get() escape = escape[2:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-4:], 16) & 0xff) + return LITERAL, int(escape[-4:], 16) & CHARMASK elif escape[1:2] in DIGITS: while 1: group = _group(escape, state.groups) @@ -251,17 +248,14 @@ def _escape(source, escape, state): else: break escape = escape[1:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-6:], 8) & 0xff) + return LITERAL, int(escape[-6:], 8) & CHARMASK if len(escape) == 2: - return LITERAL, escape[1] + return LITERAL, ord(escape[1]) except ValueError: pass raise error, "bogus escape: %s" % repr(escape) - def _branch(pattern, items): - # form a branch operator from a set of items subpattern = SubPattern(pattern) @@ -327,7 +321,7 @@ def _parse(source, state, flags=0): continue if this and this[0] not in SPECIAL_CHARS: - subpattern.append((LITERAL, this)) + subpattern.append((LITERAL, ord(this))) elif this == "[": # character set @@ -345,7 +339,7 @@ def _parse(source, state, flags=0): elif this and this[0] == "\\": code1 = _class_escape(source, this) elif this: - code1 = LITERAL, this + code1 = LITERAL, ord(this) else: raise error, "unexpected end of regular expression" if source.match("-"): @@ -353,17 +347,15 @@ def _parse(source, state, flags=0): this = source.get() if this == "]": set.append(code1) - set.append((LITERAL, "-")) + set.append((LITERAL, ord("-"))) break else: if this[0] == "\\": code2 = _class_escape(source, this) else: - code2 = LITERAL, this + code2 = LITERAL, ord(this) if code1[0] != LITERAL or code2[0] != LITERAL: raise error, "illegal range" - if len(code1[1]) != 1 or len(code2[1]) != 1: - raise error, "illegal range" set.append((RANGE, (code1[1], code2[1]))) else: if code1[0] is IN: @@ -605,17 +597,16 @@ def parse_template(source, pattern): break if not code: this = this[1:] - # FIXME: support unicode characters! - code = LITERAL, chr(int(this[-6:], 8) & 0xff) + code = LITERAL, int(this[-6:], 8) & CHARMASK a(code) else: try: a(ESCAPES[this]) except KeyError: for c in this: - a((LITERAL, c)) + a((LITERAL, ord(c))) else: - a((LITERAL, this)) + a((LITERAL, ord(this))) return p def expand_template(template, match): @@ -623,12 +614,17 @@ def expand_template(template, match): # code instead p = [] a = p.append + sep = match.string[:0] + if type(sep) is type(""): + char = chr + else: + char = unichr for c, s in template: if c is LITERAL: - a(s) + a(char(s)) elif c is MARK: s = match.group(s) if s is None: raise error, "empty group" a(s) - return match.string[:0].join(p) + return sep.join(p) diff --git a/Lib/test/output/test_sre b/Lib/test/output/test_sre index d3732b52149..10de93dd205 100644 --- a/Lib/test/output/test_sre +++ b/Lib/test/output/test_sre @@ -1,6 +1,5 @@ test_sre -test_support -- test failed re module pickle -test_support -- test failed re module cPickle +=== Failed incorrectly ('\\x00ffffffffffffff', '\377', 0, 'found', '\377') === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A') === Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a') === grouping error ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', 0, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/') 'd:msgs/tdir/sub1/-trial/' should be 'd:msgs/tdir/sub1/-tdir/' diff --git a/Modules/_sre.c b/Modules/_sre.c index 22b6c7347c5..268c5dd82b9 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -20,7 +20,7 @@ * 00-06-28 fl fixed findall (0.9.1) * 00-06-29 fl fixed split, added more scanner features (0.9.2) * 00-06-30 fl tuning, fast search (0.9.3) - * 00-06-30 fl added assert (lookahead) primitives (0.9.4) + * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4) * * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * @@ -339,7 +339,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) } LOCAL(int) -SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch) +SRE_MEMBER(SRE_CODE* set, SRE_CODE ch) { /* check if character is a member of the given set */ @@ -356,13 +356,13 @@ SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch) return !ok; case SRE_OP_LITERAL: - if (ch == (SRE_CHAR) set[0]) + if (ch == set[0]) return ok; set++; break; case SRE_OP_RANGE: - if ((SRE_CHAR) set[0] <= ch && ch <= (SRE_CHAR) set[1]) + if (set[0] <= ch && ch <= set[1]) return ok; set += 2; break; @@ -455,8 +455,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) case SRE_OP_LITERAL: /* match literal string */ /* args: */ - TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) pattern[0])); - if (ptr >= end || *ptr != (SRE_CHAR) pattern[0]) + TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0])); + if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0]) goto failure; pattern++; ptr++; @@ -465,8 +465,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) case SRE_OP_NOT_LITERAL: /* match anything that is not literal character */ /* args: */ - TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) pattern[0])); - if (ptr >= end || *ptr == (SRE_CHAR) pattern[0]) + TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0])); + if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0]) goto failure; pattern++; ptr++; @@ -528,7 +528,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) break; case SRE_OP_LITERAL_IGNORE: - TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern)); + TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0])); if (ptr >= end || state->lower(*ptr) != state->lower(*pattern)) goto failure; @@ -537,8 +537,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) break; case SRE_OP_NOT_LITERAL_IGNORE: - TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), - (SRE_CHAR) *pattern)); + TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0])); if (ptr >= end || state->lower(*ptr) == state->lower(*pattern)) goto failure; @@ -549,7 +548,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) case SRE_OP_IN_IGNORE: TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr)); if (ptr >= end - || !SRE_MEMBER(pattern+1, (SRE_CHAR) state->lower(*ptr))) + || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr))) goto failure; pattern += pattern[0]; ptr++; @@ -631,9 +630,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) } else if (pattern[3] == SRE_OP_LITERAL) { /* repeated literal */ - SRE_CHAR chr = (SRE_CHAR) pattern[4]; + SRE_CODE chr = pattern[4]; while (count < (int) pattern[2]) { - if (ptr >= end || *ptr != chr) + if (ptr >= end || (SRE_CODE) ptr[0] != chr) break; ptr++; count++; @@ -641,9 +640,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) { /* repeated literal */ - SRE_CHAR chr = (SRE_CHAR) pattern[4]; + SRE_CODE chr = pattern[4]; while (count < (int) pattern[2]) { - if (ptr >= end || (SRE_CHAR) state->lower(*ptr) != chr) + if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr) break; ptr++; count++; @@ -651,9 +650,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) } else if (pattern[3] == SRE_OP_NOT_LITERAL) { /* repeated non-literal */ - SRE_CHAR chr = (SRE_CHAR) pattern[4]; + SRE_CODE chr = pattern[4]; while (count < (int) pattern[2]) { - if (ptr >= end || *ptr == chr) + if (ptr >= end || (SRE_CODE) ptr[0] == chr) break; ptr++; count++; @@ -661,9 +660,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) { /* repeated non-literal */ - SRE_CHAR chr = (SRE_CHAR) pattern[4]; + SRE_CODE chr = pattern[4]; while (count < (int) pattern[2]) { - if (ptr >= end || (SRE_CHAR) state->lower(*ptr) == chr) + if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr) break; ptr++; count++; @@ -712,7 +711,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) } else if (pattern[pattern[0]] == SRE_OP_LITERAL) { /* tail starts with a literal. skip positions where the rest of the pattern cannot possibly match */ - SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1]; + SRE_CODE chr = pattern[pattern[0]+1]; TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr)); for (;;) { TRACE(("%8d: scan for tail match\n", PTR(ptr))); @@ -868,7 +867,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) TRACE(("%8d: branch\n", PTR(ptr))); while (*pattern) { if (pattern[1] != SRE_OP_LITERAL || - (ptr < end && *ptr == (SRE_CHAR) pattern[2])) { + (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) { TRACE(("%8d: branch check\n", PTR(ptr))); state->ptr = ptr; i = SRE_MATCH(state, pattern + 1); @@ -976,7 +975,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) end = state->end; while (ptr < end) { for (;;) { - if (*ptr != (SRE_CHAR) prefix[i]) { + if ((SRE_CODE) ptr[0] != prefix[i]) { if (!i) break; else @@ -1008,9 +1007,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) if (pattern[0] == SRE_OP_LITERAL) { /* pattern starts with a literal character. this is used for short prefixes, and if fast search is disabled*/ - SRE_CHAR chr = (SRE_CHAR) pattern[1]; + SRE_CODE chr = pattern[1]; for (;;) { - while (ptr < end && *ptr != chr) + while (ptr < end && (SRE_CODE) ptr[0] != chr) ptr++; if (ptr == end) return 0;