From 01016fe972a90eb57bafeb1f4a73f334c201c3c2 Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Fri, 30 Jun 2000 00:27:46 +0000 Subject: [PATCH] - fixed split behaviour on empty matches - fixed compiler problems when using locale/unicode flags - fixed group/octal code parsing in sub/subn templates --- Lib/sre.py | 15 ++++------ Lib/sre_compile.py | 8 +++--- Lib/sre_parse.py | 68 ++++++++++++++++++++++++++++------------------ Modules/_sre.c | 6 ++-- 4 files changed, 55 insertions(+), 42 deletions(-) diff --git a/Lib/sre.py b/Lib/sre.py index 49e3140bd4f..d5bb462e7bc 100644 --- a/Lib/sre.py +++ b/Lib/sre.py @@ -109,16 +109,13 @@ def _subn(pattern, template, string, count=0): m = c.search() if not m: break - j = m.start() - if j > i: - append(string[i:j]) + b, e = m.span() + if i < b: + append(string[i:b]) append(filter(m)) - i = m.end() - if i <= j: - break + i = e n = n + 1 - if i < len(string): - append(string[i:]) + append(string[i:]) return string[:0].join(s), n def _split(pattern, string, maxsplit=0): @@ -128,7 +125,7 @@ def _split(pattern, string, maxsplit=0): append = s.append extend = s.extend c = pattern.scanner(string) - g = c.groups + g = pattern.groups while not maxsplit or n < maxsplit: m = c.search() if not m: diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 344dc29113f..ea5f5bca3dd 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -61,9 +61,9 @@ def _compile(code, pattern, flags): elif op is CATEGORY: emit(OPCODES[op]) if flags & SRE_FLAG_LOCALE: - emit(CH_LOCALE[CHCODES[av]]) + emit(CHCODES[CH_LOCALE[av]]) elif flags & SRE_FLAG_UNICODE: - emit(CH_UNICODE[CHCODES[av]]) + emit(CHCODES[CH_UNICODE[av]]) else: emit(CHCODES[av]) elif op is GROUP: @@ -92,9 +92,9 @@ def _compile(code, pattern, flags): emit(fixup(av[1])) elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: - emit(CH_LOCALE[CHCODES[av]]) + emit(CHCODES[CH_LOCALE[av]]) elif flags & SRE_FLAG_UNICODE: - emit(CH_UNICODE[CHCODES[av]]) + emit(CHCODES[CH_UNICODE[av]]) else: emit(CHCODES[av]) else: diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 93a7b5dc997..ec934fe6b49 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -30,26 +30,27 @@ HEXDIGITS = tuple("0123456789abcdefABCDEF") WHITESPACE = string.whitespace ESCAPES = { - "\\a": (LITERAL, chr(7)), - "\\b": (LITERAL, chr(8)), - "\\f": (LITERAL, chr(12)), - "\\n": (LITERAL, chr(10)), - "\\r": (LITERAL, chr(13)), - "\\t": (LITERAL, chr(9)), - "\\v": (LITERAL, chr(11)) + r"\a": (LITERAL, chr(7)), + r"\b": (LITERAL, chr(8)), + r"\f": (LITERAL, chr(12)), + r"\n": (LITERAL, chr(10)), + r"\r": (LITERAL, chr(13)), + r"\t": (LITERAL, chr(9)), + r"\v": (LITERAL, chr(11)), + r"\\": (LITERAL, "\\") } CATEGORIES = { - "\\A": (AT, AT_BEGINNING), # start of string - "\\b": (AT, AT_BOUNDARY), - "\\B": (AT, AT_NON_BOUNDARY), - "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), - "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), - "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), - "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), - "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]), - "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - "\\Z": (AT, AT_END), # end of string + r"\A": (AT, AT_BEGINNING), # start of string + r"\b": (AT, AT_BOUNDARY), + r"\B": (AT, AT_NON_BOUNDARY), + r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), + r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), + r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), + r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), + r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), + r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), + r"\Z": (AT, AT_END), # end of string } FLAGS = { @@ -185,11 +186,11 @@ def isname(name): return 0 return 1 -def _group(escape, state): +def _group(escape, groups): # check if the escape string represents a valid group try: group = int(escape[1:]) - if group and group < state.groups: + if group and group < groups: return group except ValueError: pass @@ -239,10 +240,10 @@ def _escape(source, escape, state): return LITERAL, chr(int(escape[-4:], 16) & 0xff) elif escape[1:2] in DIGITS: while 1: - group = _group(escape, state) + group = _group(escape, state.groups) if group: if (not source.next or - not _group(escape + source.next, state)): + not _group(escape + source.next, state.groups)): return GROUP, group escape = escape + source.get() elif source.next in OCTDIGITS: @@ -534,6 +535,7 @@ def parse_template(source, pattern): if this is None: break # end of replacement string if this and this[0] == "\\": + # group if this == "\\g": name = "" if s.match("<"): @@ -557,15 +559,29 @@ def parse_template(source, pattern): raise IndexError, "unknown group name" a((MARK, index)) elif len(this) > 1 and this[1] in DIGITS: - while s.next in DIGITS: - this = this + s.get() - a((MARK, int(this[1:]))) + code = None + while 1: + group = _group(this, pattern.groups+1) + if group: + if (not s.next or + not _group(this + s.next, pattern.groups+1)): + code = MARK, int(group) + break + elif s.next in OCTDIGITS: + this = this + s.get() + else: + break + if not code: + this = this[1:] + # FIXME: support unicode characters! + code = LITERAL, chr(int(this[-6:], 8) & 0xff) + a(code) else: try: a(ESCAPES[this]) except KeyError: - for char in this: - a((LITERAL, char)) + for c in this: + a((LITERAL, c)) else: a((LITERAL, this)) return p diff --git a/Modules/_sre.c b/Modules/_sre.c index 6b0fa61a708..7b1adbd177f 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1534,6 +1534,9 @@ pattern_getattr(PatternObject* self, char* name) if (!strcmp(name, "flags")) return Py_BuildValue("i", self->flags); + if (!strcmp(name, "groups")) + return Py_BuildValue("i", self->groups); + if (!strcmp(name, "groupindex") && self->groupindex) { Py_INCREF(self->groupindex); return self->groupindex; @@ -1939,9 +1942,6 @@ scanner_getattr(ScannerObject* self, char* name) return self->pattern; } - if (!strcmp(name, "groups")) - return Py_BuildValue("i", ((PatternObject*) self->pattern)->groups); - PyErr_SetString(PyExc_AttributeError, name); return NULL; }