-- SRE 0.9.6 sync. this includes:

+ added "regs" attribute + fixed "pos" and "endpos" attributes + reset "lastindex" and "lastgroup" in scanner methods + removed (?P#id) syntax; the "lastindex" and "lastgroup" attributes are now always set + removed string module dependencies in sre_parse + better debugging support in sre_parse + various tweaks to build under 1.5.2
2000-07-23 21:46:17 +00:00 · 2000-07-23 21:46:17 +00:00 · 8a3ebf8ca8
parent 4f1b2081e9
commit 8a3ebf8ca8
7 changed files with 1265 additions and 1138 deletions
--- a/Lib/sre.py
+++ b/Lib/sre.py
@ -10,9 +10,13 @@
 # other compatibility work.
 #

+# FIXME: change all FIXME's to XXX ;-)
+
 import sre_compile
 import sre_parse

+import string
+
 # flags
 I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
 L = LOCALE = sre_compile.SRE_FLAG_LOCALE
@ -53,6 +57,9 @@ def findall(pattern, string, maxsplit=0):
 def compile(pattern, flags=0):
    return _compile(pattern, flags)

+def purge():
+    _cache.clear()
+
 def template(pattern, flags=0):
    return _compile(pattern, flags|T)

@ -65,7 +72,7 @@ def escape(pattern):
                s[i] = "\\000"
            else:
                s[i] = "\\" + c
-    return pattern[:0].join(s)
+    return _join(s, pattern)

 # --------------------------------------------------------------------
 # internals
@ -73,10 +80,14 @@ def escape(pattern):
 _cache = {}
 _MAXCACHE = 100

+def _join(seq, sep):
+    # internal: join into string having the same type as sep
+    return string.join(seq, sep[:0])
+
 def _compile(pattern, flags=0):
    # internal: compile pattern
    tp = type(pattern)
-    if tp not in (type(""), type(u"")):
+    if tp not in sre_compile.STRING_TYPES:
        return pattern
    key = (tp, pattern, flags)
    try:
@ -89,10 +100,6 @@ def _compile(pattern, flags=0):
    _cache[key] = p
    return p

-def purge():
-    # clear pattern cache
-    _cache.clear()
-
 def _sub(pattern, template, string, count=0):
    # internal: pattern.sub implementation hook
    return _subn(pattern, template, string, count)[0]
@ -120,7 +127,7 @@ def _subn(pattern, template, string, count=0):
        i = e
        n = n + 1
    append(string[i:])
-    return string[:0].join(s), n
+    return _join(s, string[:0]), n

 def _split(pattern, string, maxsplit=0):
    # internal: pattern.split implementation hook
@ -161,11 +168,19 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)

 class Scanner:
    def __init__(self, lexicon):
+        from sre_constants import BRANCH, SUBPATTERN, INDEX
        self.lexicon = lexicon
+        # combine phrases into a compound pattern
        p = []
+        s = sre_parse.Pattern()
        for phrase, action in lexicon:
-            p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
-        self.scanner = _compile("|".join(p))
+            p.append(sre_parse.SubPattern(s, [
+                (SUBPATTERN, (None, sre_parse.parse(phrase))),
+                (INDEX, len(p))
+                ]))
+        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
+        s.groups = len(p)
+        self.scanner = sre_compile.compile(p)
    def scan(self, string):
        result = []
        append = result.append
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@ -197,10 +197,11 @@ def _compile(code, pattern, flags):
            else:
                emit(ATCODES[av])
        elif op is BRANCH:
-            emit(OPCODES[op])
            tail = []
            for av in av[1]:
+                emit(OPCODES[op])
                skip = len(code); emit(0)
+                emit(MAXCODE) # save mark
                _compile(code, av, flags)
                emit(OPCODES[JUMP])
                tail.append(len(code)); emit(0)
@ -286,11 +287,18 @@ def _compile_info(code, pattern, flags):
        emit(OPCODES[FAILURE])
    code[skip] = len(code) - skip

+STRING_TYPES = [type("")]
+
+try:
+    STRING_TYPES.append(type(unicode("")))
+except NameError:
+    pass
+
 def compile(p, flags=0):
    # internal: convert pattern list to internal format

    # compile, as necessary
-    if type(p) in (type(""), type(u"")):
+    if type(p) in STRING_TYPES:
        import sre_parse
        pattern = p
        p = sre_parse.parse(p, flags)
@ -308,6 +316,8 @@ def compile(p, flags=0):

    code.append(OPCODES[SUCCESS])

+    # print code
+
    # FIXME: <fl> get rid of this limitation!
    assert p.pattern.groups <= 100,\
           "sorry, but this version only supports 100 named groups"
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@ -172,7 +172,7 @@ CH_UNICODE = {
 # flags
 SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
 SRE_FLAG_IGNORECASE = 2 # case insensitive
-SRE_FLAG_LOCALE = 4 # honor system locale
+SRE_FLAG_LOCALE = 4 # honour system locale
 SRE_FLAG_MULTILINE = 8 # treat target as multiline string
 SRE_FLAG_DOTALL = 16 # treat target as a single string
 SRE_FLAG_UNICODE = 32 # use unicode locale
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@ -25,12 +25,12 @@ CHARMASK = 0xff
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS  = "*+?{"

-DIGITS = tuple(string.digits)
+DIGITS = tuple("012345689")

 OCTDIGITS = tuple("01234567")
 HEXDIGITS = tuple("0123456789abcdefABCDEF")

-WHITESPACE = tuple(string.whitespace)
+WHITESPACE = tuple(" \t\n\r\v\f")

 ESCAPES = {
    r"\a": (LITERAL, 7),
@ -68,7 +68,8 @@ FLAGS = {
    "u": SRE_FLAG_UNICODE,
 }

-class State:
+class Pattern:
+    # master pattern object.  keeps track of global attributes
    def __init__(self):
        self.flags = 0
        self.groups = 1
@ -88,6 +89,33 @@ class SubPattern:
            data = []
        self.data = data
        self.width = None
+    def dump(self, level=0):
+        nl = 1
+        for op, av in self.data:
+            print level*"  " + op,; nl = 0
+            if op == "in":
+                # member sublanguage
+                print; nl = 1
+                for op, a in av:
+                    print (level+1)*"  " + op, a
+            elif op == "branch":
+                print; nl = 1
+                i = 0
+                for a in av[1]:
+                    if i > 0:
+                        print level*"  " + "or"
+                    a.dump(level+1); nl = 1
+                    i = i + 1
+            elif type(av) in (type(()), type([])):
+                for a in av:
+                    if isinstance(a, SubPattern):
+                        if not nl: print
+                        a.dump(level+1); nl = 1
+                    else:
+                        print a, ; nl = 0
+            else:
+                print av, ; nl = 0
+            if not nl: print
    def __repr__(self):
        return repr(self.data)
    def __len__(self):
@ -255,10 +283,25 @@ def _escape(source, escape, state):
        pass
    raise error, "bogus escape: %s" % repr(escape)

-def _branch(pattern, items):
-    # form a branch operator from a set of items
+def _parse_sub(source, state, nested=1):
+    # parse an alternation: a|b|c

-    subpattern = SubPattern(pattern)
+    items = []
+    while 1:
+        items.append(_parse(source, state))
+        if source.match("|"):
+            continue
+        if not nested:
+            break
+        if not source.next or source.match(")"):
+            break
+        else:
+            raise error, "pattern not properly closed"
+
+    if len(items) == 1:
+        return items[0]
+
+    subpattern = SubPattern(state)

    # check if all items share a common prefix
    while 1:
@ -285,7 +328,7 @@ def _branch(pattern, items):
            break
    else:
        # we can store this as a character set instead of a
-        # branch (FIXME: use a range if possible)
+        # branch (the compiler may optimize this even more)
        set = []
        for item in items:
            set.append(item[0])
@ -296,8 +339,7 @@ def _branch(pattern, items):
    return subpattern

 def _parse(source, state):
-
-    # parse regular expression pattern into an operator list.
+    # parse a simple pattern

    subpattern = SubPattern(state)

@ -451,22 +493,6 @@ def _parse(source, state):
                        if gid is None:
                            raise error, "unknown group name"
                        subpattern.append((GROUPREF, gid))
-                    elif source.match("#"):
-                        index = ""
-                        while 1:
-                            char = source.get()
-                            if char is None:
-                                raise error, "unterminated index"
-                            if char == ")":
-                                break
-                            index = index + char
-                        try:
-                            index = int(index)
-                            if index < 0 or index > MAXREPEAT:
-                                raise ValueError
-                        except ValueError:
-                            raise error, "illegal index"
-                        subpattern.append((INDEX, index))
                        continue
                    else:
                        char = source.get()
@ -491,48 +517,27 @@ def _parse(source, state):
                            raise error, "syntax error"
                        dir = -1 # lookbehind
                        char = source.get()
-                    b = []
-                    while 1:
-                        p = _parse(source, state)
-                        if source.next == ")":
-                            if b:
-                                b.append(p)
-                                p = _branch(state, b)
-                            if char == "=":
-                                subpattern.append((ASSERT, (dir, p)))
-                            else:
-                                subpattern.append((ASSERT_NOT, (dir, p)))
-                            break
-                        elif source.match("|"):
-                            b.append(p)
-                        else:
-                            raise error, "pattern not properly closed"
+                    p = _parse_sub(source, state)
+                    if char == "=":
+                        subpattern.append((ASSERT, (dir, p)))
+                    else:
+                        subpattern.append((ASSERT_NOT, (dir, p)))
+                    continue
                else:
                    # flags
                    while FLAGS.has_key(source.next):
                        state.flags = state.flags | FLAGS[source.get()]
            if group:
                # parse group contents
-                b = []
                if group == 2:
                    # anonymous group
                    group = None
                else:
                    group = state.getgroup(name)
-                while 1:
-                    p = _parse(source, state)
-                    if group is not None:
-                        p.append((INDEX, group))
-                    if source.match(")"):
-                        if b:
-                            b.append(p)
-                            p = _branch(state, b)
-                        subpattern.append((SUBPATTERN, (group, p)))
-                        break
-                    elif source.match("|"):
-                        b.append(p)
-                    else:
-                        raise error, "group not properly closed"
+                p = _parse_sub(source, state)
+                subpattern.append((SUBPATTERN, (group, p)))
+                if group is not None:
+                    p.append((INDEX, group))
            else:
                while 1:
                    char = source.get()
@ -555,26 +560,24 @@ def _parse(source, state):

    return subpattern

-def parse(pattern, flags=0):
+def parse(str, flags=0):
    # parse 're' pattern into list of (opcode, argument) tuples
-    source = Tokenizer(pattern)
-    state = State()
-    state.flags = flags
-    b = []
-    while 1:
-        p = _parse(source, state)
-        tail = source.get()
-        if tail == "|":
-            b.append(p)
-        elif tail == ")":
-            raise error, "unbalanced parenthesis"
-        elif tail is None:
-            if b:
-                b.append(p)
-                p = _branch(state, b)
-            break
-        else:
-            raise error, "bogus characters at end of regular expression"
+
+    source = Tokenizer(str)
+
+    pattern = Pattern()
+    pattern.flags = flags
+
+    p = _parse_sub(source, pattern, 0)
+
+    tail = source.get()
+    if tail == ")":
+        raise error, "unbalanced parenthesis"
+    elif tail:
+        raise error, "bogus characters at end of regular expression"
+
+    # p.dump()
+
    return p

 def parse_template(source, pattern):
@ -656,4 +659,4 @@ def expand_template(template, match):
            if s is None:
                raise error, "empty group"
            a(s)
-    return sep.join(p)
+    return string.join(p, sep)
--- a/Lib/test/output/test_sre
+++ b/Lib/test/output/test_sre
@ -1,4 +1,6 @@
 test_sre
 === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
 === Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a')
+=== grouping error ('(a)(b)c|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None') 'ab-None-b' should be 'ab-None-None'
+=== grouping error ('(a)+b|aac', 'aac', 0, 'found+"-"+g1', 'aac-None') 'aac-a' should be 'aac-None'
 === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
--- a/Modules/sre.h
+++ b/Modules/sre.h
@ -1,4 +1,5 @@
 /*
+ *
 * Secret Labs' Regular Expression Engine
 *
 * regular expression matching engine
@ -33,6 +34,7 @@ typedef struct {
 typedef struct {
    PyObject_VAR_HEAD
    PyObject* string; /* link to the target string */
+    PyObject* regs; /* cached list of matching spans */
    PatternObject* pattern; /* link to the regex (pattern) object */
    int pos, endpos; /* current target slice */
    int lastindex; /* last index marker seen by the engine (-1 if none) */
@ -60,6 +62,9 @@ typedef struct {
    void* beginning; /* start of original string */
    void* start; /* start of current slice */
    void* end; /* end of original string */
+    /* attributes for the match object */
+    PyObject* string;
+    int pos, endpos;
    /* character size */
    int charsize;
    /* registers */
@ -78,7 +83,6 @@ typedef struct {
    /* scanner (internal helper object) */
    PyObject_HEAD
    PyObject* pattern;
-    PyObject* string;
    SRE_STATE state;
 } ScannerObject;