Fredrik Lundh: here's the 96.6% version of SRE

This commit is contained in:
Jeremy Hylton 2000-06-01 17:39:12 +00:00
parent 0292d78e91
commit b1aa19515f
6 changed files with 745 additions and 305 deletions

View File

@ -1,4 +1,3 @@
# -*- Mode: Python; tab-width: 4 -*-
#
# Secret Labs' Regular Expression Engine
# $Id$
@ -7,39 +6,127 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
# This code can only be used for 1.6 alpha testing. All other use
# require explicit permission from Secret Labs AB.
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work.
#
"""
this is a long string
"""
import sre_compile
# flags
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
L = LOCALE = sre_compile.SRE_FLAG_LOCALE
M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
S = DOTALL = sre_compile.SRE_FLAG_DOTALL
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
# --------------------------------------------------------------------
# public interface
def compile(pattern, flags=0):
return sre_compile.compile(pattern, _fixflags(flags))
# FIXME: add docstrings
def match(pattern, string, flags=0):
return compile(pattern, _fixflags(flags)).match(string)
return _compile(pattern, flags).match(string)
def search(pattern, string, flags=0):
return compile(pattern, _fixflags(flags)).search(string)
return _compile(pattern, flags).search(string)
# FIXME: etc
def sub(pattern, repl, string, count=0):
return _compile(pattern).sub(repl, string, count)
def subn(pattern, repl, string, count=0):
return _compile(pattern).subn(repl, string, count)
def split(pattern, string, maxsplit=0):
return _compile(pattern).split(string, maxsplit)
def findall(pattern, string, maxsplit=0):
return _compile(pattern).findall(string, maxsplit)
def compile(pattern, flags=0):
return _compile(pattern, flags)
def escape(pattern):
s = list(pattern)
for i in range(len(pattern)):
c = pattern[i]
if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"):
if c == "\000":
s[i] = "\\000"
else:
s[i] = "\\" + c
return pattern[:0].join(s)
# --------------------------------------------------------------------
# helpers
# internals
def _fixflags(flags):
# convert flag bitmask to sequence
assert not flags
return ()
_cache = {}
_MAXCACHE = 100
def _compile(pattern, flags=0):
# internal: compile pattern
tp = type(pattern)
if tp not in (type(""), type(u"")):
return pattern
key = (tp, pattern, flags)
try:
return _cache[key]
except KeyError:
pass
p = sre_compile.compile(pattern, flags)
if len(_cache) >= _MAXCACHE:
_cache.clear()
_cache[key] = p
return p
def _sub(pattern, template, string, count=0):
# internal: pattern.sub implementation hook
return _subn(pattern, template, string, count)[0]
def _expand(match, template):
# internal: expand template
return template # FIXME
def _subn(pattern, template, string, count=0):
# internal: pattern.subn implementation hook
if callable(template):
filter = callable
else:
# FIXME: prepare template
def filter(match, template=template):
return _expand(match, template)
n = i = 0
s = []
append = s.append
c = pattern.cursor(string)
while not count or n < count:
m = c.search()
if not m:
break
j = m.start()
if j > i:
append(string[i:j])
append(filter(m))
i = m.end()
n = n + 1
if i < len(string):
append(string[i:])
return string[:0].join(s), n
def _split(pattern, string, maxsplit=0):
# internal: pattern.split implementation hook
n = i = 0
s = []
append = s.append
c = pattern.cursor(string)
while not maxsplit or n < maxsplit:
m = c.search()
if not m:
break
j = m.start()
append(string[i:j])
i = m.end()
n = n + 1
if i < len(string):
append(string[i:])
return s

View File

@ -14,9 +14,6 @@
# other compatibility work.
#
# FIXME: <fl> formalize (objectify?) and document the compiler code
# format, so that other frontends can use this compiler
import array, string, sys
import _sre
@ -45,64 +42,70 @@ class Code:
self.data.append(code)
def todata(self):
# print self.data
try:
return array.array(WORDSIZE, self.data).tostring()
except OverflowError:
print self.data
raise
def _lower(literal):
# return _sre._lower(literal) # FIXME
return string.lower(literal)
def _compile(code, pattern, flags):
def _compile(code, pattern, flags, level=0):
append = code.append
for op, av in pattern:
if op is ANY:
if "s" in flags:
append(CODES[op]) # any character at all!
if flags & SRE_FLAG_DOTALL:
append(OPCODES[op]) # any character at all!
else:
append(CODES[NOT_LITERAL])
append(10)
append(OPCODES[CATEGORY])
append(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (SUCCESS, FAILURE):
append(CODES[op])
append(OPCODES[op])
elif op is AT:
append(CODES[op])
append(POSITIONS[av])
append(OPCODES[op])
if flags & SRE_FLAG_MULTILINE:
append(ATCODES[AT_MULTILINE[av]])
else:
append(ATCODES[av])
elif op is BRANCH:
append(CODES[op])
append(OPCODES[op])
tail = []
for av in av[1]:
skip = len(code); append(0)
_compile(code, av, flags)
append(CODES[JUMP])
_compile(code, av, flags, level)
append(OPCODES[JUMP])
tail.append(len(code)); append(0)
code[skip] = len(code) - skip
append(0) # end of branch
for tail in tail:
code[tail] = len(code) - tail
elif op is CALL:
append(CODES[op])
append(OPCODES[op])
skip = len(code); append(0)
_compile(code, av, flags)
append(CODES[SUCCESS])
_compile(code, av, flags, level+1)
append(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is CATEGORY: # not used by current parser
append(CODES[op])
append(CATEGORIES[av])
append(OPCODES[op])
if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]])
else:
append(CHCODES[av])
elif op is GROUP:
if "i" in flags:
append(CODES[MAP_IGNORE[op]])
if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]])
else:
append(CODES[op])
append(av)
append(OPCODES[op])
append(av-1)
elif op is IN:
if "i" in flags:
append(CODES[MAP_IGNORE[op]])
if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]])
def fixup(literal):
return ord(_lower(literal))
return ord(literal.lower())
else:
append(CODES[op])
append(OPCODES[op])
fixup = ord
skip = len(code); append(0)
for op, av in av:
append(CODES[op])
append(OPCODES[op])
if op is NEGATE:
pass
elif op is LITERAL:
@ -111,58 +114,60 @@ def _compile(code, pattern, flags):
append(fixup(av[0]))
append(fixup(av[1]))
elif op is CATEGORY:
append(CATEGORIES[av])
if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]])
else:
append(CHCODES[av])
else:
raise ValueError, "unsupported set operator"
append(CODES[FAILURE])
append(OPCODES[FAILURE])
code[skip] = len(code) - skip
elif op in (LITERAL, NOT_LITERAL):
if "i" in flags:
append(CODES[MAP_IGNORE[op]])
append(ord(_lower(av)))
if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]])
append(ord(av.lower()))
else:
append(CODES[op])
append(OPCODES[op])
append(ord(av))
elif op is MARK:
append(CODES[op])
append(OPCODES[op])
append(av)
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
lo, hi = av[2].getwidth()
if lo == 0:
raise SyntaxError, "cannot repeat zero-width items"
if lo == hi == 1 and op is MAX_REPEAT:
append(CODES[MAX_REPEAT_ONE])
append(OPCODES[MAX_REPEAT_ONE])
skip = len(code); append(0)
append(av[0])
append(av[1])
_compile(code, av[2], flags)
append(CODES[SUCCESS])
_compile(code, av[2], flags, level+1)
append(OPCODES[SUCCESS])
code[skip] = len(code) - skip
else:
append(CODES[op])
append(OPCODES[op])
skip = len(code); append(0)
append(av[0])
append(av[1])
_compile(code, av[2], flags)
_compile(code, av[2], flags, level+1)
if op is MIN_REPEAT:
append(CODES[MIN_UNTIL])
append(OPCODES[MIN_UNTIL])
else:
# FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
append(CODES[MAX_UNTIL])
append(OPCODES[MAX_UNTIL])
code[skip] = len(code) - skip
elif op is SUBPATTERN:
## group = av[0]
## if group:
## append(CODES[MARK])
## append((group-1)*2)
_compile(code, av[1], flags)
## if group:
## append(CODES[MARK])
## append((group-1)*2+1)
group = av[0]
if group:
append(OPCODES[MARK])
append((group-1)*2)
_compile(code, av[1], flags, level+1)
if group:
append(OPCODES[MARK])
append((group-1)*2+1)
else:
raise ValueError, ("unsupported operand type", op)
def compile(p, flags=()):
def compile(p, flags=0):
# convert pattern list to internal format
if type(p) in (type(""), type(u"")):
import sre_parse
@ -170,12 +175,10 @@ def compile(p, flags=()):
p = sre_parse.parse(p)
else:
pattern = None
# print p.getwidth()
# print p
flags = p.pattern.flags | flags
code = Code()
_compile(code, p.data, p.pattern.flags)
code.append(CODES[SUCCESS])
# print list(code.data)
_compile(code, p.data, flags)
code.append(OPCODES[SUCCESS])
data = code.todata()
if 0: # debugging
print
@ -183,5 +186,8 @@ def compile(p, flags=()):
import sre_disasm
sre_disasm.disasm(data)
print "-" * 68
# print len(data), p.pattern.groups, len(p.pattern.groupdict)
return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict)
return _sre.compile(
pattern, flags,
data,
p.pattern.groups-1, p.pattern.groupdict
)

View File

@ -48,20 +48,31 @@ SUBPATTERN = "subpattern"
# positions
AT_BEGINNING = "at_beginning"
AT_BEGINNING_LINE = "at_beginning_line"
AT_BOUNDARY = "at_boundary"
AT_NON_BOUNDARY = "at_non_boundary"
AT_END = "at_end"
AT_END_LINE = "at_end_line"
# categories
CATEGORY_DIGIT = "category_digit"
CATEGORY_NOT_DIGIT = "category_not_digit"
CATEGORY_SPACE = "category_space"
CATEGORY_NOT_SPACE = "category_not_space"
CATEGORY_WORD = "category_word"
CATEGORY_NOT_WORD = "category_not_word"
CATEGORY_LINEBREAK = "category_linebreak"
CATEGORY_NOT_LINEBREAK = "category_not_linebreak"
CATEGORY_LOC_DIGIT = "category_loc_digit"
CATEGORY_LOC_NOT_DIGIT = "category_loc_not_digit"
CATEGORY_LOC_SPACE = "category_loc_space"
CATEGORY_LOC_NOT_SPACE = "category_loc_not_space"
CATEGORY_LOC_WORD = "category_loc_word"
CATEGORY_LOC_NOT_WORD = "category_loc_not_word"
CATEGORY_LOC_LINEBREAK = "category_loc_linebreak"
CATEGORY_LOC_NOT_LINEBREAK = "category_loc_not_linebreak"
CODES = [
OPCODES = [
# failure=0 success=1 (just because it looks better that way :-)
FAILURE, SUCCESS,
@ -87,45 +98,75 @@ CODES = [
]
# convert to dictionary
c = {}
ATCODES = [
AT_BEGINNING, AT_BEGINNING_LINE, AT_BOUNDARY,
AT_NON_BOUNDARY, AT_END, AT_END_LINE
]
CHCODES = [
CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE,
CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD,
CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_DIGIT,
CATEGORY_LOC_NOT_DIGIT, CATEGORY_LOC_SPACE,
CATEGORY_LOC_NOT_SPACE, CATEGORY_LOC_WORD, CATEGORY_LOC_NOT_WORD,
CATEGORY_LOC_LINEBREAK, CATEGORY_LOC_NOT_LINEBREAK
]
def makedict(list):
d = {}
i = 0
for code in CODES:
c[code] = i
for item in list:
d[item] = i
i = i + 1
CODES = c
return d
OPCODES = makedict(OPCODES)
ATCODES = makedict(ATCODES)
CHCODES = makedict(CHCODES)
# replacement operations for "ignore case" mode
MAP_IGNORE = {
OP_IGNORE = {
GROUP: GROUP_IGNORE,
IN: IN_IGNORE,
LITERAL: LITERAL_IGNORE,
NOT_LITERAL: NOT_LITERAL_IGNORE
}
POSITIONS = {
AT_BEGINNING: ord("a"),
AT_BOUNDARY: ord("b"),
AT_NON_BOUNDARY: ord("B"),
AT_END: ord("z"),
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
}
CATEGORIES = {
CATEGORY_DIGIT: ord("d"),
CATEGORY_NOT_DIGIT: ord("D"),
CATEGORY_SPACE: ord("s"),
CATEGORY_NOT_SPACE: ord("S"),
CATEGORY_WORD: ord("w"),
CATEGORY_NOT_WORD: ord("W"),
CH_LOCALE = {
CATEGORY_DIGIT: CATEGORY_LOC_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_LOC_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_LOC_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_LOC_NOT_SPACE,
CATEGORY_WORD: CATEGORY_LOC_WORD,
CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_LOC_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_LOC_NOT_LINEBREAK
}
# flags
SRE_FLAG_TEMPLATE = 1 # NYI
SRE_FLAG_IGNORECASE = 2
SRE_FLAG_LOCALE = 4
SRE_FLAG_MULTILINE = 8
SRE_FLAG_DOTALL = 16
SRE_FLAG_VERBOSE = 32
if __name__ == "__main__":
import string
items = CODES.items()
def dump(f, d, prefix):
items = d.items()
items.sort(lambda a, b: cmp(a[1], b[1]))
f = open("sre_constants.h", "w")
f.write("/* generated by sre_constants.py */\n")
for k, v in items:
f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n")
f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v))
f = open("sre_constants.h", "w")
f.write("/* generated from sre_constants.py */\n")
dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
f.close()
print "done"

File diff suppressed because it is too large Load Diff

View File

@ -14,17 +14,18 @@
#include "sre_constants.h"
/* Python objects */
typedef struct {
PyObject_HEAD
PyObject* code; /* link to the code string object */
PyObject* pattern; /* link to the pattern source (or None) */
int groups;
PyObject* groupindex;
/* compatibility */
PyObject* pattern; /* pattern source (or None) */
int flags; /* flags used when compiling pattern source */
} PatternObject;
#define PatternObject_GetCode(o) ((void*) PyString_AS_STRING((o)->code))
#define PatternObject_GetCode(o)\
((void*) PyString_AS_STRING(((PatternObject*)(o))->code))
typedef struct {
PyObject_HEAD
@ -34,5 +35,28 @@ typedef struct {
int mark[2];
} MatchObject;
#endif
typedef struct {
/* string pointers */
void* ptr; /* current position (also end of current slice) */
void* beginning; /* start of original string */
void* start; /* start of current slice */
void* end; /* end of original string */
/* character size */
int charsize;
/* registers */
int marks;
void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
/* backtracking stack */
void** stack;
int stacksize;
int stackbase;
} SRE_STATE;
typedef struct {
PyObject_HEAD
PyObject* pattern;
PyObject* string;
SRE_STATE state;
} CursorObject;
#endif

View File

@ -1,4 +1,4 @@
/* generated by sre_constants.py */
/* generated from sre_constants.py */
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@ -25,3 +25,25 @@
#define SRE_OP_NEGATE 23
#define SRE_OP_RANGE 24
#define SRE_OP_REPEAT 25
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2
#define SRE_AT_NON_BOUNDARY 3
#define SRE_AT_END 4
#define SRE_AT_END_LINE 5
#define SRE_CATEGORY_DIGIT 0
#define SRE_CATEGORY_NOT_DIGIT 1
#define SRE_CATEGORY_SPACE 2
#define SRE_CATEGORY_NOT_SPACE 3
#define SRE_CATEGORY_WORD 4
#define SRE_CATEGORY_NOT_WORD 5
#define SRE_CATEGORY_LINEBREAK 6
#define SRE_CATEGORY_NOT_LINEBREAK 7
#define SRE_CATEGORY_LOC_DIGIT 8
#define SRE_CATEGORY_LOC_NOT_DIGIT 9
#define SRE_CATEGORY_LOC_SPACE 10
#define SRE_CATEGORY_LOC_NOT_SPACE 11
#define SRE_CATEGORY_LOC_WORD 12
#define SRE_CATEGORY_LOC_NOT_WORD 13
#define SRE_CATEGORY_LOC_LINEBREAK 14
#define SRE_CATEGORY_LOC_NOT_LINEBREAK 15