- fixed split
(test_sre still complains about split, but that's caused by the group reset bug, not split itself) - added more mark slots (should be dynamically allocated, but 100 is better than 32. and checking for the upper limit is better than overwriting the memory ;-) - internal: renamed the cursor helper class - internal: removed some bloat from sre_compile
This commit is contained in:
parent
69218178ec
commit
be2211e940
20
Lib/sre.py
20
Lib/sre.py
|
@ -26,7 +26,7 @@ T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
|
|||
U = UNICODE = sre_compile.SRE_FLAG_UNICODE
|
||||
|
||||
# sre exception
|
||||
error = sre_parse.error
|
||||
error = sre_compile.error
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# public interface
|
||||
|
@ -105,7 +105,7 @@ def _subn(pattern, template, string, count=0):
|
|||
n = i = 0
|
||||
s = []
|
||||
append = s.append
|
||||
c = pattern.cursor(string)
|
||||
c = pattern.scanner(string)
|
||||
while not count or n < count:
|
||||
m = c.search()
|
||||
if not m:
|
||||
|
@ -127,16 +127,20 @@ def _split(pattern, string, maxsplit=0):
|
|||
n = i = 0
|
||||
s = []
|
||||
append = s.append
|
||||
c = pattern.cursor(string)
|
||||
extend = s.extend
|
||||
c = pattern.scanner(string)
|
||||
g = c.groups
|
||||
while not maxsplit or n < maxsplit:
|
||||
m = c.search()
|
||||
if not m:
|
||||
break
|
||||
j = m.start()
|
||||
append(string[i:j])
|
||||
i = m.end()
|
||||
if i <= j:
|
||||
break
|
||||
b, e = m.span()
|
||||
if e == i:
|
||||
continue
|
||||
append(string[i:b])
|
||||
if g and b != e:
|
||||
extend(m.groups())
|
||||
i = e
|
||||
n = n + 1
|
||||
if i < len(string):
|
||||
append(string[i:])
|
||||
|
|
|
@ -11,8 +11,7 @@
|
|||
# other compatibility work.
|
||||
#
|
||||
|
||||
import array, string, sys
|
||||
|
||||
import array
|
||||
import _sre
|
||||
|
||||
from sre_constants import *
|
||||
|
@ -24,123 +23,101 @@ for WORDSIZE in "BHil":
|
|||
else:
|
||||
raise RuntimeError, "cannot find a useable array type"
|
||||
|
||||
# FIXME: <fl> should move some optimizations from the parser to here!
|
||||
|
||||
class Code:
|
||||
def __init__(self):
|
||||
self.data = []
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
def __getitem__(self, index):
|
||||
return self.data[index]
|
||||
def __setitem__(self, index, code):
|
||||
self.data[index] = code
|
||||
def append(self, code):
|
||||
self.data.append(code)
|
||||
def todata(self):
|
||||
# print self.data
|
||||
try:
|
||||
return array.array(WORDSIZE, self.data).tostring()
|
||||
except OverflowError:
|
||||
print self.data
|
||||
raise
|
||||
|
||||
def _compile(code, pattern, flags):
|
||||
append = code.append
|
||||
emit = code.append
|
||||
for op, av in pattern:
|
||||
if op is ANY:
|
||||
if flags & SRE_FLAG_DOTALL:
|
||||
append(OPCODES[op]) # any character at all!
|
||||
emit(OPCODES[op])
|
||||
else:
|
||||
append(OPCODES[CATEGORY])
|
||||
append(CHCODES[CATEGORY_NOT_LINEBREAK])
|
||||
emit(OPCODES[CATEGORY])
|
||||
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
|
||||
elif op in (SUCCESS, FAILURE):
|
||||
append(OPCODES[op])
|
||||
emit(OPCODES[op])
|
||||
elif op is AT:
|
||||
append(OPCODES[op])
|
||||
emit(OPCODES[op])
|
||||
if flags & SRE_FLAG_MULTILINE:
|
||||
append(ATCODES[AT_MULTILINE[av]])
|
||||
emit(ATCODES[AT_MULTILINE[av]])
|
||||
else:
|
||||
append(ATCODES[av])
|
||||
emit(ATCODES[av])
|
||||
elif op is BRANCH:
|
||||
append(OPCODES[op])
|
||||
emit(OPCODES[op])
|
||||
tail = []
|
||||
for av in av[1]:
|
||||
skip = len(code); append(0)
|
||||
skip = len(code); emit(0)
|
||||
_compile(code, av, flags)
|
||||
## append(OPCODES[SUCCESS])
|
||||
append(OPCODES[JUMP])
|
||||
tail.append(len(code)); append(0)
|
||||
emit(OPCODES[JUMP])
|
||||
tail.append(len(code)); emit(0)
|
||||
code[skip] = len(code) - skip
|
||||
append(0) # end of branch
|
||||
emit(0) # end of branch
|
||||
for tail in tail:
|
||||
code[tail] = len(code) - tail
|
||||
elif op is CALL:
|
||||
append(OPCODES[op])
|
||||
skip = len(code); append(0)
|
||||
emit(OPCODES[op])
|
||||
skip = len(code); emit(0)
|
||||
_compile(code, av, flags)
|
||||
append(OPCODES[SUCCESS])
|
||||
emit(OPCODES[SUCCESS])
|
||||
code[skip] = len(code) - skip
|
||||
elif op is CATEGORY:
|
||||
append(OPCODES[op])
|
||||
emit(OPCODES[op])
|
||||
if flags & SRE_FLAG_LOCALE:
|
||||
append(CH_LOCALE[CHCODES[av]])
|
||||
emit(CH_LOCALE[CHCODES[av]])
|
||||
elif flags & SRE_FLAG_UNICODE:
|
||||
append(CH_UNICODE[CHCODES[av]])
|
||||
emit(CH_UNICODE[CHCODES[av]])
|
||||
else:
|
||||
append(CHCODES[av])
|
||||
emit(CHCODES[av])
|
||||
elif op is GROUP:
|
||||
if flags & SRE_FLAG_IGNORECASE:
|
||||
append(OPCODES[OP_IGNORE[op]])
|
||||
emit(OPCODES[OP_IGNORE[op]])
|
||||
else:
|
||||
append(OPCODES[op])
|
||||
append(av-1)
|
||||
emit(OPCODES[op])
|
||||
emit(av-1)
|
||||
elif op is IN:
|
||||
if flags & SRE_FLAG_IGNORECASE:
|
||||
append(OPCODES[OP_IGNORE[op]])
|
||||
emit(OPCODES[OP_IGNORE[op]])
|
||||
def fixup(literal, flags=flags):
|
||||
return _sre.getlower(ord(literal), flags)
|
||||
else:
|
||||
append(OPCODES[op])
|
||||
emit(OPCODES[op])
|
||||
fixup = ord
|
||||
skip = len(code); append(0)
|
||||
skip = len(code); emit(0)
|
||||
for op, av in av:
|
||||
append(OPCODES[op])
|
||||
emit(OPCODES[op])
|
||||
if op is NEGATE:
|
||||
pass
|
||||
elif op is LITERAL:
|
||||
append(fixup(av))
|
||||
emit(fixup(av))
|
||||
elif op is RANGE:
|
||||
append(fixup(av[0]))
|
||||
append(fixup(av[1]))
|
||||
emit(fixup(av[0]))
|
||||
emit(fixup(av[1]))
|
||||
elif op is CATEGORY:
|
||||
if flags & SRE_FLAG_LOCALE:
|
||||
append(CH_LOCALE[CHCODES[av]])
|
||||
emit(CH_LOCALE[CHCODES[av]])
|
||||
elif flags & SRE_FLAG_UNICODE:
|
||||
append(CH_UNICODE[CHCODES[av]])
|
||||
emit(CH_UNICODE[CHCODES[av]])
|
||||
else:
|
||||
append(CHCODES[av])
|
||||
emit(CHCODES[av])
|
||||
else:
|
||||
raise ValueError, "unsupported set operator"
|
||||
append(OPCODES[FAILURE])
|
||||
raise error, "internal: unsupported set operator"
|
||||
emit(OPCODES[FAILURE])
|
||||
code[skip] = len(code) - skip
|
||||
elif op in (LITERAL, NOT_LITERAL):
|
||||
if flags & SRE_FLAG_IGNORECASE:
|
||||
append(OPCODES[OP_IGNORE[op]])
|
||||
emit(OPCODES[OP_IGNORE[op]])
|
||||
else:
|
||||
append(OPCODES[op])
|
||||
append(ord(av))
|
||||
emit(OPCODES[op])
|
||||
emit(ord(av))
|
||||
elif op is MARK:
|
||||
append(OPCODES[op])
|
||||
append(av)
|
||||
emit(OPCODES[op])
|
||||
emit(av)
|
||||
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
|
||||
if flags & SRE_FLAG_TEMPLATE:
|
||||
append(OPCODES[REPEAT])
|
||||
skip = len(code); append(0)
|
||||
append(av[0])
|
||||
append(av[1])
|
||||
emit(OPCODES[REPEAT])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
emit(av[1])
|
||||
_compile(code, av[2], flags)
|
||||
append(OPCODES[SUCCESS])
|
||||
emit(OPCODES[SUCCESS])
|
||||
code[skip] = len(code) - skip
|
||||
else:
|
||||
lo, hi = av[2].getwidth()
|
||||
|
@ -149,35 +126,35 @@ def _compile(code, pattern, flags):
|
|||
if 0 and lo == hi == 1 and op is MAX_REPEAT:
|
||||
# FIXME: <fl> need a better way to figure out when
|
||||
# it's safe to use this one (in the parser, probably)
|
||||
append(OPCODES[MAX_REPEAT_ONE])
|
||||
skip = len(code); append(0)
|
||||
append(av[0])
|
||||
append(av[1])
|
||||
emit(OPCODES[MAX_REPEAT_ONE])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
emit(av[1])
|
||||
_compile(code, av[2], flags)
|
||||
append(OPCODES[SUCCESS])
|
||||
emit(OPCODES[SUCCESS])
|
||||
code[skip] = len(code) - skip
|
||||
else:
|
||||
append(OPCODES[op])
|
||||
skip = len(code); append(0)
|
||||
append(av[0])
|
||||
append(av[1])
|
||||
emit(OPCODES[op])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
emit(av[1])
|
||||
_compile(code, av[2], flags)
|
||||
append(OPCODES[SUCCESS])
|
||||
emit(OPCODES[SUCCESS])
|
||||
code[skip] = len(code) - skip
|
||||
elif op is SUBPATTERN:
|
||||
group = av[0]
|
||||
if group:
|
||||
append(OPCODES[MARK])
|
||||
append((group-1)*2)
|
||||
emit(OPCODES[MARK])
|
||||
emit((group-1)*2)
|
||||
_compile(code, av[1], flags)
|
||||
if group:
|
||||
append(OPCODES[MARK])
|
||||
append((group-1)*2+1)
|
||||
emit(OPCODES[MARK])
|
||||
emit((group-1)*2+1)
|
||||
else:
|
||||
raise ValueError, ("unsupported operand type", op)
|
||||
|
||||
def compile(p, flags=0):
|
||||
# convert pattern list to internal format
|
||||
# internal: convert pattern list to internal format
|
||||
if type(p) in (type(""), type(u"")):
|
||||
import sre_parse
|
||||
pattern = p
|
||||
|
@ -185,18 +162,14 @@ def compile(p, flags=0):
|
|||
else:
|
||||
pattern = None
|
||||
flags = p.pattern.flags | flags
|
||||
code = Code()
|
||||
code = []
|
||||
_compile(code, p.data, flags)
|
||||
code.append(OPCODES[SUCCESS])
|
||||
data = code.todata()
|
||||
if 0: # debugging
|
||||
print
|
||||
print "-" * 68
|
||||
import sre_disasm
|
||||
sre_disasm.disasm(data)
|
||||
print "-" * 68
|
||||
# FIXME: <fl> get rid of this limitation
|
||||
assert p.pattern.groups <= 100,\
|
||||
"sorry, but this version only supports 100 named groups"
|
||||
return _sre.compile(
|
||||
pattern, flags,
|
||||
data,
|
||||
array.array(WORDSIZE, code).tostring(),
|
||||
p.pattern.groups-1, p.pattern.groupdict
|
||||
)
|
||||
|
|
|
@ -14,11 +14,12 @@
|
|||
* 00-03-06 fl first alpha, sort of (0.5)
|
||||
* 00-03-14 fl removed most compatibility stuff (0.6)
|
||||
* 00-05-10 fl towards third alpha (0.8.2)
|
||||
* 00-05-13 fl added experimental cursor stuff (0.8.3)
|
||||
* 00-05-13 fl added experimental scanner stuff (0.8.3)
|
||||
* 00-05-27 fl final bug hunt (0.8.4)
|
||||
* 00-06-21 fl less bugs, more taste (0.8.5)
|
||||
* 00-06-25 fl major changes to better deal with nested repeats (0.9)
|
||||
* 00-06-28 fl fixed findall (0.9.1)
|
||||
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
|
||||
*
|
||||
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
|
||||
*
|
||||
|
@ -384,7 +385,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
int i, count;
|
||||
|
||||
/* FIXME: this is a hack! */
|
||||
void* mark_copy[64];
|
||||
void* mark_copy[SRE_MARK_SIZE];
|
||||
void* mark = NULL;
|
||||
|
||||
TRACE(("%8d: enter\n", PTR(ptr)));
|
||||
|
@ -954,7 +955,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
|
||||
staticforward PyTypeObject Pattern_Type;
|
||||
staticforward PyTypeObject Match_Type;
|
||||
staticforward PyTypeObject Cursor_Type;
|
||||
staticforward PyTypeObject Scanner_Type;
|
||||
|
||||
static PyObject *
|
||||
_compile(PyObject* self_, PyObject* args)
|
||||
|
@ -1074,7 +1075,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
|
|||
state->lastmark = 0;
|
||||
|
||||
/* FIXME: dynamic! */
|
||||
for (i = 0; i < 64; i++)
|
||||
for (i = 0; i < SRE_MARK_SIZE; i++)
|
||||
state->mark[i] = NULL;
|
||||
|
||||
state->stack = NULL;
|
||||
|
@ -1176,15 +1177,15 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
|
|||
}
|
||||
|
||||
static PyObject*
|
||||
pattern_cursor(PatternObject* pattern, PyObject* args)
|
||||
pattern_scanner(PatternObject* pattern, PyObject* args)
|
||||
{
|
||||
/* create search state object */
|
||||
|
||||
CursorObject* self;
|
||||
ScannerObject* self;
|
||||
PyObject* string;
|
||||
|
||||
/* create match object (with room for extra group marks) */
|
||||
self = PyObject_NEW(CursorObject, &Cursor_Type);
|
||||
self = PyObject_NEW(ScannerObject, &Scanner_Type);
|
||||
if (self == NULL)
|
||||
return NULL;
|
||||
|
||||
|
@ -1431,7 +1432,7 @@ static PyMethodDef pattern_methods[] = {
|
|||
{"split", (PyCFunction) pattern_split, 1},
|
||||
{"findall", (PyCFunction) pattern_findall, 1},
|
||||
/* experimental */
|
||||
{"cursor", (PyCFunction) pattern_cursor, 1},
|
||||
{"scanner", (PyCFunction) pattern_scanner, 1},
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
|
@ -1467,7 +1468,7 @@ pattern_getattr(PatternObject* self, char* name)
|
|||
|
||||
statichere PyTypeObject Pattern_Type = {
|
||||
PyObject_HEAD_INIT(NULL)
|
||||
0, "Pattern", sizeof(PatternObject), 0,
|
||||
0, "SRE_Pattern", sizeof(PatternObject), 0,
|
||||
(destructor)pattern_dealloc, /*tp_dealloc*/
|
||||
0, /*tp_print*/
|
||||
(getattrfunc)pattern_getattr, /*tp_getattr*/
|
||||
|
@ -1761,7 +1762,7 @@ match_getattr(MatchObject* self, char* name)
|
|||
|
||||
statichere PyTypeObject Match_Type = {
|
||||
PyObject_HEAD_INIT(NULL)
|
||||
0, "Match",
|
||||
0, "SRE_Match",
|
||||
sizeof(MatchObject), /* size of basic object */
|
||||
sizeof(int), /* space for group item */
|
||||
(destructor)match_dealloc, /*tp_dealloc*/
|
||||
|
@ -1770,10 +1771,10 @@ statichere PyTypeObject Match_Type = {
|
|||
};
|
||||
|
||||
/* -------------------------------------------------------------------- */
|
||||
/* cursor methods (experimental) */
|
||||
/* scanner methods (experimental) */
|
||||
|
||||
static void
|
||||
cursor_dealloc(CursorObject* self)
|
||||
scanner_dealloc(ScannerObject* self)
|
||||
{
|
||||
state_fini(&self->state);
|
||||
Py_DECREF(self->string);
|
||||
|
@ -1782,7 +1783,7 @@ cursor_dealloc(CursorObject* self)
|
|||
}
|
||||
|
||||
static PyObject*
|
||||
cursor_match(CursorObject* self, PyObject* args)
|
||||
scanner_match(ScannerObject* self, PyObject* args)
|
||||
{
|
||||
SRE_STATE* state = &self->state;
|
||||
PyObject* match;
|
||||
|
@ -1811,7 +1812,7 @@ cursor_match(CursorObject* self, PyObject* args)
|
|||
|
||||
|
||||
static PyObject*
|
||||
cursor_search(CursorObject* self, PyObject* args)
|
||||
scanner_search(ScannerObject* self, PyObject* args)
|
||||
{
|
||||
SRE_STATE* state = &self->state;
|
||||
PyObject* match;
|
||||
|
@ -1830,24 +1831,26 @@ cursor_search(CursorObject* self, PyObject* args)
|
|||
match = pattern_new_match((PatternObject*) self->pattern,
|
||||
state, self->string, status);
|
||||
|
||||
if (status >= 0)
|
||||
if (status == 0 || state->ptr == state->start)
|
||||
state->start = (void*) ((char*) state->ptr + state->charsize);
|
||||
else
|
||||
state->start = state->ptr;
|
||||
|
||||
return match;
|
||||
}
|
||||
|
||||
static PyMethodDef cursor_methods[] = {
|
||||
{"match", (PyCFunction) cursor_match, 0},
|
||||
{"search", (PyCFunction) cursor_search, 0},
|
||||
static PyMethodDef scanner_methods[] = {
|
||||
{"match", (PyCFunction) scanner_match, 0},
|
||||
{"search", (PyCFunction) scanner_search, 0},
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
static PyObject*
|
||||
cursor_getattr(CursorObject* self, char* name)
|
||||
scanner_getattr(ScannerObject* self, char* name)
|
||||
{
|
||||
PyObject* res;
|
||||
|
||||
res = Py_FindMethod(cursor_methods, (PyObject*) self, name);
|
||||
res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
|
||||
if (res)
|
||||
return res;
|
||||
|
||||
|
@ -1859,18 +1862,21 @@ cursor_getattr(CursorObject* self, char* name)
|
|||
return self->pattern;
|
||||
}
|
||||
|
||||
if (!strcmp(name, "groups"))
|
||||
return Py_BuildValue("i", ((PatternObject*) self->pattern)->groups);
|
||||
|
||||
PyErr_SetString(PyExc_AttributeError, name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
statichere PyTypeObject Cursor_Type = {
|
||||
statichere PyTypeObject Scanner_Type = {
|
||||
PyObject_HEAD_INIT(NULL)
|
||||
0, "Cursor",
|
||||
sizeof(CursorObject), /* size of basic object */
|
||||
0, "SRE_Scanner",
|
||||
sizeof(ScannerObject), /* size of basic object */
|
||||
0,
|
||||
(destructor)cursor_dealloc, /*tp_dealloc*/
|
||||
(destructor)scanner_dealloc, /*tp_dealloc*/
|
||||
0, /*tp_print*/
|
||||
(getattrfunc)cursor_getattr, /*tp_getattr*/
|
||||
(getattrfunc)scanner_getattr, /*tp_getattr*/
|
||||
};
|
||||
|
||||
static PyMethodDef _functions[] = {
|
||||
|
@ -1888,7 +1894,7 @@ init_sre()
|
|||
{
|
||||
/* Patch object types */
|
||||
Pattern_Type.ob_type = Match_Type.ob_type =
|
||||
Cursor_Type.ob_type = &PyType_Type;
|
||||
Scanner_Type.ob_type = &PyType_Type;
|
||||
|
||||
Py_InitModule("_" MODULE, _functions);
|
||||
}
|
||||
|
|
|
@ -46,6 +46,9 @@ typedef struct {
|
|||
void* ptr;
|
||||
} SRE_STACK;
|
||||
|
||||
/* FIXME: <fl> shouldn't be a constant, really... */
|
||||
#define SRE_MARK_SIZE 200
|
||||
|
||||
typedef struct {
|
||||
/* string pointers */
|
||||
void* ptr; /* current position (also end of current slice) */
|
||||
|
@ -56,7 +59,7 @@ typedef struct {
|
|||
int charsize;
|
||||
/* registers */
|
||||
int lastmark;
|
||||
void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
|
||||
void* mark[SRE_MARK_SIZE];
|
||||
/* backtracking stack */
|
||||
SRE_STACK* stack;
|
||||
int stacksize;
|
||||
|
@ -66,11 +69,11 @@ typedef struct {
|
|||
} SRE_STATE;
|
||||
|
||||
typedef struct {
|
||||
/* search helper */
|
||||
/* scanner (internal helper object) */
|
||||
PyObject_HEAD
|
||||
PyObject* pattern;
|
||||
PyObject* string;
|
||||
SRE_STATE state;
|
||||
} CursorObject;
|
||||
} ScannerObject;
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue