- added lookbehind support (?<=pattern), (?<!pattern).

the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...
This commit is contained in:
Fredrik Lundh 2000-07-03 18:44:21 +00:00
parent 40c48685a2
commit 6f01398236
5 changed files with 138 additions and 104 deletions

View File

@ -10,18 +10,10 @@
# other compatibility work.
#
import array
import _sre
from sre_constants import *
# find an array type code that matches the engine's code size
for WORDSIZE in "Hil":
if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
break
else:
raise RuntimeError, "cannot find a useable array type"
MAXCODE = 65535
def _charset(charset, fixup):
@ -170,7 +162,20 @@ def _compile(code, pattern, flags):
emit((group-1)*2+1)
elif op in (SUCCESS, FAILURE):
emit(OPCODES[op])
elif op in (ASSERT, ASSERT_NOT, CALL):
elif op in (ASSERT, ASSERT_NOT):
emit(OPCODES[op])
skip = len(code); emit(0)
if av[0] >= 0:
emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
if lo != hi:
raise error, "look-behind requires fixed-width pattern"
emit(lo) # look behind
_compile(code, av[1], flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is CALL:
emit(OPCODES[op])
skip = len(code); emit(0)
_compile(code, av, flags)
@ -305,7 +310,7 @@ def compile(p, flags=0):
indexgroup[i] = k
return _sre.compile(
pattern, flags,
array.array(WORDSIZE, code).tostring(),
p.pattern.groups-1, groupindex, indexgroup
pattern, flags, code,
p.pattern.groups-1,
groupindex, indexgroup
)

View File

@ -482,9 +482,15 @@ def _parse(source, state):
if source.next is None or source.next == ")":
break
source.get()
elif source.next in ("=", "!"):
elif source.next in ("=", "!", "<"):
# lookahead assertions
char = source.get()
dir = 1
if char == "<":
if source.next not in ("=", "!"):
raise error, "syntax error"
dir = -1 # lookbehind
char = source.get()
b = []
while 1:
p = _parse(source, state)
@ -493,9 +499,9 @@ def _parse(source, state):
b.append(p)
p = _branch(state, b)
if char == "=":
subpattern.append((ASSERT, p))
subpattern.append((ASSERT, (dir, p)))
else:
subpattern.append((ASSERT_NOT, p))
subpattern.append((ASSERT_NOT, (dir, p)))
break
elif source.match("|"):
b.append(p)

View File

@ -35,20 +35,20 @@ if verbose:
try:
assert sre.sub("(?i)b+", "x", "bbbb BBBB") == 'x x'
def bump_num(matchobj):
int_value = int(matchobj.group(0))
return str(int_value + 1)
assert sre.sub(r'\d+', bump_num, '08.2 -2 23x99y') == '9.3 -3 24x100y'
assert sre.sub(r'\d+', bump_num, '08.2 -2 23x99y', 3) == '9.3 -3 23x99y'
assert sre.sub('.', lambda m: r"\n", 'x') == '\\n'
assert sre.sub('.', r"\n", 'x') == '\n'
s = r"\1\1"
assert sre.sub('(.)', s, 'x') == 'xx'
assert sre.sub('(.)', sre.escape(s), 'x') == s
assert sre.sub('(.)', sre.escape(s), 'x') == s
assert sre.sub('(.)', lambda m: s, 'x') == s
assert sre.sub('(?P<a>x)', '\g<a>\g<a>', 'xx') == 'xxxx'
@ -144,7 +144,7 @@ except AssertionError:
if verbose:
print 'Running tests on sre.split'
try:
assert sre.split(":", ":a:b::c") == ['', 'a', 'b', '', 'c']
assert sre.split(":*", ":a:b::c") == ['', 'a', 'b', 'c']
@ -164,7 +164,7 @@ try:
assert sre.split(':', 'a:b:c:d', 2) == ['a', 'b', 'c:d']
assert sre.split("(:)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c']
assert sre.split("(:*)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c']
assert sre.split("(:*)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c']
except AssertionError:
raise TestFailed, "qualified sre.split"
@ -186,29 +186,29 @@ if verbose:
try:
# No groups at all
m = sre.match('a', 'a') ; assert m.groups() == ()
m = sre.match('a', 'a') ; assert m.groups() == ()
# A single group
m = sre.match('(a)', 'a') ; assert m.groups() == ('a',)
m = sre.match('(a)', 'a') ; assert m.groups() == ('a',)
pat = sre.compile('((a)|(b))(c)?')
assert pat.match('a').groups() == ('a', 'a', None, None)
assert pat.match('b').groups() == ('b', None, 'b', None)
assert pat.match('ac').groups() == ('a', 'a', None, 'c')
assert pat.match('bc').groups() == ('b', None, 'b', 'c')
assert pat.match('bc').groups("") == ('b', "", 'b', 'c')
assert pat.match('a').groups() == ('a', 'a', None, None)
assert pat.match('b').groups() == ('b', None, 'b', None)
assert pat.match('ac').groups() == ('a', 'a', None, 'c')
assert pat.match('bc').groups() == ('b', None, 'b', 'c')
assert pat.match('bc').groups("") == ('b', "", 'b', 'c')
except AssertionError:
raise TestFailed, "match .groups() method"
try:
# A single group
m = sre.match('(a)', 'a')
assert m.group(0) == 'a' ; assert m.group(0) == 'a'
m = sre.match('(a)', 'a')
assert m.group(0) == 'a' ; assert m.group(0) == 'a'
assert m.group(1) == 'a' ; assert m.group(1, 1) == ('a', 'a')
pat = sre.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
assert pat.match('a').group(1, 2, 3) == ('a', None, None)
assert pat.match('b').group('a1', 'b2', 'c3') == (None, 'b', None)
assert pat.match('ac').group(1, 'b2', 3) == ('a', None, 'c')
assert pat.match('a').group(1, 2, 3) == ('a', None, None)
assert pat.match('b').group('a1', 'b2', 'c3') == (None, 'b', None)
assert pat.match('ac').group(1, 'b2', 3) == ('a', None, 'c')
except AssertionError:
raise TestFailed, "match .group() method"
@ -252,10 +252,10 @@ try:
assert sre.I == sre.IGNORECASE
assert sre.L == sre.LOCALE
assert sre.M == sre.MULTILINE
assert sre.S == sre.DOTALL
assert sre.X == sre.VERBOSE
assert sre.T == sre.TEMPLATE
assert sre.U == sre.UNICODE
assert sre.S == sre.DOTALL
assert sre.X == sre.VERBOSE
assert sre.T == sre.TEMPLATE
assert sre.U == sre.UNICODE
except AssertionError:
raise TestFailed, 're module constants'
@ -272,7 +272,7 @@ if verbose:
else:
# To save time, only run the first and last 10 tests
#tests = tests[:10] + tests[-10:]
pass
pass
for t in tests:
sys.stdout.flush()
@ -280,7 +280,7 @@ for t in tests:
if len(t)==5:
pattern, s, outcome, repl, expected = t
elif len(t)==3:
pattern, s, outcome = t
pattern, s, outcome = t
else:
raise ValueError, ('Test tuples should have 3 or 5 fields',t)
@ -288,7 +288,7 @@ for t in tests:
obj=sre.compile(pattern)
except sre.error:
if outcome==SYNTAX_ERROR: pass # Expected a syntax error
else:
else:
print '=== Syntax error:', t
except KeyboardInterrupt: raise KeyboardInterrupt
except:
@ -356,7 +356,7 @@ for t in tests:
# of the match and see if it still succeeds. \B will
# break (because it won't match at the end or start of a
# string), so we'll ignore patterns that feature it.
if pattern[:2]!='\\B' and pattern[-2:]!='\\B':
obj=sre.compile(pattern)
result=obj.search(s, result.start(0), result.end(0)+1)

View File

@ -22,6 +22,7 @@
* 00-06-30 fl added fast search optimization (0.9.3)
* 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
* 00-07-02 fl added charset optimizations, etc (0.9.5)
* 00-07-03 fl store code in pattern object, lookbehind, etc
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
@ -144,14 +145,6 @@ static unsigned int sre_lower_unicode(unsigned int ch)
{
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
}
#if !defined(Py_UNICODE_ISALNUM)
/* FIXME: workaround. should be fixed in unicodectype.c */
#define Py_UNICODE_ISALNUM(ch)\
(Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISUPPER(ch) ||\
Py_UNICODE_ISTITLE(ch) || Py_UNICODE_ISDIGIT(ch))
#endif
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
@ -592,7 +585,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* set index */
/* args: <index> */
TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
state->index = pattern[0];
state->lastindex = pattern[0];
pattern++;
break;
@ -606,10 +599,12 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_ASSERT:
/* assert subpattern */
/* args: <skip> <pattern> */
TRACE(("%8d: assert subpattern\n", PTR(ptr)));
state->ptr = ptr;
i = SRE_MATCH(state, pattern + 1);
/* args: <skip> <back> <pattern> */
TRACE(("%8d: assert subpattern %d\n", PTR(ptr), pattern[1]));
state->ptr = ptr - pattern[1];
if (state->ptr < state->beginning)
goto failure;
i = SRE_MATCH(state, pattern + 2);
if (i < 0)
return i;
if (!i)
@ -620,9 +615,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_ASSERT_NOT:
/* assert not subpattern */
/* args: <skip> <pattern> */
TRACE(("%8d: assert not subpattern\n", PTR(ptr)));
state->ptr = ptr;
i = SRE_MATCH(state, pattern + 1);
TRACE(("%8d: assert not subpattern %d\n", PTR(ptr), pattern[1]));
state->ptr = ptr - pattern[1];
if (state->ptr < state->beginning)
goto failure;
i = SRE_MATCH(state, pattern + 2);
if (i < 0)
return i;
if (i)
@ -1098,6 +1095,7 @@ _compile(PyObject* self_, PyObject* args)
/* "compile" pattern descriptor to pattern object */
PatternObject* self;
int i, n;
PyObject* pattern;
int flags = 0;
@ -1105,24 +1103,37 @@ _compile(PyObject* self_, PyObject* args)
int groups = 0;
PyObject* groupindex = NULL;
PyObject* indexgroup = NULL;
if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
&PyString_Type, &code,
if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code,
&groups, &groupindex, &indexgroup))
return NULL;
self = PyObject_NEW(PatternObject, &Pattern_Type);
if (self == NULL)
code = PySequence_Fast(code, "code argument must be a sequence");
if (!code)
return NULL;
n = PySequence_Length(code);
self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n);
if (!self) {
Py_DECREF(code);
return NULL;
}
for (i = 0; i < n; i++) {
PyObject *o = PySequence_Fast_GET_ITEM(code, i);
self->code[i] = (SRE_CODE) PyInt_AsLong(o);
}
Py_DECREF(code);
if (PyErr_Occurred())
return NULL;
Py_INCREF(pattern);
self->pattern = pattern;
self->flags = flags;
Py_INCREF(code);
self->code = code;
self->groups = groups;
Py_XINCREF(groupindex);
@ -1217,7 +1228,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
for (i = 0; i < SRE_MARK_SIZE; i++)
state->mark[i] = NULL;
state->index = -1;
state->lastindex = -1;
state->stack = NULL;
state->stackbase = 0;
@ -1274,8 +1285,9 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
if (status > 0) {
/* create match object (with room for extra group marks) */
match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
if (match == NULL)
match = PyObject_NEW_VAR(MatchObject, &Match_Type,
2*(pattern->groups+1));
if (!match)
return NULL;
Py_INCREF(pattern);
@ -1301,7 +1313,10 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
} else
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
match->index = state->index;
match->lastindex = state->lastindex;
match->pos = ((char*) state->start - base) / n;
match->endpos = ((char*) state->end - base) / n;
return (PyObject*) match;
@ -1329,12 +1344,12 @@ pattern_scanner(PatternObject* pattern, PyObject* args)
/* create match object (with room for extra group marks) */
self = PyObject_NEW(ScannerObject, &Scanner_Type);
if (self == NULL)
if (!self)
return NULL;
string = state_init(&self->state, pattern, args);
if (!string) {
PyObject_DEL(self);
PyObject_Del(self);
return NULL;
}
@ -1350,10 +1365,9 @@ pattern_scanner(PatternObject* pattern, PyObject* args)
static void
pattern_dealloc(PatternObject* self)
{
Py_XDECREF(self->code);
Py_XDECREF(self->pattern);
Py_XDECREF(self->groupindex);
PyMem_DEL(self);
PyObject_DEL(self);
}
static PyObject*
@ -1614,10 +1628,11 @@ pattern_getattr(PatternObject* self, char* name)
statichere PyTypeObject Pattern_Type = {
PyObject_HEAD_INIT(NULL)
0, "SRE_Pattern", sizeof(PatternObject), 0,
0, "SRE_Pattern",
sizeof(PatternObject), sizeof(SRE_CODE),
(destructor)pattern_dealloc, /*tp_dealloc*/
0, /*tp_print*/
(getattrfunc)pattern_getattr, /*tp_getattr*/
(getattrfunc)pattern_getattr /*tp_getattr*/
};
/* -------------------------------------------------------------------- */
@ -1628,7 +1643,7 @@ match_dealloc(MatchObject* self)
{
Py_XDECREF(self->string);
Py_DECREF(self->pattern);
PyMem_DEL(self);
PyObject_DEL(self);
}
static PyObject*
@ -1643,31 +1658,40 @@ match_getslice_by_index(MatchObject* self, int index, PyObject* def)
return NULL;
}
if (self->string == Py_None || self->mark[index+index] < 0) {
index *= 2;
if (self->string == Py_None || self->mark[index] < 0) {
/* return default value if the string or group is undefined */
Py_INCREF(def);
return def;
}
return PySequence_GetSlice(
self->string, self->mark[index+index], self->mark[index+index+1]
self->string, self->mark[index], self->mark[index+1]
);
}
static int
match_getindex(MatchObject* self, PyObject* index)
{
if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
/* FIXME: resource leak? */
index = PyObject_GetItem(self->pattern->groupindex, index);
if (!index)
return -1;
}
int i;
if (PyInt_Check(index))
return (int) PyInt_AS_LONG(index);
return -1;
i = -1;
if (self->pattern->groupindex) {
index = PyObject_GetItem(self->pattern->groupindex, index);
if (index) {
if (PyInt_Check(index))
i = (int) PyInt_AS_LONG(index);
Py_DECREF(index);
} else
PyErr_Clear();
}
return i;
}
static PyObject*
@ -1889,17 +1913,17 @@ match_getattr(MatchObject* self, char* name)
if (!strcmp(name, "lastindex")) {
/* experimental */
if (self->index >= 0)
return Py_BuildValue("i", self->index);
if (self->lastindex >= 0)
return Py_BuildValue("i", self->lastindex);
Py_INCREF(Py_None);
return Py_None;
}
if (!strcmp(name, "lastgroup")) {
/* experimental */
if (self->pattern->indexgroup) {
if (self->pattern->indexgroup && self->lastindex >= 0) {
PyObject* result = PySequence_GetItem(
self->pattern->indexgroup, self->index
self->pattern->indexgroup, self->lastindex
);
if (result)
return result;
@ -1920,10 +1944,10 @@ match_getattr(MatchObject* self, char* name)
}
if (!strcmp(name, "pos"))
return Py_BuildValue("i", 0); /* FIXME */
return Py_BuildValue("i", self->pos);
if (!strcmp(name, "endpos"))
return Py_BuildValue("i", 0); /* FIXME */
return Py_BuildValue("i", self->endpos);
PyErr_SetString(PyExc_AttributeError, name);
return NULL;
@ -1935,11 +1959,10 @@ match_getattr(MatchObject* self, char* name)
statichere PyTypeObject Match_Type = {
PyObject_HEAD_INIT(NULL)
0, "SRE_Match",
sizeof(MatchObject), /* size of basic object */
sizeof(int), /* space for group item */
sizeof(MatchObject), sizeof(int),
(destructor)match_dealloc, /*tp_dealloc*/
0, /*tp_print*/
(getattrfunc)match_getattr, /*tp_getattr*/
(getattrfunc)match_getattr /*tp_getattr*/
};
/* -------------------------------------------------------------------- */
@ -1951,7 +1974,7 @@ scanner_dealloc(ScannerObject* self)
state_fini(&self->state);
Py_DECREF(self->string);
Py_DECREF(self->pattern);
PyMem_DEL(self);
PyObject_DEL(self);
}
static PyObject*
@ -2041,8 +2064,7 @@ scanner_getattr(ScannerObject* self, char* name)
statichere PyTypeObject Scanner_Type = {
PyObject_HEAD_INIT(NULL)
0, "SRE_Scanner",
sizeof(ScannerObject), /* size of basic object */
0,
sizeof(ScannerObject), 0,
(destructor)scanner_dealloc, /*tp_dealloc*/
0, /*tp_print*/
(getattrfunc)scanner_getattr, /*tp_getattr*/

View File

@ -17,26 +17,27 @@
#define SRE_CODE unsigned short
typedef struct {
PyObject_HEAD
PyObject* code; /* link to the code string object */
PyObject_VAR_HEAD
int groups;
PyObject* groupindex;
PyObject* indexgroup;
/* compatibility */
PyObject* pattern; /* pattern source (or None) */
int flags; /* flags used when compiling pattern source */
/* pattern code */
SRE_CODE code[1];
} PatternObject;
#define PatternObject_GetCode(o)\
((void*) PyString_AS_STRING(((PatternObject*)(o))->code))
#define PatternObject_GetCode(o) (((PatternObject*)(o))->code)
typedef struct {
PyObject_HEAD
PyObject_VAR_HEAD
PyObject* string; /* link to the target string */
PatternObject* pattern; /* link to the regex (pattern) object */
int index; /* last index marker seen by the engine (-1 if none) */
int pos, endpos; /* current target slice */
int lastindex; /* last index marker seen by the engine (-1 if none) */
int groups; /* number of groups (start/end marks) */
int mark[2];
int mark[1];
} MatchObject;
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
@ -59,7 +60,7 @@ typedef struct {
/* character size */
int charsize;
/* registers */
int index;
int lastindex;
int lastmark;
void* mark[SRE_MARK_SIZE];
/* backtracking stack */