- actually enabled charset anchors in the engine (still not
used by the code generator) - changed max repeat value in engine (to match earlier array fix) - added experimental "which part matched?" mechanism to sre; see http://hem.passagen.se/eff/2000_07_01_bot-archive.htm#416954 or python-dev for details.
This commit is contained in:
parent
b19948b7fb
commit
7cafe4d7e4
31
Lib/sre.py
31
Lib/sre.py
|
@ -155,3 +155,34 @@ def _pickle(p):
|
||||||
return _compile, (p.pattern, p.flags)
|
return _compile, (p.pattern, p.flags)
|
||||||
|
|
||||||
copy_reg.pickle(type(_compile("")), _pickle, _compile)
|
copy_reg.pickle(type(_compile("")), _pickle, _compile)
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
# experimental stuff (see python-dev discussions for details)
|
||||||
|
|
||||||
|
class Scanner:
|
||||||
|
def __init__(self, lexicon):
|
||||||
|
self.lexicon = lexicon
|
||||||
|
p = []
|
||||||
|
for phrase, action in lexicon:
|
||||||
|
p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
|
||||||
|
self.scanner = sre.compile("|".join(p))
|
||||||
|
def scan(self, string):
|
||||||
|
result = []
|
||||||
|
append = result.append
|
||||||
|
match = self.scanner.match
|
||||||
|
i = 0
|
||||||
|
while 1:
|
||||||
|
m = match(string, i)
|
||||||
|
if not m:
|
||||||
|
break
|
||||||
|
j = m.end()
|
||||||
|
if i == j:
|
||||||
|
break
|
||||||
|
action = self.lexicon[m.index][1]
|
||||||
|
if callable(action):
|
||||||
|
self.match = match
|
||||||
|
action = action(self, m.group())
|
||||||
|
if action is not None:
|
||||||
|
append(action)
|
||||||
|
i = j
|
||||||
|
return result, string[i:]
|
||||||
|
|
|
@ -208,7 +208,7 @@ def _compile(code, pattern, flags):
|
||||||
else:
|
else:
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[op])
|
||||||
emit(av-1)
|
emit(av-1)
|
||||||
elif op is MARK:
|
elif op in (MARK, INDEX):
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[op])
|
||||||
emit(av)
|
emit(av)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -33,6 +33,7 @@ GROUP = "group"
|
||||||
GROUP_IGNORE = "group_ignore"
|
GROUP_IGNORE = "group_ignore"
|
||||||
IN = "in"
|
IN = "in"
|
||||||
IN_IGNORE = "in_ignore"
|
IN_IGNORE = "in_ignore"
|
||||||
|
INDEX = "index"
|
||||||
INFO = "info"
|
INFO = "info"
|
||||||
JUMP = "jump"
|
JUMP = "jump"
|
||||||
LITERAL = "literal"
|
LITERAL = "literal"
|
||||||
|
@ -90,6 +91,7 @@ OPCODES = [
|
||||||
CATEGORY,
|
CATEGORY,
|
||||||
CHARSET,
|
CHARSET,
|
||||||
GROUP, GROUP_IGNORE,
|
GROUP, GROUP_IGNORE,
|
||||||
|
INDEX,
|
||||||
IN, IN_IGNORE,
|
IN, IN_IGNORE,
|
||||||
INFO,
|
INFO,
|
||||||
JUMP,
|
JUMP,
|
||||||
|
|
|
@ -451,6 +451,23 @@ def _parse(source, state):
|
||||||
if gid is None:
|
if gid is None:
|
||||||
raise error, "unknown group name"
|
raise error, "unknown group name"
|
||||||
subpattern.append((GROUP, gid))
|
subpattern.append((GROUP, gid))
|
||||||
|
elif source.match("#"):
|
||||||
|
index = ""
|
||||||
|
while 1:
|
||||||
|
char = source.get()
|
||||||
|
if char is None:
|
||||||
|
raise error, "unterminated index"
|
||||||
|
if char == ")":
|
||||||
|
break
|
||||||
|
index = index + char
|
||||||
|
try:
|
||||||
|
index = int(index)
|
||||||
|
if index < 0 or index > MAXREPEAT:
|
||||||
|
raise ValueError
|
||||||
|
except ValueError:
|
||||||
|
raise error, "illegal index"
|
||||||
|
subpattern.append((INDEX, index))
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
char = source.get()
|
char = source.get()
|
||||||
if char is None:
|
if char is None:
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
|
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
|
||||||
* 00-06-30 fl added fast search optimization (0.9.3)
|
* 00-06-30 fl added fast search optimization (0.9.3)
|
||||||
* 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
|
* 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
|
||||||
|
* 00-07-02 fl added charset optimizations, etc (0.9.5)
|
||||||
*
|
*
|
||||||
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
|
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
|
||||||
*
|
*
|
||||||
|
@ -31,7 +32,7 @@
|
||||||
|
|
||||||
#ifndef SRE_RECURSIVE
|
#ifndef SRE_RECURSIVE
|
||||||
|
|
||||||
char copyright[] = " SRE 0.9.4 Copyright (c) 1997-2000 by Secret Labs AB ";
|
char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
|
||||||
|
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
|
|
||||||
|
@ -587,6 +588,14 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
pattern++;
|
pattern++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_INDEX:
|
||||||
|
/* set index */
|
||||||
|
/* args: <index> */
|
||||||
|
TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
|
||||||
|
state->index = pattern[0];
|
||||||
|
pattern++;
|
||||||
|
break;
|
||||||
|
|
||||||
case SRE_OP_JUMP:
|
case SRE_OP_JUMP:
|
||||||
case SRE_OP_INFO:
|
case SRE_OP_INFO:
|
||||||
/* jump forward */
|
/* jump forward */
|
||||||
|
@ -810,7 +819,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
/* match maximum number of items, pushing alternate end
|
/* match maximum number of items, pushing alternate end
|
||||||
points to the stack */
|
points to the stack */
|
||||||
|
|
||||||
while (pattern[2] == 32767 || count < (int) pattern[2]) {
|
while (pattern[2] == 65535 || count < (int) pattern[2]) {
|
||||||
state->stackbase = stack;
|
state->stackbase = stack;
|
||||||
i = SRE_MATCH(state, pattern + 3);
|
i = SRE_MATCH(state, pattern + 3);
|
||||||
state->stackbase = stackbase; /* rewind */
|
state->stackbase = stackbase; /* rewind */
|
||||||
|
@ -980,10 +989,12 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & SRE_INFO_PREFIX) {
|
if (flags & SRE_INFO_PREFIX) {
|
||||||
|
/* pattern starts with a known prefix */
|
||||||
prefix_len = pattern[5];
|
prefix_len = pattern[5];
|
||||||
prefix = pattern + 6;
|
prefix = pattern + 6;
|
||||||
overlap = prefix + prefix_len - 1;
|
overlap = prefix + prefix_len - 1;
|
||||||
} else if (flags & SRE_INFO_CHARSET)
|
} else if (flags & SRE_INFO_CHARSET)
|
||||||
|
/* pattern starts with a character from a known set */
|
||||||
charset = pattern + 5;
|
charset = pattern + 5;
|
||||||
|
|
||||||
pattern += 1 + pattern[1];
|
pattern += 1 + pattern[1];
|
||||||
|
@ -1042,7 +1053,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
if (status != 0)
|
if (status != 0)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#if 0
|
|
||||||
} else if (charset) {
|
} else if (charset) {
|
||||||
/* pattern starts with a character from a known set */
|
/* pattern starts with a character from a known set */
|
||||||
for (;;) {
|
for (;;) {
|
||||||
|
@ -1057,7 +1067,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
if (status != 0)
|
if (status != 0)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
} else
|
} else
|
||||||
/* general case */
|
/* general case */
|
||||||
while (ptr <= end) {
|
while (ptr <= end) {
|
||||||
|
@ -1204,6 +1213,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
|
||||||
for (i = 0; i < SRE_MARK_SIZE; i++)
|
for (i = 0; i < SRE_MARK_SIZE; i++)
|
||||||
state->mark[i] = NULL;
|
state->mark[i] = NULL;
|
||||||
|
|
||||||
|
state->index = -1;
|
||||||
|
|
||||||
state->stack = NULL;
|
state->stack = NULL;
|
||||||
state->stackbase = 0;
|
state->stackbase = 0;
|
||||||
state->stacksize = 0;
|
state->stacksize = 0;
|
||||||
|
@ -1286,6 +1297,8 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
|
||||||
} else
|
} else
|
||||||
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
|
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
|
||||||
|
|
||||||
|
match->index = state->index;
|
||||||
|
|
||||||
return (PyObject*) match;
|
return (PyObject*) match;
|
||||||
|
|
||||||
} else if (status < 0) {
|
} else if (status < 0) {
|
||||||
|
@ -1887,6 +1900,15 @@ match_getattr(MatchObject* self, char* name)
|
||||||
if (!strcmp(name, "endpos"))
|
if (!strcmp(name, "endpos"))
|
||||||
return Py_BuildValue("i", 0); /* FIXME */
|
return Py_BuildValue("i", 0); /* FIXME */
|
||||||
|
|
||||||
|
if (!strcmp(name, "index")) {
|
||||||
|
/* experimental */
|
||||||
|
if (self->index < 0) {
|
||||||
|
Py_INCREF(Py_None);
|
||||||
|
return Py_None;
|
||||||
|
} else
|
||||||
|
return Py_BuildValue("i", self->index);
|
||||||
|
}
|
||||||
|
|
||||||
PyErr_SetString(PyExc_AttributeError, name);
|
PyErr_SetString(PyExc_AttributeError, name);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,6 +33,7 @@ typedef struct {
|
||||||
PyObject_HEAD
|
PyObject_HEAD
|
||||||
PyObject* string; /* link to the target string */
|
PyObject* string; /* link to the target string */
|
||||||
PatternObject* pattern; /* link to the regex (pattern) object */
|
PatternObject* pattern; /* link to the regex (pattern) object */
|
||||||
|
int index; /* last index marker seen by the engine (-1 if none) */
|
||||||
int groups; /* number of groups (start/end marks) */
|
int groups; /* number of groups (start/end marks) */
|
||||||
int mark[2];
|
int mark[2];
|
||||||
} MatchObject;
|
} MatchObject;
|
||||||
|
@ -57,6 +58,7 @@ typedef struct {
|
||||||
/* character size */
|
/* character size */
|
||||||
int charsize;
|
int charsize;
|
||||||
/* registers */
|
/* registers */
|
||||||
|
int index;
|
||||||
int lastmark;
|
int lastmark;
|
||||||
void* mark[SRE_MARK_SIZE];
|
void* mark[SRE_MARK_SIZE];
|
||||||
/* backtracking stack */
|
/* backtracking stack */
|
||||||
|
|
|
@ -23,21 +23,22 @@
|
||||||
#define SRE_OP_CHARSET 9
|
#define SRE_OP_CHARSET 9
|
||||||
#define SRE_OP_GROUP 10
|
#define SRE_OP_GROUP 10
|
||||||
#define SRE_OP_GROUP_IGNORE 11
|
#define SRE_OP_GROUP_IGNORE 11
|
||||||
#define SRE_OP_IN 12
|
#define SRE_OP_INDEX 12
|
||||||
#define SRE_OP_IN_IGNORE 13
|
#define SRE_OP_IN 13
|
||||||
#define SRE_OP_INFO 14
|
#define SRE_OP_IN_IGNORE 14
|
||||||
#define SRE_OP_JUMP 15
|
#define SRE_OP_INFO 15
|
||||||
#define SRE_OP_LITERAL 16
|
#define SRE_OP_JUMP 16
|
||||||
#define SRE_OP_LITERAL_IGNORE 17
|
#define SRE_OP_LITERAL 17
|
||||||
#define SRE_OP_MARK 18
|
#define SRE_OP_LITERAL_IGNORE 18
|
||||||
#define SRE_OP_MAX_REPEAT 19
|
#define SRE_OP_MARK 19
|
||||||
#define SRE_OP_MAX_REPEAT_ONE 20
|
#define SRE_OP_MAX_REPEAT 20
|
||||||
#define SRE_OP_MIN_REPEAT 21
|
#define SRE_OP_MAX_REPEAT_ONE 21
|
||||||
#define SRE_OP_NOT_LITERAL 22
|
#define SRE_OP_MIN_REPEAT 22
|
||||||
#define SRE_OP_NOT_LITERAL_IGNORE 23
|
#define SRE_OP_NOT_LITERAL 23
|
||||||
#define SRE_OP_NEGATE 24
|
#define SRE_OP_NOT_LITERAL_IGNORE 24
|
||||||
#define SRE_OP_RANGE 25
|
#define SRE_OP_NEGATE 25
|
||||||
#define SRE_OP_REPEAT 26
|
#define SRE_OP_RANGE 26
|
||||||
|
#define SRE_OP_REPEAT 27
|
||||||
#define SRE_AT_BEGINNING 0
|
#define SRE_AT_BEGINNING 0
|
||||||
#define SRE_AT_BEGINNING_LINE 1
|
#define SRE_AT_BEGINNING_LINE 1
|
||||||
#define SRE_AT_BOUNDARY 2
|
#define SRE_AT_BOUNDARY 2
|
||||||
|
|
Loading…
Reference in New Issue