Implemented non-recursive SRE matching.

This commit is contained in:
Gustavo Niemeyer 2003-10-17 22:13:16 +00:00
parent 41e2809feb
commit ad3fc44ccb
9 changed files with 844 additions and 474 deletions

View File

@ -297,6 +297,15 @@ assertions, the contained pattern must only match strings of some
fixed length. Patterns which start with negative lookbehind fixed length. Patterns which start with negative lookbehind
assertions may match at the beginning of the string being searched. assertions may match at the beginning of the string being searched.
\item[\code{(?(\var{id/name})yes-pattern|no-pattern)}] Will try to match
with \regexp{yes-pattern} if the group with given \var{id} or \var{name}
exists, and with \regexp{no-pattern} if it doesn't. \regexp{|no-pattern}
is optional and can be omitted. For example,
\regexp{(<)?(\e w+@\e w+(?:\e .\e w+)+)(?(1)>)} is a poor email matching
pattern, which will match with \code{'<user@host.com>'} as well as
\code{'user@host.com'}, but not with \code{'<user@host.com'}.
\versionadded{2.3}
\end{list} \end{list}
The special sequences consist of \character{\e} and a character from the The special sequences consist of \character{\e} and a character from the

View File

@ -145,6 +145,19 @@ def _compile(code, pattern, flags):
else: else:
emit(OPCODES[op]) emit(OPCODES[op])
emit(av-1) emit(av-1)
elif op is GROUPREF_EXISTS:
emit(OPCODES[op])
emit((av[0]-1)*2)
skipyes = len(code); emit(0)
_compile(code, av[1], flags)
if av[2]:
emit(OPCODES[JUMP])
skipno = len(code); emit(0)
code[skipyes] = len(code) - skipyes + 1
_compile(code, av[2], flags)
code[skipno] = len(code) - skipno
else:
code[skipyes] = len(code) - skipyes + 1
else: else:
raise ValueError, ("unsupported operand type", op) raise ValueError, ("unsupported operand type", op)

View File

@ -13,7 +13,7 @@
# update when constants are added or removed # update when constants are added or removed
MAGIC = 20030419 MAGIC = 20031017
# max code word in this release # max code word in this release
@ -42,6 +42,7 @@ CATEGORY = "category"
CHARSET = "charset" CHARSET = "charset"
GROUPREF = "groupref" GROUPREF = "groupref"
GROUPREF_IGNORE = "groupref_ignore" GROUPREF_IGNORE = "groupref_ignore"
GROUPREF_EXISTS = "groupref_exists"
IN = "in" IN = "in"
IN_IGNORE = "in_ignore" IN_IGNORE = "in_ignore"
INFO = "info" INFO = "info"
@ -108,7 +109,7 @@ OPCODES = [
CALL, CALL,
CATEGORY, CATEGORY,
CHARSET, BIGCHARSET, CHARSET, BIGCHARSET,
GROUPREF, GROUPREF_IGNORE, GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE,
IN, IN_IGNORE, IN, IN_IGNORE,
INFO, INFO,
JUMP, JUMP,

View File

@ -364,6 +364,20 @@ def _parse_sub(source, state, nested=1):
subpattern.append((BRANCH, (None, items))) subpattern.append((BRANCH, (None, items)))
return subpattern return subpattern
def _parse_sub_cond(source, state, condgroup):
item_yes = _parse(source, state)
if source.match("|"):
item_no = _parse(source, state)
if source.match("|"):
raise error, "conditional backref with more than two branches"
else:
item_no = None
if source.next and not source.match(")", 0):
raise error, "pattern not properly closed"
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
def _parse(source, state): def _parse(source, state):
# parse a simple pattern # parse a simple pattern
@ -499,6 +513,7 @@ def _parse(source, state):
elif this == "(": elif this == "(":
group = 1 group = 1
name = None name = None
condgroup = None
if source.match("?"): if source.match("?"):
group = 0 group = 0
# options # options
@ -568,6 +583,26 @@ def _parse(source, state):
else: else:
subpattern.append((ASSERT_NOT, (dir, p))) subpattern.append((ASSERT_NOT, (dir, p)))
continue continue
elif source.match("("):
# conditional backreference group
condname = ""
while 1:
char = source.get()
if char is None:
raise error, "unterminated name"
if char == ")":
break
condname = condname + char
group = 2
if isname(condname):
condgroup = state.groupdict.get(condname)
if condgroup is None:
raise error, "unknown group name"
else:
try:
condgroup = atoi(condname)
except ValueError:
raise error, "bad character in group name"
else: else:
# flags # flags
if not source.next in FLAGS: if not source.next in FLAGS:
@ -581,7 +616,10 @@ def _parse(source, state):
group = None group = None
else: else:
group = state.opengroup(name) group = state.opengroup(name)
p = _parse_sub(source, state) if condgroup:
p = _parse_sub_cond(source, state, condgroup)
else:
p = _parse_sub(source, state)
if not source.match(")"): if not source.match(")"):
raise error, "unbalanced parenthesis" raise error, "unbalanced parenthesis"
if group is not None: if group is not None:

View File

@ -169,7 +169,6 @@ class ReTests(unittest.TestCase):
self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
def test_re_groupref_exists(self): def test_re_groupref_exists(self):
return # not yet
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(), self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
('(', 'a')) ('(', 'a'))
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(), self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
@ -405,19 +404,20 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
20003) 20003)
self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
# non-simple '*?' still recurses and hits the recursion limit # non-simple '*?' still used to hit the recursion limit, before the
self.assertRaises(RuntimeError, re.search, '(a|b)*?c', 10000*'ab'+'cd') # non-recursive scheme was implemented.
self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
def test_bug_612074(self): def test_bug_612074(self):
pat=u"["+re.escape(u"\u2039")+u"]" pat=u"["+re.escape(u"\u2039")+u"]"
self.assertEqual(re.compile(pat) and 1, 1) self.assertEqual(re.compile(pat) and 1, 1)
def test_stack_overflow(self): def test_stack_overflow(self):
# nasty case that overflows the straightforward recursive # nasty cases that used to overflow the straightforward recursive
# implementation of repeated groups. # implementation of repeated groups.
self.assertRaises(RuntimeError, re.match, '(x)*', 50000*'x') self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
self.assertRaises(RuntimeError, re.match, '(x)*y', 50000*'x'+'y') self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
self.assertRaises(RuntimeError, re.match, '(x)*?y', 50000*'x'+'y') self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
def test_scanner(self): def test_scanner(self):
def s_ident(scanner, token): return token def s_ident(scanner, token): return token

View File

@ -61,6 +61,10 @@ Extension modules
- Bug #814613: INET_ADDRSTRLEN fix needed for all compilers on SGI - Bug #814613: INET_ADDRSTRLEN fix needed for all compilers on SGI
- Implemented non-recursive SRE matching scheme (#757624).
- Implemented (?(id/name)yes|no) support in SRE (#572936).
Library Library
------- -------

File diff suppressed because it is too large Load Diff

View File

@ -55,6 +55,7 @@ typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
typedef struct SRE_REPEAT_T { typedef struct SRE_REPEAT_T {
int count; int count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */ SRE_CODE* pattern; /* points to REPEAT operator arguments */
void* last_ptr; /* helper to check for infinite loops */
struct SRE_REPEAT_T *prev; /* points to previous repeat context */ struct SRE_REPEAT_T *prev; /* points to previous repeat context */
} SRE_REPEAT; } SRE_REPEAT;
@ -74,10 +75,11 @@ typedef struct {
int lastmark; int lastmark;
void* mark[SRE_MARK_SIZE]; void* mark[SRE_MARK_SIZE];
/* dynamically allocated stuff */ /* dynamically allocated stuff */
void** mark_stack; char* data_stack;
int mark_stack_size; int data_stack_size;
int mark_stack_base; int data_stack_base;
SRE_REPEAT *repeat; /* current repeat context */ /* current repeat context */
SRE_REPEAT *repeat;
/* hooks */ /* hooks */
SRE_TOLOWER_HOOK lower; SRE_TOLOWER_HOOK lower;
} SRE_STATE; } SRE_STATE;

View File

@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution. * See the _sre.c file for information on usage and redistribution.
*/ */
#define SRE_MAGIC 20030419 #define SRE_MAGIC 20031017
#define SRE_OP_FAILURE 0 #define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1 #define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2 #define SRE_OP_ANY 2
@ -25,24 +25,25 @@
#define SRE_OP_CHARSET 10 #define SRE_OP_CHARSET 10
#define SRE_OP_BIGCHARSET 11 #define SRE_OP_BIGCHARSET 11
#define SRE_OP_GROUPREF 12 #define SRE_OP_GROUPREF 12
#define SRE_OP_GROUPREF_IGNORE 13 #define SRE_OP_GROUPREF_EXISTS 13
#define SRE_OP_IN 14 #define SRE_OP_GROUPREF_IGNORE 14
#define SRE_OP_IN_IGNORE 15 #define SRE_OP_IN 15
#define SRE_OP_INFO 16 #define SRE_OP_IN_IGNORE 16
#define SRE_OP_JUMP 17 #define SRE_OP_INFO 17
#define SRE_OP_LITERAL 18 #define SRE_OP_JUMP 18
#define SRE_OP_LITERAL_IGNORE 19 #define SRE_OP_LITERAL 19
#define SRE_OP_MARK 20 #define SRE_OP_LITERAL_IGNORE 20
#define SRE_OP_MAX_UNTIL 21 #define SRE_OP_MARK 21
#define SRE_OP_MIN_UNTIL 22 #define SRE_OP_MAX_UNTIL 22
#define SRE_OP_NOT_LITERAL 23 #define SRE_OP_MIN_UNTIL 23
#define SRE_OP_NOT_LITERAL_IGNORE 24 #define SRE_OP_NOT_LITERAL 24
#define SRE_OP_NEGATE 25 #define SRE_OP_NOT_LITERAL_IGNORE 25
#define SRE_OP_RANGE 26 #define SRE_OP_NEGATE 26
#define SRE_OP_REPEAT 27 #define SRE_OP_RANGE 27
#define SRE_OP_REPEAT_ONE 28 #define SRE_OP_REPEAT 28
#define SRE_OP_SUBPATTERN 29 #define SRE_OP_REPEAT_ONE 29
#define SRE_OP_MIN_REPEAT_ONE 30 #define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31
#define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2 #define SRE_AT_BEGINNING_STRING 2