Implemented non-recursive SRE matching.

This commit is contained in:
Gustavo Niemeyer 2003-10-17 22:13:16 +00:00
parent 41e2809feb
commit ad3fc44ccb
9 changed files with 844 additions and 474 deletions

View File

@ -297,6 +297,15 @@ assertions, the contained pattern must only match strings of some
fixed length. Patterns which start with negative lookbehind
assertions may match at the beginning of the string being searched.
\item[\code{(?(\var{id/name})yes-pattern|no-pattern)}] Will try to match
with \regexp{yes-pattern} if the group with given \var{id} or \var{name}
exists, and with \regexp{no-pattern} if it doesn't. \regexp{|no-pattern}
is optional and can be omitted. For example,
\regexp{(<)?(\e w+@\e w+(?:\e .\e w+)+)(?(1)>)} is a poor email matching
pattern, which will match with \code{'<user@host.com>'} as well as
\code{'user@host.com'}, but not with \code{'<user@host.com'}.
\versionadded{2.3}
\end{list}
The special sequences consist of \character{\e} and a character from the

View File

@ -145,6 +145,19 @@ def _compile(code, pattern, flags):
else:
emit(OPCODES[op])
emit(av-1)
elif op is GROUPREF_EXISTS:
emit(OPCODES[op])
emit((av[0]-1)*2)
skipyes = len(code); emit(0)
_compile(code, av[1], flags)
if av[2]:
emit(OPCODES[JUMP])
skipno = len(code); emit(0)
code[skipyes] = len(code) - skipyes + 1
_compile(code, av[2], flags)
code[skipno] = len(code) - skipno
else:
code[skipyes] = len(code) - skipyes + 1
else:
raise ValueError, ("unsupported operand type", op)

View File

@ -13,7 +13,7 @@
# update when constants are added or removed
MAGIC = 20030419
MAGIC = 20031017
# max code word in this release
@ -42,6 +42,7 @@ CATEGORY = "category"
CHARSET = "charset"
GROUPREF = "groupref"
GROUPREF_IGNORE = "groupref_ignore"
GROUPREF_EXISTS = "groupref_exists"
IN = "in"
IN_IGNORE = "in_ignore"
INFO = "info"
@ -108,7 +109,7 @@ OPCODES = [
CALL,
CATEGORY,
CHARSET, BIGCHARSET,
GROUPREF, GROUPREF_IGNORE,
GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE,
IN, IN_IGNORE,
INFO,
JUMP,

View File

@ -364,6 +364,20 @@ def _parse_sub(source, state, nested=1):
subpattern.append((BRANCH, (None, items)))
return subpattern
def _parse_sub_cond(source, state, condgroup):
item_yes = _parse(source, state)
if source.match("|"):
item_no = _parse(source, state)
if source.match("|"):
raise error, "conditional backref with more than two branches"
else:
item_no = None
if source.next and not source.match(")", 0):
raise error, "pattern not properly closed"
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
def _parse(source, state):
# parse a simple pattern
@ -499,6 +513,7 @@ def _parse(source, state):
elif this == "(":
group = 1
name = None
condgroup = None
if source.match("?"):
group = 0
# options
@ -568,6 +583,26 @@ def _parse(source, state):
else:
subpattern.append((ASSERT_NOT, (dir, p)))
continue
elif source.match("("):
# conditional backreference group
condname = ""
while 1:
char = source.get()
if char is None:
raise error, "unterminated name"
if char == ")":
break
condname = condname + char
group = 2
if isname(condname):
condgroup = state.groupdict.get(condname)
if condgroup is None:
raise error, "unknown group name"
else:
try:
condgroup = atoi(condname)
except ValueError:
raise error, "bad character in group name"
else:
# flags
if not source.next in FLAGS:
@ -581,6 +616,9 @@ def _parse(source, state):
group = None
else:
group = state.opengroup(name)
if condgroup:
p = _parse_sub_cond(source, state, condgroup)
else:
p = _parse_sub(source, state)
if not source.match(")"):
raise error, "unbalanced parenthesis"

View File

@ -169,7 +169,6 @@ class ReTests(unittest.TestCase):
self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
def test_re_groupref_exists(self):
return # not yet
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
('(', 'a'))
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
@ -405,19 +404,20 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
20003)
self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
# non-simple '*?' still recurses and hits the recursion limit
self.assertRaises(RuntimeError, re.search, '(a|b)*?c', 10000*'ab'+'cd')
# non-simple '*?' still used to hit the recursion limit, before the
# non-recursive scheme was implemented.
self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
def test_bug_612074(self):
pat=u"["+re.escape(u"\u2039")+u"]"
self.assertEqual(re.compile(pat) and 1, 1)
def test_stack_overflow(self):
# nasty case that overflows the straightforward recursive
# nasty cases that used to overflow the straightforward recursive
# implementation of repeated groups.
self.assertRaises(RuntimeError, re.match, '(x)*', 50000*'x')
self.assertRaises(RuntimeError, re.match, '(x)*y', 50000*'x'+'y')
self.assertRaises(RuntimeError, re.match, '(x)*?y', 50000*'x'+'y')
self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
def test_scanner(self):
def s_ident(scanner, token): return token

View File

@ -61,6 +61,10 @@ Extension modules
- Bug #814613: INET_ADDRSTRLEN fix needed for all compilers on SGI
- Implemented non-recursive SRE matching scheme (#757624).
- Implemented (?(id/name)yes|no) support in SRE (#572936).
Library
-------

File diff suppressed because it is too large Load Diff

View File

@ -55,6 +55,7 @@ typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
typedef struct SRE_REPEAT_T {
int count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */
void* last_ptr; /* helper to check for infinite loops */
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
} SRE_REPEAT;
@ -74,10 +75,11 @@ typedef struct {
int lastmark;
void* mark[SRE_MARK_SIZE];
/* dynamically allocated stuff */
void** mark_stack;
int mark_stack_size;
int mark_stack_base;
SRE_REPEAT *repeat; /* current repeat context */
char* data_stack;
int data_stack_size;
int data_stack_base;
/* current repeat context */
SRE_REPEAT *repeat;
/* hooks */
SRE_TOLOWER_HOOK lower;
} SRE_STATE;

View File

@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
#define SRE_MAGIC 20030419
#define SRE_MAGIC 20031017
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@ -25,24 +25,25 @@
#define SRE_OP_CHARSET 10
#define SRE_OP_BIGCHARSET 11
#define SRE_OP_GROUPREF 12
#define SRE_OP_GROUPREF_IGNORE 13
#define SRE_OP_IN 14
#define SRE_OP_IN_IGNORE 15
#define SRE_OP_INFO 16
#define SRE_OP_JUMP 17
#define SRE_OP_LITERAL 18
#define SRE_OP_LITERAL_IGNORE 19
#define SRE_OP_MARK 20
#define SRE_OP_MAX_UNTIL 21
#define SRE_OP_MIN_UNTIL 22
#define SRE_OP_NOT_LITERAL 23
#define SRE_OP_NOT_LITERAL_IGNORE 24
#define SRE_OP_NEGATE 25
#define SRE_OP_RANGE 26
#define SRE_OP_REPEAT 27
#define SRE_OP_REPEAT_ONE 28
#define SRE_OP_SUBPATTERN 29
#define SRE_OP_MIN_REPEAT_ONE 30
#define SRE_OP_GROUPREF_EXISTS 13
#define SRE_OP_GROUPREF_IGNORE 14
#define SRE_OP_IN 15
#define SRE_OP_IN_IGNORE 16
#define SRE_OP_INFO 17
#define SRE_OP_JUMP 18
#define SRE_OP_LITERAL 19
#define SRE_OP_LITERAL_IGNORE 20
#define SRE_OP_MARK 21
#define SRE_OP_MAX_UNTIL 22
#define SRE_OP_MIN_UNTIL 23
#define SRE_OP_NOT_LITERAL 24
#define SRE_OP_NOT_LITERAL_IGNORE 25
#define SRE_OP_NEGATE 26
#define SRE_OP_RANGE 27
#define SRE_OP_REPEAT 28
#define SRE_OP_REPEAT_ONE 29
#define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2