Implemented non-recursive SRE matching.
This commit is contained in:
parent
41e2809feb
commit
ad3fc44ccb
|
@ -297,6 +297,15 @@ assertions, the contained pattern must only match strings of some
|
||||||
fixed length. Patterns which start with negative lookbehind
|
fixed length. Patterns which start with negative lookbehind
|
||||||
assertions may match at the beginning of the string being searched.
|
assertions may match at the beginning of the string being searched.
|
||||||
|
|
||||||
|
\item[\code{(?(\var{id/name})yes-pattern|no-pattern)}] Will try to match
|
||||||
|
with \regexp{yes-pattern} if the group with given \var{id} or \var{name}
|
||||||
|
exists, and with \regexp{no-pattern} if it doesn't. \regexp{|no-pattern}
|
||||||
|
is optional and can be omitted. For example,
|
||||||
|
\regexp{(<)?(\e w+@\e w+(?:\e .\e w+)+)(?(1)>)} is a poor email matching
|
||||||
|
pattern, which will match with \code{'<user@host.com>'} as well as
|
||||||
|
\code{'user@host.com'}, but not with \code{'<user@host.com'}.
|
||||||
|
\versionadded{2.3}
|
||||||
|
|
||||||
\end{list}
|
\end{list}
|
||||||
|
|
||||||
The special sequences consist of \character{\e} and a character from the
|
The special sequences consist of \character{\e} and a character from the
|
||||||
|
|
|
@ -145,6 +145,19 @@ def _compile(code, pattern, flags):
|
||||||
else:
|
else:
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[op])
|
||||||
emit(av-1)
|
emit(av-1)
|
||||||
|
elif op is GROUPREF_EXISTS:
|
||||||
|
emit(OPCODES[op])
|
||||||
|
emit((av[0]-1)*2)
|
||||||
|
skipyes = len(code); emit(0)
|
||||||
|
_compile(code, av[1], flags)
|
||||||
|
if av[2]:
|
||||||
|
emit(OPCODES[JUMP])
|
||||||
|
skipno = len(code); emit(0)
|
||||||
|
code[skipyes] = len(code) - skipyes + 1
|
||||||
|
_compile(code, av[2], flags)
|
||||||
|
code[skipno] = len(code) - skipno
|
||||||
|
else:
|
||||||
|
code[skipyes] = len(code) - skipyes + 1
|
||||||
else:
|
else:
|
||||||
raise ValueError, ("unsupported operand type", op)
|
raise ValueError, ("unsupported operand type", op)
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
|
|
||||||
# update when constants are added or removed
|
# update when constants are added or removed
|
||||||
|
|
||||||
MAGIC = 20030419
|
MAGIC = 20031017
|
||||||
|
|
||||||
# max code word in this release
|
# max code word in this release
|
||||||
|
|
||||||
|
@ -42,6 +42,7 @@ CATEGORY = "category"
|
||||||
CHARSET = "charset"
|
CHARSET = "charset"
|
||||||
GROUPREF = "groupref"
|
GROUPREF = "groupref"
|
||||||
GROUPREF_IGNORE = "groupref_ignore"
|
GROUPREF_IGNORE = "groupref_ignore"
|
||||||
|
GROUPREF_EXISTS = "groupref_exists"
|
||||||
IN = "in"
|
IN = "in"
|
||||||
IN_IGNORE = "in_ignore"
|
IN_IGNORE = "in_ignore"
|
||||||
INFO = "info"
|
INFO = "info"
|
||||||
|
@ -108,7 +109,7 @@ OPCODES = [
|
||||||
CALL,
|
CALL,
|
||||||
CATEGORY,
|
CATEGORY,
|
||||||
CHARSET, BIGCHARSET,
|
CHARSET, BIGCHARSET,
|
||||||
GROUPREF, GROUPREF_IGNORE,
|
GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE,
|
||||||
IN, IN_IGNORE,
|
IN, IN_IGNORE,
|
||||||
INFO,
|
INFO,
|
||||||
JUMP,
|
JUMP,
|
||||||
|
|
|
@ -364,6 +364,20 @@ def _parse_sub(source, state, nested=1):
|
||||||
subpattern.append((BRANCH, (None, items)))
|
subpattern.append((BRANCH, (None, items)))
|
||||||
return subpattern
|
return subpattern
|
||||||
|
|
||||||
|
def _parse_sub_cond(source, state, condgroup):
|
||||||
|
item_yes = _parse(source, state)
|
||||||
|
if source.match("|"):
|
||||||
|
item_no = _parse(source, state)
|
||||||
|
if source.match("|"):
|
||||||
|
raise error, "conditional backref with more than two branches"
|
||||||
|
else:
|
||||||
|
item_no = None
|
||||||
|
if source.next and not source.match(")", 0):
|
||||||
|
raise error, "pattern not properly closed"
|
||||||
|
subpattern = SubPattern(state)
|
||||||
|
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
||||||
|
return subpattern
|
||||||
|
|
||||||
def _parse(source, state):
|
def _parse(source, state):
|
||||||
# parse a simple pattern
|
# parse a simple pattern
|
||||||
|
|
||||||
|
@ -499,6 +513,7 @@ def _parse(source, state):
|
||||||
elif this == "(":
|
elif this == "(":
|
||||||
group = 1
|
group = 1
|
||||||
name = None
|
name = None
|
||||||
|
condgroup = None
|
||||||
if source.match("?"):
|
if source.match("?"):
|
||||||
group = 0
|
group = 0
|
||||||
# options
|
# options
|
||||||
|
@ -568,6 +583,26 @@ def _parse(source, state):
|
||||||
else:
|
else:
|
||||||
subpattern.append((ASSERT_NOT, (dir, p)))
|
subpattern.append((ASSERT_NOT, (dir, p)))
|
||||||
continue
|
continue
|
||||||
|
elif source.match("("):
|
||||||
|
# conditional backreference group
|
||||||
|
condname = ""
|
||||||
|
while 1:
|
||||||
|
char = source.get()
|
||||||
|
if char is None:
|
||||||
|
raise error, "unterminated name"
|
||||||
|
if char == ")":
|
||||||
|
break
|
||||||
|
condname = condname + char
|
||||||
|
group = 2
|
||||||
|
if isname(condname):
|
||||||
|
condgroup = state.groupdict.get(condname)
|
||||||
|
if condgroup is None:
|
||||||
|
raise error, "unknown group name"
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
condgroup = atoi(condname)
|
||||||
|
except ValueError:
|
||||||
|
raise error, "bad character in group name"
|
||||||
else:
|
else:
|
||||||
# flags
|
# flags
|
||||||
if not source.next in FLAGS:
|
if not source.next in FLAGS:
|
||||||
|
@ -581,7 +616,10 @@ def _parse(source, state):
|
||||||
group = None
|
group = None
|
||||||
else:
|
else:
|
||||||
group = state.opengroup(name)
|
group = state.opengroup(name)
|
||||||
p = _parse_sub(source, state)
|
if condgroup:
|
||||||
|
p = _parse_sub_cond(source, state, condgroup)
|
||||||
|
else:
|
||||||
|
p = _parse_sub(source, state)
|
||||||
if not source.match(")"):
|
if not source.match(")"):
|
||||||
raise error, "unbalanced parenthesis"
|
raise error, "unbalanced parenthesis"
|
||||||
if group is not None:
|
if group is not None:
|
||||||
|
|
|
@ -169,7 +169,6 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
|
self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
|
||||||
|
|
||||||
def test_re_groupref_exists(self):
|
def test_re_groupref_exists(self):
|
||||||
return # not yet
|
|
||||||
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
|
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
|
||||||
('(', 'a'))
|
('(', 'a'))
|
||||||
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
|
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
|
||||||
|
@ -405,19 +404,20 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
|
self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
|
||||||
20003)
|
20003)
|
||||||
self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
|
self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
|
||||||
# non-simple '*?' still recurses and hits the recursion limit
|
# non-simple '*?' still used to hit the recursion limit, before the
|
||||||
self.assertRaises(RuntimeError, re.search, '(a|b)*?c', 10000*'ab'+'cd')
|
# non-recursive scheme was implemented.
|
||||||
|
self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
|
||||||
|
|
||||||
def test_bug_612074(self):
|
def test_bug_612074(self):
|
||||||
pat=u"["+re.escape(u"\u2039")+u"]"
|
pat=u"["+re.escape(u"\u2039")+u"]"
|
||||||
self.assertEqual(re.compile(pat) and 1, 1)
|
self.assertEqual(re.compile(pat) and 1, 1)
|
||||||
|
|
||||||
def test_stack_overflow(self):
|
def test_stack_overflow(self):
|
||||||
# nasty case that overflows the straightforward recursive
|
# nasty cases that used to overflow the straightforward recursive
|
||||||
# implementation of repeated groups.
|
# implementation of repeated groups.
|
||||||
self.assertRaises(RuntimeError, re.match, '(x)*', 50000*'x')
|
self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
|
||||||
self.assertRaises(RuntimeError, re.match, '(x)*y', 50000*'x'+'y')
|
self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
|
||||||
self.assertRaises(RuntimeError, re.match, '(x)*?y', 50000*'x'+'y')
|
self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
|
||||||
|
|
||||||
def test_scanner(self):
|
def test_scanner(self):
|
||||||
def s_ident(scanner, token): return token
|
def s_ident(scanner, token): return token
|
||||||
|
|
|
@ -61,6 +61,10 @@ Extension modules
|
||||||
|
|
||||||
- Bug #814613: INET_ADDRSTRLEN fix needed for all compilers on SGI
|
- Bug #814613: INET_ADDRSTRLEN fix needed for all compilers on SGI
|
||||||
|
|
||||||
|
- Implemented non-recursive SRE matching scheme (#757624).
|
||||||
|
|
||||||
|
- Implemented (?(id/name)yes|no) support in SRE (#572936).
|
||||||
|
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
|
1184
Modules/_sre.c
1184
Modules/_sre.c
File diff suppressed because it is too large
Load Diff
|
@ -55,6 +55,7 @@ typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
|
||||||
typedef struct SRE_REPEAT_T {
|
typedef struct SRE_REPEAT_T {
|
||||||
int count;
|
int count;
|
||||||
SRE_CODE* pattern; /* points to REPEAT operator arguments */
|
SRE_CODE* pattern; /* points to REPEAT operator arguments */
|
||||||
|
void* last_ptr; /* helper to check for infinite loops */
|
||||||
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
|
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
|
||||||
} SRE_REPEAT;
|
} SRE_REPEAT;
|
||||||
|
|
||||||
|
@ -74,10 +75,11 @@ typedef struct {
|
||||||
int lastmark;
|
int lastmark;
|
||||||
void* mark[SRE_MARK_SIZE];
|
void* mark[SRE_MARK_SIZE];
|
||||||
/* dynamically allocated stuff */
|
/* dynamically allocated stuff */
|
||||||
void** mark_stack;
|
char* data_stack;
|
||||||
int mark_stack_size;
|
int data_stack_size;
|
||||||
int mark_stack_base;
|
int data_stack_base;
|
||||||
SRE_REPEAT *repeat; /* current repeat context */
|
/* current repeat context */
|
||||||
|
SRE_REPEAT *repeat;
|
||||||
/* hooks */
|
/* hooks */
|
||||||
SRE_TOLOWER_HOOK lower;
|
SRE_TOLOWER_HOOK lower;
|
||||||
} SRE_STATE;
|
} SRE_STATE;
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
* See the _sre.c file for information on usage and redistribution.
|
* See the _sre.c file for information on usage and redistribution.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define SRE_MAGIC 20030419
|
#define SRE_MAGIC 20031017
|
||||||
#define SRE_OP_FAILURE 0
|
#define SRE_OP_FAILURE 0
|
||||||
#define SRE_OP_SUCCESS 1
|
#define SRE_OP_SUCCESS 1
|
||||||
#define SRE_OP_ANY 2
|
#define SRE_OP_ANY 2
|
||||||
|
@ -25,24 +25,25 @@
|
||||||
#define SRE_OP_CHARSET 10
|
#define SRE_OP_CHARSET 10
|
||||||
#define SRE_OP_BIGCHARSET 11
|
#define SRE_OP_BIGCHARSET 11
|
||||||
#define SRE_OP_GROUPREF 12
|
#define SRE_OP_GROUPREF 12
|
||||||
#define SRE_OP_GROUPREF_IGNORE 13
|
#define SRE_OP_GROUPREF_EXISTS 13
|
||||||
#define SRE_OP_IN 14
|
#define SRE_OP_GROUPREF_IGNORE 14
|
||||||
#define SRE_OP_IN_IGNORE 15
|
#define SRE_OP_IN 15
|
||||||
#define SRE_OP_INFO 16
|
#define SRE_OP_IN_IGNORE 16
|
||||||
#define SRE_OP_JUMP 17
|
#define SRE_OP_INFO 17
|
||||||
#define SRE_OP_LITERAL 18
|
#define SRE_OP_JUMP 18
|
||||||
#define SRE_OP_LITERAL_IGNORE 19
|
#define SRE_OP_LITERAL 19
|
||||||
#define SRE_OP_MARK 20
|
#define SRE_OP_LITERAL_IGNORE 20
|
||||||
#define SRE_OP_MAX_UNTIL 21
|
#define SRE_OP_MARK 21
|
||||||
#define SRE_OP_MIN_UNTIL 22
|
#define SRE_OP_MAX_UNTIL 22
|
||||||
#define SRE_OP_NOT_LITERAL 23
|
#define SRE_OP_MIN_UNTIL 23
|
||||||
#define SRE_OP_NOT_LITERAL_IGNORE 24
|
#define SRE_OP_NOT_LITERAL 24
|
||||||
#define SRE_OP_NEGATE 25
|
#define SRE_OP_NOT_LITERAL_IGNORE 25
|
||||||
#define SRE_OP_RANGE 26
|
#define SRE_OP_NEGATE 26
|
||||||
#define SRE_OP_REPEAT 27
|
#define SRE_OP_RANGE 27
|
||||||
#define SRE_OP_REPEAT_ONE 28
|
#define SRE_OP_REPEAT 28
|
||||||
#define SRE_OP_SUBPATTERN 29
|
#define SRE_OP_REPEAT_ONE 29
|
||||||
#define SRE_OP_MIN_REPEAT_ONE 30
|
#define SRE_OP_SUBPATTERN 30
|
||||||
|
#define SRE_OP_MIN_REPEAT_ONE 31
|
||||||
#define SRE_AT_BEGINNING 0
|
#define SRE_AT_BEGINNING 0
|
||||||
#define SRE_AT_BEGINNING_LINE 1
|
#define SRE_AT_BEGINNING_LINE 1
|
||||||
#define SRE_AT_BEGINNING_STRING 2
|
#define SRE_AT_BEGINNING_STRING 2
|
||||||
|
|
Loading…
Reference in New Issue