mirror of https://github.com/python/cpython
Implemented non-recursive SRE matching.
This commit is contained in:
parent
41e2809feb
commit
ad3fc44ccb
|
@ -297,6 +297,15 @@ assertions, the contained pattern must only match strings of some
|
|||
fixed length. Patterns which start with negative lookbehind
|
||||
assertions may match at the beginning of the string being searched.
|
||||
|
||||
\item[\code{(?(\var{id/name})yes-pattern|no-pattern)}] Will try to match
|
||||
with \regexp{yes-pattern} if the group with given \var{id} or \var{name}
|
||||
exists, and with \regexp{no-pattern} if it doesn't. \regexp{|no-pattern}
|
||||
is optional and can be omitted. For example,
|
||||
\regexp{(<)?(\e w+@\e w+(?:\e .\e w+)+)(?(1)>)} is a poor email matching
|
||||
pattern, which will match with \code{'<user@host.com>'} as well as
|
||||
\code{'user@host.com'}, but not with \code{'<user@host.com'}.
|
||||
\versionadded{2.3}
|
||||
|
||||
\end{list}
|
||||
|
||||
The special sequences consist of \character{\e} and a character from the
|
||||
|
|
|
@ -145,6 +145,19 @@ def _compile(code, pattern, flags):
|
|||
else:
|
||||
emit(OPCODES[op])
|
||||
emit(av-1)
|
||||
elif op is GROUPREF_EXISTS:
|
||||
emit(OPCODES[op])
|
||||
emit((av[0]-1)*2)
|
||||
skipyes = len(code); emit(0)
|
||||
_compile(code, av[1], flags)
|
||||
if av[2]:
|
||||
emit(OPCODES[JUMP])
|
||||
skipno = len(code); emit(0)
|
||||
code[skipyes] = len(code) - skipyes + 1
|
||||
_compile(code, av[2], flags)
|
||||
code[skipno] = len(code) - skipno
|
||||
else:
|
||||
code[skipyes] = len(code) - skipyes + 1
|
||||
else:
|
||||
raise ValueError, ("unsupported operand type", op)
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
# update when constants are added or removed
|
||||
|
||||
MAGIC = 20030419
|
||||
MAGIC = 20031017
|
||||
|
||||
# max code word in this release
|
||||
|
||||
|
@ -42,6 +42,7 @@ CATEGORY = "category"
|
|||
CHARSET = "charset"
|
||||
GROUPREF = "groupref"
|
||||
GROUPREF_IGNORE = "groupref_ignore"
|
||||
GROUPREF_EXISTS = "groupref_exists"
|
||||
IN = "in"
|
||||
IN_IGNORE = "in_ignore"
|
||||
INFO = "info"
|
||||
|
@ -108,7 +109,7 @@ OPCODES = [
|
|||
CALL,
|
||||
CATEGORY,
|
||||
CHARSET, BIGCHARSET,
|
||||
GROUPREF, GROUPREF_IGNORE,
|
||||
GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE,
|
||||
IN, IN_IGNORE,
|
||||
INFO,
|
||||
JUMP,
|
||||
|
|
|
@ -364,6 +364,20 @@ def _parse_sub(source, state, nested=1):
|
|||
subpattern.append((BRANCH, (None, items)))
|
||||
return subpattern
|
||||
|
||||
def _parse_sub_cond(source, state, condgroup):
|
||||
item_yes = _parse(source, state)
|
||||
if source.match("|"):
|
||||
item_no = _parse(source, state)
|
||||
if source.match("|"):
|
||||
raise error, "conditional backref with more than two branches"
|
||||
else:
|
||||
item_no = None
|
||||
if source.next and not source.match(")", 0):
|
||||
raise error, "pattern not properly closed"
|
||||
subpattern = SubPattern(state)
|
||||
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
||||
return subpattern
|
||||
|
||||
def _parse(source, state):
|
||||
# parse a simple pattern
|
||||
|
||||
|
@ -499,6 +513,7 @@ def _parse(source, state):
|
|||
elif this == "(":
|
||||
group = 1
|
||||
name = None
|
||||
condgroup = None
|
||||
if source.match("?"):
|
||||
group = 0
|
||||
# options
|
||||
|
@ -568,6 +583,26 @@ def _parse(source, state):
|
|||
else:
|
||||
subpattern.append((ASSERT_NOT, (dir, p)))
|
||||
continue
|
||||
elif source.match("("):
|
||||
# conditional backreference group
|
||||
condname = ""
|
||||
while 1:
|
||||
char = source.get()
|
||||
if char is None:
|
||||
raise error, "unterminated name"
|
||||
if char == ")":
|
||||
break
|
||||
condname = condname + char
|
||||
group = 2
|
||||
if isname(condname):
|
||||
condgroup = state.groupdict.get(condname)
|
||||
if condgroup is None:
|
||||
raise error, "unknown group name"
|
||||
else:
|
||||
try:
|
||||
condgroup = atoi(condname)
|
||||
except ValueError:
|
||||
raise error, "bad character in group name"
|
||||
else:
|
||||
# flags
|
||||
if not source.next in FLAGS:
|
||||
|
@ -581,7 +616,10 @@ def _parse(source, state):
|
|||
group = None
|
||||
else:
|
||||
group = state.opengroup(name)
|
||||
p = _parse_sub(source, state)
|
||||
if condgroup:
|
||||
p = _parse_sub_cond(source, state, condgroup)
|
||||
else:
|
||||
p = _parse_sub(source, state)
|
||||
if not source.match(")"):
|
||||
raise error, "unbalanced parenthesis"
|
||||
if group is not None:
|
||||
|
|
|
@ -169,7 +169,6 @@ class ReTests(unittest.TestCase):
|
|||
self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
|
||||
|
||||
def test_re_groupref_exists(self):
|
||||
return # not yet
|
||||
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
|
||||
('(', 'a'))
|
||||
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
|
||||
|
@ -405,19 +404,20 @@ class ReTests(unittest.TestCase):
|
|||
self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
|
||||
20003)
|
||||
self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
|
||||
# non-simple '*?' still recurses and hits the recursion limit
|
||||
self.assertRaises(RuntimeError, re.search, '(a|b)*?c', 10000*'ab'+'cd')
|
||||
# non-simple '*?' still used to hit the recursion limit, before the
|
||||
# non-recursive scheme was implemented.
|
||||
self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
|
||||
|
||||
def test_bug_612074(self):
|
||||
pat=u"["+re.escape(u"\u2039")+u"]"
|
||||
self.assertEqual(re.compile(pat) and 1, 1)
|
||||
|
||||
def test_stack_overflow(self):
|
||||
# nasty case that overflows the straightforward recursive
|
||||
# nasty cases that used to overflow the straightforward recursive
|
||||
# implementation of repeated groups.
|
||||
self.assertRaises(RuntimeError, re.match, '(x)*', 50000*'x')
|
||||
self.assertRaises(RuntimeError, re.match, '(x)*y', 50000*'x'+'y')
|
||||
self.assertRaises(RuntimeError, re.match, '(x)*?y', 50000*'x'+'y')
|
||||
self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
|
||||
self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
|
||||
self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
|
||||
|
||||
def test_scanner(self):
|
||||
def s_ident(scanner, token): return token
|
||||
|
|
|
@ -61,6 +61,10 @@ Extension modules
|
|||
|
||||
- Bug #814613: INET_ADDRSTRLEN fix needed for all compilers on SGI
|
||||
|
||||
- Implemented non-recursive SRE matching scheme (#757624).
|
||||
|
||||
- Implemented (?(id/name)yes|no) support in SRE (#572936).
|
||||
|
||||
Library
|
||||
-------
|
||||
|
||||
|
|
1184
Modules/_sre.c
1184
Modules/_sre.c
File diff suppressed because it is too large
Load Diff
|
@ -55,6 +55,7 @@ typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
|
|||
typedef struct SRE_REPEAT_T {
|
||||
int count;
|
||||
SRE_CODE* pattern; /* points to REPEAT operator arguments */
|
||||
void* last_ptr; /* helper to check for infinite loops */
|
||||
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
|
||||
} SRE_REPEAT;
|
||||
|
||||
|
@ -74,10 +75,11 @@ typedef struct {
|
|||
int lastmark;
|
||||
void* mark[SRE_MARK_SIZE];
|
||||
/* dynamically allocated stuff */
|
||||
void** mark_stack;
|
||||
int mark_stack_size;
|
||||
int mark_stack_base;
|
||||
SRE_REPEAT *repeat; /* current repeat context */
|
||||
char* data_stack;
|
||||
int data_stack_size;
|
||||
int data_stack_base;
|
||||
/* current repeat context */
|
||||
SRE_REPEAT *repeat;
|
||||
/* hooks */
|
||||
SRE_TOLOWER_HOOK lower;
|
||||
} SRE_STATE;
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
* See the _sre.c file for information on usage and redistribution.
|
||||
*/
|
||||
|
||||
#define SRE_MAGIC 20030419
|
||||
#define SRE_MAGIC 20031017
|
||||
#define SRE_OP_FAILURE 0
|
||||
#define SRE_OP_SUCCESS 1
|
||||
#define SRE_OP_ANY 2
|
||||
|
@ -25,24 +25,25 @@
|
|||
#define SRE_OP_CHARSET 10
|
||||
#define SRE_OP_BIGCHARSET 11
|
||||
#define SRE_OP_GROUPREF 12
|
||||
#define SRE_OP_GROUPREF_IGNORE 13
|
||||
#define SRE_OP_IN 14
|
||||
#define SRE_OP_IN_IGNORE 15
|
||||
#define SRE_OP_INFO 16
|
||||
#define SRE_OP_JUMP 17
|
||||
#define SRE_OP_LITERAL 18
|
||||
#define SRE_OP_LITERAL_IGNORE 19
|
||||
#define SRE_OP_MARK 20
|
||||
#define SRE_OP_MAX_UNTIL 21
|
||||
#define SRE_OP_MIN_UNTIL 22
|
||||
#define SRE_OP_NOT_LITERAL 23
|
||||
#define SRE_OP_NOT_LITERAL_IGNORE 24
|
||||
#define SRE_OP_NEGATE 25
|
||||
#define SRE_OP_RANGE 26
|
||||
#define SRE_OP_REPEAT 27
|
||||
#define SRE_OP_REPEAT_ONE 28
|
||||
#define SRE_OP_SUBPATTERN 29
|
||||
#define SRE_OP_MIN_REPEAT_ONE 30
|
||||
#define SRE_OP_GROUPREF_EXISTS 13
|
||||
#define SRE_OP_GROUPREF_IGNORE 14
|
||||
#define SRE_OP_IN 15
|
||||
#define SRE_OP_IN_IGNORE 16
|
||||
#define SRE_OP_INFO 17
|
||||
#define SRE_OP_JUMP 18
|
||||
#define SRE_OP_LITERAL 19
|
||||
#define SRE_OP_LITERAL_IGNORE 20
|
||||
#define SRE_OP_MARK 21
|
||||
#define SRE_OP_MAX_UNTIL 22
|
||||
#define SRE_OP_MIN_UNTIL 23
|
||||
#define SRE_OP_NOT_LITERAL 24
|
||||
#define SRE_OP_NOT_LITERAL_IGNORE 25
|
||||
#define SRE_OP_NEGATE 26
|
||||
#define SRE_OP_RANGE 27
|
||||
#define SRE_OP_REPEAT 28
|
||||
#define SRE_OP_REPEAT_ONE 29
|
||||
#define SRE_OP_SUBPATTERN 30
|
||||
#define SRE_OP_MIN_REPEAT_ONE 31
|
||||
#define SRE_AT_BEGINNING 0
|
||||
#define SRE_AT_BEGINNING_LINE 1
|
||||
#define SRE_AT_BEGINNING_STRING 2
|
||||
|
|
Loading…
Reference in New Issue