Issue #22364: Improved some re error messages using regex for hints.

This commit is contained in:
Serhiy Storchaka 2015-03-25 21:03:47 +02:00
parent 7c316a181a
commit 632a77e6a3
6 changed files with 299 additions and 195 deletions

View File

@ -286,7 +286,7 @@ def _compile(pattern, flags):
if isinstance(pattern, _pattern_type):
if flags:
raise ValueError(
"Cannot process flags argument with a compiled pattern")
"cannot process flags argument with a compiled pattern")
return pattern
if not sre_compile.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern")

View File

@ -113,7 +113,7 @@ def _compile(code, pattern, flags):
emit(ANY)
elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator")
raise error("internal: unsupported template operator %r" % (op,))
elif _simple(av) and op is not REPEAT:
if op is MAX_REPEAT:
emit(REPEAT_ONE)
@ -216,7 +216,7 @@ def _compile(code, pattern, flags):
else:
code[skipyes] = _len(code) - skipyes + 1
else:
raise ValueError("unsupported operand type", op)
raise error("internal: unsupported operand type %r" % (op,))
def _compile_charset(charset, flags, code, fixup=None, fixes=None):
# compile charset subprogram
@ -242,7 +242,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
else:
emit(av)
else:
raise error("internal: unsupported set operator")
raise error("internal: unsupported set operator %r" % (op,))
emit(FAILURE)
def _optimize_charset(charset, fixup, fixes):

View File

@ -79,7 +79,7 @@ class Pattern:
gid = self.groups
self.subpatterns.append(None)
if self.groups > MAXGROUPS:
raise error("groups number is too large")
raise error("too many groups")
if name is not None:
ogid = self.groupdict.get(name, None)
if ogid is not None:
@ -235,7 +235,7 @@ class Tokenizer:
try:
char += self.decoded_string[index]
except IndexError:
raise error("bogus escape (end of line)",
raise error("bad escape (end of pattern)",
self.string, len(self.string) - 1) from None
self.index = index + 1
self.next = char
@ -263,8 +263,13 @@ class Tokenizer:
c = self.next
self.__next()
if c is None:
raise self.error("unterminated name")
if not result:
raise self.error("missing group name")
raise self.error("missing %s, unterminated name" % terminator,
len(result))
if c == terminator:
if not result:
raise self.error("missing group name", 1)
break
result += c
return result
@ -318,19 +323,19 @@ def _class_escape(source, escape):
# hexadecimal escape (exactly two digits)
escape += source.getwhile(2, HEXDIGITS)
if len(escape) != 4:
raise ValueError
raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext:
# unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS)
if len(escape) != 6:
raise ValueError
raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16)
elif c == "U" and source.istext:
# unicode escape (exactly eight digits)
escape += source.getwhile(8, HEXDIGITS)
if len(escape) != 10:
raise ValueError
raise source.error("incomplete escape %s" % escape, len(escape))
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
@ -339,7 +344,7 @@ def _class_escape(source, escape):
escape += source.getwhile(2, OCTDIGITS)
c = int(escape[1:], 8)
if c > 0o377:
raise source.error('octal escape value %r outside of '
raise source.error('octal escape value %s outside of '
'range 0-0o377' % escape, len(escape))
return LITERAL, c
elif c in DIGITS:
@ -352,7 +357,7 @@ def _class_escape(source, escape):
return LITERAL, ord(escape[1])
except ValueError:
pass
raise source.error("bogus escape: %r" % escape, len(escape))
raise source.error("bad escape %s" % escape, len(escape))
def _escape(source, escape, state):
# handle escape code in expression
@ -368,19 +373,19 @@ def _escape(source, escape, state):
# hexadecimal escape
escape += source.getwhile(2, HEXDIGITS)
if len(escape) != 4:
raise ValueError
raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext:
# unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS)
if len(escape) != 6:
raise ValueError
raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16)
elif c == "U" and source.istext:
# unicode escape (exactly eight digits)
escape += source.getwhile(8, HEXDIGITS)
if len(escape) != 10:
raise ValueError
raise source.error("incomplete escape %s" % escape, len(escape))
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
@ -398,7 +403,7 @@ def _escape(source, escape, state):
escape += source.get()
c = int(escape[1:], 8)
if c > 0o377:
raise source.error('octal escape value %r outside of '
raise source.error('octal escape value %s outside of '
'range 0-0o377' % escape,
len(escape))
return LITERAL, c
@ -406,11 +411,11 @@ def _escape(source, escape, state):
group = int(escape[1:])
if group < state.groups:
if not state.checkgroup(group):
raise source.error("cannot refer to open group",
raise source.error("cannot refer to an open group",
len(escape))
state.checklookbehindgroup(group, source)
return GROUPREF, group
raise ValueError
raise source.error("invalid group reference", len(escape))
if len(escape) == 2:
if c in ASCIILETTERS:
import warnings
@ -419,7 +424,7 @@ def _escape(source, escape, state):
return LITERAL, ord(escape[1])
except ValueError:
pass
raise source.error("bogus escape: %r" % escape, len(escape))
raise source.error("bad escape %s" % escape, len(escape))
def _parse_sub(source, state, nested=True):
# parse an alternation: a|b|c
@ -427,12 +432,11 @@ def _parse_sub(source, state, nested=True):
items = []
itemsappend = items.append
sourcematch = source.match
start = source.tell()
while True:
itemsappend(_parse(source, state))
if not sourcematch("|"):
break
if nested and source.next is not None and source.next != ")":
raise source.error("pattern not properly closed")
if len(items) == 1:
return items[0]
@ -480,8 +484,6 @@ def _parse_sub_cond(source, state, condgroup):
raise source.error("conditional backref with more than two branches")
else:
item_no = None
if source.next is not None and source.next != ")":
raise source.error("pattern not properly closed")
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
@ -526,6 +528,7 @@ def _parse(source, state):
subpatternappend((LITERAL, _ord(this)))
elif this == "[":
here = source.tell() - 1
# character set
set = []
setappend = set.append
@ -538,7 +541,8 @@ def _parse(source, state):
while True:
this = sourceget()
if this is None:
raise source.error("unexpected end of regular expression")
raise source.error("unterminated character set",
source.tell() - here)
if this == "]" and set != start:
break
elif this[0] == "\\":
@ -547,25 +551,28 @@ def _parse(source, state):
code1 = LITERAL, _ord(this)
if sourcematch("-"):
# potential range
this = sourceget()
if this is None:
raise source.error("unexpected end of regular expression")
if this == "]":
that = sourceget()
if that is None:
raise source.error("unterminated character set",
source.tell() - here)
if that == "]":
if code1[0] is IN:
code1 = code1[1][0]
setappend(code1)
setappend((LITERAL, _ord("-")))
break
if this[0] == "\\":
code2 = _class_escape(source, this)
if that[0] == "\\":
code2 = _class_escape(source, that)
else:
code2 = LITERAL, _ord(this)
code2 = LITERAL, _ord(that)
if code1[0] != LITERAL or code2[0] != LITERAL:
raise source.error("bad character range", len(this))
msg = "bad character range %s-%s" % (this, that)
raise source.error(msg, len(this) + 1 + len(that))
lo = code1[1]
hi = code2[1]
if hi < lo:
raise source.error("bad character range", len(this))
msg = "bad character range %s-%s" % (this, that)
raise source.error(msg, len(this) + 1 + len(that))
setappend((RANGE, (lo, hi)))
else:
if code1[0] is IN:
@ -617,10 +624,10 @@ def _parse(source, state):
if max >= MAXREPEAT:
raise OverflowError("the repetition number is too large")
if max < min:
raise source.error("bad repeat interval",
raise source.error("min repeat greater than max repeat",
source.tell() - here)
else:
raise source.error("not supported", len(this))
raise AssertionError("unsupported quantifier %r" % (char,))
# figure out which item to repeat
if subpattern:
item = subpattern[-1:]
@ -641,39 +648,32 @@ def _parse(source, state):
subpatternappend((ANY, None))
elif this == "(":
group = 1
start = source.tell() - 1
group = True
name = None
condgroup = None
if sourcematch("?"):
group = 0
# options
char = sourceget()
if char is None:
raise self.error("unexpected end of pattern")
raise source.error("unexpected end of pattern")
if char == "P":
# python extensions
if sourcematch("<"):
# named group: skip forward to end of name
name = source.getuntil(">")
group = 1
if not name:
raise source.error("missing group name", 1)
if not name.isidentifier():
raise source.error("bad character in group name "
"%r" % name,
len(name) + 1)
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
elif sourcematch("="):
# named backreference
name = source.getuntil(")")
if not name:
raise source.error("missing group name", 1)
if not name.isidentifier():
raise source.error("bad character in backref "
"group name %r" % name,
len(name) + 1)
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name: {0!r}".format(name)
msg = "unknown group name %r" % name
raise source.error(msg, len(name) + 1)
state.checklookbehindgroup(gid, source)
subpatternappend((GROUPREF, gid))
@ -682,16 +682,17 @@ def _parse(source, state):
char = sourceget()
if char is None:
raise source.error("unexpected end of pattern")
raise source.error("unknown specifier: ?P%s" % char,
len(char))
raise source.error("unknown extension ?P" + char,
len(char) + 2)
elif char == ":":
# non-capturing group
group = 2
group = None
elif char == "#":
# comment
while True:
if source.next is None:
raise source.error("unbalanced parenthesis")
raise source.error("missing ), unterminated comment",
source.tell() - start)
if sourceget() == ")":
break
continue
@ -700,8 +701,11 @@ def _parse(source, state):
dir = 1
if char == "<":
char = sourceget()
if char is None or char not in "=!":
raise source.error("syntax error")
if char is None:
raise source.error("unexpected end of pattern")
if char not in "=!":
raise source.error("unknown extension ?<" + char,
len(char) + 2)
dir = -1 # lookbehind
lookbehindgroups = state.lookbehindgroups
if lookbehindgroups is None:
@ -711,7 +715,8 @@ def _parse(source, state):
if lookbehindgroups is None:
state.lookbehindgroups = None
if not sourcematch(")"):
raise source.error("unbalanced parenthesis")
raise source.error("missing ), unterminated subpattern",
source.tell() - start)
if char == "=":
subpatternappend((ASSERT, (dir, p)))
else:
@ -720,13 +725,11 @@ def _parse(source, state):
elif char == "(":
# conditional backreference group
condname = source.getuntil(")")
group = 2
if not condname:
raise source.error("missing group name", 1)
group = None
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name: {0!r}".format(condname)
msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1)
else:
try:
@ -734,50 +737,48 @@ def _parse(source, state):
if condgroup < 0:
raise ValueError
except ValueError:
raise source.error("bad character in group name",
len(condname) + 1)
msg = "bad character in group name %r" % condname
raise source.error(msg, len(condname) + 1) from None
if not condgroup:
raise source.error("bad group number",
len(condname) + 1)
if condgroup >= MAXGROUPS:
raise source.error("the group number is too large",
raise source.error("invalid group reference",
len(condname) + 1)
state.checklookbehindgroup(condgroup, source)
elif char in FLAGS:
# flags
state.flags |= FLAGS[char]
while source.next in FLAGS:
state.flags |= FLAGS[sourceget()]
while True:
state.flags |= FLAGS[char]
char = sourceget()
if char is None:
raise source.error("missing )")
if char == ")":
break
if char not in FLAGS:
raise source.error("unknown flag", len(char))
verbose = state.flags & SRE_FLAG_VERBOSE
continue
else:
raise source.error("unexpected end of pattern")
if group:
# parse group contents
if group == 2:
# anonymous group
group = None
else:
try:
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1)
if condgroup:
p = _parse_sub_cond(source, state, condgroup)
else:
p = _parse_sub(source, state)
if not sourcematch(")"):
raise source.error("unbalanced parenthesis")
if group is not None:
state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, p)))
raise source.error("unknown extension ?" + char,
len(char) + 1)
# parse group contents
if group is not None:
try:
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1) from None
if condgroup:
p = _parse_sub_cond(source, state, condgroup)
else:
while True:
char = sourceget()
if char is None:
raise source.error("unexpected end of pattern")
if char == ")":
break
raise source.error("unknown extension", len(char))
p = _parse_sub(source, state)
if not source.match(")"):
raise source.error("missing ), unterminated subpattern",
source.tell() - start)
if group is not None:
state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, p)))
elif this == "^":
subpatternappend((AT, AT_BEGINNING))
@ -786,7 +787,7 @@ def _parse(source, state):
subpattern.append((AT, AT_END))
else:
raise source.error("parser error", len(this))
raise AssertionError("unsupported special character %r" % (char,))
return subpattern
@ -804,7 +805,7 @@ def fix_flags(src, flags):
raise ValueError("ASCII and UNICODE flags are incompatible")
else:
if flags & SRE_FLAG_UNICODE:
raise ValueError("can't use UNICODE flag with a bytes pattern")
raise ValueError("cannot use UNICODE flag with a bytes pattern")
if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
import warnings
warnings.warn("ASCII and LOCALE flags are incompatible. "
@ -826,11 +827,8 @@ def parse(str, flags=0, pattern=None):
p.pattern.flags = fix_flags(str, p.pattern.flags)
if source.next is not None:
if source.next == ")":
raise source.error("unbalanced parenthesis")
else:
raise source.error("bogus characters at end of regular expression",
len(tail))
assert source.next == ")"
raise source.error("unbalanced parenthesis")
if flags & SRE_FLAG_DEBUG:
p.dump()
@ -866,26 +864,25 @@ def parse_template(source, pattern):
c = this[1]
if c == "g":
name = ""
if s.match("<"):
name = s.getuntil(">")
if not name:
raise s.error("missing group name", 1)
try:
index = int(name)
if index < 0:
raise s.error("negative group number", len(name) + 1)
if index >= MAXGROUPS:
raise s.error("the group number is too large",
len(name) + 1)
except ValueError:
if not name.isidentifier():
raise s.error("bad character in group name",
len(name) + 1)
if not s.match("<"):
raise s.error("missing <")
name = s.getuntil(">")
if name.isidentifier():
try:
index = pattern.groupindex[name]
except KeyError:
msg = "unknown group name: {0!r}".format(name)
raise IndexError(msg)
raise IndexError("unknown group name %r" % name)
else:
try:
index = int(name)
if index < 0:
raise ValueError
except ValueError:
raise s.error("bad character in group name %r" % name,
len(name) + 1) from None
if index >= MAXGROUPS:
raise s.error("invalid group reference",
len(name) + 1)
addgroup(index)
elif c == "0":
if s.next in OCTDIGITS:
@ -903,7 +900,7 @@ def parse_template(source, pattern):
isoctal = True
c = int(this[1:], 8)
if c > 0o377:
raise s.error('octal escape value %r outside of '
raise s.error('octal escape value %s outside of '
'range 0-0o377' % this, len(this))
lappend(chr(c))
if not isoctal:

View File

@ -38,6 +38,24 @@ class ReTests(unittest.TestCase):
self.assertIs(type(actual), type(expect), msg)
recurse(actual, expect)
def checkPatternError(self, pattern, errmsg, pos=None):
with self.assertRaises(re.error) as cm:
re.compile(pattern)
with self.subTest(pattern=pattern):
err = cm.exception
self.assertEqual(err.msg, errmsg)
if pos is not None:
self.assertEqual(err.pos, pos)
def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
with self.assertRaises(re.error) as cm:
re.sub(pattern, repl, string)
with self.subTest(pattern=pattern, repl=repl):
err = cm.exception
self.assertEqual(err.msg, errmsg)
if pos is not None:
self.assertEqual(err.pos, pos)
def test_keep_buffer(self):
# See bug 14212
b = bytearray(b'x')
@ -148,6 +166,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
@ -158,21 +177,25 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
self.assertRaises(re.error, re.sub, 'x', r'\400', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\777', 'x')
self.checkTemplateError('x', r'\400', 'x',
r'octal escape value \400 outside of '
r'range 0-0o377', 0)
self.checkTemplateError('x', r'\777', 'x',
r'octal escape value \777 outside of '
r'range 0-0o377', 0)
self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
# in python2.3 (etc), these loop endlessly in sre_parser.py
self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
@ -198,47 +221,65 @@ class ReTests(unittest.TestCase):
re.compile('(?P<a>x)(?P=a)(?(a)y)')
re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
re.compile('(?P<a1>x)\1(?(1)y)')
self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
self.assertRaises(re.error, re.compile, '(?Px)')
self.assertRaises(re.error, re.compile, '(?P=)')
self.assertRaises(re.error, re.compile, '(?P=1)')
self.assertRaises(re.error, re.compile, '(?P=a)')
self.assertRaises(re.error, re.compile, '(?P=a1)')
self.assertRaises(re.error, re.compile, '(?P=a.)')
self.assertRaises(re.error, re.compile, '(?P<)')
self.assertRaises(re.error, re.compile, '(?P<>)')
self.assertRaises(re.error, re.compile, '(?P<1>)')
self.assertRaises(re.error, re.compile, '(?P<a.>)')
self.assertRaises(re.error, re.compile, '(?())')
self.assertRaises(re.error, re.compile, '(?(a))')
self.assertRaises(re.error, re.compile, '(?(1a))')
self.assertRaises(re.error, re.compile, '(?(a.))')
self.checkPatternError('(?P<a>)(?P<a>)',
"redefinition of group name 'a' as group 2; "
"was group 1")
self.checkPatternError('(?Pxy)', 'unknown extension ?Px')
self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11)
self.checkPatternError('(?P=', 'missing group name', 4)
self.checkPatternError('(?P=)', 'missing group name', 4)
self.checkPatternError('(?P=1)', "bad character in group name '1'", 4)
self.checkPatternError('(?P=a)', "unknown group name 'a'")
self.checkPatternError('(?P=a1)', "unknown group name 'a1'")
self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4)
self.checkPatternError('(?P<)', 'missing >, unterminated name', 4)
self.checkPatternError('(?P<a', 'missing >, unterminated name', 4)
self.checkPatternError('(?P<', 'missing group name', 4)
self.checkPatternError('(?P<>)', 'missing group name', 4)
self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
self.checkPatternError(r'(?(', 'missing group name', 3)
self.checkPatternError(r'(?())', 'missing group name', 3)
self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
# New valid/invalid identifiers in Python 3
re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
self.assertRaises(re.error, re.compile, '(?P<©>x)')
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
# Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
pat = '(?:%s)(?(200)z|t)' % pat
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
def test_symbolic_refs(self):
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<2>', 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\2', 'xx')
self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
self.checkTemplateError('(?P<a>x)', '\g<a', 'xx',
'missing >, unterminated name', 3)
self.checkTemplateError('(?P<a>x)', '\g<', 'xx',
'missing group name', 3)
self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2)
self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx',
"bad character in group name 'a a'", 3)
self.checkTemplateError('(?P<a>x)', '\g<>', 'xx',
'missing group name', 3)
self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx',
"bad character in group name '1a1'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
'invalid group reference')
self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
'invalid group reference')
with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
re.sub('(?P<a>x)', '\g<ab>', 'xx')
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx',
"bad character in group name '-1'", 3)
# New valid/invalid identifiers in Python 3
self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
self.checkTemplateError('(?P<a>x)', '\g<©>', 'xx',
"bad character in group name '©'", 3)
# Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
@ -444,6 +485,19 @@ class ReTests(unittest.TestCase):
pat = '(?:%s)(?(200)z)' % pat
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
self.checkPatternError(r'()(?(1)a|b',
'missing ), unterminated subpattern', 2)
self.checkPatternError(r'()(?(1)a|b|c)',
'conditional backref with more than '
'two branches', 10)
def test_re_groupref_overflow(self):
self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx',
'invalid group reference', 3)
self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
'invalid group reference', 10)
def test_re_groupref(self):
self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
('|', 'a'))
@ -456,6 +510,8 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
(None, None))
self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
def test_groupdict(self):
self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
'first second').groupdict(),
@ -493,6 +549,7 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match("^x{3}$", "xxx"))
self.assertTrue(re.match("^x{1,3}$", "xxx"))
self.assertTrue(re.match("^x{3,3}$", "xxx"))
self.assertTrue(re.match("^x{1,4}$", "xxx"))
self.assertTrue(re.match("^x{3,4}?$", "xxx"))
self.assertTrue(re.match("^x{3}?$", "xxx"))
@ -503,6 +560,9 @@ class ReTests(unittest.TestCase):
self.assertIsNone(re.match("^x{}$", "xxx"))
self.assertTrue(re.match("^x{}$", "x{}"))
self.checkPatternError(r'x{2,1}',
'min repeat greater than max repeat', 2)
def test_getattr(self):
self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
@ -550,7 +610,7 @@ class ReTests(unittest.TestCase):
b"1aa! a", re.LOCALE).group(0), b"1aa! a")
def test_other_escapes(self):
self.assertRaises(re.error, re.compile, "\\")
self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
self.assertEqual(re.match(r"\(", '(').group(), '(')
self.assertIsNone(re.match(r"\(", ')'))
self.assertEqual(re.match(r"\\", '\\').group(), '\\')
@ -875,15 +935,17 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(r"\08", "\0008"))
self.assertTrue(re.match(r"\01", "\001"))
self.assertTrue(re.match(r"\018", "\0018"))
self.assertRaises(re.error, re.match, r"\567", "")
self.assertRaises(re.error, re.match, r"\911", "")
self.assertRaises(re.error, re.match, r"\x1", "")
self.assertRaises(re.error, re.match, r"\x1z", "")
self.assertRaises(re.error, re.match, r"\u123", "")
self.assertRaises(re.error, re.match, r"\u123z", "")
self.assertRaises(re.error, re.match, r"\U0001234", "")
self.assertRaises(re.error, re.match, r"\U0001234z", "")
self.assertRaises(re.error, re.match, r"\U00110000", "")
self.checkPatternError(r"\567",
r'octal escape value \567 outside of '
r'range 0-0o377', 0)
self.checkPatternError(r"\911", 'invalid group reference', 0)
self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
def test_sre_character_class_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
@ -903,12 +965,14 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
self.assertRaises(re.error, re.match, r"[\567]", "")
self.assertRaises(re.error, re.match, r"[\911]", "")
self.assertRaises(re.error, re.match, r"[\x1z]", "")
self.assertRaises(re.error, re.match, r"[\u123z]", "")
self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
self.assertRaises(re.error, re.match, r"[\U00110000]", "")
self.checkPatternError(r"[\567]",
r'octal escape value \567 outside of '
r'range 0-0o377', 1)
self.checkPatternError(r"[\911]", r'bad escape \9', 1)
self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
def test_sre_byte_literals(self):
@ -927,10 +991,12 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(br"\08", b"\0008"))
self.assertTrue(re.match(br"\01", b"\001"))
self.assertTrue(re.match(br"\018", b"\0018"))
self.assertRaises(re.error, re.match, br"\567", b"")
self.assertRaises(re.error, re.match, br"\911", b"")
self.assertRaises(re.error, re.match, br"\x1", b"")
self.assertRaises(re.error, re.match, br"\x1z", b"")
self.checkPatternError(br"\567",
r'octal escape value \567 outside of '
r'range 0-0o377', 0)
self.checkPatternError(br"\911", 'invalid group reference', 0)
self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
def test_sre_byte_class_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
@ -946,9 +1012,22 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(br"[\u1234]", b'u'))
with self.assertWarns(DeprecationWarning):
self.assertTrue(re.match(br"[\U00012345]", b'U'))
self.assertRaises(re.error, re.match, br"[\567]", b"")
self.assertRaises(re.error, re.match, br"[\911]", b"")
self.assertRaises(re.error, re.match, br"[\x1z]", b"")
self.checkPatternError(br"[\567]",
r'octal escape value \567 outside of '
r'range 0-0o377', 1)
self.checkPatternError(br"[\911]", r'bad escape \9', 1)
self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
def test_character_set_errors(self):
self.checkPatternError(r'[', 'unterminated character set', 0)
self.checkPatternError(r'[^', 'unterminated character set', 0)
self.checkPatternError(r'[a', 'unterminated character set', 0)
# bug 545855 -- This pattern failed to cause a compile error as it
# should, instead provoking a TypeError.
self.checkPatternError(r"[a-", 'unterminated character set', 0)
self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
def test_bug_113254(self):
self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
@ -963,11 +1042,6 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
self.assertEqual(re.match("((a))", "a").lastindex, 1)
def test_bug_545855(self):
# bug 545855 -- This pattern failed to cause a compile error as it
# should, instead provoking a TypeError.
self.assertRaises(re.error, re.compile, 'foo[a-')
def test_bug_418626(self):
# bugs 418626 at al. -- Testing Greg Chapman's addition of op code
# SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
@ -991,6 +1065,24 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
def test_nothing_to_repeat(self):
for reps in '*', '+', '?', '{1,2}':
for mod in '', '?':
self.checkPatternError('%s%s' % (reps, mod),
'nothing to repeat', 0)
self.checkPatternError('(?:%s%s)' % (reps, mod),
'nothing to repeat', 3)
def test_multiple_repeat(self):
for outer_reps in '*', '+', '{1,2}':
for outer_mod in '', '?':
outer_op = outer_reps + outer_mod
for inner_reps in '*', '+', '?', '{1,2}':
for inner_mod in '', '?':
inner_op = inner_reps + inner_mod
self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
'multiple repeat', 1 + len(inner_op))
def test_unlimited_zero_width_repeat(self):
# Issue #9669
self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
@ -1381,13 +1473,13 @@ class ReTests(unittest.TestCase):
def test_backref_group_name_in_exception(self):
# Issue 17341: Poor error message when compiling invalid regex
with self.assertRaisesRegex(sre_constants.error, '<foo>'):
re.compile('(?P=<foo>)')
self.checkPatternError('(?P=<foo>)',
"bad character in group name '<foo>'", 4)
def test_group_name_in_exception(self):
# Issue 17341: Poor error message when compiling invalid regex
with self.assertRaisesRegex(sre_constants.error, '\?foo'):
re.compile('(?P<?foo>)')
self.checkPatternError('(?P<?foo>)',
"bad character in group name '?foo'", 4)
def test_issue17998(self):
for reps in '*', '+', '?', '{1}':
@ -1556,6 +1648,19 @@ SUBPATTERN None
self.assertIn(' at position 77', str(err))
self.assertIn('(line 5, column 17)', str(err))
def test_misc_errors(self):
self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
self.checkPatternError(r'(?iz)', 'unknown flag', 3)
self.checkPatternError(r'(?i', 'missing )', 3)
self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected):

View File

@ -30,6 +30,8 @@ Core and Builtins
Library
-------
- Issue #22364: Improved some re error messages using regex for hints.
- Issue #23742: ntpath.expandvars() no longer loses unbalanced single quotes.
- Issue #21717: The zipfile.ZipFile.open function now supports 'x' (exclusive

View File

@ -315,7 +315,7 @@ getstring(PyObject* string, Py_ssize_t* p_length,
/* get pointer to byte string buffer */
if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
return NULL;
}
@ -359,12 +359,12 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
if (isbytes && pattern->isbytes == 0) {
PyErr_SetString(PyExc_TypeError,
"can't use a string pattern on a bytes-like object");
"cannot use a string pattern on a bytes-like object");
goto err;
}
if (!isbytes && pattern->isbytes > 0) {
PyErr_SetString(PyExc_TypeError,
"can't use a bytes pattern on a string-like object");
"cannot use a bytes pattern on a string-like object");
goto err;
}