Issue #22364: Improved some re error messages using regex for hints.

This commit is contained in:
Serhiy Storchaka 2015-03-25 21:03:47 +02:00
parent 7c316a181a
commit 632a77e6a3
6 changed files with 299 additions and 195 deletions

View File

@ -286,7 +286,7 @@ def _compile(pattern, flags):
if isinstance(pattern, _pattern_type): if isinstance(pattern, _pattern_type):
if flags: if flags:
raise ValueError( raise ValueError(
"Cannot process flags argument with a compiled pattern") "cannot process flags argument with a compiled pattern")
return pattern return pattern
if not sre_compile.isstring(pattern): if not sre_compile.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern") raise TypeError("first argument must be string or compiled pattern")

View File

@ -113,7 +113,7 @@ def _compile(code, pattern, flags):
emit(ANY) emit(ANY)
elif op in REPEATING_CODES: elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE: if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator") raise error("internal: unsupported template operator %r" % (op,))
elif _simple(av) and op is not REPEAT: elif _simple(av) and op is not REPEAT:
if op is MAX_REPEAT: if op is MAX_REPEAT:
emit(REPEAT_ONE) emit(REPEAT_ONE)
@ -216,7 +216,7 @@ def _compile(code, pattern, flags):
else: else:
code[skipyes] = _len(code) - skipyes + 1 code[skipyes] = _len(code) - skipyes + 1
else: else:
raise ValueError("unsupported operand type", op) raise error("internal: unsupported operand type %r" % (op,))
def _compile_charset(charset, flags, code, fixup=None, fixes=None): def _compile_charset(charset, flags, code, fixup=None, fixes=None):
# compile charset subprogram # compile charset subprogram
@ -242,7 +242,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
else: else:
emit(av) emit(av)
else: else:
raise error("internal: unsupported set operator") raise error("internal: unsupported set operator %r" % (op,))
emit(FAILURE) emit(FAILURE)
def _optimize_charset(charset, fixup, fixes): def _optimize_charset(charset, fixup, fixes):

View File

@ -79,7 +79,7 @@ class Pattern:
gid = self.groups gid = self.groups
self.subpatterns.append(None) self.subpatterns.append(None)
if self.groups > MAXGROUPS: if self.groups > MAXGROUPS:
raise error("groups number is too large") raise error("too many groups")
if name is not None: if name is not None:
ogid = self.groupdict.get(name, None) ogid = self.groupdict.get(name, None)
if ogid is not None: if ogid is not None:
@ -235,7 +235,7 @@ class Tokenizer:
try: try:
char += self.decoded_string[index] char += self.decoded_string[index]
except IndexError: except IndexError:
raise error("bogus escape (end of line)", raise error("bad escape (end of pattern)",
self.string, len(self.string) - 1) from None self.string, len(self.string) - 1) from None
self.index = index + 1 self.index = index + 1
self.next = char self.next = char
@ -263,8 +263,13 @@ class Tokenizer:
c = self.next c = self.next
self.__next() self.__next()
if c is None: if c is None:
raise self.error("unterminated name") if not result:
raise self.error("missing group name")
raise self.error("missing %s, unterminated name" % terminator,
len(result))
if c == terminator: if c == terminator:
if not result:
raise self.error("missing group name", 1)
break break
result += c result += c
return result return result
@ -318,19 +323,19 @@ def _class_escape(source, escape):
# hexadecimal escape (exactly two digits) # hexadecimal escape (exactly two digits)
escape += source.getwhile(2, HEXDIGITS) escape += source.getwhile(2, HEXDIGITS)
if len(escape) != 4: if len(escape) != 4:
raise ValueError raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16) return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext: elif c == "u" and source.istext:
# unicode escape (exactly four digits) # unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS) escape += source.getwhile(4, HEXDIGITS)
if len(escape) != 6: if len(escape) != 6:
raise ValueError raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16) return LITERAL, int(escape[2:], 16)
elif c == "U" and source.istext: elif c == "U" and source.istext:
# unicode escape (exactly eight digits) # unicode escape (exactly eight digits)
escape += source.getwhile(8, HEXDIGITS) escape += source.getwhile(8, HEXDIGITS)
if len(escape) != 10: if len(escape) != 10:
raise ValueError raise source.error("incomplete escape %s" % escape, len(escape))
c = int(escape[2:], 16) c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code chr(c) # raise ValueError for invalid code
return LITERAL, c return LITERAL, c
@ -339,7 +344,7 @@ def _class_escape(source, escape):
escape += source.getwhile(2, OCTDIGITS) escape += source.getwhile(2, OCTDIGITS)
c = int(escape[1:], 8) c = int(escape[1:], 8)
if c > 0o377: if c > 0o377:
raise source.error('octal escape value %r outside of ' raise source.error('octal escape value %s outside of '
'range 0-0o377' % escape, len(escape)) 'range 0-0o377' % escape, len(escape))
return LITERAL, c return LITERAL, c
elif c in DIGITS: elif c in DIGITS:
@ -352,7 +357,7 @@ def _class_escape(source, escape):
return LITERAL, ord(escape[1]) return LITERAL, ord(escape[1])
except ValueError: except ValueError:
pass pass
raise source.error("bogus escape: %r" % escape, len(escape)) raise source.error("bad escape %s" % escape, len(escape))
def _escape(source, escape, state): def _escape(source, escape, state):
# handle escape code in expression # handle escape code in expression
@ -368,19 +373,19 @@ def _escape(source, escape, state):
# hexadecimal escape # hexadecimal escape
escape += source.getwhile(2, HEXDIGITS) escape += source.getwhile(2, HEXDIGITS)
if len(escape) != 4: if len(escape) != 4:
raise ValueError raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16) return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext: elif c == "u" and source.istext:
# unicode escape (exactly four digits) # unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS) escape += source.getwhile(4, HEXDIGITS)
if len(escape) != 6: if len(escape) != 6:
raise ValueError raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16) return LITERAL, int(escape[2:], 16)
elif c == "U" and source.istext: elif c == "U" and source.istext:
# unicode escape (exactly eight digits) # unicode escape (exactly eight digits)
escape += source.getwhile(8, HEXDIGITS) escape += source.getwhile(8, HEXDIGITS)
if len(escape) != 10: if len(escape) != 10:
raise ValueError raise source.error("incomplete escape %s" % escape, len(escape))
c = int(escape[2:], 16) c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code chr(c) # raise ValueError for invalid code
return LITERAL, c return LITERAL, c
@ -398,7 +403,7 @@ def _escape(source, escape, state):
escape += source.get() escape += source.get()
c = int(escape[1:], 8) c = int(escape[1:], 8)
if c > 0o377: if c > 0o377:
raise source.error('octal escape value %r outside of ' raise source.error('octal escape value %s outside of '
'range 0-0o377' % escape, 'range 0-0o377' % escape,
len(escape)) len(escape))
return LITERAL, c return LITERAL, c
@ -406,11 +411,11 @@ def _escape(source, escape, state):
group = int(escape[1:]) group = int(escape[1:])
if group < state.groups: if group < state.groups:
if not state.checkgroup(group): if not state.checkgroup(group):
raise source.error("cannot refer to open group", raise source.error("cannot refer to an open group",
len(escape)) len(escape))
state.checklookbehindgroup(group, source) state.checklookbehindgroup(group, source)
return GROUPREF, group return GROUPREF, group
raise ValueError raise source.error("invalid group reference", len(escape))
if len(escape) == 2: if len(escape) == 2:
if c in ASCIILETTERS: if c in ASCIILETTERS:
import warnings import warnings
@ -419,7 +424,7 @@ def _escape(source, escape, state):
return LITERAL, ord(escape[1]) return LITERAL, ord(escape[1])
except ValueError: except ValueError:
pass pass
raise source.error("bogus escape: %r" % escape, len(escape)) raise source.error("bad escape %s" % escape, len(escape))
def _parse_sub(source, state, nested=True): def _parse_sub(source, state, nested=True):
# parse an alternation: a|b|c # parse an alternation: a|b|c
@ -427,12 +432,11 @@ def _parse_sub(source, state, nested=True):
items = [] items = []
itemsappend = items.append itemsappend = items.append
sourcematch = source.match sourcematch = source.match
start = source.tell()
while True: while True:
itemsappend(_parse(source, state)) itemsappend(_parse(source, state))
if not sourcematch("|"): if not sourcematch("|"):
break break
if nested and source.next is not None and source.next != ")":
raise source.error("pattern not properly closed")
if len(items) == 1: if len(items) == 1:
return items[0] return items[0]
@ -480,8 +484,6 @@ def _parse_sub_cond(source, state, condgroup):
raise source.error("conditional backref with more than two branches") raise source.error("conditional backref with more than two branches")
else: else:
item_no = None item_no = None
if source.next is not None and source.next != ")":
raise source.error("pattern not properly closed")
subpattern = SubPattern(state) subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern return subpattern
@ -526,6 +528,7 @@ def _parse(source, state):
subpatternappend((LITERAL, _ord(this))) subpatternappend((LITERAL, _ord(this)))
elif this == "[": elif this == "[":
here = source.tell() - 1
# character set # character set
set = [] set = []
setappend = set.append setappend = set.append
@ -538,7 +541,8 @@ def _parse(source, state):
while True: while True:
this = sourceget() this = sourceget()
if this is None: if this is None:
raise source.error("unexpected end of regular expression") raise source.error("unterminated character set",
source.tell() - here)
if this == "]" and set != start: if this == "]" and set != start:
break break
elif this[0] == "\\": elif this[0] == "\\":
@ -547,25 +551,28 @@ def _parse(source, state):
code1 = LITERAL, _ord(this) code1 = LITERAL, _ord(this)
if sourcematch("-"): if sourcematch("-"):
# potential range # potential range
this = sourceget() that = sourceget()
if this is None: if that is None:
raise source.error("unexpected end of regular expression") raise source.error("unterminated character set",
if this == "]": source.tell() - here)
if that == "]":
if code1[0] is IN: if code1[0] is IN:
code1 = code1[1][0] code1 = code1[1][0]
setappend(code1) setappend(code1)
setappend((LITERAL, _ord("-"))) setappend((LITERAL, _ord("-")))
break break
if this[0] == "\\": if that[0] == "\\":
code2 = _class_escape(source, this) code2 = _class_escape(source, that)
else: else:
code2 = LITERAL, _ord(this) code2 = LITERAL, _ord(that)
if code1[0] != LITERAL or code2[0] != LITERAL: if code1[0] != LITERAL or code2[0] != LITERAL:
raise source.error("bad character range", len(this)) msg = "bad character range %s-%s" % (this, that)
raise source.error(msg, len(this) + 1 + len(that))
lo = code1[1] lo = code1[1]
hi = code2[1] hi = code2[1]
if hi < lo: if hi < lo:
raise source.error("bad character range", len(this)) msg = "bad character range %s-%s" % (this, that)
raise source.error(msg, len(this) + 1 + len(that))
setappend((RANGE, (lo, hi))) setappend((RANGE, (lo, hi)))
else: else:
if code1[0] is IN: if code1[0] is IN:
@ -617,10 +624,10 @@ def _parse(source, state):
if max >= MAXREPEAT: if max >= MAXREPEAT:
raise OverflowError("the repetition number is too large") raise OverflowError("the repetition number is too large")
if max < min: if max < min:
raise source.error("bad repeat interval", raise source.error("min repeat greater than max repeat",
source.tell() - here) source.tell() - here)
else: else:
raise source.error("not supported", len(this)) raise AssertionError("unsupported quantifier %r" % (char,))
# figure out which item to repeat # figure out which item to repeat
if subpattern: if subpattern:
item = subpattern[-1:] item = subpattern[-1:]
@ -641,39 +648,32 @@ def _parse(source, state):
subpatternappend((ANY, None)) subpatternappend((ANY, None))
elif this == "(": elif this == "(":
group = 1 start = source.tell() - 1
group = True
name = None name = None
condgroup = None condgroup = None
if sourcematch("?"): if sourcematch("?"):
group = 0
# options # options
char = sourceget() char = sourceget()
if char is None: if char is None:
raise self.error("unexpected end of pattern") raise source.error("unexpected end of pattern")
if char == "P": if char == "P":
# python extensions # python extensions
if sourcematch("<"): if sourcematch("<"):
# named group: skip forward to end of name # named group: skip forward to end of name
name = source.getuntil(">") name = source.getuntil(">")
group = 1
if not name:
raise source.error("missing group name", 1)
if not name.isidentifier(): if not name.isidentifier():
raise source.error("bad character in group name " msg = "bad character in group name %r" % name
"%r" % name, raise source.error(msg, len(name) + 1)
len(name) + 1)
elif sourcematch("="): elif sourcematch("="):
# named backreference # named backreference
name = source.getuntil(")") name = source.getuntil(")")
if not name:
raise source.error("missing group name", 1)
if not name.isidentifier(): if not name.isidentifier():
raise source.error("bad character in backref " msg = "bad character in group name %r" % name
"group name %r" % name, raise source.error(msg, len(name) + 1)
len(name) + 1)
gid = state.groupdict.get(name) gid = state.groupdict.get(name)
if gid is None: if gid is None:
msg = "unknown group name: {0!r}".format(name) msg = "unknown group name %r" % name
raise source.error(msg, len(name) + 1) raise source.error(msg, len(name) + 1)
state.checklookbehindgroup(gid, source) state.checklookbehindgroup(gid, source)
subpatternappend((GROUPREF, gid)) subpatternappend((GROUPREF, gid))
@ -682,16 +682,17 @@ def _parse(source, state):
char = sourceget() char = sourceget()
if char is None: if char is None:
raise source.error("unexpected end of pattern") raise source.error("unexpected end of pattern")
raise source.error("unknown specifier: ?P%s" % char, raise source.error("unknown extension ?P" + char,
len(char)) len(char) + 2)
elif char == ":": elif char == ":":
# non-capturing group # non-capturing group
group = 2 group = None
elif char == "#": elif char == "#":
# comment # comment
while True: while True:
if source.next is None: if source.next is None:
raise source.error("unbalanced parenthesis") raise source.error("missing ), unterminated comment",
source.tell() - start)
if sourceget() == ")": if sourceget() == ")":
break break
continue continue
@ -700,8 +701,11 @@ def _parse(source, state):
dir = 1 dir = 1
if char == "<": if char == "<":
char = sourceget() char = sourceget()
if char is None or char not in "=!": if char is None:
raise source.error("syntax error") raise source.error("unexpected end of pattern")
if char not in "=!":
raise source.error("unknown extension ?<" + char,
len(char) + 2)
dir = -1 # lookbehind dir = -1 # lookbehind
lookbehindgroups = state.lookbehindgroups lookbehindgroups = state.lookbehindgroups
if lookbehindgroups is None: if lookbehindgroups is None:
@ -711,7 +715,8 @@ def _parse(source, state):
if lookbehindgroups is None: if lookbehindgroups is None:
state.lookbehindgroups = None state.lookbehindgroups = None
if not sourcematch(")"): if not sourcematch(")"):
raise source.error("unbalanced parenthesis") raise source.error("missing ), unterminated subpattern",
source.tell() - start)
if char == "=": if char == "=":
subpatternappend((ASSERT, (dir, p))) subpatternappend((ASSERT, (dir, p)))
else: else:
@ -720,13 +725,11 @@ def _parse(source, state):
elif char == "(": elif char == "(":
# conditional backreference group # conditional backreference group
condname = source.getuntil(")") condname = source.getuntil(")")
group = 2 group = None
if not condname:
raise source.error("missing group name", 1)
if condname.isidentifier(): if condname.isidentifier():
condgroup = state.groupdict.get(condname) condgroup = state.groupdict.get(condname)
if condgroup is None: if condgroup is None:
msg = "unknown group name: {0!r}".format(condname) msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1) raise source.error(msg, len(condname) + 1)
else: else:
try: try:
@ -734,50 +737,48 @@ def _parse(source, state):
if condgroup < 0: if condgroup < 0:
raise ValueError raise ValueError
except ValueError: except ValueError:
raise source.error("bad character in group name", msg = "bad character in group name %r" % condname
len(condname) + 1) raise source.error(msg, len(condname) + 1) from None
if not condgroup: if not condgroup:
raise source.error("bad group number", raise source.error("bad group number",
len(condname) + 1) len(condname) + 1)
if condgroup >= MAXGROUPS: if condgroup >= MAXGROUPS:
raise source.error("the group number is too large", raise source.error("invalid group reference",
len(condname) + 1) len(condname) + 1)
state.checklookbehindgroup(condgroup, source) state.checklookbehindgroup(condgroup, source)
elif char in FLAGS: elif char in FLAGS:
# flags # flags
state.flags |= FLAGS[char] while True:
while source.next in FLAGS: state.flags |= FLAGS[char]
state.flags |= FLAGS[sourceget()] char = sourceget()
if char is None:
raise source.error("missing )")
if char == ")":
break
if char not in FLAGS:
raise source.error("unknown flag", len(char))
verbose = state.flags & SRE_FLAG_VERBOSE verbose = state.flags & SRE_FLAG_VERBOSE
continue
else: else:
raise source.error("unexpected end of pattern") raise source.error("unknown extension ?" + char,
if group: len(char) + 1)
# parse group contents
if group == 2: # parse group contents
# anonymous group if group is not None:
group = None try:
else: group = state.opengroup(name)
try: except error as err:
group = state.opengroup(name) raise source.error(err.msg, len(name) + 1) from None
except error as err: if condgroup:
raise source.error(err.msg, len(name) + 1) p = _parse_sub_cond(source, state, condgroup)
if condgroup:
p = _parse_sub_cond(source, state, condgroup)
else:
p = _parse_sub(source, state)
if not sourcematch(")"):
raise source.error("unbalanced parenthesis")
if group is not None:
state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, p)))
else: else:
while True: p = _parse_sub(source, state)
char = sourceget() if not source.match(")"):
if char is None: raise source.error("missing ), unterminated subpattern",
raise source.error("unexpected end of pattern") source.tell() - start)
if char == ")": if group is not None:
break state.closegroup(group, p)
raise source.error("unknown extension", len(char)) subpatternappend((SUBPATTERN, (group, p)))
elif this == "^": elif this == "^":
subpatternappend((AT, AT_BEGINNING)) subpatternappend((AT, AT_BEGINNING))
@ -786,7 +787,7 @@ def _parse(source, state):
subpattern.append((AT, AT_END)) subpattern.append((AT, AT_END))
else: else:
raise source.error("parser error", len(this)) raise AssertionError("unsupported special character %r" % (char,))
return subpattern return subpattern
@ -804,7 +805,7 @@ def fix_flags(src, flags):
raise ValueError("ASCII and UNICODE flags are incompatible") raise ValueError("ASCII and UNICODE flags are incompatible")
else: else:
if flags & SRE_FLAG_UNICODE: if flags & SRE_FLAG_UNICODE:
raise ValueError("can't use UNICODE flag with a bytes pattern") raise ValueError("cannot use UNICODE flag with a bytes pattern")
if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
import warnings import warnings
warnings.warn("ASCII and LOCALE flags are incompatible. " warnings.warn("ASCII and LOCALE flags are incompatible. "
@ -826,11 +827,8 @@ def parse(str, flags=0, pattern=None):
p.pattern.flags = fix_flags(str, p.pattern.flags) p.pattern.flags = fix_flags(str, p.pattern.flags)
if source.next is not None: if source.next is not None:
if source.next == ")": assert source.next == ")"
raise source.error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
else:
raise source.error("bogus characters at end of regular expression",
len(tail))
if flags & SRE_FLAG_DEBUG: if flags & SRE_FLAG_DEBUG:
p.dump() p.dump()
@ -866,26 +864,25 @@ def parse_template(source, pattern):
c = this[1] c = this[1]
if c == "g": if c == "g":
name = "" name = ""
if s.match("<"): if not s.match("<"):
name = s.getuntil(">") raise s.error("missing <")
if not name: name = s.getuntil(">")
raise s.error("missing group name", 1) if name.isidentifier():
try:
index = int(name)
if index < 0:
raise s.error("negative group number", len(name) + 1)
if index >= MAXGROUPS:
raise s.error("the group number is too large",
len(name) + 1)
except ValueError:
if not name.isidentifier():
raise s.error("bad character in group name",
len(name) + 1)
try: try:
index = pattern.groupindex[name] index = pattern.groupindex[name]
except KeyError: except KeyError:
msg = "unknown group name: {0!r}".format(name) raise IndexError("unknown group name %r" % name)
raise IndexError(msg) else:
try:
index = int(name)
if index < 0:
raise ValueError
except ValueError:
raise s.error("bad character in group name %r" % name,
len(name) + 1) from None
if index >= MAXGROUPS:
raise s.error("invalid group reference",
len(name) + 1)
addgroup(index) addgroup(index)
elif c == "0": elif c == "0":
if s.next in OCTDIGITS: if s.next in OCTDIGITS:
@ -903,7 +900,7 @@ def parse_template(source, pattern):
isoctal = True isoctal = True
c = int(this[1:], 8) c = int(this[1:], 8)
if c > 0o377: if c > 0o377:
raise s.error('octal escape value %r outside of ' raise s.error('octal escape value %s outside of '
'range 0-0o377' % this, len(this)) 'range 0-0o377' % this, len(this))
lappend(chr(c)) lappend(chr(c))
if not isoctal: if not isoctal:

View File

@ -38,6 +38,24 @@ class ReTests(unittest.TestCase):
self.assertIs(type(actual), type(expect), msg) self.assertIs(type(actual), type(expect), msg)
recurse(actual, expect) recurse(actual, expect)
def checkPatternError(self, pattern, errmsg, pos=None):
with self.assertRaises(re.error) as cm:
re.compile(pattern)
with self.subTest(pattern=pattern):
err = cm.exception
self.assertEqual(err.msg, errmsg)
if pos is not None:
self.assertEqual(err.pos, pos)
def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
with self.assertRaises(re.error) as cm:
re.sub(pattern, repl, string)
with self.subTest(pattern=pattern, repl=repl):
err = cm.exception
self.assertEqual(err.msg, errmsg)
if pos is not None:
self.assertEqual(err.pos, pos)
def test_keep_buffer(self): def test_keep_buffer(self):
# See bug 14212 # See bug 14212
b = bytearray(b'x') b = bytearray(b'x')
@ -148,6 +166,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
self.assertEqual(re.sub('x', r'\111', 'x'), '\111') self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
self.assertEqual(re.sub('x', r'\117', 'x'), '\117') self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
@ -158,21 +177,25 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
self.assertRaises(re.error, re.sub, 'x', r'\400', 'x') self.checkTemplateError('x', r'\400', 'x',
self.assertRaises(re.error, re.sub, 'x', r'\777', 'x') r'octal escape value \400 outside of '
r'range 0-0o377', 0)
self.checkTemplateError('x', r'\777', 'x',
r'octal escape value \777 outside of '
r'range 0-0o377', 0)
self.assertRaises(re.error, re.sub, 'x', r'\1', 'x') self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
self.assertRaises(re.error, re.sub, 'x', r'\8', 'x') self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
self.assertRaises(re.error, re.sub, 'x', r'\9', 'x') self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
self.assertRaises(re.error, re.sub, 'x', r'\11', 'x') self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
self.assertRaises(re.error, re.sub, 'x', r'\18', 'x') self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x') self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
self.assertRaises(re.error, re.sub, 'x', r'\90', 'x') self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
self.assertRaises(re.error, re.sub, 'x', r'\99', 'x') self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8' self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x') self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1' self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0' self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
# in python2.3 (etc), these loop endlessly in sre_parser.py # in python2.3 (etc), these loop endlessly in sre_parser.py
self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
@ -198,47 +221,65 @@ class ReTests(unittest.TestCase):
re.compile('(?P<a>x)(?P=a)(?(a)y)') re.compile('(?P<a>x)(?P=a)(?(a)y)')
re.compile('(?P<a1>x)(?P=a1)(?(a1)y)') re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
re.compile('(?P<a1>x)\1(?(1)y)') re.compile('(?P<a1>x)\1(?(1)y)')
self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)') self.checkPatternError('(?P<a>)(?P<a>)',
self.assertRaises(re.error, re.compile, '(?Px)') "redefinition of group name 'a' as group 2; "
self.assertRaises(re.error, re.compile, '(?P=)') "was group 1")
self.assertRaises(re.error, re.compile, '(?P=1)') self.checkPatternError('(?Pxy)', 'unknown extension ?Px')
self.assertRaises(re.error, re.compile, '(?P=a)') self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11)
self.assertRaises(re.error, re.compile, '(?P=a1)') self.checkPatternError('(?P=', 'missing group name', 4)
self.assertRaises(re.error, re.compile, '(?P=a.)') self.checkPatternError('(?P=)', 'missing group name', 4)
self.assertRaises(re.error, re.compile, '(?P<)') self.checkPatternError('(?P=1)', "bad character in group name '1'", 4)
self.assertRaises(re.error, re.compile, '(?P<>)') self.checkPatternError('(?P=a)', "unknown group name 'a'")
self.assertRaises(re.error, re.compile, '(?P<1>)') self.checkPatternError('(?P=a1)', "unknown group name 'a1'")
self.assertRaises(re.error, re.compile, '(?P<a.>)') self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4)
self.assertRaises(re.error, re.compile, '(?())') self.checkPatternError('(?P<)', 'missing >, unterminated name', 4)
self.assertRaises(re.error, re.compile, '(?(a))') self.checkPatternError('(?P<a', 'missing >, unterminated name', 4)
self.assertRaises(re.error, re.compile, '(?(1a))') self.checkPatternError('(?P<', 'missing group name', 4)
self.assertRaises(re.error, re.compile, '(?(a.))') self.checkPatternError('(?P<>)', 'missing group name', 4)
self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
self.checkPatternError(r'(?(', 'missing group name', 3)
self.checkPatternError(r'(?())', 'missing group name', 3)
self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
# New valid/invalid identifiers in Python 3 # New valid/invalid identifiers in Python 3
re.compile('(?P<µ>x)(?P=µ)(?(µ)y)') re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)') re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
self.assertRaises(re.error, re.compile, '(?P<©>x)') self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
# Support > 100 groups. # Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
pat = '(?:%s)(?(200)z|t)' % pat pat = '(?:%s)(?(200)z|t)' % pat
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
def test_symbolic_refs(self): def test_symbolic_refs(self):
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx') self.checkTemplateError('(?P<a>x)', '\g<a', 'xx',
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx') 'missing >, unterminated name', 3)
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx') self.checkTemplateError('(?P<a>x)', '\g<', 'xx',
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx') 'missing group name', 3)
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx') self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2)
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx') self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx',
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<2>', 'xx') "bad character in group name 'a a'", 3)
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\2', 'xx') self.checkTemplateError('(?P<a>x)', '\g<>', 'xx',
self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx') 'missing group name', 3)
self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx',
"bad character in group name '1a1'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
'invalid group reference')
self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
'invalid group reference')
with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
re.sub('(?P<a>x)', '\g<ab>', 'xx')
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '') self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx') self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx',
"bad character in group name '-1'", 3)
# New valid/invalid identifiers in Python 3 # New valid/invalid identifiers in Python 3
self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx') self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx') self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx') self.checkTemplateError('(?P<a>x)', '\g<©>', 'xx',
"bad character in group name '©'", 3)
# Support > 100 groups. # Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8') self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
@ -444,6 +485,19 @@ class ReTests(unittest.TestCase):
pat = '(?:%s)(?(200)z)' % pat pat = '(?:%s)(?(200)z)' % pat
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
self.checkPatternError(r'()(?(1)a|b',
'missing ), unterminated subpattern', 2)
self.checkPatternError(r'()(?(1)a|b|c)',
'conditional backref with more than '
'two branches', 10)
def test_re_groupref_overflow(self):
self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx',
'invalid group reference', 3)
self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
'invalid group reference', 10)
def test_re_groupref(self): def test_re_groupref(self):
self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
('|', 'a')) ('|', 'a'))
@ -456,6 +510,8 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
(None, None)) (None, None))
self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
def test_groupdict(self): def test_groupdict(self):
self.assertEqual(re.match('(?P<first>first) (?P<second>second)', self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
'first second').groupdict(), 'first second').groupdict(),
@ -493,6 +549,7 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match("^x{3}$", "xxx")) self.assertTrue(re.match("^x{3}$", "xxx"))
self.assertTrue(re.match("^x{1,3}$", "xxx")) self.assertTrue(re.match("^x{1,3}$", "xxx"))
self.assertTrue(re.match("^x{3,3}$", "xxx"))
self.assertTrue(re.match("^x{1,4}$", "xxx")) self.assertTrue(re.match("^x{1,4}$", "xxx"))
self.assertTrue(re.match("^x{3,4}?$", "xxx")) self.assertTrue(re.match("^x{3,4}?$", "xxx"))
self.assertTrue(re.match("^x{3}?$", "xxx")) self.assertTrue(re.match("^x{3}?$", "xxx"))
@ -503,6 +560,9 @@ class ReTests(unittest.TestCase):
self.assertIsNone(re.match("^x{}$", "xxx")) self.assertIsNone(re.match("^x{}$", "xxx"))
self.assertTrue(re.match("^x{}$", "x{}")) self.assertTrue(re.match("^x{}$", "x{}"))
self.checkPatternError(r'x{2,1}',
'min repeat greater than max repeat', 2)
def test_getattr(self): def test_getattr(self):
self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
@ -550,7 +610,7 @@ class ReTests(unittest.TestCase):
b"1aa! a", re.LOCALE).group(0), b"1aa! a") b"1aa! a", re.LOCALE).group(0), b"1aa! a")
def test_other_escapes(self): def test_other_escapes(self):
self.assertRaises(re.error, re.compile, "\\") self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
self.assertEqual(re.match(r"\(", '(').group(), '(') self.assertEqual(re.match(r"\(", '(').group(), '(')
self.assertIsNone(re.match(r"\(", ')')) self.assertIsNone(re.match(r"\(", ')'))
self.assertEqual(re.match(r"\\", '\\').group(), '\\') self.assertEqual(re.match(r"\\", '\\').group(), '\\')
@ -875,15 +935,17 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(r"\08", "\0008")) self.assertTrue(re.match(r"\08", "\0008"))
self.assertTrue(re.match(r"\01", "\001")) self.assertTrue(re.match(r"\01", "\001"))
self.assertTrue(re.match(r"\018", "\0018")) self.assertTrue(re.match(r"\018", "\0018"))
self.assertRaises(re.error, re.match, r"\567", "") self.checkPatternError(r"\567",
self.assertRaises(re.error, re.match, r"\911", "") r'octal escape value \567 outside of '
self.assertRaises(re.error, re.match, r"\x1", "") r'range 0-0o377', 0)
self.assertRaises(re.error, re.match, r"\x1z", "") self.checkPatternError(r"\911", 'invalid group reference', 0)
self.assertRaises(re.error, re.match, r"\u123", "") self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
self.assertRaises(re.error, re.match, r"\u123z", "") self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
self.assertRaises(re.error, re.match, r"\U0001234", "") self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
self.assertRaises(re.error, re.match, r"\U0001234z", "") self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
self.assertRaises(re.error, re.match, r"\U00110000", "") self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
def test_sre_character_class_literals(self): def test_sre_character_class_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
@ -903,12 +965,14 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(r"[\U%08x]" % i, chr(i))) self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0")) self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z")) self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
self.assertRaises(re.error, re.match, r"[\567]", "") self.checkPatternError(r"[\567]",
self.assertRaises(re.error, re.match, r"[\911]", "") r'octal escape value \567 outside of '
self.assertRaises(re.error, re.match, r"[\x1z]", "") r'range 0-0o377', 1)
self.assertRaises(re.error, re.match, r"[\u123z]", "") self.checkPatternError(r"[\911]", r'bad escape \9', 1)
self.assertRaises(re.error, re.match, r"[\U0001234z]", "") self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
self.assertRaises(re.error, re.match, r"[\U00110000]", "") self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
def test_sre_byte_literals(self): def test_sre_byte_literals(self):
@ -927,10 +991,12 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(br"\08", b"\0008")) self.assertTrue(re.match(br"\08", b"\0008"))
self.assertTrue(re.match(br"\01", b"\001")) self.assertTrue(re.match(br"\01", b"\001"))
self.assertTrue(re.match(br"\018", b"\0018")) self.assertTrue(re.match(br"\018", b"\0018"))
self.assertRaises(re.error, re.match, br"\567", b"") self.checkPatternError(br"\567",
self.assertRaises(re.error, re.match, br"\911", b"") r'octal escape value \567 outside of '
self.assertRaises(re.error, re.match, br"\x1", b"") r'range 0-0o377', 0)
self.assertRaises(re.error, re.match, br"\x1z", b"") self.checkPatternError(br"\911", 'invalid group reference', 0)
self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
def test_sre_byte_class_literals(self): def test_sre_byte_class_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255]: for i in [0, 8, 16, 32, 64, 127, 128, 255]:
@ -946,9 +1012,22 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(br"[\u1234]", b'u')) self.assertTrue(re.match(br"[\u1234]", b'u'))
with self.assertWarns(DeprecationWarning): with self.assertWarns(DeprecationWarning):
self.assertTrue(re.match(br"[\U00012345]", b'U')) self.assertTrue(re.match(br"[\U00012345]", b'U'))
self.assertRaises(re.error, re.match, br"[\567]", b"") self.checkPatternError(br"[\567]",
self.assertRaises(re.error, re.match, br"[\911]", b"") r'octal escape value \567 outside of '
self.assertRaises(re.error, re.match, br"[\x1z]", b"") r'range 0-0o377', 1)
self.checkPatternError(br"[\911]", r'bad escape \9', 1)
self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
def test_character_set_errors(self):
self.checkPatternError(r'[', 'unterminated character set', 0)
self.checkPatternError(r'[^', 'unterminated character set', 0)
self.checkPatternError(r'[a', 'unterminated character set', 0)
# bug 545855 -- This pattern failed to cause a compile error as it
# should, instead provoking a TypeError.
self.checkPatternError(r"[a-", 'unterminated character set', 0)
self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
def test_bug_113254(self): def test_bug_113254(self):
self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
@ -963,11 +1042,6 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a') self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
self.assertEqual(re.match("((a))", "a").lastindex, 1) self.assertEqual(re.match("((a))", "a").lastindex, 1)
def test_bug_545855(self):
# bug 545855 -- This pattern failed to cause a compile error as it
# should, instead provoking a TypeError.
self.assertRaises(re.error, re.compile, 'foo[a-')
def test_bug_418626(self): def test_bug_418626(self):
# bugs 418626 at al. -- Testing Greg Chapman's addition of op code # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
# SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
@ -991,6 +1065,24 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
def test_nothing_to_repeat(self):
for reps in '*', '+', '?', '{1,2}':
for mod in '', '?':
self.checkPatternError('%s%s' % (reps, mod),
'nothing to repeat', 0)
self.checkPatternError('(?:%s%s)' % (reps, mod),
'nothing to repeat', 3)
def test_multiple_repeat(self):
for outer_reps in '*', '+', '{1,2}':
for outer_mod in '', '?':
outer_op = outer_reps + outer_mod
for inner_reps in '*', '+', '?', '{1,2}':
for inner_mod in '', '?':
inner_op = inner_reps + inner_mod
self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
'multiple repeat', 1 + len(inner_op))
def test_unlimited_zero_width_repeat(self): def test_unlimited_zero_width_repeat(self):
# Issue #9669 # Issue #9669
self.assertIsNone(re.match(r'(?:a?)*y', 'z')) self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
@ -1381,13 +1473,13 @@ class ReTests(unittest.TestCase):
def test_backref_group_name_in_exception(self): def test_backref_group_name_in_exception(self):
# Issue 17341: Poor error message when compiling invalid regex # Issue 17341: Poor error message when compiling invalid regex
with self.assertRaisesRegex(sre_constants.error, '<foo>'): self.checkPatternError('(?P=<foo>)',
re.compile('(?P=<foo>)') "bad character in group name '<foo>'", 4)
def test_group_name_in_exception(self): def test_group_name_in_exception(self):
# Issue 17341: Poor error message when compiling invalid regex # Issue 17341: Poor error message when compiling invalid regex
with self.assertRaisesRegex(sre_constants.error, '\?foo'): self.checkPatternError('(?P<?foo>)',
re.compile('(?P<?foo>)') "bad character in group name '?foo'", 4)
def test_issue17998(self): def test_issue17998(self):
for reps in '*', '+', '?', '{1}': for reps in '*', '+', '?', '{1}':
@ -1556,6 +1648,19 @@ SUBPATTERN None
self.assertIn(' at position 77', str(err)) self.assertIn(' at position 77', str(err))
self.assertIn('(line 5, column 17)', str(err)) self.assertIn('(line 5, column 17)', str(err))
def test_misc_errors(self):
self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
self.checkPatternError(r'(?iz)', 'unknown flag', 3)
self.checkPatternError(r'(?i', 'missing )', 3)
self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
class PatternReprTests(unittest.TestCase): class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected): def check(self, pattern, expected):

View File

@ -30,6 +30,8 @@ Core and Builtins
Library Library
------- -------
- Issue #22364: Improved some re error messages using regex for hints.
- Issue #23742: ntpath.expandvars() no longer loses unbalanced single quotes. - Issue #23742: ntpath.expandvars() no longer loses unbalanced single quotes.
- Issue #21717: The zipfile.ZipFile.open function now supports 'x' (exclusive - Issue #21717: The zipfile.ZipFile.open function now supports 'x' (exclusive

View File

@ -315,7 +315,7 @@ getstring(PyObject* string, Py_ssize_t* p_length,
/* get pointer to byte string buffer */ /* get pointer to byte string buffer */
if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) { if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
PyErr_SetString(PyExc_TypeError, "expected string or buffer"); PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
return NULL; return NULL;
} }
@ -359,12 +359,12 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
if (isbytes && pattern->isbytes == 0) { if (isbytes && pattern->isbytes == 0) {
PyErr_SetString(PyExc_TypeError, PyErr_SetString(PyExc_TypeError,
"can't use a string pattern on a bytes-like object"); "cannot use a string pattern on a bytes-like object");
goto err; goto err;
} }
if (!isbytes && pattern->isbytes > 0) { if (!isbytes && pattern->isbytes > 0) {
PyErr_SetString(PyExc_TypeError, PyErr_SetString(PyExc_TypeError,
"can't use a bytes pattern on a string-like object"); "cannot use a bytes pattern on a string-like object");
goto err; goto err;
} }