Issue #22578: Added attributes to the re.error class.

This commit is contained in:
Serhiy Storchaka 2014-11-10 13:49:00 +02:00
parent eb99e51574
commit ad446d57a9
5 changed files with 174 additions and 62 deletions

View File

@ -733,13 +733,36 @@ form.
Clear the regular expression cache.
.. exception:: error
.. exception:: error(msg, pattern=None, pos=None)
Exception raised when a string passed to one of the functions here is not a
valid regular expression (for example, it might contain unmatched parentheses)
or when some other error occurs during compilation or matching. It is never an
error if a string contains no match for a pattern.
error if a string contains no match for a pattern. The error instance has
the following additional attributes:
.. attribute:: msg
The unformatted error message.
.. attribute:: pattern
The regular expression pattern.
.. attribute:: pos
The index of *pattern* where compilation failed.
.. attribute:: lineno
The line corresponding to *pos*.
.. attribute:: colno
The column corresponding to *pos*.
.. versionchanged:: 3.5
Added additional attributes.
.. _re-objects:

View File

@ -21,7 +21,35 @@ from _sre import MAXREPEAT, MAXGROUPS
# should this really be here?
class error(Exception):
pass
def __init__(self, msg, pattern=None, pos=None):
self.msg = msg
self.pattern = pattern
self.pos = pos
if pattern is not None and pos is not None:
msg = '%s at position %d' % (msg, pos)
if isinstance(pattern, str):
newline = '\n'
else:
newline = b'\n'
self.lineno = pattern.count(newline, 0, pos) + 1
self.colno = pos - pattern.rfind(newline, 0, pos)
if newline in pattern:
msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
else:
self.lineno = self.colno = None
super().__init__(msg)
def linecol(doc, pos):
if isinstance(pattern, str):
newline = '\n'
else:
newline = b'\n'
lineno = pattern.count(newline, 0, pos) + 1
if lineno == 1:
colno = pos + 1
else:
colno = pos - doc.rindex(newline, 0, pos)
return lineno, colno
class _NamedIntConstant(int):

View File

@ -81,8 +81,8 @@ class Pattern:
if name is not None:
ogid = self.groupdict.get(name, None)
if ogid is not None:
raise error("redefinition of group name %s as group %d; "
"was group %d" % (repr(name), gid, ogid))
raise error("redefinition of group name %r as group %d; "
"was group %d" % (name, gid, ogid))
self.groupdict[name] = gid
return gid
def closegroup(self, gid, p):
@ -206,24 +206,25 @@ class SubPattern:
class Tokenizer:
def __init__(self, string):
self.istext = isinstance(string, str)
self.string = string
if not self.istext:
string = str(string, 'latin1')
self.string = string
self.decoded_string = string
self.index = 0
self.__next()
def __next(self):
index = self.index
try:
char = self.string[index]
char = self.decoded_string[index]
except IndexError:
self.next = None
return
if char == "\\":
index += 1
try:
char += self.string[index]
char += self.decoded_string[index]
except IndexError:
raise error("bogus escape (end of line)")
raise self.error("bogus escape (end of line)") from None
self.index = index + 1
self.next = char
def match(self, char):
@ -250,15 +251,19 @@ class Tokenizer:
c = self.next
self.__next()
if c is None:
raise error("unterminated name")
raise self.error("unterminated name")
if c == terminator:
break
result += c
return result
def tell(self):
return self.index, self.next
return self.index - len(self.next or '')
def seek(self, index):
self.index, self.next = index
self.index = index
self.__next()
def error(self, msg, offset=0):
return error(msg, self.string, self.tell() - offset)
# The following three functions are not used in this module anymore, but we keep
# them here (with DeprecationWarnings) for backwards compatibility.
@ -322,8 +327,8 @@ def _class_escape(source, escape):
escape += source.getwhile(2, OCTDIGITS)
c = int(escape[1:], 8)
if c > 0o377:
raise error('octal escape value %r outside of '
'range 0-0o377' % escape)
raise source.error('octal escape value %r outside of '
'range 0-0o377' % escape, len(escape))
return LITERAL, c
elif c in DIGITS:
raise ValueError
@ -331,7 +336,7 @@ def _class_escape(source, escape):
return LITERAL, ord(escape[1])
except ValueError:
pass
raise error("bogus escape: %s" % repr(escape))
raise source.error("bogus escape: %r" % escape, len(escape))
def _escape(source, escape, state):
# handle escape code in expression
@ -377,21 +382,23 @@ def _escape(source, escape, state):
escape += source.get()
c = int(escape[1:], 8)
if c > 0o377:
raise error('octal escape value %r outside of '
'range 0-0o377' % escape)
raise source.error('octal escape value %r outside of '
'range 0-0o377' % escape,
len(escape))
return LITERAL, c
# not an octal escape, so this is a group reference
group = int(escape[1:])
if group < state.groups:
if not state.checkgroup(group):
raise error("cannot refer to open group")
raise source.error("cannot refer to open group",
len(escape))
return GROUPREF, group
raise ValueError
if len(escape) == 2:
return LITERAL, ord(escape[1])
except ValueError:
pass
raise error("bogus escape: %s" % repr(escape))
raise source.error("bogus escape: %r" % escape, len(escape))
def _parse_sub(source, state, nested=True):
# parse an alternation: a|b|c
@ -404,7 +411,7 @@ def _parse_sub(source, state, nested=True):
if not sourcematch("|"):
break
if nested and source.next is not None and source.next != ")":
raise error("pattern not properly closed")
raise source.error("pattern not properly closed")
if len(items) == 1:
return items[0]
@ -449,11 +456,11 @@ def _parse_sub_cond(source, state, condgroup):
if source.match("|"):
item_no = _parse(source, state)
if source.next == "|":
raise error("conditional backref with more than two branches")
raise source.error("conditional backref with more than two branches")
else:
item_no = None
if source.next is not None and source.next != ")":
raise error("pattern not properly closed")
raise source.error("pattern not properly closed")
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
@ -510,7 +517,7 @@ def _parse(source, state):
while True:
this = sourceget()
if this is None:
raise error("unexpected end of regular expression")
raise source.error("unexpected end of regular expression")
if this == "]" and set != start:
break
elif this[0] == "\\":
@ -521,7 +528,7 @@ def _parse(source, state):
# potential range
this = sourceget()
if this is None:
raise error("unexpected end of regular expression")
raise source.error("unexpected end of regular expression")
if this == "]":
if code1[0] is IN:
code1 = code1[1][0]
@ -533,11 +540,11 @@ def _parse(source, state):
else:
code2 = LITERAL, _ord(this)
if code1[0] != LITERAL or code2[0] != LITERAL:
raise error("bad character range")
raise source.error("bad character range", len(this))
lo = code1[1]
hi = code2[1]
if hi < lo:
raise error("bad character range")
raise source.error("bad character range", len(this))
setappend((RANGE, (lo, hi)))
else:
if code1[0] is IN:
@ -555,6 +562,7 @@ def _parse(source, state):
elif this in REPEAT_CHARS:
# repeat previous item
here = source.tell()
if this == "?":
min, max = 0, 1
elif this == "*":
@ -566,7 +574,6 @@ def _parse(source, state):
if source.next == "}":
subpatternappend((LITERAL, _ord(this)))
continue
here = source.tell()
min, max = 0, MAXREPEAT
lo = hi = ""
while source.next in DIGITS:
@ -589,18 +596,21 @@ def _parse(source, state):
if max >= MAXREPEAT:
raise OverflowError("the repetition number is too large")
if max < min:
raise error("bad repeat interval")
raise source.error("bad repeat interval",
source.tell() - here)
else:
raise error("not supported")
raise source.error("not supported", len(this))
# figure out which item to repeat
if subpattern:
item = subpattern[-1:]
else:
item = None
if not item or (_len(item) == 1 and item[0][0] == AT):
raise error("nothing to repeat")
raise source.error("nothing to repeat",
source.tell() - here + len(this))
if item[0][0] in _REPEATCODES:
raise error("multiple repeat")
raise source.error("multiple repeat",
source.tell() - here + len(this))
if sourcematch("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item))
else:
@ -618,7 +628,7 @@ def _parse(source, state):
# options
char = sourceget()
if char is None:
raise error("unexpected end of pattern")
raise self.error("unexpected end of pattern")
if char == "P":
# python extensions
if sourcematch("<"):
@ -626,28 +636,32 @@ def _parse(source, state):
name = source.getuntil(">")
group = 1
if not name:
raise error("missing group name")
raise source.error("missing group name", 1)
if not name.isidentifier():
raise error("bad character in group name %r" % name)
raise source.error("bad character in group name "
"%r" % name,
len(name) + 1)
elif sourcematch("="):
# named backreference
name = source.getuntil(")")
if not name:
raise error("missing group name")
raise source.error("missing group name", 1)
if not name.isidentifier():
raise error("bad character in backref group name "
"%r" % name)
raise source.error("bad character in backref "
"group name %r" % name,
len(name) + 1)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name: {0!r}".format(name)
raise error(msg)
raise source.error(msg, len(name) + 1)
subpatternappend((GROUPREF, gid))
continue
else:
char = sourceget()
if char is None:
raise error("unexpected end of pattern")
raise error("unknown specifier: ?P%s" % char)
raise source.error("unexpected end of pattern")
raise source.error("unknown specifier: ?P%s" % char,
len(char))
elif char == ":":
# non-capturing group
group = 2
@ -655,7 +669,7 @@ def _parse(source, state):
# comment
while True:
if source.next is None:
raise error("unbalanced parenthesis")
raise source.error("unbalanced parenthesis")
if sourceget() == ")":
break
continue
@ -665,11 +679,11 @@ def _parse(source, state):
if char == "<":
char = sourceget()
if char is None or char not in "=!":
raise error("syntax error")
raise source.error("syntax error")
dir = -1 # lookbehind
p = _parse_sub(source, state)
if not sourcematch(")"):
raise error("unbalanced parenthesis")
raise source.error("unbalanced parenthesis")
if char == "=":
subpatternappend((ASSERT, (dir, p)))
else:
@ -680,23 +694,26 @@ def _parse(source, state):
condname = source.getuntil(")")
group = 2
if not condname:
raise error("missing group name")
raise source.error("missing group name", 1)
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name: {0!r}".format(condname)
raise error(msg)
raise source.error(msg, len(condname) + 1)
else:
try:
condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError:
raise error("bad character in group name")
raise source.error("bad character in group name",
len(condname) + 1)
if not condgroup:
raise error("bad group number")
raise source.error("bad group number",
len(condname) + 1)
if condgroup >= MAXGROUPS:
raise error("the group number is too large")
raise source.error("the group number is too large",
len(condname) + 1)
elif char in FLAGS:
# flags
state.flags |= FLAGS[char]
@ -704,20 +721,23 @@ def _parse(source, state):
state.flags |= FLAGS[sourceget()]
verbose = state.flags & SRE_FLAG_VERBOSE
else:
raise error("unexpected end of pattern " + char)
raise source.error("unexpected end of pattern")
if group:
# parse group contents
if group == 2:
# anonymous group
group = None
else:
group = state.opengroup(name)
try:
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1)
if condgroup:
p = _parse_sub_cond(source, state, condgroup)
else:
p = _parse_sub(source, state)
if not sourcematch(")"):
raise error("unbalanced parenthesis")
raise source.error("unbalanced parenthesis")
if group is not None:
state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, p)))
@ -725,10 +745,10 @@ def _parse(source, state):
while True:
char = sourceget()
if char is None:
raise error("unexpected end of pattern")
raise source.error("unexpected end of pattern")
if char == ")":
break
raise error("unknown extension")
raise source.error("unknown extension", len(char))
elif this == "^":
subpatternappend((AT, AT_BEGINNING))
@ -737,7 +757,7 @@ def _parse(source, state):
subpattern.append((AT, AT_END))
else:
raise error("parser error")
raise source.error("parser error", len(this))
return subpattern
@ -768,9 +788,10 @@ def parse(str, flags=0, pattern=None):
if source.next is not None:
if source.next == ")":
raise error("unbalanced parenthesis")
raise source.error("unbalanced parenthesis")
else:
raise error("bogus characters at end of regular expression")
raise source.error("bogus characters at end of regular expression",
len(tail))
if flags & SRE_FLAG_DEBUG:
p.dump()
@ -809,16 +830,18 @@ def parse_template(source, pattern):
if s.match("<"):
name = s.getuntil(">")
if not name:
raise error("missing group name")
raise s.error("missing group name", 1)
try:
index = int(name)
if index < 0:
raise error("negative group number")
raise s.error("negative group number", len(name) + 1)
if index >= MAXGROUPS:
raise error("the group number is too large")
raise s.error("the group number is too large",
len(name) + 1)
except ValueError:
if not name.isidentifier():
raise error("bad character in group name")
raise s.error("bad character in group name",
len(name) + 1)
try:
index = pattern.groupindex[name]
except KeyError:
@ -841,8 +864,8 @@ def parse_template(source, pattern):
isoctal = True
c = int(this[1:], 8)
if c > 0o377:
raise error('octal escape value %r outside of '
'range 0-0o377' % this)
raise s.error('octal escape value %r outside of '
'range 0-0o377' % this, len(this))
lappend(chr(c))
if not isoctal:
addgroup(int(this[1:]))

View File

@ -1419,6 +1419,42 @@ SUBPATTERN None
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
def test_error(self):
with self.assertRaises(re.error) as cm:
re.compile('(\u20ac))')
err = cm.exception
self.assertIsInstance(err.pattern, str)
self.assertEqual(err.pattern, '(\u20ac))')
self.assertEqual(err.pos, 3)
self.assertEqual(err.lineno, 1)
self.assertEqual(err.colno, 4)
self.assertIn(err.msg, str(err))
self.assertIn(' at position 3', str(err))
self.assertNotIn(' at position 3', err.msg)
# Bytes pattern
with self.assertRaises(re.error) as cm:
re.compile(b'(\xa4))')
err = cm.exception
self.assertIsInstance(err.pattern, bytes)
self.assertEqual(err.pattern, b'(\xa4))')
self.assertEqual(err.pos, 3)
# Multiline pattern
with self.assertRaises(re.error) as cm:
re.compile("""
(
abc
)
)
(
""", re.VERBOSE)
err = cm.exception
self.assertEqual(err.pos, 77)
self.assertEqual(err.lineno, 5)
self.assertEqual(err.colno, 17)
self.assertIn(err.msg, str(err))
self.assertIn(' at position 77', str(err))
self.assertIn('(line 5, column 17)', str(err))
class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected):

View File

@ -183,6 +183,8 @@ Core and Builtins
Library
-------
- Issue #22578: Added attributes to the re.error class.
- Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.