mirror of https://github.com/python/cpython
Use unicodedata instead of eval.
This commit is contained in:
parent
c0a268962a
commit
2272cec13b
|
@ -13,7 +13,7 @@
|
||||||
# XXX: show string offset and offending character for all errors
|
# XXX: show string offset and offending character for all errors
|
||||||
|
|
||||||
from sre_constants import *
|
from sre_constants import *
|
||||||
from ast import literal_eval
|
import unicodedata
|
||||||
|
|
||||||
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
||||||
REPEAT_CHARS = "*+?{"
|
REPEAT_CHARS = "*+?{"
|
||||||
|
@ -26,10 +26,6 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
||||||
|
|
||||||
WHITESPACE = frozenset(" \t\n\r\v\f")
|
WHITESPACE = frozenset(" \t\n\r\v\f")
|
||||||
|
|
||||||
UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -')
|
|
||||||
CLOSING_BRACE = frozenset("}")
|
|
||||||
OPENING_BRACE = frozenset("{")
|
|
||||||
|
|
||||||
|
|
||||||
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
|
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
|
||||||
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
|
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
|
||||||
|
@ -270,19 +266,19 @@ class Tokenizer:
|
||||||
result += c
|
result += c
|
||||||
self.__next()
|
self.__next()
|
||||||
return result
|
return result
|
||||||
def getuntil(self, terminator):
|
def getuntil(self, terminator, name):
|
||||||
result = ''
|
result = ''
|
||||||
while True:
|
while True:
|
||||||
c = self.next
|
c = self.next
|
||||||
self.__next()
|
self.__next()
|
||||||
if c is None:
|
if c is None:
|
||||||
if not result:
|
if not result:
|
||||||
raise self.error("missing group name")
|
raise self.error("missing " + name)
|
||||||
raise self.error("missing %s, unterminated name" % terminator,
|
raise self.error("missing %s, unterminated name" % terminator,
|
||||||
len(result))
|
len(result))
|
||||||
if c == terminator:
|
if c == terminator:
|
||||||
if not result:
|
if not result:
|
||||||
raise self.error("missing group name", 1)
|
raise self.error("missing " + name, 1)
|
||||||
break
|
break
|
||||||
result += c
|
result += c
|
||||||
return result
|
return result
|
||||||
|
@ -330,14 +326,14 @@ def _class_escape(source, escape):
|
||||||
return LITERAL, c
|
return LITERAL, c
|
||||||
elif c == "N" and source.istext:
|
elif c == "N" and source.istext:
|
||||||
# named unicode escape e.g. \N{EM DASH}
|
# named unicode escape e.g. \N{EM DASH}
|
||||||
escape += source.getwhile(1, OPENING_BRACE)
|
if not source.match('{'):
|
||||||
escape += source.getwhile(100, UNICODE_NAME)
|
raise source.error("missing {")
|
||||||
escape += source.getwhile(1, CLOSING_BRACE)
|
charname = source.getuntil('}', 'character name')
|
||||||
try:
|
try:
|
||||||
c = ord(literal_eval('"%s"' % escape))
|
c = ord(unicodedata.lookup(charname))
|
||||||
except SyntaxError:
|
except KeyError:
|
||||||
charname = escape[2:].strip('{}')
|
raise source.error("undefined character name %r" % charname,
|
||||||
raise source.error("unknown Unicode character name %s" % charname, len(escape))
|
len(charname) + len(r'\N{}'))
|
||||||
return LITERAL, c
|
return LITERAL, c
|
||||||
elif c in OCTDIGITS:
|
elif c in OCTDIGITS:
|
||||||
# octal escape (up to three digits)
|
# octal escape (up to three digits)
|
||||||
|
@ -389,14 +385,14 @@ def _escape(source, escape, state):
|
||||||
return LITERAL, c
|
return LITERAL, c
|
||||||
elif c == "N" and source.istext:
|
elif c == "N" and source.istext:
|
||||||
# named unicode escape e.g. \N{EM DASH}
|
# named unicode escape e.g. \N{EM DASH}
|
||||||
escape += source.getwhile(1, OPENING_BRACE)
|
if not source.match('{'):
|
||||||
escape += source.getwhile(100, UNICODE_NAME)
|
raise source.error("missing {")
|
||||||
escape += source.getwhile(1, CLOSING_BRACE)
|
charname = source.getuntil('}', 'character name')
|
||||||
try:
|
try:
|
||||||
c = ord(literal_eval('"%s"' % escape))
|
c = ord(unicodedata.lookup(charname))
|
||||||
except SyntaxError:
|
except KeyError:
|
||||||
charname = escape[2:].strip('{}')
|
raise source.error("undefined character name %r" % charname,
|
||||||
raise source.error("unknown Unicode character name %s" % charname, len(escape))
|
len(charname) + len(r'\N{}'))
|
||||||
return LITERAL, c
|
return LITERAL, c
|
||||||
elif c == "0":
|
elif c == "0":
|
||||||
# octal escape
|
# octal escape
|
||||||
|
@ -707,13 +703,13 @@ def _parse(source, state, verbose, nested, first=False):
|
||||||
# python extensions
|
# python extensions
|
||||||
if sourcematch("<"):
|
if sourcematch("<"):
|
||||||
# named group: skip forward to end of name
|
# named group: skip forward to end of name
|
||||||
name = source.getuntil(">")
|
name = source.getuntil(">", "group name")
|
||||||
if not name.isidentifier():
|
if not name.isidentifier():
|
||||||
msg = "bad character in group name %r" % name
|
msg = "bad character in group name %r" % name
|
||||||
raise source.error(msg, len(name) + 1)
|
raise source.error(msg, len(name) + 1)
|
||||||
elif sourcematch("="):
|
elif sourcematch("="):
|
||||||
# named backreference
|
# named backreference
|
||||||
name = source.getuntil(")")
|
name = source.getuntil(")", "group name")
|
||||||
if not name.isidentifier():
|
if not name.isidentifier():
|
||||||
msg = "bad character in group name %r" % name
|
msg = "bad character in group name %r" % name
|
||||||
raise source.error(msg, len(name) + 1)
|
raise source.error(msg, len(name) + 1)
|
||||||
|
@ -776,7 +772,7 @@ def _parse(source, state, verbose, nested, first=False):
|
||||||
|
|
||||||
elif char == "(":
|
elif char == "(":
|
||||||
# conditional backreference group
|
# conditional backreference group
|
||||||
condname = source.getuntil(")")
|
condname = source.getuntil(")", "group name")
|
||||||
if condname.isidentifier():
|
if condname.isidentifier():
|
||||||
condgroup = state.groupdict.get(condname)
|
condgroup = state.groupdict.get(condname)
|
||||||
if condgroup is None:
|
if condgroup is None:
|
||||||
|
@ -1005,7 +1001,7 @@ def parse_template(source, pattern):
|
||||||
name = ""
|
name = ""
|
||||||
if not s.match("<"):
|
if not s.match("<"):
|
||||||
raise s.error("missing <")
|
raise s.error("missing <")
|
||||||
name = s.getuntil(">")
|
name = s.getuntil(">", "group name")
|
||||||
if name.isidentifier():
|
if name.isidentifier():
|
||||||
try:
|
try:
|
||||||
index = groupindex[name]
|
index = groupindex[name]
|
||||||
|
|
Loading…
Reference in New Issue