Use unicodedata instead of eval.

This commit is contained in:
Serhiy Storchaka 2018-02-08 18:51:45 +02:00
parent c0a268962a
commit 2272cec13b
1 changed files with 22 additions and 26 deletions

View File

@ -13,7 +13,7 @@
# XXX: show string offset and offending character for all errors # XXX: show string offset and offending character for all errors
from sre_constants import * from sre_constants import *
from ast import literal_eval import unicodedata
SPECIAL_CHARS = ".\\[{()*+?^$|" SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{" REPEAT_CHARS = "*+?{"
@ -26,10 +26,6 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = frozenset(" \t\n\r\v\f") WHITESPACE = frozenset(" \t\n\r\v\f")
UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -')
CLOSING_BRACE = frozenset("}")
OPENING_BRACE = frozenset("{")
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
@ -270,19 +266,19 @@ class Tokenizer:
result += c result += c
self.__next() self.__next()
return result return result
def getuntil(self, terminator): def getuntil(self, terminator, name):
result = '' result = ''
while True: while True:
c = self.next c = self.next
self.__next() self.__next()
if c is None: if c is None:
if not result: if not result:
raise self.error("missing group name") raise self.error("missing " + name)
raise self.error("missing %s, unterminated name" % terminator, raise self.error("missing %s, unterminated name" % terminator,
len(result)) len(result))
if c == terminator: if c == terminator:
if not result: if not result:
raise self.error("missing group name", 1) raise self.error("missing " + name, 1)
break break
result += c result += c
return result return result
@ -330,14 +326,14 @@ def _class_escape(source, escape):
return LITERAL, c return LITERAL, c
elif c == "N" and source.istext: elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH} # named unicode escape e.g. \N{EM DASH}
escape += source.getwhile(1, OPENING_BRACE) if not source.match('{'):
escape += source.getwhile(100, UNICODE_NAME) raise source.error("missing {")
escape += source.getwhile(1, CLOSING_BRACE) charname = source.getuntil('}', 'character name')
try: try:
c = ord(literal_eval('"%s"' % escape)) c = ord(unicodedata.lookup(charname))
except SyntaxError: except KeyError:
charname = escape[2:].strip('{}') raise source.error("undefined character name %r" % charname,
raise source.error("unknown Unicode character name %s" % charname, len(escape)) len(charname) + len(r'\N{}'))
return LITERAL, c return LITERAL, c
elif c in OCTDIGITS: elif c in OCTDIGITS:
# octal escape (up to three digits) # octal escape (up to three digits)
@ -389,14 +385,14 @@ def _escape(source, escape, state):
return LITERAL, c return LITERAL, c
elif c == "N" and source.istext: elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH} # named unicode escape e.g. \N{EM DASH}
escape += source.getwhile(1, OPENING_BRACE) if not source.match('{'):
escape += source.getwhile(100, UNICODE_NAME) raise source.error("missing {")
escape += source.getwhile(1, CLOSING_BRACE) charname = source.getuntil('}', 'character name')
try: try:
c = ord(literal_eval('"%s"' % escape)) c = ord(unicodedata.lookup(charname))
except SyntaxError: except KeyError:
charname = escape[2:].strip('{}') raise source.error("undefined character name %r" % charname,
raise source.error("unknown Unicode character name %s" % charname, len(escape)) len(charname) + len(r'\N{}'))
return LITERAL, c return LITERAL, c
elif c == "0": elif c == "0":
# octal escape # octal escape
@ -707,13 +703,13 @@ def _parse(source, state, verbose, nested, first=False):
# python extensions # python extensions
if sourcematch("<"): if sourcematch("<"):
# named group: skip forward to end of name # named group: skip forward to end of name
name = source.getuntil(">") name = source.getuntil(">", "group name")
if not name.isidentifier(): if not name.isidentifier():
msg = "bad character in group name %r" % name msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1) raise source.error(msg, len(name) + 1)
elif sourcematch("="): elif sourcematch("="):
# named backreference # named backreference
name = source.getuntil(")") name = source.getuntil(")", "group name")
if not name.isidentifier(): if not name.isidentifier():
msg = "bad character in group name %r" % name msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1) raise source.error(msg, len(name) + 1)
@ -776,7 +772,7 @@ def _parse(source, state, verbose, nested, first=False):
elif char == "(": elif char == "(":
# conditional backreference group # conditional backreference group
condname = source.getuntil(")") condname = source.getuntil(")", "group name")
if condname.isidentifier(): if condname.isidentifier():
condgroup = state.groupdict.get(condname) condgroup = state.groupdict.get(condname)
if condgroup is None: if condgroup is None:
@ -1005,7 +1001,7 @@ def parse_template(source, pattern):
name = "" name = ""
if not s.match("<"): if not s.match("<"):
raise s.error("missing <") raise s.error("missing <")
name = s.getuntil(">") name = s.getuntil(">", "group name")
if name.isidentifier(): if name.isidentifier():
try: try:
index = groupindex[name] index = groupindex[name]