Merged revisions 65544 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r65544 | guido.van.rossum | 2008-08-04 20:39:21 -0700 (Mon, 04 Aug 2008) | 28 lines Tracker issue 3487: sre "bytecode" verifier. This is a verifier for the binary code used by the _sre module (this is often called bytecode, though to distinguish it from Python bytecode I put it in quotes). I wrote this for Google App Engine, and am making the patch available as open source under the Apache 2 license. Below are the copyright statement and license, for completeness. # Copyright 2008 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. It's not necessary to include these copyrights and bytecode in the source file. Google has signed a contributor's agreement with the PSF already. ........
This commit is contained in:
parent
11e18b0c2e
commit
10faf6a0a3
|
@ -465,7 +465,7 @@ class UnquotingTests(unittest.TestCase):
|
||||||
|
|
||||||
def test_unquote_with_unicode(self):
|
def test_unquote_with_unicode(self):
|
||||||
r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc')
|
r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc')
|
||||||
self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc')
|
self.assertEqual(r, 'br\u00FCckner_sapporo_20050930.doc')
|
||||||
|
|
||||||
class urlencode_Tests(unittest.TestCase):
|
class urlencode_Tests(unittest.TestCase):
|
||||||
"""Tests for urlencode()"""
|
"""Tests for urlencode()"""
|
||||||
|
|
|
@ -261,84 +261,74 @@ def urldefrag(url):
|
||||||
return url, ''
|
return url, ''
|
||||||
|
|
||||||
|
|
||||||
_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
|
def unquote_as_string (s, plus=False, charset=None):
|
||||||
_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
|
if charset is None:
|
||||||
|
charset = "UTF-8"
|
||||||
|
return str(unquote_as_bytes(s, plus=plus), charset, 'strict')
|
||||||
|
|
||||||
|
def unquote_as_bytes (s, plus=False):
|
||||||
|
"""unquote('abc%20def') -> 'abc def'."""
|
||||||
|
if plus:
|
||||||
|
s = s.replace('+', ' ')
|
||||||
|
res = s.split('%')
|
||||||
|
res[0] = res[0].encode('ASCII', 'strict')
|
||||||
|
for i in range(1, len(res)):
|
||||||
|
res[i] = (bytes.fromhex(res[i][:2]) +
|
||||||
|
res[i][2:].encode('ASCII', 'strict'))
|
||||||
|
return b''.join(res)
|
||||||
|
|
||||||
|
_always_safe = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
|
b'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
b'0123456789'
|
||||||
|
b'_.-')
|
||||||
|
|
||||||
|
_percent_code = ord('%')
|
||||||
|
|
||||||
|
_hextable = b'0123456789ABCDEF'
|
||||||
|
|
||||||
|
def quote_as_bytes(s, safe = '/', plus=False):
|
||||||
|
"""quote(b'abc@def') -> 'abc%40def'"""
|
||||||
|
|
||||||
|
if isinstance(s, str):
|
||||||
|
s = s.encode("UTF-8", "strict")
|
||||||
|
if not (isinstance(s, bytes) or isinstance(s, bytearray)):
|
||||||
|
raise ValueError("Argument to quote must be either bytes "
|
||||||
|
"or bytearray; string arguments will be "
|
||||||
|
"converted to UTF-8 bytes")
|
||||||
|
|
||||||
|
safeset = _always_safe + safe.encode('ASCII', 'strict')
|
||||||
|
if plus:
|
||||||
|
safeset += b' '
|
||||||
|
|
||||||
|
result = bytearray()
|
||||||
|
for i in s:
|
||||||
|
if i not in safeset:
|
||||||
|
result.append(_percent_code)
|
||||||
|
result.append(_hextable[(i >> 4) & 0xF])
|
||||||
|
result.append(_hextable[i & 0xF])
|
||||||
|
else:
|
||||||
|
result.append(i)
|
||||||
|
if plus:
|
||||||
|
result = result.replace(b' ', b'+')
|
||||||
|
return result
|
||||||
|
|
||||||
|
def quote_as_string(s, safe = '/', plus=False):
|
||||||
|
return str(quote_as_bytes(s, safe=safe, plus=plus), 'ASCII', 'strict')
|
||||||
|
|
||||||
|
# finally, define defaults for 'quote' and 'unquote'
|
||||||
|
|
||||||
|
def quote(s, safe='/'):
|
||||||
|
return quote_as_string(s, safe=safe)
|
||||||
|
|
||||||
|
def quote_plus(s, safe=''):
|
||||||
|
return quote_as_string(s, safe=safe, plus=True)
|
||||||
|
|
||||||
def unquote(s):
|
def unquote(s):
|
||||||
"""unquote('abc%20def') -> 'abc def'."""
|
return unquote_as_string(s)
|
||||||
res = s.split('%')
|
|
||||||
for i in range(1, len(res)):
|
|
||||||
item = res[i]
|
|
||||||
try:
|
|
||||||
res[i] = _hextochr[item[:2]] + item[2:]
|
|
||||||
except KeyError:
|
|
||||||
res[i] = '%' + item
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
res[i] = chr(int(item[:2], 16)) + item[2:]
|
|
||||||
return "".join(res)
|
|
||||||
|
|
||||||
def unquote_plus(s):
|
def unquote_plus(s):
|
||||||
"""unquote('%7e/abc+def') -> '~/abc def'"""
|
return unquote_as_string(s, plus=True)
|
||||||
s = s.replace('+', ' ')
|
|
||||||
return unquote(s)
|
|
||||||
|
|
||||||
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
||||||
'abcdefghijklmnopqrstuvwxyz'
|
|
||||||
'0123456789' '_.-')
|
|
||||||
_safe_quoters= {}
|
|
||||||
|
|
||||||
class Quoter:
|
|
||||||
def __init__(self, safe):
|
|
||||||
self.cache = {}
|
|
||||||
self.safe = safe + always_safe
|
|
||||||
|
|
||||||
def __call__(self, c):
|
|
||||||
try:
|
|
||||||
return self.cache[c]
|
|
||||||
except KeyError:
|
|
||||||
if ord(c) < 256:
|
|
||||||
res = (c in self.safe) and c or ('%%%02X' % ord(c))
|
|
||||||
self.cache[c] = res
|
|
||||||
return res
|
|
||||||
else:
|
|
||||||
return "".join(['%%%02X' % i for i in c.encode("utf-8")])
|
|
||||||
|
|
||||||
def quote(s, safe = '/'):
|
|
||||||
"""quote('abc def') -> 'abc%20def'
|
|
||||||
|
|
||||||
Each part of a URL, e.g. the path info, the query, etc., has a
|
|
||||||
different set of reserved characters that must be quoted.
|
|
||||||
|
|
||||||
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
|
|
||||||
the following reserved characters.
|
|
||||||
|
|
||||||
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
|
||||||
"$" | ","
|
|
||||||
|
|
||||||
Each of these characters is reserved in some component of a URL,
|
|
||||||
but not necessarily in all of them.
|
|
||||||
|
|
||||||
By default, the quote function is intended for quoting the path
|
|
||||||
section of a URL. Thus, it will not encode '/'. This character
|
|
||||||
is reserved, but in typical usage the quote function is being
|
|
||||||
called on a path where the existing slash characters are used as
|
|
||||||
reserved characters.
|
|
||||||
"""
|
|
||||||
cachekey = (safe, always_safe)
|
|
||||||
try:
|
|
||||||
quoter = _safe_quoters[cachekey]
|
|
||||||
except KeyError:
|
|
||||||
quoter = Quoter(safe)
|
|
||||||
_safe_quoters[cachekey] = quoter
|
|
||||||
res = map(quoter, s)
|
|
||||||
return ''.join(res)
|
|
||||||
|
|
||||||
def quote_plus(s, safe = ''):
|
|
||||||
"""Quote the query fragment of a URL; replacing ' ' with '+'"""
|
|
||||||
if ' ' in s:
|
|
||||||
s = quote(s, safe + ' ')
|
|
||||||
return s.replace(' ', '+')
|
|
||||||
return quote(s, safe)
|
|
||||||
|
|
||||||
def urlencode(query,doseq=0):
|
def urlencode(query,doseq=0):
|
||||||
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
||||||
|
@ -387,7 +377,7 @@ def urlencode(query,doseq=0):
|
||||||
# is there a reasonable way to convert to ASCII?
|
# is there a reasonable way to convert to ASCII?
|
||||||
# encode generates a string, but "replace" or "ignore"
|
# encode generates a string, but "replace" or "ignore"
|
||||||
# lose information and "strict" can raise UnicodeError
|
# lose information and "strict" can raise UnicodeError
|
||||||
v = quote_plus(v.encode("ASCII","replace"))
|
v = quote_plus(v)
|
||||||
l.append(k + '=' + v)
|
l.append(k + '=' + v)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
|
@ -474,7 +464,8 @@ def splituser(host):
|
||||||
_userprog = re.compile('^(.*)@(.*)$')
|
_userprog = re.compile('^(.*)@(.*)$')
|
||||||
|
|
||||||
match = _userprog.match(host)
|
match = _userprog.match(host)
|
||||||
if match: return map(unquote, match.group(1, 2))
|
if match:
|
||||||
|
return map(unquote, match.group(1, 2))
|
||||||
return None, host
|
return None, host
|
||||||
|
|
||||||
_passwdprog = None
|
_passwdprog = None
|
||||||
|
|
474
Modules/_sre.c
474
Modules/_sre.c
|
@ -2637,6 +2637,8 @@ static PyTypeObject Pattern_Type = {
|
||||||
pattern_members, /* tp_members */
|
pattern_members, /* tp_members */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static int _validate(PatternObject *self); /* Forward */
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
_compile(PyObject* self_, PyObject* args)
|
_compile(PyObject* self_, PyObject* args)
|
||||||
{
|
{
|
||||||
|
@ -2695,9 +2697,481 @@ _compile(PyObject* self_, PyObject* args)
|
||||||
|
|
||||||
self->weakreflist = NULL;
|
self->weakreflist = NULL;
|
||||||
|
|
||||||
|
if (!_validate(self)) {
|
||||||
|
Py_DECREF(self);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return (PyObject*) self;
|
return (PyObject*) self;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------- */
|
||||||
|
/* Code validation */
|
||||||
|
|
||||||
|
/* To learn more about this code, have a look at the _compile() function in
|
||||||
|
Lib/sre_compile.py. The validation functions below checks the code array
|
||||||
|
for conformance with the code patterns generated there.
|
||||||
|
|
||||||
|
The nice thing about the generated code is that it is position-independent:
|
||||||
|
all jumps are relative jumps forward. Also, jumps don't cross each other:
|
||||||
|
the target of a later jump is always earlier than the target of an earlier
|
||||||
|
jump. IOW, this is okay:
|
||||||
|
|
||||||
|
J---------J-------T--------T
|
||||||
|
\ \_____/ /
|
||||||
|
\______________________/
|
||||||
|
|
||||||
|
but this is not:
|
||||||
|
|
||||||
|
J---------J-------T--------T
|
||||||
|
\_________\_____/ /
|
||||||
|
\____________/
|
||||||
|
|
||||||
|
It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
|
||||||
|
bytes wide (the latter if Python is compiled for "wide" unicode support).
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Defining this one enables tracing of the validator */
|
||||||
|
#undef VVERBOSE
|
||||||
|
|
||||||
|
/* Trace macro for the validator */
|
||||||
|
#if defined(VVERBOSE)
|
||||||
|
#define VTRACE(v) printf v
|
||||||
|
#else
|
||||||
|
#define VTRACE(v)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Report failure */
|
||||||
|
#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
|
||||||
|
|
||||||
|
/* Extract opcode, argument, or skip count from code array */
|
||||||
|
#define GET_OP \
|
||||||
|
do { \
|
||||||
|
VTRACE(("%p: ", code)); \
|
||||||
|
if (code >= end) FAIL; \
|
||||||
|
op = *code++; \
|
||||||
|
VTRACE(("%lu (op)\n", (unsigned long)op)); \
|
||||||
|
} while (0)
|
||||||
|
#define GET_ARG \
|
||||||
|
do { \
|
||||||
|
VTRACE(("%p= ", code)); \
|
||||||
|
if (code >= end) FAIL; \
|
||||||
|
arg = *code++; \
|
||||||
|
VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
|
||||||
|
} while (0)
|
||||||
|
#define GET_SKIP \
|
||||||
|
do { \
|
||||||
|
VTRACE(("%p= ", code)); \
|
||||||
|
if (code >= end) FAIL; \
|
||||||
|
skip = *code; \
|
||||||
|
VTRACE(("%lu (skip to %p)\n", \
|
||||||
|
(unsigned long)skip, code+skip)); \
|
||||||
|
if (code+skip < code || code+skip > end) \
|
||||||
|
FAIL; \
|
||||||
|
code++; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
static int
|
||||||
|
_validate_charset(SRE_CODE *code, SRE_CODE *end)
|
||||||
|
{
|
||||||
|
/* Some variables are manipulated by the macros above */
|
||||||
|
SRE_CODE op;
|
||||||
|
SRE_CODE arg;
|
||||||
|
SRE_CODE offset;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
while (code < end) {
|
||||||
|
GET_OP;
|
||||||
|
switch (op) {
|
||||||
|
|
||||||
|
case SRE_OP_NEGATE:
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_LITERAL:
|
||||||
|
GET_ARG;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_RANGE:
|
||||||
|
GET_ARG;
|
||||||
|
GET_ARG;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_CHARSET:
|
||||||
|
offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
|
||||||
|
if (code+offset < code || code+offset > end)
|
||||||
|
FAIL;
|
||||||
|
code += offset;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_BIGCHARSET:
|
||||||
|
GET_ARG; /* Number of blocks */
|
||||||
|
offset = 256/sizeof(SRE_CODE); /* 256-byte table */
|
||||||
|
if (code+offset < code || code+offset > end)
|
||||||
|
FAIL;
|
||||||
|
/* Make sure that each byte points to a valid block */
|
||||||
|
for (i = 0; i < 256; i++) {
|
||||||
|
if (((unsigned char *)code)[i] >= arg)
|
||||||
|
FAIL;
|
||||||
|
}
|
||||||
|
code += offset;
|
||||||
|
offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
|
||||||
|
if (code+offset < code || code+offset > end)
|
||||||
|
FAIL;
|
||||||
|
code += offset;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_CATEGORY:
|
||||||
|
GET_ARG;
|
||||||
|
switch (arg) {
|
||||||
|
case SRE_CATEGORY_DIGIT:
|
||||||
|
case SRE_CATEGORY_NOT_DIGIT:
|
||||||
|
case SRE_CATEGORY_SPACE:
|
||||||
|
case SRE_CATEGORY_NOT_SPACE:
|
||||||
|
case SRE_CATEGORY_WORD:
|
||||||
|
case SRE_CATEGORY_NOT_WORD:
|
||||||
|
case SRE_CATEGORY_LINEBREAK:
|
||||||
|
case SRE_CATEGORY_NOT_LINEBREAK:
|
||||||
|
case SRE_CATEGORY_LOC_WORD:
|
||||||
|
case SRE_CATEGORY_LOC_NOT_WORD:
|
||||||
|
case SRE_CATEGORY_UNI_DIGIT:
|
||||||
|
case SRE_CATEGORY_UNI_NOT_DIGIT:
|
||||||
|
case SRE_CATEGORY_UNI_SPACE:
|
||||||
|
case SRE_CATEGORY_UNI_NOT_SPACE:
|
||||||
|
case SRE_CATEGORY_UNI_WORD:
|
||||||
|
case SRE_CATEGORY_UNI_NOT_WORD:
|
||||||
|
case SRE_CATEGORY_UNI_LINEBREAK:
|
||||||
|
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
FAIL;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
FAIL;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
||||||
|
{
|
||||||
|
/* Some variables are manipulated by the macros above */
|
||||||
|
SRE_CODE op;
|
||||||
|
SRE_CODE arg;
|
||||||
|
SRE_CODE skip;
|
||||||
|
|
||||||
|
VTRACE(("code=%p, end=%p\n", code, end));
|
||||||
|
|
||||||
|
if (code > end)
|
||||||
|
FAIL;
|
||||||
|
|
||||||
|
while (code < end) {
|
||||||
|
GET_OP;
|
||||||
|
switch (op) {
|
||||||
|
|
||||||
|
case SRE_OP_MARK:
|
||||||
|
/* We don't check whether marks are properly nested; the
|
||||||
|
sre_match() code is robust even if they don't, and the worst
|
||||||
|
you can get is nonsensical match results. */
|
||||||
|
GET_ARG;
|
||||||
|
if (arg > 2*groups+1) {
|
||||||
|
VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
|
||||||
|
FAIL;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_LITERAL:
|
||||||
|
case SRE_OP_NOT_LITERAL:
|
||||||
|
case SRE_OP_LITERAL_IGNORE:
|
||||||
|
case SRE_OP_NOT_LITERAL_IGNORE:
|
||||||
|
GET_ARG;
|
||||||
|
/* The arg is just a character, nothing to check */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_SUCCESS:
|
||||||
|
case SRE_OP_FAILURE:
|
||||||
|
/* Nothing to check; these normally end the matching process */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_AT:
|
||||||
|
GET_ARG;
|
||||||
|
switch (arg) {
|
||||||
|
case SRE_AT_BEGINNING:
|
||||||
|
case SRE_AT_BEGINNING_STRING:
|
||||||
|
case SRE_AT_BEGINNING_LINE:
|
||||||
|
case SRE_AT_END:
|
||||||
|
case SRE_AT_END_LINE:
|
||||||
|
case SRE_AT_END_STRING:
|
||||||
|
case SRE_AT_BOUNDARY:
|
||||||
|
case SRE_AT_NON_BOUNDARY:
|
||||||
|
case SRE_AT_LOC_BOUNDARY:
|
||||||
|
case SRE_AT_LOC_NON_BOUNDARY:
|
||||||
|
case SRE_AT_UNI_BOUNDARY:
|
||||||
|
case SRE_AT_UNI_NON_BOUNDARY:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
FAIL;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_ANY:
|
||||||
|
case SRE_OP_ANY_ALL:
|
||||||
|
/* These have no operands */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_IN:
|
||||||
|
case SRE_OP_IN_IGNORE:
|
||||||
|
GET_SKIP;
|
||||||
|
/* Stop 1 before the end; we check the FAILURE below */
|
||||||
|
if (!_validate_charset(code, code+skip-2))
|
||||||
|
FAIL;
|
||||||
|
if (code[skip-2] != SRE_OP_FAILURE)
|
||||||
|
FAIL;
|
||||||
|
code += skip-1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_INFO:
|
||||||
|
{
|
||||||
|
/* A minimal info field is
|
||||||
|
<INFO> <1=skip> <2=flags> <3=min> <4=max>;
|
||||||
|
If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
|
||||||
|
more follows. */
|
||||||
|
SRE_CODE flags, min, max, i;
|
||||||
|
SRE_CODE *newcode;
|
||||||
|
GET_SKIP;
|
||||||
|
newcode = code+skip-1;
|
||||||
|
GET_ARG; flags = arg;
|
||||||
|
GET_ARG; min = arg;
|
||||||
|
GET_ARG; max = arg;
|
||||||
|
/* Check that only valid flags are present */
|
||||||
|
if ((flags & ~(SRE_INFO_PREFIX |
|
||||||
|
SRE_INFO_LITERAL |
|
||||||
|
SRE_INFO_CHARSET)) != 0)
|
||||||
|
FAIL;
|
||||||
|
/* PREFIX and CHARSET are mutually exclusive */
|
||||||
|
if ((flags & SRE_INFO_PREFIX) &&
|
||||||
|
(flags & SRE_INFO_CHARSET))
|
||||||
|
FAIL;
|
||||||
|
/* LITERAL implies PREFIX */
|
||||||
|
if ((flags & SRE_INFO_LITERAL) &&
|
||||||
|
!(flags & SRE_INFO_PREFIX))
|
||||||
|
FAIL;
|
||||||
|
/* Validate the prefix */
|
||||||
|
if (flags & SRE_INFO_PREFIX) {
|
||||||
|
SRE_CODE prefix_len, prefix_skip;
|
||||||
|
GET_ARG; prefix_len = arg;
|
||||||
|
GET_ARG; prefix_skip = arg;
|
||||||
|
/* Here comes the prefix string */
|
||||||
|
if (code+prefix_len < code || code+prefix_len > newcode)
|
||||||
|
FAIL;
|
||||||
|
code += prefix_len;
|
||||||
|
/* And here comes the overlap table */
|
||||||
|
if (code+prefix_len < code || code+prefix_len > newcode)
|
||||||
|
FAIL;
|
||||||
|
/* Each overlap value should be < prefix_len */
|
||||||
|
for (i = 0; i < prefix_len; i++) {
|
||||||
|
if (code[i] >= prefix_len)
|
||||||
|
FAIL;
|
||||||
|
}
|
||||||
|
code += prefix_len;
|
||||||
|
}
|
||||||
|
/* Validate the charset */
|
||||||
|
if (flags & SRE_INFO_CHARSET) {
|
||||||
|
if (!_validate_charset(code, newcode-1))
|
||||||
|
FAIL;
|
||||||
|
if (newcode[-1] != SRE_OP_FAILURE)
|
||||||
|
FAIL;
|
||||||
|
code = newcode;
|
||||||
|
}
|
||||||
|
else if (code != newcode) {
|
||||||
|
VTRACE(("code=%p, newcode=%p\n", code, newcode));
|
||||||
|
FAIL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_BRANCH:
|
||||||
|
{
|
||||||
|
SRE_CODE *target = NULL;
|
||||||
|
for (;;) {
|
||||||
|
GET_SKIP;
|
||||||
|
if (skip == 0)
|
||||||
|
break;
|
||||||
|
/* Stop 2 before the end; we check the JUMP below */
|
||||||
|
if (!_validate_inner(code, code+skip-3, groups))
|
||||||
|
FAIL;
|
||||||
|
code += skip-3;
|
||||||
|
/* Check that it ends with a JUMP, and that each JUMP
|
||||||
|
has the same target */
|
||||||
|
GET_OP;
|
||||||
|
if (op != SRE_OP_JUMP)
|
||||||
|
FAIL;
|
||||||
|
GET_SKIP;
|
||||||
|
if (target == NULL)
|
||||||
|
target = code+skip-1;
|
||||||
|
else if (code+skip-1 != target)
|
||||||
|
FAIL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_REPEAT_ONE:
|
||||||
|
case SRE_OP_MIN_REPEAT_ONE:
|
||||||
|
{
|
||||||
|
SRE_CODE min, max;
|
||||||
|
GET_SKIP;
|
||||||
|
GET_ARG; min = arg;
|
||||||
|
GET_ARG; max = arg;
|
||||||
|
if (min > max)
|
||||||
|
FAIL;
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
if (max > 65535)
|
||||||
|
FAIL;
|
||||||
|
#endif
|
||||||
|
if (!_validate_inner(code, code+skip-4, groups))
|
||||||
|
FAIL;
|
||||||
|
code += skip-4;
|
||||||
|
GET_OP;
|
||||||
|
if (op != SRE_OP_SUCCESS)
|
||||||
|
FAIL;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_REPEAT:
|
||||||
|
{
|
||||||
|
SRE_CODE min, max;
|
||||||
|
GET_SKIP;
|
||||||
|
GET_ARG; min = arg;
|
||||||
|
GET_ARG; max = arg;
|
||||||
|
if (min > max)
|
||||||
|
FAIL;
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
if (max > 65535)
|
||||||
|
FAIL;
|
||||||
|
#endif
|
||||||
|
if (!_validate_inner(code, code+skip-3, groups))
|
||||||
|
FAIL;
|
||||||
|
code += skip-3;
|
||||||
|
GET_OP;
|
||||||
|
if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
|
||||||
|
FAIL;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_GROUPREF:
|
||||||
|
case SRE_OP_GROUPREF_IGNORE:
|
||||||
|
GET_ARG;
|
||||||
|
if (arg >= groups)
|
||||||
|
FAIL;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_GROUPREF_EXISTS:
|
||||||
|
/* The regex syntax for this is: '(?(group)then|else)', where
|
||||||
|
'group' is either an integer group number or a group name,
|
||||||
|
'then' and 'else' are sub-regexes, and 'else' is optional. */
|
||||||
|
GET_ARG;
|
||||||
|
if (arg >= groups)
|
||||||
|
FAIL;
|
||||||
|
GET_SKIP;
|
||||||
|
code--; /* The skip is relative to the first arg! */
|
||||||
|
/* There are two possibilities here: if there is both a 'then'
|
||||||
|
part and an 'else' part, the generated code looks like:
|
||||||
|
|
||||||
|
GROUPREF_EXISTS
|
||||||
|
<group>
|
||||||
|
<skipyes>
|
||||||
|
...then part...
|
||||||
|
JUMP
|
||||||
|
<skipno>
|
||||||
|
(<skipyes> jumps here)
|
||||||
|
...else part...
|
||||||
|
(<skipno> jumps here)
|
||||||
|
|
||||||
|
If there is only a 'then' part, it looks like:
|
||||||
|
|
||||||
|
GROUPREF_EXISTS
|
||||||
|
<group>
|
||||||
|
<skip>
|
||||||
|
...then part...
|
||||||
|
(<skip> jumps here)
|
||||||
|
|
||||||
|
There is no direct way to decide which it is, and we don't want
|
||||||
|
to allow arbitrary jumps anywhere in the code; so we just look
|
||||||
|
for a JUMP opcode preceding our skip target.
|
||||||
|
*/
|
||||||
|
if (skip >= 3 && code+skip-3 >= code &&
|
||||||
|
code[skip-3] == SRE_OP_JUMP)
|
||||||
|
{
|
||||||
|
VTRACE(("both then and else parts present\n"));
|
||||||
|
if (!_validate_inner(code+1, code+skip-3, groups))
|
||||||
|
FAIL;
|
||||||
|
code += skip-2; /* Position after JUMP, at <skipno> */
|
||||||
|
GET_SKIP;
|
||||||
|
if (!_validate_inner(code, code+skip-1, groups))
|
||||||
|
FAIL;
|
||||||
|
code += skip-1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
VTRACE(("only a then part present\n"));
|
||||||
|
if (!_validate_inner(code+1, code+skip-1, groups))
|
||||||
|
FAIL;
|
||||||
|
code += skip-1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_ASSERT:
|
||||||
|
case SRE_OP_ASSERT_NOT:
|
||||||
|
GET_SKIP;
|
||||||
|
GET_ARG; /* 0 for lookahead, width for lookbehind */
|
||||||
|
code--; /* Back up over arg to simplify math below */
|
||||||
|
if (arg & 0x80000000)
|
||||||
|
FAIL; /* Width too large */
|
||||||
|
/* Stop 1 before the end; we check the SUCCESS below */
|
||||||
|
if (!_validate_inner(code+1, code+skip-2, groups))
|
||||||
|
FAIL;
|
||||||
|
code += skip-2;
|
||||||
|
GET_OP;
|
||||||
|
if (op != SRE_OP_SUCCESS)
|
||||||
|
FAIL;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
FAIL;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
VTRACE(("okay\n"));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
||||||
|
{
|
||||||
|
if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
|
||||||
|
FAIL;
|
||||||
|
if (groups == 0) /* fix for simplejson */
|
||||||
|
groups = 100; /* 100 groups should always be safe */
|
||||||
|
return _validate_inner(code, end-1, groups);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_validate(PatternObject *self)
|
||||||
|
{
|
||||||
|
if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
|
||||||
|
{
|
||||||
|
PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
VTRACE(("Success!\n"));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
/* -------------------------------------------------------------------- */
|
/* -------------------------------------------------------------------- */
|
||||||
/* match methods */
|
/* match methods */
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue