From 10faf6a0a3a4909bf7e6e8158d42d1ffe2345f89 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 6 Aug 2008 19:29:14 +0000 Subject: [PATCH] Merged revisions 65544 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r65544 | guido.van.rossum | 2008-08-04 20:39:21 -0700 (Mon, 04 Aug 2008) | 28 lines Tracker issue 3487: sre "bytecode" verifier. This is a verifier for the binary code used by the _sre module (this is often called bytecode, though to distinguish it from Python bytecode I put it in quotes). I wrote this for Google App Engine, and am making the patch available as open source under the Apache 2 license. Below are the copyright statement and license, for completeness. # Copyright 2008 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. It's not necessary to include these copyrights and bytecode in the source file. Google has signed a contributor's agreement with the PSF already. ........ --- Lib/test/test_urllib.py | 2 +- Lib/urllib/parse.py | 141 ++++++------ Modules/_sre.c | 474 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 541 insertions(+), 76 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index f5a9d5d20ee..d4630a8510a 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -465,7 +465,7 @@ class UnquotingTests(unittest.TestCase): def test_unquote_with_unicode(self): r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc') - self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc') + self.assertEqual(r, 'br\u00FCckner_sapporo_20050930.doc') class urlencode_Tests(unittest.TestCase): """Tests for urlencode()""" diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index fe02db5e000..f924a3a4a1c 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -261,84 +261,74 @@ def urldefrag(url): return url, '' -_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) -_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) +def unquote_as_string (s, plus=False, charset=None): + if charset is None: + charset = "UTF-8" + return str(unquote_as_bytes(s, plus=plus), charset, 'strict') + +def unquote_as_bytes (s, plus=False): + """unquote('abc%20def') -> 'abc def'.""" + if plus: + s = s.replace('+', ' ') + res = s.split('%') + res[0] = res[0].encode('ASCII', 'strict') + for i in range(1, len(res)): + res[i] = (bytes.fromhex(res[i][:2]) + + res[i][2:].encode('ASCII', 'strict')) + return b''.join(res) + +_always_safe = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + b'abcdefghijklmnopqrstuvwxyz' + b'0123456789' + b'_.-') + +_percent_code = ord('%') + +_hextable = b'0123456789ABCDEF' + +def quote_as_bytes(s, safe = '/', plus=False): + """quote(b'abc@def') -> 'abc%40def'""" + + if isinstance(s, str): + s = s.encode("UTF-8", "strict") + if not (isinstance(s, bytes) or isinstance(s, bytearray)): + raise ValueError("Argument to quote must be either bytes " + "or bytearray; string arguments will be " + "converted to UTF-8 bytes") + + safeset = _always_safe + safe.encode('ASCII', 'strict') + if plus: + safeset += b' ' + + result = bytearray() + for i in s: + if i not in safeset: + result.append(_percent_code) + result.append(_hextable[(i >> 4) & 0xF]) + result.append(_hextable[i & 0xF]) + else: + result.append(i) + if plus: + result = result.replace(b' ', b'+') + return result + +def quote_as_string(s, safe = '/', plus=False): + return str(quote_as_bytes(s, safe=safe, plus=plus), 'ASCII', 'strict') + +# finally, define defaults for 'quote' and 'unquote' + +def quote(s, safe='/'): + return quote_as_string(s, safe=safe) + +def quote_plus(s, safe=''): + return quote_as_string(s, safe=safe, plus=True) def unquote(s): - """unquote('abc%20def') -> 'abc def'.""" - res = s.split('%') - for i in range(1, len(res)): - item = res[i] - try: - res[i] = _hextochr[item[:2]] + item[2:] - except KeyError: - res[i] = '%' + item - except UnicodeDecodeError: - res[i] = chr(int(item[:2], 16)) + item[2:] - return "".join(res) + return unquote_as_string(s) def unquote_plus(s): - """unquote('%7e/abc+def') -> '~/abc def'""" - s = s.replace('+', ' ') - return unquote(s) + return unquote_as_string(s, plus=True) -always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '_.-') -_safe_quoters= {} - -class Quoter: - def __init__(self, safe): - self.cache = {} - self.safe = safe + always_safe - - def __call__(self, c): - try: - return self.cache[c] - except KeyError: - if ord(c) < 256: - res = (c in self.safe) and c or ('%%%02X' % ord(c)) - self.cache[c] = res - return res - else: - return "".join(['%%%02X' % i for i in c.encode("utf-8")]) - -def quote(s, safe = '/'): - """quote('abc def') -> 'abc%20def' - - Each part of a URL, e.g. the path info, the query, etc., has a - different set of reserved characters that must be quoted. - - RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists - the following reserved characters. - - reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | - "$" | "," - - Each of these characters is reserved in some component of a URL, - but not necessarily in all of them. - - By default, the quote function is intended for quoting the path - section of a URL. Thus, it will not encode '/'. This character - is reserved, but in typical usage the quote function is being - called on a path where the existing slash characters are used as - reserved characters. - """ - cachekey = (safe, always_safe) - try: - quoter = _safe_quoters[cachekey] - except KeyError: - quoter = Quoter(safe) - _safe_quoters[cachekey] = quoter - res = map(quoter, s) - return ''.join(res) - -def quote_plus(s, safe = ''): - """Quote the query fragment of a URL; replacing ' ' with '+'""" - if ' ' in s: - s = quote(s, safe + ' ') - return s.replace(' ', '+') - return quote(s, safe) def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. @@ -387,7 +377,7 @@ def urlencode(query,doseq=0): # is there a reasonable way to convert to ASCII? # encode generates a string, but "replace" or "ignore" # lose information and "strict" can raise UnicodeError - v = quote_plus(v.encode("ASCII","replace")) + v = quote_plus(v) l.append(k + '=' + v) else: try: @@ -474,7 +464,8 @@ def splituser(host): _userprog = re.compile('^(.*)@(.*)$') match = _userprog.match(host) - if match: return map(unquote, match.group(1, 2)) + if match: + return map(unquote, match.group(1, 2)) return None, host _passwdprog = None diff --git a/Modules/_sre.c b/Modules/_sre.c index a0e974a570a..4511c1b802f 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -2637,6 +2637,8 @@ static PyTypeObject Pattern_Type = { pattern_members, /* tp_members */ }; +static int _validate(PatternObject *self); /* Forward */ + static PyObject * _compile(PyObject* self_, PyObject* args) { @@ -2695,9 +2697,481 @@ _compile(PyObject* self_, PyObject* args) self->weakreflist = NULL; + if (!_validate(self)) { + Py_DECREF(self); + return NULL; + } + return (PyObject*) self; } +/* -------------------------------------------------------------------- */ +/* Code validation */ + +/* To learn more about this code, have a look at the _compile() function in + Lib/sre_compile.py. The validation functions below checks the code array + for conformance with the code patterns generated there. + + The nice thing about the generated code is that it is position-independent: + all jumps are relative jumps forward. Also, jumps don't cross each other: + the target of a later jump is always earlier than the target of an earlier + jump. IOW, this is okay: + + J---------J-------T--------T + \ \_____/ / + \______________________/ + + but this is not: + + J---------J-------T--------T + \_________\_____/ / + \____________/ + + It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4 + bytes wide (the latter if Python is compiled for "wide" unicode support). +*/ + +/* Defining this one enables tracing of the validator */ +#undef VVERBOSE + +/* Trace macro for the validator */ +#if defined(VVERBOSE) +#define VTRACE(v) printf v +#else +#define VTRACE(v) +#endif + +/* Report failure */ +#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0) + +/* Extract opcode, argument, or skip count from code array */ +#define GET_OP \ + do { \ + VTRACE(("%p: ", code)); \ + if (code >= end) FAIL; \ + op = *code++; \ + VTRACE(("%lu (op)\n", (unsigned long)op)); \ + } while (0) +#define GET_ARG \ + do { \ + VTRACE(("%p= ", code)); \ + if (code >= end) FAIL; \ + arg = *code++; \ + VTRACE(("%lu (arg)\n", (unsigned long)arg)); \ + } while (0) +#define GET_SKIP \ + do { \ + VTRACE(("%p= ", code)); \ + if (code >= end) FAIL; \ + skip = *code; \ + VTRACE(("%lu (skip to %p)\n", \ + (unsigned long)skip, code+skip)); \ + if (code+skip < code || code+skip > end) \ + FAIL; \ + code++; \ + } while (0) + +static int +_validate_charset(SRE_CODE *code, SRE_CODE *end) +{ + /* Some variables are manipulated by the macros above */ + SRE_CODE op; + SRE_CODE arg; + SRE_CODE offset; + int i; + + while (code < end) { + GET_OP; + switch (op) { + + case SRE_OP_NEGATE: + break; + + case SRE_OP_LITERAL: + GET_ARG; + break; + + case SRE_OP_RANGE: + GET_ARG; + GET_ARG; + break; + + case SRE_OP_CHARSET: + offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */ + if (code+offset < code || code+offset > end) + FAIL; + code += offset; + break; + + case SRE_OP_BIGCHARSET: + GET_ARG; /* Number of blocks */ + offset = 256/sizeof(SRE_CODE); /* 256-byte table */ + if (code+offset < code || code+offset > end) + FAIL; + /* Make sure that each byte points to a valid block */ + for (i = 0; i < 256; i++) { + if (((unsigned char *)code)[i] >= arg) + FAIL; + } + code += offset; + offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */ + if (code+offset < code || code+offset > end) + FAIL; + code += offset; + break; + + case SRE_OP_CATEGORY: + GET_ARG; + switch (arg) { + case SRE_CATEGORY_DIGIT: + case SRE_CATEGORY_NOT_DIGIT: + case SRE_CATEGORY_SPACE: + case SRE_CATEGORY_NOT_SPACE: + case SRE_CATEGORY_WORD: + case SRE_CATEGORY_NOT_WORD: + case SRE_CATEGORY_LINEBREAK: + case SRE_CATEGORY_NOT_LINEBREAK: + case SRE_CATEGORY_LOC_WORD: + case SRE_CATEGORY_LOC_NOT_WORD: + case SRE_CATEGORY_UNI_DIGIT: + case SRE_CATEGORY_UNI_NOT_DIGIT: + case SRE_CATEGORY_UNI_SPACE: + case SRE_CATEGORY_UNI_NOT_SPACE: + case SRE_CATEGORY_UNI_WORD: + case SRE_CATEGORY_UNI_NOT_WORD: + case SRE_CATEGORY_UNI_LINEBREAK: + case SRE_CATEGORY_UNI_NOT_LINEBREAK: + break; + default: + FAIL; + } + break; + + default: + FAIL; + + } + } + + return 1; +} + +static int +_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) +{ + /* Some variables are manipulated by the macros above */ + SRE_CODE op; + SRE_CODE arg; + SRE_CODE skip; + + VTRACE(("code=%p, end=%p\n", code, end)); + + if (code > end) + FAIL; + + while (code < end) { + GET_OP; + switch (op) { + + case SRE_OP_MARK: + /* We don't check whether marks are properly nested; the + sre_match() code is robust even if they don't, and the worst + you can get is nonsensical match results. */ + GET_ARG; + if (arg > 2*groups+1) { + VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups)); + FAIL; + } + break; + + case SRE_OP_LITERAL: + case SRE_OP_NOT_LITERAL: + case SRE_OP_LITERAL_IGNORE: + case SRE_OP_NOT_LITERAL_IGNORE: + GET_ARG; + /* The arg is just a character, nothing to check */ + break; + + case SRE_OP_SUCCESS: + case SRE_OP_FAILURE: + /* Nothing to check; these normally end the matching process */ + break; + + case SRE_OP_AT: + GET_ARG; + switch (arg) { + case SRE_AT_BEGINNING: + case SRE_AT_BEGINNING_STRING: + case SRE_AT_BEGINNING_LINE: + case SRE_AT_END: + case SRE_AT_END_LINE: + case SRE_AT_END_STRING: + case SRE_AT_BOUNDARY: + case SRE_AT_NON_BOUNDARY: + case SRE_AT_LOC_BOUNDARY: + case SRE_AT_LOC_NON_BOUNDARY: + case SRE_AT_UNI_BOUNDARY: + case SRE_AT_UNI_NON_BOUNDARY: + break; + default: + FAIL; + } + break; + + case SRE_OP_ANY: + case SRE_OP_ANY_ALL: + /* These have no operands */ + break; + + case SRE_OP_IN: + case SRE_OP_IN_IGNORE: + GET_SKIP; + /* Stop 1 before the end; we check the FAILURE below */ + if (!_validate_charset(code, code+skip-2)) + FAIL; + if (code[skip-2] != SRE_OP_FAILURE) + FAIL; + code += skip-1; + break; + + case SRE_OP_INFO: + { + /* A minimal info field is + <1=skip> <2=flags> <3=min> <4=max>; + If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags, + more follows. */ + SRE_CODE flags, min, max, i; + SRE_CODE *newcode; + GET_SKIP; + newcode = code+skip-1; + GET_ARG; flags = arg; + GET_ARG; min = arg; + GET_ARG; max = arg; + /* Check that only valid flags are present */ + if ((flags & ~(SRE_INFO_PREFIX | + SRE_INFO_LITERAL | + SRE_INFO_CHARSET)) != 0) + FAIL; + /* PREFIX and CHARSET are mutually exclusive */ + if ((flags & SRE_INFO_PREFIX) && + (flags & SRE_INFO_CHARSET)) + FAIL; + /* LITERAL implies PREFIX */ + if ((flags & SRE_INFO_LITERAL) && + !(flags & SRE_INFO_PREFIX)) + FAIL; + /* Validate the prefix */ + if (flags & SRE_INFO_PREFIX) { + SRE_CODE prefix_len, prefix_skip; + GET_ARG; prefix_len = arg; + GET_ARG; prefix_skip = arg; + /* Here comes the prefix string */ + if (code+prefix_len < code || code+prefix_len > newcode) + FAIL; + code += prefix_len; + /* And here comes the overlap table */ + if (code+prefix_len < code || code+prefix_len > newcode) + FAIL; + /* Each overlap value should be < prefix_len */ + for (i = 0; i < prefix_len; i++) { + if (code[i] >= prefix_len) + FAIL; + } + code += prefix_len; + } + /* Validate the charset */ + if (flags & SRE_INFO_CHARSET) { + if (!_validate_charset(code, newcode-1)) + FAIL; + if (newcode[-1] != SRE_OP_FAILURE) + FAIL; + code = newcode; + } + else if (code != newcode) { + VTRACE(("code=%p, newcode=%p\n", code, newcode)); + FAIL; + } + } + break; + + case SRE_OP_BRANCH: + { + SRE_CODE *target = NULL; + for (;;) { + GET_SKIP; + if (skip == 0) + break; + /* Stop 2 before the end; we check the JUMP below */ + if (!_validate_inner(code, code+skip-3, groups)) + FAIL; + code += skip-3; + /* Check that it ends with a JUMP, and that each JUMP + has the same target */ + GET_OP; + if (op != SRE_OP_JUMP) + FAIL; + GET_SKIP; + if (target == NULL) + target = code+skip-1; + else if (code+skip-1 != target) + FAIL; + } + } + break; + + case SRE_OP_REPEAT_ONE: + case SRE_OP_MIN_REPEAT_ONE: + { + SRE_CODE min, max; + GET_SKIP; + GET_ARG; min = arg; + GET_ARG; max = arg; + if (min > max) + FAIL; +#ifdef Py_UNICODE_WIDE + if (max > 65535) + FAIL; +#endif + if (!_validate_inner(code, code+skip-4, groups)) + FAIL; + code += skip-4; + GET_OP; + if (op != SRE_OP_SUCCESS) + FAIL; + } + break; + + case SRE_OP_REPEAT: + { + SRE_CODE min, max; + GET_SKIP; + GET_ARG; min = arg; + GET_ARG; max = arg; + if (min > max) + FAIL; +#ifdef Py_UNICODE_WIDE + if (max > 65535) + FAIL; +#endif + if (!_validate_inner(code, code+skip-3, groups)) + FAIL; + code += skip-3; + GET_OP; + if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) + FAIL; + } + break; + + case SRE_OP_GROUPREF: + case SRE_OP_GROUPREF_IGNORE: + GET_ARG; + if (arg >= groups) + FAIL; + break; + + case SRE_OP_GROUPREF_EXISTS: + /* The regex syntax for this is: '(?(group)then|else)', where + 'group' is either an integer group number or a group name, + 'then' and 'else' are sub-regexes, and 'else' is optional. */ + GET_ARG; + if (arg >= groups) + FAIL; + GET_SKIP; + code--; /* The skip is relative to the first arg! */ + /* There are two possibilities here: if there is both a 'then' + part and an 'else' part, the generated code looks like: + + GROUPREF_EXISTS + + + ...then part... + JUMP + + ( jumps here) + ...else part... + ( jumps here) + + If there is only a 'then' part, it looks like: + + GROUPREF_EXISTS + + + ...then part... + ( jumps here) + + There is no direct way to decide which it is, and we don't want + to allow arbitrary jumps anywhere in the code; so we just look + for a JUMP opcode preceding our skip target. + */ + if (skip >= 3 && code+skip-3 >= code && + code[skip-3] == SRE_OP_JUMP) + { + VTRACE(("both then and else parts present\n")); + if (!_validate_inner(code+1, code+skip-3, groups)) + FAIL; + code += skip-2; /* Position after JUMP, at */ + GET_SKIP; + if (!_validate_inner(code, code+skip-1, groups)) + FAIL; + code += skip-1; + } + else { + VTRACE(("only a then part present\n")); + if (!_validate_inner(code+1, code+skip-1, groups)) + FAIL; + code += skip-1; + } + break; + + case SRE_OP_ASSERT: + case SRE_OP_ASSERT_NOT: + GET_SKIP; + GET_ARG; /* 0 for lookahead, width for lookbehind */ + code--; /* Back up over arg to simplify math below */ + if (arg & 0x80000000) + FAIL; /* Width too large */ + /* Stop 1 before the end; we check the SUCCESS below */ + if (!_validate_inner(code+1, code+skip-2, groups)) + FAIL; + code += skip-2; + GET_OP; + if (op != SRE_OP_SUCCESS) + FAIL; + break; + + default: + FAIL; + + } + } + + VTRACE(("okay\n")); + return 1; +} + +static int +_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) +{ + if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS) + FAIL; + if (groups == 0) /* fix for simplejson */ + groups = 100; /* 100 groups should always be safe */ + return _validate_inner(code, end-1, groups); +} + +static int +_validate(PatternObject *self) +{ + if (!_validate_outer(self->code, self->code+self->codesize, self->groups)) + { + PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); + return 0; + } + else + VTRACE(("Success!\n")); + return 1; +} + /* -------------------------------------------------------------------- */ /* match methods */