From c8e58126a2acfb5d3c6bbbaf326e69785830bedb Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 29 Jan 2013 10:20:34 +0200 Subject: [PATCH] Issue #16979: Fix error handling bugs in the unicode-escape-decode decoder. --- Lib/test/test_codeccallbacks.py | 4 +- Lib/test/test_codecs.py | 84 +++++++++++++++++++++++++++++++++ Misc/NEWS | 2 + Objects/unicodeobject.c | 79 +++++++++++-------------------- 4 files changed, 116 insertions(+), 53 deletions(-) diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 150a0d2a809..61c2df20c43 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -262,12 +262,12 @@ class CodecCallbackTest(unittest.TestCase): self.assertEqual( "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), - u"\u3042[<92><117><51><120>]xx" + u"\u3042[<92><117><51>]xxx" ) self.assertEqual( "\\u3042\u3xx".decode("unicode-escape", "test.handler1"), - u"\u3042[<92><117><51><120><120>]" + u"\u3042[<92><117><51>]xx" ) self.assertEqual( diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index eb96471f2df..2359917769c 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -4,6 +4,11 @@ import codecs import locale import sys, StringIO, _testcapi +def coding_checker(self, coder): + def check(input, expect): + self.assertEqual(coder(input), (expect, len(input))) + return check + class Queue(object): """ queue: write bytes at one end, read bytes from the other end @@ -1786,6 +1791,84 @@ class WithStmtTest(unittest.TestCase): self.assertEqual(srw.read(), u"\xfc") +class UnicodeEscapeTest(unittest.TestCase): + def test_empty(self): + self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0)) + self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0)) + + def test_raw_encode(self): + encode = codecs.unicode_escape_encode + for b in range(32, 127): + if b != ord('\\'): + self.assertEqual(encode(unichr(b)), (chr(b), 1)) + + def test_raw_decode(self): + decode = codecs.unicode_escape_decode + for b in range(256): + if b != ord('\\'): + self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2)) + + def test_escape_encode(self): + encode = codecs.unicode_escape_encode + check = coding_checker(self, encode) + check(u'\t', r'\t') + check(u'\n', r'\n') + check(u'\r', r'\r') + check(u'\\', r'\\') + for b in range(32): + if chr(b) not in '\t\n\r': + check(unichr(b), '\\x%02x' % b) + for b in range(127, 256): + check(unichr(b), '\\x%02x' % b) + check(u'\u20ac', r'\u20ac') + check(u'\U0001d120', r'\U0001d120') + + def test_escape_decode(self): + decode = codecs.unicode_escape_decode + check = coding_checker(self, decode) + check("[\\\n]", u"[]") + check(r'[\"]', u'["]') + check(r"[\']", u"[']") + check(r"[\\]", ur"[\]") + check(r"[\a]", u"[\x07]") + check(r"[\b]", u"[\x08]") + check(r"[\t]", u"[\x09]") + check(r"[\n]", u"[\x0a]") + check(r"[\v]", u"[\x0b]") + check(r"[\f]", u"[\x0c]") + check(r"[\r]", u"[\x0d]") + check(r"[\7]", u"[\x07]") + check(r"[\8]", ur"[\8]") + check(r"[\78]", u"[\x078]") + check(r"[\41]", u"[!]") + check(r"[\418]", u"[!8]") + check(r"[\101]", u"[A]") + check(r"[\1010]", u"[A0]") + check(r"[\x41]", u"[A]") + check(r"[\x410]", u"[A0]") + check(r"\u20ac", u"\u20ac") + check(r"\U0001d120", u"\U0001d120") + for b in range(256): + if chr(b) not in '\n"\'\\abtnvfr01234567xuUN': + check('\\' + chr(b), u'\\' + unichr(b)) + + def test_decode_errors(self): + decode = codecs.unicode_escape_decode + for c, d in ('x', 2), ('u', 4), ('U', 4): + for i in range(d): + self.assertRaises(UnicodeDecodeError, decode, + "\\" + c + "0"*i) + self.assertRaises(UnicodeDecodeError, decode, + "[\\" + c + "0"*i + "]") + data = "[\\" + c + "0"*i + "]\\" + c + "0"*i + self.assertEqual(decode(data, "ignore"), (u"[]", len(data))) + self.assertEqual(decode(data, "replace"), + (u"[\ufffd]\ufffd", len(data))) + self.assertRaises(UnicodeDecodeError, decode, r"\U00110000") + self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10)) + self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10)) + + class BomTest(unittest.TestCase): def test_seek0(self): data = u"1234567890" @@ -1871,6 +1954,7 @@ def test_main(): BasicStrTest, CharmapTest, WithStmtTest, + UnicodeEscapeTest, BomTest, ) diff --git a/Misc/NEWS b/Misc/NEWS index 31c4a8a6c82..c66cffbec12 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -200,6 +200,8 @@ Core and Builtins Library ------- +- Issue #16979: Fix error handling bugs in the unicode-escape-decode decoder. + - Issue #17051: Fix a memory leak in os.path.isdir() on Windows. Patch by Robert Xiao. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d44a298557e..5fbd24d5fc8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2738,7 +2738,6 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, Py_ssize_t startinpos; Py_ssize_t endinpos; Py_ssize_t outpos; - int i; PyUnicodeObject *v; Py_UNICODE *p; const char *end; @@ -2824,29 +2823,19 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, message = "truncated \\UXXXXXXXX escape"; hexescape: chr = 0; - outpos = p-PyUnicode_AS_UNICODE(v); - if (s+digits>end) { - endinpos = size; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", "end of string in escape sequence", - starts, size, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) - goto onError; - goto nextByte; - } - for (i = 0; i < digits; ++i) { - c = (unsigned char) s[i]; - if (!isxdigit(c)) { - endinpos = (s+i+1)-starts; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", message, - starts, size, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) - goto onError; - goto nextByte; + if (end - s < digits) { + /* count only hex digits */ + for (; s < end; ++s) { + c = (unsigned char)*s; + if (!Py_ISXDIGIT(c)) + goto error; } + goto error; + } + for (; digits--; ++s) { + c = (unsigned char)*s; + if (!Py_ISXDIGIT(c)) + goto error; chr = (chr<<4) & ~0xF; if (c >= '0' && c <= '9') chr += c - '0'; @@ -2855,7 +2844,6 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, else chr += 10 + c - 'A'; } - s += i; if (chr == 0xffffffff && PyErr_Occurred()) /* _decoding_error will have already written into the target buffer. */ @@ -2876,14 +2864,8 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); #endif } else { - endinpos = s-starts; - outpos = p-PyUnicode_AS_UNICODE(v); - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", "illegal Unicode character", - starts, size, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) - goto onError; + message = "illegal Unicode character"; + goto error; } break; @@ -2910,28 +2892,13 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, goto store; } } - endinpos = s-starts; - outpos = p-PyUnicode_AS_UNICODE(v); - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", message, - starts, size, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) - goto onError; - break; + goto error; default: if (s > end) { message = "\\ at end of string"; s--; - endinpos = s-starts; - outpos = p-PyUnicode_AS_UNICODE(v); - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", message, - starts, size, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) - goto onError; + goto error; } else { *p++ = '\\'; @@ -2939,8 +2906,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, } break; } - nextByte: - ; + continue; + + error: + endinpos = s-starts; + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + starts, size, &startinpos, &endinpos, &exc, &s, + &v, &outpos, &p)) + goto onError; + continue; } if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) goto onError;