From 35804e4c63ae0a61adb71ced8ea6ddcf68908d41 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 19 Oct 2013 20:38:19 +0300 Subject: [PATCH] Issue #19279: UTF-7 decoder no more produces illegal strings. --- Lib/test/test_codecs.py | 30 ++++++++++++++++++++++++++++++ Misc/NEWS | 2 ++ Objects/unicodeobject.c | 2 ++ 3 files changed, 34 insertions(+) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index c68088ed751..80ec541abe9 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -820,6 +820,36 @@ class UTF7Test(ReadTest, unittest.TestCase): ] ) + def test_errors(self): + tests = [ + (b'a\xffb', 'a\ufffdb'), + (b'a+IK', 'a\ufffd'), + (b'a+IK-b', 'a\ufffdb'), + (b'a+IK,b', 'a\ufffdb'), + (b'a+IKx', 'a\u20ac\ufffd'), + (b'a+IKx-b', 'a\u20ac\ufffdb'), + (b'a+IKwgr', 'a\u20ac\ufffd'), + (b'a+IKwgr-b', 'a\u20ac\ufffdb'), + (b'a+IKwgr,', 'a\u20ac\ufffd'), + (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'), + (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'), + (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'), + (b'a+/,+IKw-b', 'a\ufffd\u20acb'), + (b'a+//,+IKw-b', 'a\ufffd\u20acb'), + (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), + (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), + ] + for raw, expected in tests: + with self.subTest(raw=raw): + self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, + raw, 'strict', True) + self.assertEqual(raw.decode('utf-7', 'replace'), expected) + + def test_nonbmp(self): + self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') + self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') + self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') + class UTF16ExTest(unittest.TestCase): def test_errors(self): diff --git a/Misc/NEWS b/Misc/NEWS index 629cf81165a..c2f584ffc02 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,8 @@ What's New in Python 3.3.3 release candidate 1? Core and Builtins ----------------- +- Issue #19279: UTF-7 decoder no more produces illegal strings. + - Fix macro expansion of _PyErr_OCCURRED(), and make sure to use it in at least one place so as to avoid regressions. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 440d35ad0cb..a149177a09f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4359,6 +4359,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s, Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); base64bits -= 16; base64buffer &= (1 << base64bits) - 1; /* clear high bits */ + assert(outCh <= 0xffff); if (surrogate) { /* expecting a second surrogate */ if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { @@ -4426,6 +4427,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s, inShift = 1; shiftOutStart = outpos; base64bits = 0; + base64buffer = 0; } } else if (DECODE_DIRECT(ch)) { /* character decodes as itself */