From 5418ee0b9a36886064937159f9c0641ae2c4f618 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 15 Nov 2011 01:42:21 +0100 Subject: [PATCH] Issue #13333: The UTF-7 decoder now accepts lone surrogates (the encoder already accepts them). --- Lib/test/test_unicode.py | 14 +++++++++++--- Misc/NEWS | 3 +++ Objects/unicodeobject.c | 14 +++++--------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 86185e9db6e..591a297756b 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1091,10 +1091,18 @@ class UnicodeTest(string_tests.CommonTest, for (x, y) in utfTests: self.assertEqual(x.encode('utf-7'), y) - # Unpaired surrogates not supported - self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7') + # Unpaired surrogates are passed through + self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-') + self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x') + self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-') + self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x') + self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801') + self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x') + self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01') + self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x') - self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd') + self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-') + self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') # Issue #2242: crash on some Windows/MSVC versions self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1') diff --git a/Misc/NEWS b/Misc/NEWS index ca8d4cb7bb5..4fb9ff6305a 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.2.3? Core and Builtins ----------------- +- Issue #13333: The UTF-7 decoder now accepts lone surrogates (the encoder + already accepts them). + - Issue #13342: input() used to ignore sys.stdin's and sys.stdout's unicode error handler in interactive mode (when calling into PyOS_Readline()). diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7316abfc9c3..8680726275e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2282,21 +2282,17 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, *p++ = outCh; #endif surrogate = 0; + continue; } else { + *p++ = surrogate; surrogate = 0; - errmsg = "second surrogate missing"; - goto utf7Error; } } - else if (outCh >= 0xD800 && outCh <= 0xDBFF) { + if (outCh >= 0xD800 && outCh <= 0xDBFF) { /* first surrogate */ surrogate = outCh; } - else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { - errmsg = "unexpected second surrogate"; - goto utf7Error; - } else { *p++ = outCh; } @@ -2306,8 +2302,8 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, inShift = 0; s++; if (surrogate) { - errmsg = "second surrogate missing at end of shift sequence"; - goto utf7Error; + *p++ = surrogate; + surrogate = 0; } if (base64bits > 0) { /* left-over bits */ if (base64bits >= 6) {