Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:

1. Non-ASCII bytes were accepted after shift sequence.
2. A low surrogate could be emitted in case of error in high surrogate.
This commit is contained in:
Serhiy Storchaka 2015-10-02 13:14:53 +03:00
parent a87633e596
commit e12f632186
4 changed files with 71 additions and 7 deletions

View File

@ -642,6 +642,32 @@ class UTF8Test(ReadTest):
class UTF7Test(ReadTest): class UTF7Test(ReadTest):
encoding = "utf-7" encoding = "utf-7"
def test_ascii(self):
# Set D (directly encoded characters)
set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789'
'\'(),-./:?')
self.assertEqual(set_d.encode(self.encoding), set_d)
self.assertEqual(set_d.decode(self.encoding), set_d)
# Set O (optional direct characters)
set_o = ' !"#$%&*;<=>@[]^_`{|}'
self.assertEqual(set_o.encode(self.encoding), set_o)
self.assertEqual(set_o.decode(self.encoding), set_o)
# +
self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b')
self.assertEqual('a+-b'.decode(self.encoding), u'a+b')
# White spaces
ws = ' \t\n\r'
self.assertEqual(ws.encode(self.encoding), ws)
self.assertEqual(ws.decode(self.encoding), ws)
# Other ASCII characters
other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) -
set(set_d + set_o + '+' + ws)))
self.assertEqual(other_ascii.encode(self.encoding),
'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
def test_partial(self): def test_partial(self):
self.check_partial( self.check_partial(
u"a+-b", u"a+-b",
@ -656,7 +682,9 @@ class UTF7Test(ReadTest):
def test_errors(self): def test_errors(self):
tests = [ tests = [
('\xffb', u'\ufffdb'),
('a\xffb', u'a\ufffdb'), ('a\xffb', u'a\ufffdb'),
('a\xff\xffb', u'a\ufffd\ufffdb'),
('a+IK', u'a\ufffd'), ('a+IK', u'a\ufffd'),
('a+IK-b', u'a\ufffdb'), ('a+IK-b', u'a\ufffdb'),
('a+IK,b', u'a\ufffdb'), ('a+IK,b', u'a\ufffdb'),
@ -672,6 +700,8 @@ class UTF7Test(ReadTest):
('a+//,+IKw-b', u'a\ufffd\u20acb'), ('a+//,+IKw-b', u'a\ufffd\u20acb'),
('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'), ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'), ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
('a+IKw-b\xff', u'a\u20acb\ufffd'),
('a+IKw\xffb', u'a\u20ac\ufffdb'),
] ]
for raw, expected in tests: for raw, expected in tests:
self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
@ -682,6 +712,35 @@ class UTF7Test(ReadTest):
self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-') self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-') self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0') self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0')
self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-')
self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0')
self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0')
self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding),
'+IKwgrNgB3KA-')
self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding),
u'\u20ac\u20ac\U000104A0')
self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding),
u'\u20ac\u20ac\U000104A0')
def test_lone_surrogates(self):
tests = [
('a+2AE-b', u'a\ud801b'),
('a+2AE\xffb', u'a\ufffdb'),
('a+2AE', u'a\ufffd'),
('a+2AEA-b', u'a\ufffdb'),
('a+2AH-b', u'a\ufffdb'),
('a+IKzYAQ-b', u'a\u20ac\ud801b'),
('a+IKzYAQ\xffb', u'a\u20ac\ufffdb'),
('a+IKzYAQA-b', u'a\u20ac\ufffdb'),
('a+IKzYAd-b', u'a\u20ac\ufffdb'),
('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'),
('a+IKwgrNgB\xffb', u'a\u20ac\u20ac\ufffdb'),
('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'),
('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'),
]
for raw, expected in tests:
self.assertEqual(raw.decode('utf-7', 'replace'), expected)
class UTF16ExTest(unittest.TestCase): class UTF16ExTest(unittest.TestCase):

View File

@ -1036,6 +1036,7 @@ class UnicodeTest(
self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict') self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x") self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x') self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
self.assertEqual(unicode('\202 x', 'ascii', 'replace'), u'\uFFFD x')
self.assertEqual(u'abcde'.decode('ascii', 'ignore'), self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
u'abcde'.decode('ascii', errors='ignore')) u'abcde'.decode('ascii', errors='ignore'))
self.assertEqual(u'abcde'.decode('ascii', 'replace'), self.assertEqual(u'abcde'.decode('ascii', 'replace'),

View File

@ -10,6 +10,8 @@ What's New in Python 2.7.11?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
- Issue #25003: os.urandom() doesn't use getentropy() on Solaris because - Issue #25003: os.urandom() doesn't use getentropy() on Solaris because
getentropy() is blocking, whereas os.urandom() should not block. getentropy() getentropy() is blocking, whereas os.urandom() should not block. getentropy()
is supported since Solaris 11.3. is supported since Solaris 11.3.

View File

@ -1716,29 +1716,29 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
} }
else { /* now leaving a base-64 section */ else { /* now leaving a base-64 section */
inShift = 0; inShift = 0;
s++;
if (surrogate) {
*p++ = surrogate;
surrogate = 0;
}
if (base64bits > 0) { /* left-over bits */ if (base64bits > 0) { /* left-over bits */
if (base64bits >= 6) { if (base64bits >= 6) {
/* We've seen at least one base-64 character */ /* We've seen at least one base-64 character */
s++;
errmsg = "partial character in shift sequence"; errmsg = "partial character in shift sequence";
goto utf7Error; goto utf7Error;
} }
else { else {
/* Some bits remain; they should be zero */ /* Some bits remain; they should be zero */
if (base64buffer != 0) { if (base64buffer != 0) {
s++;
errmsg = "non-zero padding bits in shift sequence"; errmsg = "non-zero padding bits in shift sequence";
goto utf7Error; goto utf7Error;
} }
} }
} }
if (ch != '-') { if (surrogate && DECODE_DIRECT(ch))
*p++ = surrogate;
surrogate = 0;
if (ch == '-') {
/* '-' is absorbed; other terminating /* '-' is absorbed; other terminating
characters are preserved */ characters are preserved */
*p++ = ch; s++;
} }
} }
} }
@ -1751,6 +1751,7 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
} }
else { /* begin base64-encoded section */ else { /* begin base64-encoded section */
inShift = 1; inShift = 1;
surrogate = 0;
shiftOutStart = p; shiftOutStart = p;
base64bits = 0; base64bits = 0;
base64buffer = 0; base64buffer = 0;
@ -1782,6 +1783,7 @@ utf7Error:
if (inShift && !consumed) { /* in shift sequence, no more to follow */ if (inShift && !consumed) { /* in shift sequence, no more to follow */
/* if we're in an inconsistent state, that's an error */ /* if we're in an inconsistent state, that's an error */
inShift = 0;
if (surrogate || if (surrogate ||
(base64bits >= 6) || (base64bits >= 6) ||
(base64bits > 0 && base64buffer != 0)) { (base64bits > 0 && base64buffer != 0)) {