Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:

1. Non-ASCII bytes were accepted after shift sequence.
2. A low surrogate could be emitted in case of error in high surrogate.
This commit is contained in:
Serhiy Storchaka 2015-10-02 13:07:28 +03:00
parent 223349cfb8
commit 28b21e50c8
4 changed files with 75 additions and 11 deletions

View File

@ -898,6 +898,32 @@ class CP65001Test(ReadTest, unittest.TestCase):
class UTF7Test(ReadTest, unittest.TestCase): class UTF7Test(ReadTest, unittest.TestCase):
encoding = "utf-7" encoding = "utf-7"
def test_ascii(self):
# Set D (directly encoded characters)
set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789'
'\'(),-./:?')
self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
# Set O (optional direct characters)
set_o = ' !"#$%&*;<=>@[]^_`{|}'
self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
# +
self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
# White spaces
ws = ' \t\n\r'
self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
# Other ASCII characters
other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
set(set_d + set_o + '+' + ws)))
self.assertEqual(other_ascii.encode(self.encoding),
b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
def test_partial(self): def test_partial(self):
self.check_partial( self.check_partial(
'a+-b\x00c\x80d\u0100e\U00010000f', 'a+-b\x00c\x80d\u0100e\U00010000f',
@ -939,7 +965,9 @@ class UTF7Test(ReadTest, unittest.TestCase):
def test_errors(self): def test_errors(self):
tests = [ tests = [
(b'\xffb', '\ufffdb'),
(b'a\xffb', 'a\ufffdb'), (b'a\xffb', 'a\ufffdb'),
(b'a\xff\xffb', 'a\ufffd\ufffdb'),
(b'a+IK', 'a\ufffd'), (b'a+IK', 'a\ufffd'),
(b'a+IK-b', 'a\ufffdb'), (b'a+IK-b', 'a\ufffdb'),
(b'a+IK,b', 'a\ufffdb'), (b'a+IK,b', 'a\ufffdb'),
@ -955,6 +983,8 @@ class UTF7Test(ReadTest, unittest.TestCase):
(b'a+//,+IKw-b', 'a\ufffd\u20acb'), (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
(b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
(b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
(b'a+IKw-b\xff', 'a\u20acb\ufffd'),
(b'a+IKw\xffb', 'a\u20ac\ufffdb'),
] ]
for raw, expected in tests: for raw, expected in tests:
with self.subTest(raw=raw): with self.subTest(raw=raw):
@ -966,8 +996,36 @@ class UTF7Test(ReadTest, unittest.TestCase):
self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
b'+IKwgrNgB3KA-')
self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
'\u20ac\u20ac\U000104A0')
self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
'\u20ac\u20ac\U000104A0')
test_lone_surrogates = None def test_lone_surrogates(self):
tests = [
(b'a+2AE-b', 'a\ud801b'),
(b'a+2AE\xffb', 'a\ufffdb'),
(b'a+2AE', 'a\ufffd'),
(b'a+2AEA-b', 'a\ufffdb'),
(b'a+2AH-b', 'a\ufffdb'),
(b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
(b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
(b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
(b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
(b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
(b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
(b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
(b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
]
for raw, expected in tests:
with self.subTest(raw=raw):
self.assertEqual(raw.decode('utf-7', 'replace'), expected)
class UTF16ExTest(unittest.TestCase): class UTF16ExTest(unittest.TestCase):

View File

@ -1524,7 +1524,7 @@ class UnicodeTest(string_tests.CommonTest,
self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
# Issue #2242: crash on some Windows/MSVC versions # Issue #2242: crash on some Windows/MSVC versions
self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1') self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
# Direct encoded characters # Direct encoded characters
set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
@ -1966,6 +1966,7 @@ class UnicodeTest(string_tests.CommonTest,
self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict') self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x") self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x') self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
# Error handling (unknown character names) # Error handling (unknown character names)
self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx") self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")

View File

@ -10,6 +10,8 @@ Release date: tba
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
- Issue #25280: Import trace messages emitted in verbose (-v) mode are no - Issue #25280: Import trace messages emitted in verbose (-v) mode are no
longer formatted twice. longer formatted twice.

View File

@ -4381,31 +4381,31 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
} }
else { /* now leaving a base-64 section */ else { /* now leaving a base-64 section */
inShift = 0; inShift = 0;
s++;
if (surrogate) {
if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
goto onError;
surrogate = 0;
}
if (base64bits > 0) { /* left-over bits */ if (base64bits > 0) { /* left-over bits */
if (base64bits >= 6) { if (base64bits >= 6) {
/* We've seen at least one base-64 character */ /* We've seen at least one base-64 character */
s++;
errmsg = "partial character in shift sequence"; errmsg = "partial character in shift sequence";
goto utf7Error; goto utf7Error;
} }
else { else {
/* Some bits remain; they should be zero */ /* Some bits remain; they should be zero */
if (base64buffer != 0) { if (base64buffer != 0) {
s++;
errmsg = "non-zero padding bits in shift sequence"; errmsg = "non-zero padding bits in shift sequence";
goto utf7Error; goto utf7Error;
} }
} }
} }
if (ch != '-') { if (surrogate && DECODE_DIRECT(ch)) {
if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
goto onError;
}
surrogate = 0;
if (ch == '-') {
/* '-' is absorbed; other terminating /* '-' is absorbed; other terminating
characters are preserved */ characters are preserved */
if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) s++;
goto onError;
} }
} }
} }
@ -4419,6 +4419,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
} }
else { /* begin base64-encoded section */ else { /* begin base64-encoded section */
inShift = 1; inShift = 1;
surrogate = 0;
shiftOutStart = writer.pos; shiftOutStart = writer.pos;
base64bits = 0; base64bits = 0;
base64buffer = 0; base64buffer = 0;
@ -4450,6 +4451,7 @@ utf7Error:
if (inShift && !consumed) { /* in shift sequence, no more to follow */ if (inShift && !consumed) { /* in shift sequence, no more to follow */
/* if we're in an inconsistent state, that's an error */ /* if we're in an inconsistent state, that's an error */
inShift = 0;
if (surrogate || if (surrogate ||
(base64bits >= 6) || (base64bits >= 6) ||
(base64bits > 0 && base64buffer != 0)) { (base64bits > 0 && base64buffer != 0)) {
@ -13337,6 +13339,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
if (maxchar > writer->maxchar || writer->readonly) { if (maxchar > writer->maxchar || writer->readonly) {
/* resize + widen */ /* resize + widen */
maxchar = Py_MAX(maxchar, writer->maxchar);
newbuffer = PyUnicode_New(newlen, maxchar); newbuffer = PyUnicode_New(newlen, maxchar);
if (newbuffer == NULL) if (newbuffer == NULL)
return -1; return -1;