Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:
1. Non-ASCII bytes were accepted after shift sequence. 2. A low surrogate could be emitted in case of error in high surrogate.
This commit is contained in:
parent
a87633e596
commit
e12f632186
|
@ -642,6 +642,32 @@ class UTF8Test(ReadTest):
|
||||||
class UTF7Test(ReadTest):
|
class UTF7Test(ReadTest):
|
||||||
encoding = "utf-7"
|
encoding = "utf-7"
|
||||||
|
|
||||||
|
def test_ascii(self):
|
||||||
|
# Set D (directly encoded characters)
|
||||||
|
set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
|
'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
'0123456789'
|
||||||
|
'\'(),-./:?')
|
||||||
|
self.assertEqual(set_d.encode(self.encoding), set_d)
|
||||||
|
self.assertEqual(set_d.decode(self.encoding), set_d)
|
||||||
|
# Set O (optional direct characters)
|
||||||
|
set_o = ' !"#$%&*;<=>@[]^_`{|}'
|
||||||
|
self.assertEqual(set_o.encode(self.encoding), set_o)
|
||||||
|
self.assertEqual(set_o.decode(self.encoding), set_o)
|
||||||
|
# +
|
||||||
|
self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b')
|
||||||
|
self.assertEqual('a+-b'.decode(self.encoding), u'a+b')
|
||||||
|
# White spaces
|
||||||
|
ws = ' \t\n\r'
|
||||||
|
self.assertEqual(ws.encode(self.encoding), ws)
|
||||||
|
self.assertEqual(ws.decode(self.encoding), ws)
|
||||||
|
# Other ASCII characters
|
||||||
|
other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) -
|
||||||
|
set(set_d + set_o + '+' + ws)))
|
||||||
|
self.assertEqual(other_ascii.encode(self.encoding),
|
||||||
|
'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
|
||||||
|
'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
|
||||||
|
|
||||||
def test_partial(self):
|
def test_partial(self):
|
||||||
self.check_partial(
|
self.check_partial(
|
||||||
u"a+-b",
|
u"a+-b",
|
||||||
|
@ -656,7 +682,9 @@ class UTF7Test(ReadTest):
|
||||||
|
|
||||||
def test_errors(self):
|
def test_errors(self):
|
||||||
tests = [
|
tests = [
|
||||||
|
('\xffb', u'\ufffdb'),
|
||||||
('a\xffb', u'a\ufffdb'),
|
('a\xffb', u'a\ufffdb'),
|
||||||
|
('a\xff\xffb', u'a\ufffd\ufffdb'),
|
||||||
('a+IK', u'a\ufffd'),
|
('a+IK', u'a\ufffd'),
|
||||||
('a+IK-b', u'a\ufffdb'),
|
('a+IK-b', u'a\ufffdb'),
|
||||||
('a+IK,b', u'a\ufffdb'),
|
('a+IK,b', u'a\ufffdb'),
|
||||||
|
@ -672,6 +700,8 @@ class UTF7Test(ReadTest):
|
||||||
('a+//,+IKw-b', u'a\ufffd\u20acb'),
|
('a+//,+IKw-b', u'a\ufffd\u20acb'),
|
||||||
('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
|
('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
|
||||||
('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
|
('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
|
||||||
|
('a+IKw-b\xff', u'a\u20acb\ufffd'),
|
||||||
|
('a+IKw\xffb', u'a\u20ac\ufffdb'),
|
||||||
]
|
]
|
||||||
for raw, expected in tests:
|
for raw, expected in tests:
|
||||||
self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
|
self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
|
||||||
|
@ -682,6 +712,35 @@ class UTF7Test(ReadTest):
|
||||||
self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
|
self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
|
||||||
self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
|
self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
|
||||||
self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
|
self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
|
||||||
|
self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0')
|
||||||
|
self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-')
|
||||||
|
self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0')
|
||||||
|
self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0')
|
||||||
|
self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding),
|
||||||
|
'+IKwgrNgB3KA-')
|
||||||
|
self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding),
|
||||||
|
u'\u20ac\u20ac\U000104A0')
|
||||||
|
self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding),
|
||||||
|
u'\u20ac\u20ac\U000104A0')
|
||||||
|
|
||||||
|
def test_lone_surrogates(self):
|
||||||
|
tests = [
|
||||||
|
('a+2AE-b', u'a\ud801b'),
|
||||||
|
('a+2AE\xffb', u'a\ufffdb'),
|
||||||
|
('a+2AE', u'a\ufffd'),
|
||||||
|
('a+2AEA-b', u'a\ufffdb'),
|
||||||
|
('a+2AH-b', u'a\ufffdb'),
|
||||||
|
('a+IKzYAQ-b', u'a\u20ac\ud801b'),
|
||||||
|
('a+IKzYAQ\xffb', u'a\u20ac\ufffdb'),
|
||||||
|
('a+IKzYAQA-b', u'a\u20ac\ufffdb'),
|
||||||
|
('a+IKzYAd-b', u'a\u20ac\ufffdb'),
|
||||||
|
('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'),
|
||||||
|
('a+IKwgrNgB\xffb', u'a\u20ac\u20ac\ufffdb'),
|
||||||
|
('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'),
|
||||||
|
('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'),
|
||||||
|
]
|
||||||
|
for raw, expected in tests:
|
||||||
|
self.assertEqual(raw.decode('utf-7', 'replace'), expected)
|
||||||
|
|
||||||
class UTF16ExTest(unittest.TestCase):
|
class UTF16ExTest(unittest.TestCase):
|
||||||
|
|
||||||
|
|
|
@ -1036,6 +1036,7 @@ class UnicodeTest(
|
||||||
self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
|
self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
|
||||||
self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
|
self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
|
||||||
self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
|
self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
|
||||||
|
self.assertEqual(unicode('\202 x', 'ascii', 'replace'), u'\uFFFD x')
|
||||||
self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
|
self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
|
||||||
u'abcde'.decode('ascii', errors='ignore'))
|
u'abcde'.decode('ascii', errors='ignore'))
|
||||||
self.assertEqual(u'abcde'.decode('ascii', 'replace'),
|
self.assertEqual(u'abcde'.decode('ascii', 'replace'),
|
||||||
|
|
|
@ -10,6 +10,8 @@ What's New in Python 2.7.11?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
|
||||||
|
|
||||||
- Issue #25003: os.urandom() doesn't use getentropy() on Solaris because
|
- Issue #25003: os.urandom() doesn't use getentropy() on Solaris because
|
||||||
getentropy() is blocking, whereas os.urandom() should not block. getentropy()
|
getentropy() is blocking, whereas os.urandom() should not block. getentropy()
|
||||||
is supported since Solaris 11.3.
|
is supported since Solaris 11.3.
|
||||||
|
|
|
@ -1716,29 +1716,29 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
|
||||||
}
|
}
|
||||||
else { /* now leaving a base-64 section */
|
else { /* now leaving a base-64 section */
|
||||||
inShift = 0;
|
inShift = 0;
|
||||||
s++;
|
|
||||||
if (surrogate) {
|
|
||||||
*p++ = surrogate;
|
|
||||||
surrogate = 0;
|
|
||||||
}
|
|
||||||
if (base64bits > 0) { /* left-over bits */
|
if (base64bits > 0) { /* left-over bits */
|
||||||
if (base64bits >= 6) {
|
if (base64bits >= 6) {
|
||||||
/* We've seen at least one base-64 character */
|
/* We've seen at least one base-64 character */
|
||||||
|
s++;
|
||||||
errmsg = "partial character in shift sequence";
|
errmsg = "partial character in shift sequence";
|
||||||
goto utf7Error;
|
goto utf7Error;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* Some bits remain; they should be zero */
|
/* Some bits remain; they should be zero */
|
||||||
if (base64buffer != 0) {
|
if (base64buffer != 0) {
|
||||||
|
s++;
|
||||||
errmsg = "non-zero padding bits in shift sequence";
|
errmsg = "non-zero padding bits in shift sequence";
|
||||||
goto utf7Error;
|
goto utf7Error;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ch != '-') {
|
if (surrogate && DECODE_DIRECT(ch))
|
||||||
|
*p++ = surrogate;
|
||||||
|
surrogate = 0;
|
||||||
|
if (ch == '-') {
|
||||||
/* '-' is absorbed; other terminating
|
/* '-' is absorbed; other terminating
|
||||||
characters are preserved */
|
characters are preserved */
|
||||||
*p++ = ch;
|
s++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1751,6 +1751,7 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
|
||||||
}
|
}
|
||||||
else { /* begin base64-encoded section */
|
else { /* begin base64-encoded section */
|
||||||
inShift = 1;
|
inShift = 1;
|
||||||
|
surrogate = 0;
|
||||||
shiftOutStart = p;
|
shiftOutStart = p;
|
||||||
base64bits = 0;
|
base64bits = 0;
|
||||||
base64buffer = 0;
|
base64buffer = 0;
|
||||||
|
@ -1782,6 +1783,7 @@ utf7Error:
|
||||||
|
|
||||||
if (inShift && !consumed) { /* in shift sequence, no more to follow */
|
if (inShift && !consumed) { /* in shift sequence, no more to follow */
|
||||||
/* if we're in an inconsistent state, that's an error */
|
/* if we're in an inconsistent state, that's an error */
|
||||||
|
inShift = 0;
|
||||||
if (surrogate ||
|
if (surrogate ||
|
||||||
(base64bits >= 6) ||
|
(base64bits >= 6) ||
|
||||||
(base64bits > 0 && base64buffer != 0)) {
|
(base64bits > 0 && base64buffer != 0)) {
|
||||||
|
|
Loading…
Reference in New Issue