Issue #4426: The UTF-7 decoder was too strict and didn't accept some legal sequences.
Patch by Nick Barnes and Victor Stinner.
This commit is contained in:
parent
2827709d6d
commit
653dece278
|
@ -740,10 +740,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
|
||||||
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
|
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
|
||||||
const Py_UNICODE *data, /* Unicode char buffer */
|
const Py_UNICODE *data, /* Unicode char buffer */
|
||||||
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
|
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
|
||||||
int encodeSetO, /* force the encoder to encode characters in
|
int base64SetO, /* Encode RFC2152 Set O characters in base64 */
|
||||||
Set O, as described in RFC2152 */
|
int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
|
||||||
int encodeWhiteSpace, /* force the encoder to encode space, tab,
|
|
||||||
carriage return and linefeed characters */
|
|
||||||
const char *errors /* error handling */
|
const char *errors /* error handling */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
|
@ -521,19 +521,28 @@ class UnicodeTest(
|
||||||
(u'+?', '+-?'),
|
(u'+?', '+-?'),
|
||||||
(ur'\\?', '+AFwAXA?'),
|
(ur'\\?', '+AFwAXA?'),
|
||||||
(ur'\\\?', '+AFwAXABc?'),
|
(ur'\\\?', '+AFwAXABc?'),
|
||||||
(ur'++--', '+-+---')
|
(ur'++--', '+-+---'),
|
||||||
|
(u'\U000abcde', '+2m/c3g-'), # surrogate pairs
|
||||||
|
(u'/', '/'),
|
||||||
]
|
]
|
||||||
|
|
||||||
for (x, y) in utfTests:
|
for (x, y) in utfTests:
|
||||||
self.assertEqual(x.encode('utf-7'), y)
|
self.assertEqual(x.encode('utf-7'), y)
|
||||||
|
|
||||||
# surrogates not supported
|
# Unpaired surrogates not supported
|
||||||
self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
|
self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
|
||||||
|
|
||||||
self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
|
self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd\ufffd')
|
||||||
|
|
||||||
# Issue #2242: crash on some Windows/MSVC versions
|
# Direct encoded characters
|
||||||
self.assertRaises(UnicodeDecodeError, '+\xc1'.decode, 'utf-7')
|
set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
|
||||||
|
# Optional direct characters
|
||||||
|
set_o = '!"#$%&*;<=>@[]^_`{|}'
|
||||||
|
for c in set_d:
|
||||||
|
self.assertEqual(c.encode('utf7'), c.encode('ascii'))
|
||||||
|
self.assertEqual(c.encode('ascii').decode('utf7'), c)
|
||||||
|
for c in set_o:
|
||||||
|
self.assertEqual(c.encode('ascii').decode('utf7'), c)
|
||||||
|
|
||||||
def test_codecs_utf8(self):
|
def test_codecs_utf8(self):
|
||||||
self.assertEqual(u''.encode('utf-8'), '')
|
self.assertEqual(u''.encode('utf-8'), '')
|
||||||
|
|
|
@ -12,6 +12,9 @@ What's New in Python 2.7 alpha 1
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #4426: The UTF-7 decoder was too strict and didn't accept some legal
|
||||||
|
sequences. Patch by Nick Barnes and Victor Stinner.
|
||||||
|
|
||||||
- Issue #1588: Add complex.__format__. For example,
|
- Issue #1588: Add complex.__format__. For example,
|
||||||
format(complex(1, 2./3), '.5') now produces a sensible result.
|
format(complex(1, 2./3), '.5') now produces a sensible result.
|
||||||
|
|
||||||
|
|
|
@ -1468,69 +1468,81 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
|
||||||
|
|
||||||
/* --- UTF-7 Codec -------------------------------------------------------- */
|
/* --- UTF-7 Codec -------------------------------------------------------- */
|
||||||
|
|
||||||
/* see RFC2152 for details */
|
/* See RFC2152 for details. We encode conservatively and decode liberally. */
|
||||||
|
|
||||||
|
/* Three simple macros defining base-64. */
|
||||||
|
|
||||||
|
/* Is c a base-64 character? */
|
||||||
|
|
||||||
|
#define IS_BASE64(c) \
|
||||||
|
(isalnum(c) || (c) == '+' || (c) == '/')
|
||||||
|
|
||||||
|
/* given that c is a base-64 character, what is its base-64 value? */
|
||||||
|
|
||||||
|
#define FROM_BASE64(c) \
|
||||||
|
(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
|
||||||
|
((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
|
||||||
|
((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
|
||||||
|
(c) == '+' ? 62 : 63)
|
||||||
|
|
||||||
|
/* What is the base-64 character of the bottom 6 bits of n? */
|
||||||
|
|
||||||
|
#define TO_BASE64(n) \
|
||||||
|
("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
|
||||||
|
|
||||||
|
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
|
||||||
|
* decoded as itself. We are permissive on decoding; the only ASCII
|
||||||
|
* byte not decoding to itself is the + which begins a base64
|
||||||
|
* string. */
|
||||||
|
|
||||||
|
#define DECODE_DIRECT(c) \
|
||||||
|
((c) <= 127 && (c) != '+')
|
||||||
|
|
||||||
|
/* The UTF-7 encoder treats ASCII characters differently according to
|
||||||
|
* whether they are Set D, Set O, Whitespace, or special (i.e. none of
|
||||||
|
* the above). See RFC2152. This array identifies these different
|
||||||
|
* sets:
|
||||||
|
* 0 : "Set D"
|
||||||
|
* alphanumeric and '(),-./:?
|
||||||
|
* 1 : "Set O"
|
||||||
|
* !"#$%&*;<=>@[]^_`{|}
|
||||||
|
* 2 : "whitespace"
|
||||||
|
* ht nl cr sp
|
||||||
|
* 3 : special (must be base64 encoded)
|
||||||
|
* everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
|
||||||
|
*/
|
||||||
|
|
||||||
static
|
static
|
||||||
char utf7_special[128] = {
|
char utf7_category[128] = {
|
||||||
/* indicate whether a UTF-7 character is special i.e. cannot be directly
|
/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
|
||||||
encoded:
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
|
||||||
0 - not special
|
/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
|
||||||
1 - special
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||||
2 - whitespace (optional)
|
/* sp ! " # $ % & ' ( ) * + , - . / */
|
||||||
3 - RFC2152 Set O (optional) */
|
2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
|
/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
|
||||||
2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
|
/* @ A B C D E F G H I J K L M N O */
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
|
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
/* P Q R S T U V W X Y Z [ \ ] ^ _ */
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
|
||||||
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
/* ` a b c d e f g h i j k l m n o */
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
|
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
/* p q r s t u v w x y z { | } ~ del */
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Note: The comparison (c) <= 0 is a trick to work-around gcc
|
/* ENCODE_DIRECT: this character should be encoded as itself. The
|
||||||
warnings about the comparison always being false; since
|
* answer depends on whether we are encoding set O as itself, and also
|
||||||
utf7_special[0] is 1, we can safely make that one comparison
|
* on whether we are encoding whitespace as itself. RFC2152 makes it
|
||||||
true */
|
* clear that the answers to these questions vary between
|
||||||
|
* applications, so this code needs to be flexible. */
|
||||||
|
|
||||||
#define SPECIAL(c, encodeO, encodeWS) \
|
#define ENCODE_DIRECT(c, directO, directWS) \
|
||||||
((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
|
((c) < 128 && (c) > 0 && \
|
||||||
(encodeWS && (utf7_special[(c)] == 2)) || \
|
((utf7_category[(c)] == 0) || \
|
||||||
(encodeO && (utf7_special[(c)] == 3)))
|
(directWS && (utf7_category[(c)] == 2)) || \
|
||||||
|
(directO && (utf7_category[(c)] == 1))))
|
||||||
#define B64(n) \
|
|
||||||
("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
|
|
||||||
#define B64CHAR(c) \
|
|
||||||
(isalnum(c) || (c) == '+' || (c) == '/')
|
|
||||||
#define UB64(c) \
|
|
||||||
((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
|
|
||||||
(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
|
|
||||||
|
|
||||||
#define ENCODE(out, ch, bits) \
|
|
||||||
while (bits >= 6) { \
|
|
||||||
*out++ = B64(ch >> (bits-6)); \
|
|
||||||
bits -= 6; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define DECODE(out, ch, bits, surrogate) \
|
|
||||||
while (bits >= 16) { \
|
|
||||||
Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
|
|
||||||
bits -= 16; \
|
|
||||||
if (surrogate) { \
|
|
||||||
/* We have already generated an error for the high surrogate \
|
|
||||||
so let's not bother seeing if the low surrogate is correct or not */ \
|
|
||||||
surrogate = 0; \
|
|
||||||
} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
|
|
||||||
/* This is a surrogate pair. Unfortunately we can't represent \
|
|
||||||
it in a 16-bit character */ \
|
|
||||||
surrogate = 1; \
|
|
||||||
errmsg = "code pairs are not supported"; \
|
|
||||||
goto utf7Error; \
|
|
||||||
} else { \
|
|
||||||
*out++ = outCh; \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
PyObject *PyUnicode_DecodeUTF7(const char *s,
|
PyObject *PyUnicode_DecodeUTF7(const char *s,
|
||||||
Py_ssize_t size,
|
Py_ssize_t size,
|
||||||
|
@ -1539,6 +1551,13 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
|
||||||
return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
|
return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* The decoder. The only state we preserve is our read position,
|
||||||
|
* i.e. how many characters we have consumed. So if we end in the
|
||||||
|
* middle of a shift sequence we have to back off the read position
|
||||||
|
* and the output to the beginning of the sequence, otherwise we lose
|
||||||
|
* all the shift state (seen bits, number of bits seen, high
|
||||||
|
* surrogate). */
|
||||||
|
|
||||||
PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
|
PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
|
||||||
Py_ssize_t size,
|
Py_ssize_t size,
|
||||||
const char *errors,
|
const char *errors,
|
||||||
|
@ -1553,9 +1572,10 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
|
||||||
Py_UNICODE *p;
|
Py_UNICODE *p;
|
||||||
const char *errmsg = "";
|
const char *errmsg = "";
|
||||||
int inShift = 0;
|
int inShift = 0;
|
||||||
unsigned int bitsleft = 0;
|
Py_UNICODE *shiftOutStart;
|
||||||
unsigned long charsleft = 0;
|
unsigned int base64bits = 0;
|
||||||
int surrogate = 0;
|
unsigned long base64buffer = 0;
|
||||||
|
Py_UNICODE surrogate = 0;
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *errorHandler = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
|
|
||||||
|
@ -1569,79 +1589,107 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
|
||||||
}
|
}
|
||||||
|
|
||||||
p = unicode->str;
|
p = unicode->str;
|
||||||
|
shiftOutStart = p;
|
||||||
e = s + size;
|
e = s + size;
|
||||||
|
|
||||||
while (s < e) {
|
while (s < e) {
|
||||||
Py_UNICODE ch;
|
Py_UNICODE ch = (unsigned char) *s;
|
||||||
restart:
|
|
||||||
ch = (unsigned char) *s;
|
|
||||||
|
|
||||||
if (inShift) {
|
if (inShift) { /* in a base-64 section */
|
||||||
if ((ch == '-') || !B64CHAR(ch)) {
|
if (IS_BASE64(ch)) { /* consume a base-64 character */
|
||||||
|
base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
|
||||||
|
base64bits += 6;
|
||||||
|
s++;
|
||||||
|
if (base64bits >= 16) {
|
||||||
|
/* we have enough bits for a UTF-16 value */
|
||||||
|
Py_UNICODE outCh = (Py_UNICODE)
|
||||||
|
(base64buffer >> (base64bits-16));
|
||||||
|
base64bits -= 16;
|
||||||
|
base64buffer &= (1 << base64bits) - 1; /* clear high bits */
|
||||||
|
if (surrogate) {
|
||||||
|
/* expecting a second surrogate */
|
||||||
|
if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
*p++ = (((surrogate & 0x3FF)<<10)
|
||||||
|
| (outCh & 0x3FF)) + 0x10000;
|
||||||
|
#else
|
||||||
|
*p++ = surrogate;
|
||||||
|
*p++ = outCh;
|
||||||
|
#endif
|
||||||
|
surrogate = 0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
surrogate = 0;
|
||||||
|
errmsg = "second surrogate missing";
|
||||||
|
goto utf7Error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
|
||||||
|
/* first surrogate */
|
||||||
|
surrogate = outCh;
|
||||||
|
}
|
||||||
|
else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
|
||||||
|
errmsg = "unexpected second surrogate";
|
||||||
|
goto utf7Error;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
*p++ = outCh;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else { /* now leaving a base-64 section */
|
||||||
inShift = 0;
|
inShift = 0;
|
||||||
s++;
|
s++;
|
||||||
|
if (surrogate) {
|
||||||
/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
|
errmsg = "second surrogate missing at end of shift sequence";
|
||||||
if (bitsleft >= 6) {
|
|
||||||
/* The shift sequence has a partial character in it. If
|
|
||||||
bitsleft < 6 then we could just classify it as padding
|
|
||||||
but that is not the case here */
|
|
||||||
|
|
||||||
errmsg = "partial character in shift sequence";
|
|
||||||
goto utf7Error;
|
goto utf7Error;
|
||||||
}
|
}
|
||||||
/* According to RFC2152 the remaining bits should be zero. We
|
if (base64bits > 0) { /* left-over bits */
|
||||||
choose to signal an error/insert a replacement character
|
if (base64bits >= 6) {
|
||||||
here so indicate the potential of a misencoded character. */
|
/* We've seen at least one base-64 character */
|
||||||
|
errmsg = "partial character in shift sequence";
|
||||||
/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
|
goto utf7Error;
|
||||||
if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
|
|
||||||
errmsg = "non-zero padding bits in shift sequence";
|
|
||||||
goto utf7Error;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ch == '-') {
|
|
||||||
if ((s < e) && (*(s) == '-')) {
|
|
||||||
*p++ = '-';
|
|
||||||
inShift = 1;
|
|
||||||
}
|
}
|
||||||
} else if (SPECIAL(ch,0,0)) {
|
else {
|
||||||
errmsg = "unexpected special character";
|
/* Some bits remain; they should be zero */
|
||||||
goto utf7Error;
|
if (base64buffer != 0) {
|
||||||
} else {
|
errmsg = "non-zero padding bits in shift sequence";
|
||||||
|
goto utf7Error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (ch != '-') {
|
||||||
|
/* '-' is absorbed; other terminating
|
||||||
|
characters are preserved */
|
||||||
*p++ = ch;
|
*p++ = ch;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
charsleft = (charsleft << 6) | UB64(ch);
|
|
||||||
bitsleft += 6;
|
|
||||||
s++;
|
|
||||||
/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if ( ch == '+' ) {
|
else if ( ch == '+' ) {
|
||||||
startinpos = s-starts;
|
startinpos = s-starts;
|
||||||
s++;
|
s++; /* consume '+' */
|
||||||
if (s < e && *s == '-') {
|
if (s < e && *s == '-') { /* '+-' encodes '+' */
|
||||||
s++;
|
s++;
|
||||||
*p++ = '+';
|
*p++ = '+';
|
||||||
} else
|
}
|
||||||
{
|
else { /* begin base64-encoded section */
|
||||||
inShift = 1;
|
inShift = 1;
|
||||||
bitsleft = 0;
|
shiftOutStart = p;
|
||||||
|
base64bits = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (SPECIAL(ch,0,0)) {
|
else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
|
||||||
startinpos = s-starts;
|
|
||||||
errmsg = "unexpected special character";
|
|
||||||
s++;
|
|
||||||
goto utf7Error;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
*p++ = ch;
|
*p++ = ch;
|
||||||
s++;
|
s++;
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
startinpos = s-starts;
|
||||||
|
s++;
|
||||||
|
errmsg = "unexpected special character";
|
||||||
|
goto utf7Error;
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
utf7Error:
|
utf7Error:
|
||||||
outpos = p-PyUnicode_AS_UNICODE(unicode);
|
outpos = p-PyUnicode_AS_UNICODE(unicode);
|
||||||
endinpos = s-starts;
|
endinpos = s-starts;
|
||||||
if (unicode_decode_call_errorhandler(
|
if (unicode_decode_call_errorhandler(
|
||||||
|
@ -1652,23 +1700,33 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inShift && !consumed) {
|
/* end of string */
|
||||||
outpos = p-PyUnicode_AS_UNICODE(unicode);
|
|
||||||
endinpos = size;
|
if (inShift && !consumed) { /* in shift sequence, no more to follow */
|
||||||
if (unicode_decode_call_errorhandler(
|
/* if we're in an inconsistent state, that's an error */
|
||||||
errors, &errorHandler,
|
if (surrogate ||
|
||||||
"utf7", "unterminated shift sequence",
|
(base64bits >= 6) ||
|
||||||
starts, size, &startinpos, &endinpos, &exc, &s,
|
(base64bits > 0 && base64buffer != 0)) {
|
||||||
&unicode, &outpos, &p))
|
outpos = p-PyUnicode_AS_UNICODE(unicode);
|
||||||
goto onError;
|
endinpos = size;
|
||||||
if (s < e)
|
if (unicode_decode_call_errorhandler(
|
||||||
goto restart;
|
errors, &errorHandler,
|
||||||
|
"utf7", "unterminated shift sequence",
|
||||||
|
starts, size, &startinpos, &endinpos, &exc, &s,
|
||||||
|
&unicode, &outpos, &p))
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* return state */
|
||||||
if (consumed) {
|
if (consumed) {
|
||||||
if(inShift)
|
if (inShift) {
|
||||||
|
p = shiftOutStart; /* back off output */
|
||||||
*consumed = startinpos;
|
*consumed = startinpos;
|
||||||
else
|
}
|
||||||
|
else {
|
||||||
*consumed = s-starts;
|
*consumed = s-starts;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
|
if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
|
||||||
|
@ -1688,27 +1746,27 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
|
||||||
|
|
||||||
PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
|
PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
|
||||||
Py_ssize_t size,
|
Py_ssize_t size,
|
||||||
int encodeSetO,
|
int base64SetO,
|
||||||
int encodeWhiteSpace,
|
int base64WhiteSpace,
|
||||||
const char *errors)
|
const char *errors)
|
||||||
{
|
{
|
||||||
PyObject *v;
|
PyObject *v;
|
||||||
/* It might be possible to tighten this worst case */
|
/* It might be possible to tighten this worst case */
|
||||||
Py_ssize_t cbAllocated = 5 * size;
|
Py_ssize_t allocated = 5 * size;
|
||||||
int inShift = 0;
|
int inShift = 0;
|
||||||
Py_ssize_t i = 0;
|
Py_ssize_t i = 0;
|
||||||
unsigned int bitsleft = 0;
|
unsigned int base64bits = 0;
|
||||||
unsigned long charsleft = 0;
|
unsigned long base64buffer = 0;
|
||||||
char * out;
|
char * out;
|
||||||
char * start;
|
char * start;
|
||||||
|
|
||||||
if (cbAllocated / 5 != size)
|
if (allocated / 5 != size)
|
||||||
return PyErr_NoMemory();
|
return PyErr_NoMemory();
|
||||||
|
|
||||||
if (size == 0)
|
if (size == 0)
|
||||||
return PyString_FromStringAndSize(NULL, 0);
|
return PyString_FromStringAndSize(NULL, 0);
|
||||||
|
|
||||||
v = PyString_FromStringAndSize(NULL, cbAllocated);
|
v = PyString_FromStringAndSize(NULL, allocated);
|
||||||
if (v == NULL)
|
if (v == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
@ -1716,78 +1774,76 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
|
||||||
for (;i < size; ++i) {
|
for (;i < size; ++i) {
|
||||||
Py_UNICODE ch = s[i];
|
Py_UNICODE ch = s[i];
|
||||||
|
|
||||||
if (!inShift) {
|
if (inShift) {
|
||||||
if (ch == '+') {
|
if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
|
||||||
*out++ = '+';
|
/* shifting out */
|
||||||
*out++ = '-';
|
if (base64bits) { /* output remaining bits */
|
||||||
} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
|
*out++ = TO_BASE64(base64buffer << (6-base64bits));
|
||||||
charsleft = ch;
|
base64buffer = 0;
|
||||||
bitsleft = 16;
|
base64bits = 0;
|
||||||
*out++ = '+';
|
|
||||||
/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
|
|
||||||
inShift = bitsleft > 0;
|
|
||||||
} else {
|
|
||||||
*out++ = (char) ch;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
|
|
||||||
*out++ = B64(charsleft << (6-bitsleft));
|
|
||||||
charsleft = 0;
|
|
||||||
bitsleft = 0;
|
|
||||||
/* Characters not in the BASE64 set implicitly unshift the sequence
|
|
||||||
so no '-' is required, except if the character is itself a '-' */
|
|
||||||
if (B64CHAR(ch) || ch == '-') {
|
|
||||||
*out++ = '-';
|
|
||||||
}
|
}
|
||||||
inShift = 0;
|
inShift = 0;
|
||||||
*out++ = (char) ch;
|
/* Characters not in the BASE64 set implicitly unshift the sequence
|
||||||
} else {
|
so no '-' is required, except if the character is itself a '-' */
|
||||||
bitsleft += 16;
|
if (IS_BASE64(ch) || ch == '-') {
|
||||||
charsleft = (charsleft << 16) | ch;
|
*out++ = '-';
|
||||||
/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
|
|
||||||
|
|
||||||
/* If the next character is special then we don't need to terminate
|
|
||||||
the shift sequence. If the next character is not a BASE64 character
|
|
||||||
or '-' then the shift sequence will be terminated implicitly and we
|
|
||||||
don't have to insert a '-'. */
|
|
||||||
|
|
||||||
if (bitsleft == 0) {
|
|
||||||
if (i + 1 < size) {
|
|
||||||
Py_UNICODE ch2 = s[i+1];
|
|
||||||
|
|
||||||
if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
|
|
||||||
|
|
||||||
} else if (B64CHAR(ch2) || ch2 == '-') {
|
|
||||||
*out++ = '-';
|
|
||||||
inShift = 0;
|
|
||||||
} else {
|
|
||||||
inShift = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
*out++ = '-';
|
|
||||||
inShift = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
*out++ = (char) ch;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
goto encode_char;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else { /* not in a shift sequence */
|
||||||
|
if (ch == '+') {
|
||||||
|
*out++ = '+';
|
||||||
|
*out++ = '-';
|
||||||
|
}
|
||||||
|
else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
|
||||||
|
*out++ = (char) ch;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
*out++ = '+';
|
||||||
|
inShift = 1;
|
||||||
|
goto encode_char;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
encode_char:
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
if (ch >= 0x10000) {
|
||||||
|
/* code first surrogate */
|
||||||
|
base64bits += 16;
|
||||||
|
base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
|
||||||
|
while (base64bits >= 6) {
|
||||||
|
*out++ = TO_BASE64(base64buffer >> (base64bits-6));
|
||||||
|
base64bits -= 6;
|
||||||
|
}
|
||||||
|
/* prepare second surrogate */
|
||||||
|
ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
base64bits += 16;
|
||||||
|
base64buffer = (base64buffer << 16) | ch;
|
||||||
|
while (base64bits >= 6) {
|
||||||
|
*out++ = TO_BASE64(base64buffer >> (base64bits-6));
|
||||||
|
base64bits -= 6;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (bitsleft) {
|
if (base64bits)
|
||||||
*out++= B64(charsleft << (6-bitsleft) );
|
*out++= TO_BASE64(base64buffer << (6-base64bits) );
|
||||||
|
if (inShift)
|
||||||
*out++ = '-';
|
*out++ = '-';
|
||||||
}
|
|
||||||
|
|
||||||
_PyString_Resize(&v, out - start);
|
_PyString_Resize(&v, out - start);
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef SPECIAL
|
#undef IS_BASE64
|
||||||
#undef B64
|
#undef FROM_BASE64
|
||||||
#undef B64CHAR
|
#undef TO_BASE64
|
||||||
#undef UB64
|
#undef DECODE_DIRECT
|
||||||
#undef ENCODE
|
#undef ENCODE_DIRECT
|
||||||
#undef DECODE
|
|
||||||
|
|
||||||
/* --- UTF-8 Codec -------------------------------------------------------- */
|
/* --- UTF-8 Codec -------------------------------------------------------- */
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue