Patch #435971: UTF-7 codec by Brian Quinlan.

This commit is contained in:
Marc-André Lemburg 2001-09-20 10:35:46 +00:00
parent 26e3b681b2
commit c60e6f7771
5 changed files with 392 additions and 1 deletions

View File

@ -607,6 +607,24 @@ extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
const char *errors /* error handling */
);
/* --- UTF-7 Codecs ------------------------------------------------------- */
extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF7(
const char *string, /* UTF-7 encoded string */
int length, /* size of string */
const char *errors /* error handling */
);
extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF7(
const Py_UNICODE *data, /* Unicode char buffer */
int length, /* number of Py_UNICODE chars to encode */
int encodeSetO, /* force the encoder to encode characters in
Set O, as described in RFC2152 */
int encodeWhiteSpace, /* force the encoder to encode space, tab,
carriage return and linefeed characters */
const char *errors /* error handling */
);
/* --- UTF-8 Codecs ------------------------------------------------------- */
extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(

View File

@ -14,6 +14,10 @@ aliases = {
'latin': 'latin_1',
'latin1': 'latin_1',
# UTF-7
'utf7': 'utf_7',
'u7': 'utf_7',
# UTF-8
'utf': 'utf_8',
'utf8': 'utf_8',

View File

@ -377,6 +377,32 @@ print 'done.'
# Test builtin codecs
print 'Testing builtin codecs...',
# UTF-7 specific encoding tests:
utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
(u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
(u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
(u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
(u'+', '+-'),
(u'+-', '+--'),
(u'+?', '+-?'),
(u'\?', '+AFw?'),
(u'+?', '+-?'),
(ur'\\?', '+AFwAXA?'),
(ur'\\\?', '+AFwAXABc?'),
(ur'++--', '+-+---')]
for x,y in utfTests:
verify( x.encode('utf-7') == y )
try:
unicode('+3ADYAA-', 'utf-7') # surrogates not supported
except UnicodeError:
pass
else:
raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
# UTF-8 specific encoding tests:
verify(u'\u20ac'.encode('utf-8') == \
''.join((chr(0xe2), chr(0x82), chr(0xac))) )
@ -439,6 +465,7 @@ verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
verify(u'hello'.encode('ascii') == 'hello')
verify(u'hello'.encode('utf-7') == 'hello')
verify(u'hello'.encode('utf-8') == 'hello')
verify(u'hello'.encode('utf8') == 'hello')
verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
@ -447,7 +474,7 @@ verify(u'hello'.encode('latin-1') == 'hello')
# Roundtrip safety for BMP (just the first 1024 chars)
u = u''.join(map(unichr, range(1024)))
for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
verify(unicode(u.encode(encoding),encoding) == u)

View File

@ -123,6 +123,22 @@ unicode_internal_decode(PyObject *self,
}
}
static PyObject *
utf_7_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "t#|z:utf_7_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeUTF7(data, size, errors),
size);
}
static PyObject *
utf_8_decode(PyObject *self,
PyObject *args)
@ -381,6 +397,30 @@ unicode_internal_encode(PyObject *self,
}
}
static PyObject *
utf_7_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
0,
0,
errors),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_8_encode(PyObject *self,
PyObject *args)
@ -632,6 +672,8 @@ static PyMethodDef _codecs_functions[] = {
#ifdef Py_USING_UNICODE
{"utf_8_encode", utf_8_encode, 1},
{"utf_8_decode", utf_8_decode, 1},
{"utf_7_encode", utf_7_encode, 1},
{"utf_7_decode", utf_7_decode, 1},
{"utf_16_encode", utf_16_encode, 1},
{"utf_16_le_encode", utf_16_le_encode, 1},
{"utf_16_be_encode", utf_16_be_encode, 1},

View File

@ -635,6 +635,306 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
return -1;
}
/* --- UTF-7 Codec -------------------------------------------------------- */
/* see RFC2152 for details */
static
char utf7_special[128] = {
/* indicate whether a UTF-7 character is special i.e. cannot be directly
encoded:
0 - not special
1 - special
2 - whitespace (optional)
3 - RFC2152 Set O (optional) */
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
};
#define SPECIAL(c, encodeO, encodeWS) \
(((c)>127 || utf7_special[(c)] == 1) || \
(encodeWS && (utf7_special[(c)] == 2)) || \
(encodeO && (utf7_special[(c)] == 3)))
#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
#define ENCODE(out, ch, bits) \
while (bits >= 6) { \
*out++ = B64(ch >> (bits-6)); \
bits -= 6; \
}
#define DECODE(out, ch, bits, surrogate) \
while (bits >= 16) { \
Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
bits -= 16; \
if (surrogate) { \
/* We have already generated an error for the high surrogate
so let's not bother seeing if the low surrogate is correct or not */\
surrogate = 0; \
} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
/* This is a surrogate pair. Unfortunately we can't represent \
it in a 16-bit character */ \
surrogate = 1; \
errmsg = "code pairs are not supported"; \
goto utf7Error; \
} else { \
*out++ = outCh; \
} \
} \
static
int utf7_decoding_error(Py_UNICODE **dest,
const char *errors,
const char *details)
{
if ((errors == NULL) ||
(strcmp(errors,"strict") == 0)) {
PyErr_Format(PyExc_UnicodeError,
"UTF-7 decoding error: %.400s",
details);
return -1;
}
else if (strcmp(errors,"ignore") == 0) {
return 0;
}
else if (strcmp(errors,"replace") == 0) {
if (dest != NULL) {
**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
(*dest)++;
}
return 0;
}
else {
PyErr_Format(PyExc_ValueError,
"UTF-7 decoding error; unknown error handling code: %.400s",
errors);
return -1;
}
}
PyObject *PyUnicode_DecodeUTF7(const char *s,
int size,
const char *errors)
{
const char *e;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const char *errmsg = "";
int inShift = 0;
unsigned int bitsleft = 0;
unsigned long charsleft = 0;
int surrogate = 0;
unicode = _PyUnicode_New(size);
if (!unicode)
return NULL;
if (size == 0)
return (PyObject *)unicode;
p = unicode->str;
e = s + size;
while (s < e) {
Py_UNICODE ch = *s;
if (inShift) {
if ((ch == '-') || !B64CHAR(ch)) {
inShift = 0;
s++;
/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
if (bitsleft >= 6) {
/* The shift sequence has a partial character in it. If
bitsleft < 6 then we could just classify it as padding
but that is not the case here */
errmsg = "partial character in shift sequence";
goto utf7Error;
}
/* According to RFC2152 the remaining bits should be zero. We
choose to signal an error/insert a replacement character
here so indicate the potential of a misencoded character. */
/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
errmsg = "non-zero padding bits in shift sequence";
goto utf7Error;
}
if (ch == '-') {
if ((s < e) && (*(s) == '-')) {
*p++ = '-';
inShift = 1;
}
} else if (SPECIAL(ch,0,0)) {
errmsg = "unexpected special character";
goto utf7Error;
} else {
*p++ = ch;
}
} else {
charsleft = (charsleft << 6) | UB64(ch);
bitsleft += 6;
s++;
/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
}
}
else if ( ch == '+' ) {
s++;
if (s < e && *s == '-') {
s++;
*p++ = '+';
} else
{
inShift = 1;
bitsleft = 0;
}
}
else if (SPECIAL(ch,0,0)) {
errmsg = "unexpected special character";
s++;
goto utf7Error;
}
else {
*p++ = ch;
s++;
}
continue;
utf7Error:
if (utf7_decoding_error(&p, errors, errmsg))
goto onError;
}
if (inShift) {
if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
goto onError;
}
if (_PyUnicode_Resize(&unicode, p - unicode->str))
goto onError;
return (PyObject *)unicode;
onError:
Py_DECREF(unicode);
return NULL;
}
PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
int size,
int encodeSetO,
int encodeWhiteSpace,
const char *errors)
{
PyObject *v;
/* It might be possible to tighten this worst case */
unsigned int cbAllocated = 5 * size;
int inShift = 0;
int i = 0;
unsigned int bitsleft = 0;
unsigned long charsleft = 0;
char * out;
char * start;
if (size == 0)
return PyString_FromStringAndSize(NULL, 0);
v = PyString_FromStringAndSize(NULL, cbAllocated);
if (v == NULL)
return NULL;
start = out = PyString_AS_STRING(v);
for (;i < size; ++i) {
Py_UNICODE ch = s[i];
if (!inShift) {
if (ch == '+') {
*out++ = '+';
*out++ = '-';
} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
charsleft = ch;
bitsleft = 16;
*out++ = '+';
/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
inShift = bitsleft > 0;
} else {
*out++ = (char) ch;
}
} else {
if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
*out++ = B64(charsleft << (6-bitsleft));
charsleft = 0;
bitsleft = 0;
/* Characters not in the BASE64 set implicitly unshift the sequence
so no '-' is required, except if the character is itself a '-' */
if (B64CHAR(ch) || ch == '-') {
*out++ = '-';
}
inShift = 0;
*out++ = (char) ch;
} else {
bitsleft += 16;
charsleft = (charsleft << 16) | ch;
/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
/* If the next character is special then we dont' need to terminate
the shift sequence. If the next character is not a BASE64 character
or '-' then the shift sequence will be terminated implicitly and we
don't have to insert a '-'. */
if (bitsleft == 0) {
if (i + 1 < size) {
Py_UNICODE ch2 = s[i+1];
if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
} else if (B64CHAR(ch2) || ch2 == '-') {
*out++ = '-';
inShift = 0;
} else {
inShift = 0;
}
}
else {
*out++ = '-';
inShift = 0;
}
}
}
}
}
if (bitsleft) {
*out++= B64(charsleft << (6-bitsleft) );
*out++ = '-';
}
if (_PyString_Resize(&v, out - start)) {
Py_DECREF(v);
return NULL;
}
return v;
}
#undef SPECIAL
#undef B64
#undef B64CHAR
#undef UB64
#undef ENCODE
#undef DECODE
/* --- UTF-8 Codec -------------------------------------------------------- */
static