SF patch #438013 Remove 2-byte Py_UCS2 assumptions

Removed all instances of Py_UCS2 from the codebase, and so also (I hope)
the last remaining reliance on the platform having an integral type
with exactly 16 bits.
PyUnicode_DecodeUTF16() and PyUnicode_EncodeUTF16() now read and write
one byte at a time.
This commit is contained in:
Tim Peters 2001-08-09 22:21:55 +00:00
parent ab9ba27dc0
commit 772747b3f1
2 changed files with 91 additions and 83 deletions

View File

@ -121,12 +121,6 @@ typedef unsigned int Py_UCS4;
typedef unsigned long Py_UCS4;
#endif
#if SIZEOF_SHORT == 2
typedef unsigned short Py_UCS2;
#else
#error Cannot find a two-byte type
#endif
typedef PY_UNICODE_TYPE Py_UNICODE;
/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */

View File

@ -944,8 +944,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
/* --- UTF-16 Codec ------------------------------------------------------- */
static
int utf16_decoding_error(const Py_UCS2 **source,
Py_UNICODE **dest,
int utf16_decoding_error(Py_UNICODE **dest,
const char *errors,
const char *details)
{
@ -975,23 +974,29 @@ int utf16_decoding_error(const Py_UCS2 **source,
}
}
PyObject *PyUnicode_DecodeUTF16(const char *s,
PyObject *
PyUnicode_DecodeUTF16(const char *s,
int size,
const char *errors,
int *byteorder)
{
PyUnicodeObject *unicode;
Py_UNICODE *p;
const Py_UCS2 *q, *e;
int bo = 0;
const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */
const char *errmsg = "";
/* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = 1, ilo = 0;
#else
int ihi = 0, ilo = 1;
#endif
/* size should be an even number */
if (size % sizeof(Py_UCS2) != 0) {
if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
if (size & 1) {
if (utf16_decoding_error(NULL, errors, "truncated data"))
return NULL;
/* The remaining input chars are ignored if we fall through
here... */
--size; /* else ignore the oddball byte */
}
/* Note: size will always be longer than the resulting Unicode
@ -1004,8 +1009,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
/* Unpack UTF-16 encoded data */
p = unicode->str;
q = (Py_UCS2 *)s;
e = q + (size / sizeof(Py_UCS2));
q = (unsigned char *)s;
e = q + size;
if (byteorder)
bo = *byteorder;
@ -1015,37 +1020,43 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
if (bo == 0) {
const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (*q == 0xFEFF) {
q++;
if (bom == 0xFEFF) {
q += 2;
bo = -1;
} else if (*q == 0xFFFE) {
q++;
}
else if (bom == 0xFFFE) {
q += 2;
bo = 1;
}
#else
if (*q == 0xFEFF) {
q++;
if (bom == 0xFEFF) {
q += 2;
bo = 1;
} else if (*q == 0xFFFE) {
q++;
}
else if (bom == 0xFFFE) {
q += 2;
bo = -1;
}
#endif
}
if (bo == -1) {
/* force LE */
ihi = 1;
ilo = 0;
}
else if (bo == 1) {
/* force BE */
ihi = 0;
ilo = 1;
}
while (q < e) {
register Py_UCS2 ch = *q++;
Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
q += 2;
/* Swap input bytes if needed. (This assumes
sizeof(Py_UNICODE) == 2 !) */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bo == 1)
ch = (ch >> 8) | (ch << 8);
#else
if (bo == -1)
ch = (ch >> 8) | (ch << 8);
#endif
if (ch < 0xD800 || ch > 0xDFFF) {
*p++ = ch;
continue;
@ -1057,14 +1068,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
goto utf16Error;
}
if (0xD800 <= ch && ch <= 0xDBFF) {
Py_UCS2 ch2 = *q++;
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bo == 1)
ch2 = (ch2 >> 8) | (ch2 << 8);
#else
if (bo == -1)
ch2 = (ch2 >> 8) | (ch2 << 8);
#endif
Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
q += 2;
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
#ifndef Py_UNICODE_WIDE
*p++ = ch;
@ -1084,7 +1089,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
/* Fall through to report the error */
utf16Error:
if (utf16_decoding_error(&q, &p, errors, errmsg))
if (utf16_decoding_error(&p, errors, errmsg))
goto onError;
}
@ -1102,58 +1107,67 @@ onError:
return NULL;
}
#undef UTF16_ERROR
PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE *s,
int size,
const char *errors,
int byteorder)
{
PyObject *v;
Py_UCS2 *p;
char *q;
int i, pairs, doswap = 1;
unsigned char *p;
int i, pairs;
/* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = 1, ilo = 0;
#else
int ihi = 0, ilo = 1;
#endif
#define STORECHAR(CH) \
do { \
p[ihi] = ((CH) >> 8) & 0xff; \
p[ilo] = (CH) & 0xff; \
p += 2; \
} while(0)
for (i = pairs = 0; i < size; i++)
if (s[i] >= 0x10000)
pairs++;
v = PyString_FromStringAndSize(NULL,
sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
2 * (size + pairs + (byteorder == 0)));
if (v == NULL)
return NULL;
q = PyString_AS_STRING(v);
p = (Py_UCS2 *)q;
p = (unsigned char *)PyString_AS_STRING(v);
if (byteorder == 0)
*p++ = 0xFEFF;
STORECHAR(0xFEFF);
if (size == 0)
return v;
if (byteorder == 0 ||
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
byteorder == -1
#else
byteorder == 1
#endif
)
doswap = 0;
if (byteorder == -1) {
/* force LE */
ihi = 1;
ilo = 0;
}
else if (byteorder == 1) {
/* force BE */
ihi = 0;
ilo = 1;
}
while (size-- > 0) {
Py_UNICODE ch = *s++;
Py_UNICODE ch2 = 0;
if (ch >= 0x10000) {
ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
ch = 0xD800|((ch-0x10000)>>10);
ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
ch = 0xD800 | ((ch-0x10000) >> 10);
}
if (doswap){
*p++ = (ch >> 8) | (ch << 8);
STORECHAR(ch);
if (ch2)
*p++ = (ch2 >> 8) | (ch2 << 8);
}else{
*p++ = ch;
if(ch2)
*p++ = ch2;
}
STORECHAR(ch2);
}
return v;
#undef STORECHAR
}
PyObject *PyUnicode_AsUTF16String(PyObject *unicode)