SF patch #438013 Remove 2-byte Py_UCS2 assumptions

Removed all instances of Py_UCS2 from the codebase, and so also (I hope) the last remaining reliance on the platform having an integral type with exactly 16 bits. PyUnicode_DecodeUTF16() and PyUnicode_EncodeUTF16() now read and write one byte at a time.
2001-08-09 22:21:55 +00:00 · 2001-08-09 22:21:55 +00:00 · 772747b3f1
parent ab9ba27dc0
commit 772747b3f1
2 changed files with 91 additions and 83 deletions
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -121,12 +121,6 @@ typedef unsigned int Py_UCS4;
 typedef unsigned long Py_UCS4; 
 #endif

-#if SIZEOF_SHORT == 2
-typedef unsigned short Py_UCS2;
-#else
-#error Cannot find a two-byte type
-#endif 
-
 typedef PY_UNICODE_TYPE Py_UNICODE;

 /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -944,8 +944,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 /* --- UTF-16 Codec ------------------------------------------------------- */

 static
-int utf16_decoding_error(const Py_UCS2 **source,
-			 Py_UNICODE **dest,
+int utf16_decoding_error(Py_UNICODE **dest,
 			 const char *errors,
 			 const char *details) 
 {
@ -975,23 +974,29 @@ int utf16_decoding_error(const Py_UCS2 **source,
    }
 }

-PyObject *PyUnicode_DecodeUTF16(const char *s,
+PyObject *
+PyUnicode_DecodeUTF16(const char *s,
 		      int size,
 		      const char *errors,
 		      int *byteorder)
 {
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
-    const Py_UCS2 *q, *e;
-    int bo = 0;
+    const unsigned char *q, *e;
+    int bo = 0;       /* assume native ordering by default */
    const char *errmsg = "";
+    /* Offsets from q for retrieving byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int ihi = 1, ilo = 0;
+#else
+    int ihi = 0, ilo = 1;
+#endif

    /* size should be an even number */
-    if (size % sizeof(Py_UCS2) != 0) {
-	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
+    if (size & 1) {
+        if (utf16_decoding_error(NULL, errors, "truncated data"))
            return NULL;
-	/* The remaining input chars are ignored if we fall through
-           here... */
+        --size;  /* else ignore the oddball byte */
    }

    /* Note: size will always be longer than the resulting Unicode
@ -1004,8 +1009,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,

    /* Unpack UTF-16 encoded data */
    p = unicode->str;
-    q = (Py_UCS2 *)s;
-    e = q + (size / sizeof(Py_UCS2));
+    q = (unsigned char *)s;
+    e = q + size;

    if (byteorder)
        bo = *byteorder;
@ -1015,37 +1020,43 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
       mark is skipped, in all other modes, it is copied to the output
       stream as-is (giving a ZWNBSP character). */
    if (bo == 0) {
+        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
-	if (*q == 0xFEFF) {
-	    q++;
+	if (bom == 0xFEFF) {
+	    q += 2;
 	    bo = -1;
-	} else if (*q == 0xFFFE) {
-	    q++;
+	}
+        else if (bom == 0xFFFE) {
+	    q += 2;
 	    bo = 1;
 	}
 #else    
-	if (*q == 0xFEFF) {
-	    q++;
+	if (bom == 0xFEFF) {
+	    q += 2;
 	    bo = 1;
-	} else if (*q == 0xFFFE) {
-	    q++;
+	}
+        else if (bom == 0xFFFE) {
+	    q += 2;
 	    bo = -1;
 	}
 #endif
    }

+    if (bo == -1) {
+        /* force LE */
+        ihi = 1;
+        ilo = 0;
+    }
+    else if (bo == 1) {
+        /* force BE */
+        ihi = 0;
+        ilo = 1;
+    }
+
    while (q < e) {
-	register Py_UCS2 ch = *q++;
+	Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
+	q += 2;

-	/* Swap input bytes if needed. (This assumes
-	   sizeof(Py_UNICODE) == 2 !) */
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
-	if (bo == 1)
-	    ch = (ch >> 8) | (ch << 8);
-#else    
-	if (bo == -1)
-	    ch = (ch >> 8) | (ch << 8);
-#endif
 	if (ch < 0xD800 || ch > 0xDFFF) {
 	    *p++ = ch;
 	    continue;
@ -1057,14 +1068,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
 	    goto utf16Error;
 	}
 	if (0xD800 <= ch && ch <= 0xDBFF) {
-	    Py_UCS2 ch2 = *q++;
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
-	    if (bo == 1)
-		    ch2 = (ch2 >> 8) | (ch2 << 8);
-#else    
-	    if (bo == -1)
-		    ch2 = (ch2 >> 8) | (ch2 << 8);
-#endif
+	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
+	    q += 2;
 	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 #ifndef Py_UNICODE_WIDE
 		*p++ = ch;
@ -1084,7 +1089,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
 	/* Fall through to report the error */

    utf16Error:
-	if (utf16_decoding_error(&q, &p, errors, errmsg))
+	if (utf16_decoding_error(&p, errors, errmsg))
 	    goto onError;
    }

@ -1102,58 +1107,67 @@ onError:
    return NULL;
 }

-#undef UTF16_ERROR
-
-PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
+PyObject *
+PyUnicode_EncodeUTF16(const Py_UNICODE *s,
 		      int size,
 		      const char *errors,
 		      int byteorder)
 {
    PyObject *v;
-    Py_UCS2 *p;
-    char *q;
-    int i, pairs, doswap = 1;
+    unsigned char *p;
+    int i, pairs;
+    /* Offsets from p for storing byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int ihi = 1, ilo = 0;
+#else
+    int ihi = 0, ilo = 1;
+#endif
+
+#define STORECHAR(CH)                   \
+    do {                                \
+        p[ihi] = ((CH) >> 8) & 0xff;    \
+        p[ilo] = (CH) & 0xff;           \
+        p += 2;                         \
+    } while(0)

    for (i = pairs = 0; i < size; i++)
 	if (s[i] >= 0x10000)
 	    pairs++;
    v = PyString_FromStringAndSize(NULL, 
-		  sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
+		  2 * (size + pairs + (byteorder == 0)));
    if (v == NULL)
        return NULL;

-    q = PyString_AS_STRING(v);
-    p = (Py_UCS2 *)q;
+    p = (unsigned char *)PyString_AS_STRING(v);
    if (byteorder == 0)
-	*p++ = 0xFEFF;
+	STORECHAR(0xFEFF);
    if (size == 0)
        return v;
-    if (byteorder == 0 ||
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN	
-	byteorder == -1
-#else
-	byteorder == 1
-#endif
-	)
-	doswap = 0;
+
+    if (byteorder == -1) {
+        /* force LE */
+        ihi = 1;
+        ilo = 0;
+    }
+    else if (byteorder == 1) {
+        /* force BE */
+        ihi = 0;
+        ilo = 1;
+    }
+
    while (size-- > 0) {
 	Py_UNICODE ch = *s++;
 	Py_UNICODE ch2 = 0;
 	if (ch >= 0x10000) {
-	    ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
-	    ch  = 0xD800|((ch-0x10000)>>10);
+	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
+	    ch  = 0xD800 | ((ch-0x10000) >> 10);
 	}
-	if (doswap){
-	    *p++ = (ch >> 8) | (ch << 8);
+        STORECHAR(ch);
        if (ch2)
-		*p++ = (ch2 >> 8) | (ch2 << 8);
-	}else{
-	    *p++ = ch;
-	    if(ch2)
-		*p++ = ch2;
-	}
+            STORECHAR(ch2);
    }
    return v;
+#undef STORECHAR
 }

 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)