Support using UCS-4 as the Py_UNICODE type:

Add configure option --enable-unicode. Add config.h macros Py_USING_UNICODE, PY_UNICODE_TYPE, Py_UNICODE_SIZE, SIZEOF_WCHAR_T. Define Py_UCS2. Encode and decode large UTF-8 characters into single Py_UNICODE values for wide Unicode types; likewise for UTF-16. Remove test whether sizeof Py_UNICODE is two.
2001-06-26 22:22:37 +00:00 · 2001-06-26 22:22:37 +00:00 · 0ba70cc3c8
parent ff1cc902fe
commit 0ba70cc3c8
7 changed files with 667 additions and 473 deletions
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -60,16 +60,9 @@ Copyright (c) Corporation for National Research Initiatives.
 /* experimental UCS-4 support.  enable at your own risk! */
 #undef USE_UCS4_STORAGE
-
+#if Py_UNICODE_SIZE == 4
-/*
+#define USE_UCS4_STORAGE
- * Use this typedef when you need to represent a UTF-16 surrogate pair
+#endif
 * as single unsigned integer.
 */
 #if SIZEOF_INT >= 4 
 typedef unsigned int Py_UCS4; 
 #elif SIZEOF_LONG >= 4
 typedef unsigned long Py_UCS4; 
 #endif 
 /* Set these flags if the platform has "wchar.h", "wctype.h" and the
   wchar_t type is a 16-bit unsigned type */
@ -77,11 +70,16 @@ typedef unsigned long Py_UCS4;
 /* #define HAVE_USABLE_WCHAR_T */
 /* Defaults for various platforms */
-#ifndef HAVE_USABLE_WCHAR_T
+#ifndef PY_UNICODE_TYPE
 /* Windows has a usable wchar_t type (unless we're using UCS-4) */
 # if defined(MS_WIN32) && !defined(USE_UCS4_STORAGE)
 #  define HAVE_USABLE_WCHAR_T
 #  define PY_UNICODE_TYPE wchar_t
 # endif
 # if defined(USE_UCS4_STORAGE)
 #  define PY_UNICODE_TYPE Py_UCS4
 # endif
 #endif
@ -104,28 +102,23 @@ typedef unsigned long Py_UCS4;
 # include "wchar.h"
 #endif
-#ifdef HAVE_USABLE_WCHAR_T
+/*
-
+ * Use this typedef when you need to represent a UTF-16 surrogate pair
-/* If the compiler defines whcar_t as a 16-bit unsigned type we can
+ * as single unsigned integer.
-   use the compiler type directly.  Works fine with all modern Windows
+ */
-   platforms. */
+#if SIZEOF_INT >= 4 
-
+typedef unsigned int Py_UCS4; 
-typedef wchar_t Py_UNICODE;
+#elif SIZEOF_LONG >= 4
-
+typedef unsigned long Py_UCS4; 
 #else
 /* Use if you have a standard ANSI compiler, without wchar_t support.
   If a short is not 16 bits on your platform, you have to fix the
   typedef below, or the module initialization code will complain. */
 #ifdef USE_UCS4_STORAGE
 typedef Py_UCS4 Py_UNICODE;
 #else
 typedef unsigned short Py_UNICODE;
 #endif
-#endif
+#if SIZEOF_SHORT == 2
 typedef unsigned short Py_UCS2;
 #else
 #error Cannot find a two-byte type
 #endif 
 typedef PY_UNICODE_TYPE Py_UNICODE;
 /* --- Internal Unicode Operations ---------------------------------------- */
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -771,13 +771,17 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
            /* validate and convert to UTF-16 */
-            if ((ch < 0x10000) ||   /* minimum value allowed for 4
+            if ((ch < 0x10000)        /* minimum value allowed for 4
                                       byte encoding */
-                (ch > 0x10ffff)) {  /* maximum value allowed for
+                || (ch > 0x10ffff))   /* maximum value allowed for
                                       UTF-16 */
 	    {
                errmsg = "illegal encoding";
 		goto utf8Error;
 	    }
 #if Py_UNICODE_SIZE == 4
 	    *p++ = (Py_UNICODE)ch;
 #else
            /*  compute and append the two surrogates: */
            /*  translate from 10000..10FFFF to 0..FFFF */
@ -788,6 +792,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
            /*  low surrogate = bottom 10 bits added to DC00 */
            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
 #endif
            break;
        default:
@ -878,7 +883,13 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
            *p++ = 0x80 | (ch & 0x3f);
            cbWritten += 2;
        }
-        else {
+        else if (ch < 0x10000) {
 #if Py_UNICODE_SIZE == 4
 	    *p++ = 0xe0 | (ch>>12);
            *p++ = 0x80 | ((ch>>6) & 0x3f);
            *p++ = 0x80 | (ch & 0x3f);
            cbWritten += 3;
 #else
            /* Check for high surrogate */
            if (0xD800 <= ch && ch <= 0xDBFF) {
                if (i != size) {
@ -909,7 +920,14 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
            }
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
            *p++ = (char)(0x80 | (ch & 0x3f));
-        }
+#endif
        } else {
            *p++ = 0xf0 | (ch>>18);
            *p++ = 0x80 | ((ch>>12) & 0x3f);
            *p++ = 0x80 | ((ch>>6) & 0x3f);
            *p++ = 0x80 | (ch & 0x3f);
            cbWritten += 4;
 	}
    }
    *p = '\0';
    if (_PyString_Resize(&v, p - q))
@ -935,7 +953,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 /* --- UTF-16 Codec ------------------------------------------------------- */
 static
-int utf16_decoding_error(const Py_UNICODE **source,
+int utf16_decoding_error(const Py_UCS2 **source,
 			 Py_UNICODE **dest,
 			 const char *errors,
 			 const char *details) 
@ -973,12 +991,12 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
 {
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
-    const Py_UNICODE *q, *e;
+    const Py_UCS2 *q, *e;
    int bo = 0;
    const char *errmsg = "";
    /* size should be an even number */
-    if (size % sizeof(Py_UNICODE) != 0) {
+    if (size % sizeof(Py_UCS2) != 0) {
 	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
 	    return NULL;
 	/* The remaining input chars are ignored if we fall through
@ -995,8 +1013,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
    /* Unpack UTF-16 encoded data */
    p = unicode->str;
-    q = (Py_UNICODE *)s;
+    q = (Py_UCS2 *)s;
-    e = q + (size / sizeof(Py_UNICODE));
+    e = q + (size / sizeof(Py_UCS2));
    if (byteorder)
 	bo = *byteorder;
@ -1026,7 +1044,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
    }
    while (q < e) {
-	register Py_UNICODE ch = *q++;
+	register Py_UCS2 ch = *q++;
 	/* Swap input bytes if needed. (This assumes
 	   sizeof(Py_UNICODE) == 2 !) */
@ -1048,17 +1066,33 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
 	    goto utf16Error;
 	}
 	if (0xDC00 <= *q && *q <= 0xDFFF) {
-	    q++;
+	    Py_UCS2 ch2 = *q++;
-	    if (0xD800 <= *q && *q <= 0xDBFF) {
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
 	    if (bo == 1)
 		    ch = (ch >> 8) | (ch << 8);
 #else    
 	    if (bo == -1)
 		    ch = (ch >> 8) | (ch << 8);
 #endif
 	    if (0xD800 <= ch && ch <= 0xDBFF) {
 #if Py_UNICODE_SIZE == 2
 		/* This is valid data (a UTF-16 surrogate pair), but
 		   we are not able to store this information since our
 		   Py_UNICODE type only has 16 bits... this might
 		   change someday, even though it's unlikely. */
 		errmsg = "code pairs are not supported";
 		goto utf16Error;
-	    }
+#else
-	    else
+		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
 		continue;
 #endif
 	    }
 	    else {
                errmsg = "illegal UTF-16 surrogate";
 		goto utf16Error;
 	    }
 	}
 	errmsg = "illegal encoding";
 	/* Fall through to report the error */
@ -1090,17 +1124,20 @@ PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
 				int byteorder)
 {
    PyObject *v;
-    Py_UNICODE *p;
+    Py_UCS2 *p;
    char *q;
    int i, pairs, doswap = 1;
-    /* We don't create UTF-16 pairs... */
+    for (i = pairs = 0; i < size; i++)
 	if (s[i] >= 0x10000)
 	    pairs++;
    v = PyString_FromStringAndSize(NULL, 
-			sizeof(Py_UNICODE) * (size + (byteorder == 0)));
+		  sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
    if (v == NULL)
        return NULL;
    q = PyString_AS_STRING(v);
-    p = (Py_UNICODE *)q;
+    p = (Py_UCS2 *)q;
    if (byteorder == 0)
 	*p++ = 0xFEFF;
    if (size == 0)
@ -1112,12 +1149,24 @@ PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
 	byteorder == 1
 #endif
 	)
-	Py_UNICODE_COPY(p, s, size);
+	doswap = 0;
-    else
+    while (size-- > 0) {
-	while (size-- > 0) {
+	Py_UNICODE ch = *s++;
-	    Py_UNICODE ch = *s++;
+	Py_UNICODE ch2 = 0;
-	    *p++ = (ch >> 8) | (ch << 8);
+	if (ch >= 0x10000) {
 	    ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
 	    ch  = 0xD800|((ch-0x10000)>>10);
 	}
 	if (doswap){
 	    *p++ = (ch >> 8) | (ch << 8);
 	    if (ch2)
 		*p++ = (ch2 >> 8) | (ch2 << 8);
 	}else{
 	    *p++ = ch;
 	    if(ch2)
 		*p++ = ch2;
 	}
    }
    return v;
 }
@ -1271,10 +1320,14 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                /* UCS-2 character */
                *p++ = (Py_UNICODE) chr;
            else if (chr <= 0x10ffff) {
-                /* UCS-4 character.  store as two surrogate characters */
+                /* UCS-4 character. Either store directly, or as surrogate pair. */
 #if Py_UNICODE_SIZE == 4
                *p++ = chr;
 #else
                chr -= 0x10000L;
                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
 #endif
            } else {
                if (unicodeescape_decoding_error(
                    &s, &x, errors,
@ -1383,6 +1436,19 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
            *p++ = '\\';
            *p++ = (char) ch;
        } 
        /* Map 21-bit characters to '\U00xxxxxx' */
        else if (ch >= 0x10000) {
            *p++ = '\\';
            *p++ = 'U';
            *p++ = hexdigit[(ch >> 28) & 0xf];
            *p++ = hexdigit[(ch >> 24) & 0xf];
            *p++ = hexdigit[(ch >> 20) & 0xf];
            *p++ = hexdigit[(ch >> 16) & 0xf];
            *p++ = hexdigit[(ch >> 12) & 0xf];
            *p++ = hexdigit[(ch >> 8) & 0xf];
            *p++ = hexdigit[(ch >> 4) & 0xf];
            *p++ = hexdigit[ch & 15];
        }
        /* Map 16-bit characters to '\uxxxx' */
        else if (ch >= 256) {
            *p++ = '\\';
@ -5281,13 +5347,6 @@ void _PyUnicode_Init(void)
 {
    int i;
    /* Doublecheck the configuration... */
 #ifndef USE_UCS4_STORAGE
    if (sizeof(Py_UNICODE) != 2)
        Py_FatalError("Unicode configuration error: "
 		      "sizeof(Py_UNICODE) != 2 bytes");
 #endif
    /* Init the implementation */
    unicode_freelist = NULL;
    unicode_freelist_size = 0;
--- a/Python/bltinmodule.c
+++ b/Python/bltinmodule.c
@ -324,12 +324,16 @@ builtin_unichr(PyObject *self, PyObject *args)
 		s[0] = (Py_UNICODE) x;
 		return PyUnicode_FromUnicode(s, 1);
 	} else {
 #if Py_UNICODE_SIZE == 2
 		/* UCS-4 character.  store as two surrogate characters */
 		x -= 0x10000L;
 		s[0] = 0xD800 + (Py_UNICODE) (x >> 10);
 		s[1] = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
 		return PyUnicode_FromUnicode(s, 2);
 #endif
 	}
 	s[0] = (Py_UNICODE)x;
 	return PyUnicode_FromUnicode(s, 1);
 }
 static char unichr_doc[] =
--- a/acconfig.h
+++ b/acconfig.h
@ -104,6 +104,15 @@
 /* Define if the compiler provides a wchar.h header file. */
 #undef HAVE_WCHAR_H
 /* Define if you want to have a Unicode type. */
 #undef Py_USING_UNICODE
 /* Define as the integral type used for Unicode representation. */
 #undef PY_UNICODE_TYPE
 /* Define as the size of the unicode type. */
 #undef Py_UNICODE_SIZE
 /* Define if malloc(0) returns a NULL pointer */
 #undef MALLOC_ZERO_RETURNS_NULL
--- a/config.h.in
+++ b/config.h.in
@ -163,6 +163,15 @@
 /* Define if the compiler provides a wchar.h header file. */
 #undef HAVE_WCHAR_H
 /* Define if you want to have a Unicode type. */
 #undef Py_USING_UNICODE
 /* Define as the integral type used for Unicode representation. */
 #undef PY_UNICODE_TYPE
 /* Define as the size of the unicode type. */
 #undef Py_UNICODE_SIZE
 /* Define if malloc(0) returns a NULL pointer */
 #undef MALLOC_ZERO_RETURNS_NULL
@ -284,6 +293,9 @@
 /* The number of bytes in a void *.  */
 #undef SIZEOF_VOID_P
 /* The number of bytes in a wchar_t.  */
 #undef SIZEOF_WCHAR_T
 /* Define if you have the _getpty function.  */
 #undef HAVE__GETPTY
--- a/868
+++ b/868
--- a/configure.in
+++ b/configure.in
@ -372,8 +372,8 @@ fi
 OPT="$OPT -Dss_family=__ss_family -Dss_len=__ss_len"
 AC_MSG_CHECKING([whether to enable ipv6])
 AC_ARG_ENABLE(ipv6,
-[  --enable-ipv6		Enable ipv6 (with ipv4) support
+[  --enable-ipv6                   Enable ipv6 (with ipv4) support
-  --disable-ipv6		Disable ipv6 support],
+  --disable-ipv6                  Disable ipv6 support],
 [ case "$enableval" in
  no)
       AC_MSG_RESULT(no)
@ -1578,23 +1578,58 @@ AC_DEFINE(HAVE_WCHAR_H) wchar_h="yes",
 wchar_h="no"
 )
-# check for usable wchar_t
+# determine wchar_t size
-usable_wchar_t="unkown"
+if test "$wchar_h" = yes
-AC_MSG_CHECKING(for usable wchar_t)
+then
-AC_TRY_RUN([
+  AC_CHECK_SIZEOF(wchar_t)
-#include "wchar.h"
+fi
-#include "wctype.h"
+
-main() {
+AC_MSG_CHECKING(what type to use for unicode)
- wchar_t s;
+AC_ARG_ENABLE(unicode, 
- if (sizeof(s) == 2)
+[  --enable-unicode[=ucs2,ucs4]    Enable Unicode strings (default is yes)],,enable_unicode=yes)
-  exit(0);
+
- else
+if test $enable_unicode = yes
-  exit(1);
+then
-}
+  # Let Py_UNICODE size depend on wchar_t size
-], 
+  case  "$ac_cv_sizeof_wchar_t" in
-AC_DEFINE(HAVE_USABLE_WCHAR_T) usable_wchar_t="yes",
+  2) enable_unicode="ucs2";;
-usable_wchar_t="no")
+  4) enable_unicode="ucs4";;
-AC_MSG_RESULT($usable_wchar_t)
+  *) enable_unicode="ucs4";; # default to UCS-4
  esac
 fi
 case "$enable_unicode" in
 ucs2) unicode_size="2"
      AC_DEFINE(Py_UNICODE_SIZE,2)
      ;;
 ucs4) unicode_size="4"
      AC_DEFINE(Py_UNICODE_SIZE,4)
      ;;
 esac
 if test "$enable_unicode" = "no"
 then
  AC_MSG_RESULT(not used)
 else
  AC_DEFINE(Py_USING_UNICODE)
  if test "$unicode_size" = "$ac_cv_sizeof_wchar_t"
  then
    PY_UNICODE_TYPE="wchar_t"
    AC_DEFINE(HAVE_USABLE_WCHAR_T)
    AC_DEFINE(PY_UNICODE_TYPE,wchar_t)
  elif test "$ac_cv_sizeof_short" = "$unicode_size"
  then
       PY_UNICODE_TYPE="unsigned short"
       AC_DEFINE(PY_UNICODE_TYPE,unsigned short)
  elif test "$ac_cv_sizeof_long" = "$unicode_size"
  then
       PY_UNICODE_TYPE="unsigned long"
       AC_DEFINE(PY_UNICODE_TYPE,unsigned long)
  else
       PY_UNICODE_TYPE="no type found"
  fi
  AC_MSG_RESULT($PY_UNICODE_TYPE)
 fi
 # check for endianness
 AC_C_BIGENDIAN