Support using UCS-4 as the Py_UNICODE type:

Add configure option --enable-unicode.
Add config.h macros Py_USING_UNICODE, PY_UNICODE_TYPE, Py_UNICODE_SIZE,
                    SIZEOF_WCHAR_T.
Define Py_UCS2.
Encode and decode large UTF-8 characters into single Py_UNICODE values
for wide Unicode types; likewise for UTF-16.
Remove test whether sizeof Py_UNICODE is two.
This commit is contained in:
Martin v. Löwis 2001-06-26 22:22:37 +00:00
parent ff1cc902fe
commit 0ba70cc3c8
7 changed files with 667 additions and 473 deletions

View File

@ -60,16 +60,9 @@ Copyright (c) Corporation for National Research Initiatives.
/* experimental UCS-4 support. enable at your own risk! */ /* experimental UCS-4 support. enable at your own risk! */
#undef USE_UCS4_STORAGE #undef USE_UCS4_STORAGE
#if Py_UNICODE_SIZE == 4
/* #define USE_UCS4_STORAGE
* Use this typedef when you need to represent a UTF-16 surrogate pair #endif
* as single unsigned integer.
*/
#if SIZEOF_INT >= 4
typedef unsigned int Py_UCS4;
#elif SIZEOF_LONG >= 4
typedef unsigned long Py_UCS4;
#endif
/* Set these flags if the platform has "wchar.h", "wctype.h" and the /* Set these flags if the platform has "wchar.h", "wctype.h" and the
wchar_t type is a 16-bit unsigned type */ wchar_t type is a 16-bit unsigned type */
@ -77,11 +70,16 @@ typedef unsigned long Py_UCS4;
/* #define HAVE_USABLE_WCHAR_T */ /* #define HAVE_USABLE_WCHAR_T */
/* Defaults for various platforms */ /* Defaults for various platforms */
#ifndef HAVE_USABLE_WCHAR_T #ifndef PY_UNICODE_TYPE
/* Windows has a usable wchar_t type (unless we're using UCS-4) */ /* Windows has a usable wchar_t type (unless we're using UCS-4) */
# if defined(MS_WIN32) && !defined(USE_UCS4_STORAGE) # if defined(MS_WIN32) && !defined(USE_UCS4_STORAGE)
# define HAVE_USABLE_WCHAR_T # define HAVE_USABLE_WCHAR_T
# define PY_UNICODE_TYPE wchar_t
# endif
# if defined(USE_UCS4_STORAGE)
# define PY_UNICODE_TYPE Py_UCS4
# endif # endif
#endif #endif
@ -104,28 +102,23 @@ typedef unsigned long Py_UCS4;
# include "wchar.h" # include "wchar.h"
#endif #endif
#ifdef HAVE_USABLE_WCHAR_T /*
* Use this typedef when you need to represent a UTF-16 surrogate pair
/* If the compiler defines whcar_t as a 16-bit unsigned type we can * as single unsigned integer.
use the compiler type directly. Works fine with all modern Windows */
platforms. */ #if SIZEOF_INT >= 4
typedef unsigned int Py_UCS4;
typedef wchar_t Py_UNICODE; #elif SIZEOF_LONG >= 4
typedef unsigned long Py_UCS4;
#else
/* Use if you have a standard ANSI compiler, without wchar_t support.
If a short is not 16 bits on your platform, you have to fix the
typedef below, or the module initialization code will complain. */
#ifdef USE_UCS4_STORAGE
typedef Py_UCS4 Py_UNICODE;
#else
typedef unsigned short Py_UNICODE;
#endif #endif
#endif #if SIZEOF_SHORT == 2
typedef unsigned short Py_UCS2;
#else
#error Cannot find a two-byte type
#endif
typedef PY_UNICODE_TYPE Py_UNICODE;
/* --- Internal Unicode Operations ---------------------------------------- */ /* --- Internal Unicode Operations ---------------------------------------- */

View File

@ -771,13 +771,17 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f); ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
/* validate and convert to UTF-16 */ /* validate and convert to UTF-16 */
if ((ch < 0x10000) || /* minimum value allowed for 4 if ((ch < 0x10000) /* minimum value allowed for 4
byte encoding */ byte encoding */
(ch > 0x10ffff)) { /* maximum value allowed for || (ch > 0x10ffff)) /* maximum value allowed for
UTF-16 */ UTF-16 */
{
errmsg = "illegal encoding"; errmsg = "illegal encoding";
goto utf8Error; goto utf8Error;
} }
#if Py_UNICODE_SIZE == 4
*p++ = (Py_UNICODE)ch;
#else
/* compute and append the two surrogates: */ /* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */ /* translate from 10000..10FFFF to 0..FFFF */
@ -788,6 +792,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
/* low surrogate = bottom 10 bits added to DC00 */ /* low surrogate = bottom 10 bits added to DC00 */
*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
#endif
break; break;
default: default:
@ -878,7 +883,13 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
*p++ = 0x80 | (ch & 0x3f); *p++ = 0x80 | (ch & 0x3f);
cbWritten += 2; cbWritten += 2;
} }
else { else if (ch < 0x10000) {
#if Py_UNICODE_SIZE == 4
*p++ = 0xe0 | (ch>>12);
*p++ = 0x80 | ((ch>>6) & 0x3f);
*p++ = 0x80 | (ch & 0x3f);
cbWritten += 3;
#else
/* Check for high surrogate */ /* Check for high surrogate */
if (0xD800 <= ch && ch <= 0xDBFF) { if (0xD800 <= ch && ch <= 0xDBFF) {
if (i != size) { if (i != size) {
@ -909,7 +920,14 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
} }
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f));
} #endif
} else {
*p++ = 0xf0 | (ch>>18);
*p++ = 0x80 | ((ch>>12) & 0x3f);
*p++ = 0x80 | ((ch>>6) & 0x3f);
*p++ = 0x80 | (ch & 0x3f);
cbWritten += 4;
}
} }
*p = '\0'; *p = '\0';
if (_PyString_Resize(&v, p - q)) if (_PyString_Resize(&v, p - q))
@ -935,7 +953,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
/* --- UTF-16 Codec ------------------------------------------------------- */ /* --- UTF-16 Codec ------------------------------------------------------- */
static static
int utf16_decoding_error(const Py_UNICODE **source, int utf16_decoding_error(const Py_UCS2 **source,
Py_UNICODE **dest, Py_UNICODE **dest,
const char *errors, const char *errors,
const char *details) const char *details)
@ -973,12 +991,12 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
{ {
PyUnicodeObject *unicode; PyUnicodeObject *unicode;
Py_UNICODE *p; Py_UNICODE *p;
const Py_UNICODE *q, *e; const Py_UCS2 *q, *e;
int bo = 0; int bo = 0;
const char *errmsg = ""; const char *errmsg = "";
/* size should be an even number */ /* size should be an even number */
if (size % sizeof(Py_UNICODE) != 0) { if (size % sizeof(Py_UCS2) != 0) {
if (utf16_decoding_error(NULL, NULL, errors, "truncated data")) if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
return NULL; return NULL;
/* The remaining input chars are ignored if we fall through /* The remaining input chars are ignored if we fall through
@ -995,8 +1013,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
/* Unpack UTF-16 encoded data */ /* Unpack UTF-16 encoded data */
p = unicode->str; p = unicode->str;
q = (Py_UNICODE *)s; q = (Py_UCS2 *)s;
e = q + (size / sizeof(Py_UNICODE)); e = q + (size / sizeof(Py_UCS2));
if (byteorder) if (byteorder)
bo = *byteorder; bo = *byteorder;
@ -1026,7 +1044,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
} }
while (q < e) { while (q < e) {
register Py_UNICODE ch = *q++; register Py_UCS2 ch = *q++;
/* Swap input bytes if needed. (This assumes /* Swap input bytes if needed. (This assumes
sizeof(Py_UNICODE) == 2 !) */ sizeof(Py_UNICODE) == 2 !) */
@ -1048,17 +1066,33 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
goto utf16Error; goto utf16Error;
} }
if (0xDC00 <= *q && *q <= 0xDFFF) { if (0xDC00 <= *q && *q <= 0xDFFF) {
q++; Py_UCS2 ch2 = *q++;
if (0xD800 <= *q && *q <= 0xDBFF) { #ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bo == 1)
ch = (ch >> 8) | (ch << 8);
#else
if (bo == -1)
ch = (ch >> 8) | (ch << 8);
#endif
if (0xD800 <= ch && ch <= 0xDBFF) {
#if Py_UNICODE_SIZE == 2
/* This is valid data (a UTF-16 surrogate pair), but /* This is valid data (a UTF-16 surrogate pair), but
we are not able to store this information since our we are not able to store this information since our
Py_UNICODE type only has 16 bits... this might Py_UNICODE type only has 16 bits... this might
change someday, even though it's unlikely. */ change someday, even though it's unlikely. */
errmsg = "code pairs are not supported"; errmsg = "code pairs are not supported";
goto utf16Error; goto utf16Error;
} #else
else *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
continue; continue;
#endif
}
else {
errmsg = "illegal UTF-16 surrogate";
goto utf16Error;
}
} }
errmsg = "illegal encoding"; errmsg = "illegal encoding";
/* Fall through to report the error */ /* Fall through to report the error */
@ -1090,17 +1124,20 @@ PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
int byteorder) int byteorder)
{ {
PyObject *v; PyObject *v;
Py_UNICODE *p; Py_UCS2 *p;
char *q; char *q;
int i, pairs, doswap = 1;
/* We don't create UTF-16 pairs... */ for (i = pairs = 0; i < size; i++)
if (s[i] >= 0x10000)
pairs++;
v = PyString_FromStringAndSize(NULL, v = PyString_FromStringAndSize(NULL,
sizeof(Py_UNICODE) * (size + (byteorder == 0))); sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
if (v == NULL) if (v == NULL)
return NULL; return NULL;
q = PyString_AS_STRING(v); q = PyString_AS_STRING(v);
p = (Py_UNICODE *)q; p = (Py_UCS2 *)q;
if (byteorder == 0) if (byteorder == 0)
*p++ = 0xFEFF; *p++ = 0xFEFF;
if (size == 0) if (size == 0)
@ -1112,12 +1149,24 @@ PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
byteorder == 1 byteorder == 1
#endif #endif
) )
Py_UNICODE_COPY(p, s, size); doswap = 0;
else while (size-- > 0) {
while (size-- > 0) { Py_UNICODE ch = *s++;
Py_UNICODE ch = *s++; Py_UNICODE ch2 = 0;
*p++ = (ch >> 8) | (ch << 8); if (ch >= 0x10000) {
ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
ch = 0xD800|((ch-0x10000)>>10);
} }
if (doswap){
*p++ = (ch >> 8) | (ch << 8);
if (ch2)
*p++ = (ch2 >> 8) | (ch2 << 8);
}else{
*p++ = ch;
if(ch2)
*p++ = ch2;
}
}
return v; return v;
} }
@ -1271,10 +1320,14 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
/* UCS-2 character */ /* UCS-2 character */
*p++ = (Py_UNICODE) chr; *p++ = (Py_UNICODE) chr;
else if (chr <= 0x10ffff) { else if (chr <= 0x10ffff) {
/* UCS-4 character. store as two surrogate characters */ /* UCS-4 character. Either store directly, or as surrogate pair. */
#if Py_UNICODE_SIZE == 4
*p++ = chr;
#else
chr -= 0x10000L; chr -= 0x10000L;
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10); *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
#endif
} else { } else {
if (unicodeescape_decoding_error( if (unicodeescape_decoding_error(
&s, &x, errors, &s, &x, errors,
@ -1383,6 +1436,19 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
*p++ = '\\'; *p++ = '\\';
*p++ = (char) ch; *p++ = (char) ch;
} }
/* Map 21-bit characters to '\U00xxxxxx' */
else if (ch >= 0x10000) {
*p++ = '\\';
*p++ = 'U';
*p++ = hexdigit[(ch >> 28) & 0xf];
*p++ = hexdigit[(ch >> 24) & 0xf];
*p++ = hexdigit[(ch >> 20) & 0xf];
*p++ = hexdigit[(ch >> 16) & 0xf];
*p++ = hexdigit[(ch >> 12) & 0xf];
*p++ = hexdigit[(ch >> 8) & 0xf];
*p++ = hexdigit[(ch >> 4) & 0xf];
*p++ = hexdigit[ch & 15];
}
/* Map 16-bit characters to '\uxxxx' */ /* Map 16-bit characters to '\uxxxx' */
else if (ch >= 256) { else if (ch >= 256) {
*p++ = '\\'; *p++ = '\\';
@ -5281,13 +5347,6 @@ void _PyUnicode_Init(void)
{ {
int i; int i;
/* Doublecheck the configuration... */
#ifndef USE_UCS4_STORAGE
if (sizeof(Py_UNICODE) != 2)
Py_FatalError("Unicode configuration error: "
"sizeof(Py_UNICODE) != 2 bytes");
#endif
/* Init the implementation */ /* Init the implementation */
unicode_freelist = NULL; unicode_freelist = NULL;
unicode_freelist_size = 0; unicode_freelist_size = 0;

View File

@ -324,12 +324,16 @@ builtin_unichr(PyObject *self, PyObject *args)
s[0] = (Py_UNICODE) x; s[0] = (Py_UNICODE) x;
return PyUnicode_FromUnicode(s, 1); return PyUnicode_FromUnicode(s, 1);
} else { } else {
#if Py_UNICODE_SIZE == 2
/* UCS-4 character. store as two surrogate characters */ /* UCS-4 character. store as two surrogate characters */
x -= 0x10000L; x -= 0x10000L;
s[0] = 0xD800 + (Py_UNICODE) (x >> 10); s[0] = 0xD800 + (Py_UNICODE) (x >> 10);
s[1] = 0xDC00 + (Py_UNICODE) (x & 0x03FF); s[1] = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
return PyUnicode_FromUnicode(s, 2); return PyUnicode_FromUnicode(s, 2);
#endif
} }
s[0] = (Py_UNICODE)x;
return PyUnicode_FromUnicode(s, 1);
} }
static char unichr_doc[] = static char unichr_doc[] =

View File

@ -104,6 +104,15 @@
/* Define if the compiler provides a wchar.h header file. */ /* Define if the compiler provides a wchar.h header file. */
#undef HAVE_WCHAR_H #undef HAVE_WCHAR_H
/* Define if you want to have a Unicode type. */
#undef Py_USING_UNICODE
/* Define as the integral type used for Unicode representation. */
#undef PY_UNICODE_TYPE
/* Define as the size of the unicode type. */
#undef Py_UNICODE_SIZE
/* Define if malloc(0) returns a NULL pointer */ /* Define if malloc(0) returns a NULL pointer */
#undef MALLOC_ZERO_RETURNS_NULL #undef MALLOC_ZERO_RETURNS_NULL

View File

@ -163,6 +163,15 @@
/* Define if the compiler provides a wchar.h header file. */ /* Define if the compiler provides a wchar.h header file. */
#undef HAVE_WCHAR_H #undef HAVE_WCHAR_H
/* Define if you want to have a Unicode type. */
#undef Py_USING_UNICODE
/* Define as the integral type used for Unicode representation. */
#undef PY_UNICODE_TYPE
/* Define as the size of the unicode type. */
#undef Py_UNICODE_SIZE
/* Define if malloc(0) returns a NULL pointer */ /* Define if malloc(0) returns a NULL pointer */
#undef MALLOC_ZERO_RETURNS_NULL #undef MALLOC_ZERO_RETURNS_NULL
@ -284,6 +293,9 @@
/* The number of bytes in a void *. */ /* The number of bytes in a void *. */
#undef SIZEOF_VOID_P #undef SIZEOF_VOID_P
/* The number of bytes in a wchar_t. */
#undef SIZEOF_WCHAR_T
/* Define if you have the _getpty function. */ /* Define if you have the _getpty function. */
#undef HAVE__GETPTY #undef HAVE__GETPTY

868
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -372,8 +372,8 @@ fi
OPT="$OPT -Dss_family=__ss_family -Dss_len=__ss_len" OPT="$OPT -Dss_family=__ss_family -Dss_len=__ss_len"
AC_MSG_CHECKING([whether to enable ipv6]) AC_MSG_CHECKING([whether to enable ipv6])
AC_ARG_ENABLE(ipv6, AC_ARG_ENABLE(ipv6,
[ --enable-ipv6 Enable ipv6 (with ipv4) support [ --enable-ipv6 Enable ipv6 (with ipv4) support
--disable-ipv6 Disable ipv6 support], --disable-ipv6 Disable ipv6 support],
[ case "$enableval" in [ case "$enableval" in
no) no)
AC_MSG_RESULT(no) AC_MSG_RESULT(no)
@ -1578,23 +1578,58 @@ AC_DEFINE(HAVE_WCHAR_H) wchar_h="yes",
wchar_h="no" wchar_h="no"
) )
# check for usable wchar_t # determine wchar_t size
usable_wchar_t="unkown" if test "$wchar_h" = yes
AC_MSG_CHECKING(for usable wchar_t) then
AC_TRY_RUN([ AC_CHECK_SIZEOF(wchar_t)
#include "wchar.h" fi
#include "wctype.h"
main() { AC_MSG_CHECKING(what type to use for unicode)
wchar_t s; AC_ARG_ENABLE(unicode,
if (sizeof(s) == 2) [ --enable-unicode[=ucs2,ucs4] Enable Unicode strings (default is yes)],,enable_unicode=yes)
exit(0);
else if test $enable_unicode = yes
exit(1); then
} # Let Py_UNICODE size depend on wchar_t size
], case "$ac_cv_sizeof_wchar_t" in
AC_DEFINE(HAVE_USABLE_WCHAR_T) usable_wchar_t="yes", 2) enable_unicode="ucs2";;
usable_wchar_t="no") 4) enable_unicode="ucs4";;
AC_MSG_RESULT($usable_wchar_t) *) enable_unicode="ucs4";; # default to UCS-4
esac
fi
case "$enable_unicode" in
ucs2) unicode_size="2"
AC_DEFINE(Py_UNICODE_SIZE,2)
;;
ucs4) unicode_size="4"
AC_DEFINE(Py_UNICODE_SIZE,4)
;;
esac
if test "$enable_unicode" = "no"
then
AC_MSG_RESULT(not used)
else
AC_DEFINE(Py_USING_UNICODE)
if test "$unicode_size" = "$ac_cv_sizeof_wchar_t"
then
PY_UNICODE_TYPE="wchar_t"
AC_DEFINE(HAVE_USABLE_WCHAR_T)
AC_DEFINE(PY_UNICODE_TYPE,wchar_t)
elif test "$ac_cv_sizeof_short" = "$unicode_size"
then
PY_UNICODE_TYPE="unsigned short"
AC_DEFINE(PY_UNICODE_TYPE,unsigned short)
elif test "$ac_cv_sizeof_long" = "$unicode_size"
then
PY_UNICODE_TYPE="unsigned long"
AC_DEFINE(PY_UNICODE_TYPE,unsigned long)
else
PY_UNICODE_TYPE="no type found"
fi
AC_MSG_RESULT($PY_UNICODE_TYPE)
fi
# check for endianness # check for endianness
AC_C_BIGENDIAN AC_C_BIGENDIAN