diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 7ae6506ed16..cd2f165eb95 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -221,24 +221,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString # define _PyUnicode_Fini _PyUnicodeUCS2_Fini # define _PyUnicode_Init _PyUnicodeUCS2_Init -# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha -# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit -# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit -# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak -# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase -# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric -# define _PyUnicode_IsPrintable _PyUnicodeUCS2_IsPrintable -# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase -# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart -# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue -# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase -# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace -# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit -# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit -# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase -# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric -# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase -# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase #else @@ -322,24 +304,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString # define _PyUnicode_Fini _PyUnicodeUCS4_Fini # define _PyUnicode_Init _PyUnicodeUCS4_Init -# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha -# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit -# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit -# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak -# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase -# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric -# define _PyUnicode_IsPrintable _PyUnicodeUCS4_IsPrintable -# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase -# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart -# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue -# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase -# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace -# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit -# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit -# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase -# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric -# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase -# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase #endif @@ -351,7 +315,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; configure Python using --with-wctype-functions. This reduces the interpreter's code size. */ -#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) +#if defined(Py_UNICODE_WIDE) && defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) #include @@ -1542,75 +1506,75 @@ PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; */ PyAPI_FUNC(int) _PyUnicode_IsLowercase( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsUppercase( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsTitlecase( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsXidStart( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsXidContinue( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsWhitespace( - const Py_UNICODE ch /* Unicode character */ + const Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsLinebreak( - const Py_UNICODE ch /* Unicode character */ + const Py_UCS4 ch /* Unicode character */ ); -PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase( - Py_UNICODE ch /* Unicode character */ +PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( + Py_UCS4 ch /* Unicode character */ ); -PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase( - Py_UNICODE ch /* Unicode character */ +PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( + Py_UCS4 ch /* Unicode character */ ); -PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase( - Py_UNICODE ch /* Unicode character */ +PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_ToDigit( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(double) _PyUnicode_ToNumeric( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsDigit( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsNumeric( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsPrintable( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(int) _PyUnicode_IsAlpha( - Py_UNICODE ch /* Unicode character */ + Py_UCS4 ch /* Unicode character */ ); PyAPI_FUNC(size_t) Py_UNICODE_strlen( diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index d7d30d2def3..ae5f53db697 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1353,6 +1353,10 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual(repr(s1()), '\\n') self.assertEqual(repr(s2()), '\\n') + def test_printable_repr(self): + self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable + self.assertEqual(repr('\U00011000'), "'\\U00011000'") # nonprintable + def test_expandtabs_overflows_gracefully(self): # This test only affects 32-bit platforms because expandtabs can only take # an int as the max value, not a 64-bit C long. If expandtabs is changed diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index b5722616a00..bfbb0aa2d64 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -294,6 +294,12 @@ class UnicodeMiscTest(UnicodeDatabaseTest): self.assertEqual(len(lines), 1, r"\u%.4x should not be a linebreak" % i) + def test_UCS4(self): + # unicodedata should work with code points outside the BMP + # even on a narrow Unicode build + self.assertEqual(self.db.category(u"\U0001012A"), "No") + self.assertEqual(self.db.numeric(u"\U0001012A"), 9000) + def test_main(): test.support.run_unittest( UnicodeMiscTest, diff --git a/Misc/NEWS b/Misc/NEWS index 59ad9d34a39..fe2080fd5b0 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,12 @@ What's New in Python 3.2 Alpha 2? Core and Builtins ----------------- +- Issue #5127: The C functions that access the Unicode Database now accept and + return characters from the full Unicode range, even on narrow unicode builds + (Py_UNICODE_TOLOWER, Py_UNICODE_ISDECIMAL, and others). A visible difference + in Python is that unicodedata.numeric() now returns the correct value for + large code points, and repr() may consider more characters as printable. + - Issue #9425: Create PyModule_GetFilenameObject() function to get the filename as a unicode object, instead of a byte string. Function needed to support unencodable filenames. Deprecate PyModule_GetFilename() in favor on the new diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index db4f513d05a..f6e32508245 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -26,9 +26,9 @@ #define NUMERIC_MASK 0x1000 typedef struct { - const Py_UNICODE upper; - const Py_UNICODE lower; - const Py_UNICODE title; + const Py_UCS4 upper; + const Py_UCS4 lower; + const Py_UCS4 title; const unsigned char decimal; const unsigned char digit; const unsigned short flags; @@ -37,15 +37,13 @@ typedef struct { #include "unicodetype_db.h" static const _PyUnicode_TypeRecord * -gettyperecord(Py_UNICODE code) +gettyperecord(Py_UCS4 code) { int index; -#ifdef Py_UNICODE_WIDE if (code >= 0x110000) index = 0; else -#endif { index = index1[(code>>SHIFT)]; index = index2[(index<title; @@ -74,7 +72,7 @@ Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch) /* Returns 1 for Unicode characters having the category 'Lt', 0 otherwise. */ -int _PyUnicode_IsTitlecase(Py_UNICODE ch) +int _PyUnicode_IsTitlecase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -84,7 +82,7 @@ int _PyUnicode_IsTitlecase(Py_UNICODE ch) /* Returns 1 for Unicode characters having the XID_Start property, 0 otherwise. */ -int _PyUnicode_IsXidStart(Py_UNICODE ch) +int _PyUnicode_IsXidStart(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -94,7 +92,7 @@ int _PyUnicode_IsXidStart(Py_UNICODE ch) /* Returns 1 for Unicode characters having the XID_Continue property, 0 otherwise. */ -int _PyUnicode_IsXidContinue(Py_UNICODE ch) +int _PyUnicode_IsXidContinue(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -104,14 +102,14 @@ int _PyUnicode_IsXidContinue(Py_UNICODE ch) /* Returns the integer decimal (0-9) for Unicode characters having this property, -1 otherwise. */ -int _PyUnicode_ToDecimalDigit(Py_UNICODE ch) +int _PyUnicode_ToDecimalDigit(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1; } -int _PyUnicode_IsDecimalDigit(Py_UNICODE ch) +int _PyUnicode_IsDecimalDigit(Py_UCS4 ch) { if (_PyUnicode_ToDecimalDigit(ch) < 0) return 0; @@ -121,14 +119,14 @@ int _PyUnicode_IsDecimalDigit(Py_UNICODE ch) /* Returns the integer digit (0-9) for Unicode characters having this property, -1 otherwise. */ -int _PyUnicode_ToDigit(Py_UNICODE ch) +int _PyUnicode_ToDigit(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1; } -int _PyUnicode_IsDigit(Py_UNICODE ch) +int _PyUnicode_IsDigit(Py_UCS4 ch) { if (_PyUnicode_ToDigit(ch) < 0) return 0; @@ -138,7 +136,7 @@ int _PyUnicode_IsDigit(Py_UNICODE ch) /* Returns the numeric value as double for Unicode characters having this property, -1.0 otherwise. */ -int _PyUnicode_IsNumeric(Py_UNICODE ch) +int _PyUnicode_IsNumeric(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -158,7 +156,7 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch) * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) * Zs (Separator, Space) other than ASCII space('\x20'). */ -int _PyUnicode_IsPrintable(Py_UNICODE ch) +int _PyUnicode_IsPrintable(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -170,7 +168,7 @@ int _PyUnicode_IsPrintable(Py_UNICODE ch) /* Returns 1 for Unicode characters having the category 'Ll', 0 otherwise. */ -int _PyUnicode_IsLowercase(Py_UNICODE ch) +int _PyUnicode_IsLowercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -180,7 +178,7 @@ int _PyUnicode_IsLowercase(Py_UNICODE ch) /* Returns 1 for Unicode characters having the category 'Lu', 0 otherwise. */ -int _PyUnicode_IsUppercase(Py_UNICODE ch) +int _PyUnicode_IsUppercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -190,7 +188,7 @@ int _PyUnicode_IsUppercase(Py_UNICODE ch) /* Returns the uppercase Unicode characters corresponding to ch or just ch if no uppercase mapping is known. */ -Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch) +Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); int delta = ctype->upper; @@ -204,7 +202,7 @@ Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch) /* Returns the lowercase Unicode characters corresponding to ch or just ch if no lowercase mapping is known. */ -Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch) +Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); int delta = ctype->lower; @@ -218,7 +216,7 @@ Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch) /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', 'Lo' or 'Lm', 0 otherwise. */ -int _PyUnicode_IsAlpha(Py_UNICODE ch) +int _PyUnicode_IsAlpha(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -230,27 +228,27 @@ int _PyUnicode_IsAlpha(Py_UNICODE ch) /* Export the interfaces using the wchar_t type for portability reasons: */ -int _PyUnicode_IsLowercase(Py_UNICODE ch) +int _PyUnicode_IsLowercase(Py_UCS4 ch) { return iswlower(ch); } -int _PyUnicode_IsUppercase(Py_UNICODE ch) +int _PyUnicode_IsUppercase(Py_UCS4 ch) { return iswupper(ch); } -Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch) +Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) { return towlower(ch); } -Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch) +Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) { return towupper(ch); } -int _PyUnicode_IsAlpha(Py_UNICODE ch) +int _PyUnicode_IsAlpha(Py_UCS4 ch) { return iswalpha(ch); } diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h index 424a3172193..637f6298cd5 100644 --- a/Objects/unicodetype_db.h +++ b/Objects/unicodetype_db.h @@ -1980,7 +1980,7 @@ static unsigned char index2[] = { /* Returns the numeric value as double for Unicode characters * having this property, -1.0 otherwise. */ -double _PyUnicode_ToNumeric(Py_UNICODE ch) +double _PyUnicode_ToNumeric(Py_UCS4 ch) { switch (ch) { case 0x0F33: @@ -2031,7 +2031,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xABF0: case 0xF9B2: case 0xFF10: -#ifdef Py_UNICODE_WIDE case 0x1018A: case 0x104A0: case 0x1D7CE: @@ -2041,7 +2040,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x1D7F6: case 0x1F100: case 0x1F101: -#endif return (double) 0.0; case 0x0031: case 0x00B9: @@ -2105,7 +2103,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xAA51: case 0xABF1: case 0xFF11: -#ifdef Py_UNICODE_WIDE case 0x10107: case 0x10142: case 0x10158: @@ -2135,7 +2132,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x1D7F7: case 0x1F102: case 0x2092A: -#endif return (double) 1.0; case 0x2152: return (double) 1.0/10.0; @@ -2147,46 +2143,36 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x0F2A: case 0x2CFD: case 0xA831: -#ifdef Py_UNICODE_WIDE case 0x10141: case 0x10175: case 0x10176: case 0x10E7B: -#endif return (double) 1.0/2.0; case 0x2153: -#ifdef Py_UNICODE_WIDE case 0x10E7D: case 0x1245A: case 0x1245D: -#endif return (double) 1.0/3.0; case 0x00BC: case 0x09F7: case 0x0D73: case 0xA830: -#ifdef Py_UNICODE_WIDE case 0x10140: case 0x10E7C: case 0x12460: case 0x12462: -#endif return (double) 1.0/4.0; case 0x2155: return (double) 1.0/5.0; case 0x2159: -#ifdef Py_UNICODE_WIDE case 0x12461: -#endif return (double) 1.0/6.0; case 0x2150: return (double) 1.0/7.0; case 0x09F5: case 0x215B: case 0xA834: -#ifdef Py_UNICODE_WIDE case 0x1245F: -#endif return (double) 1.0/8.0; case 0x2151: return (double) 1.0/9.0; @@ -2210,7 +2196,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x62FE: case 0xF973: case 0xF9FD: -#ifdef Py_UNICODE_WIDE case 0x10110: case 0x10149: case 0x10150: @@ -2229,7 +2214,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x10B7C: case 0x10E69: case 0x1D369: -#endif return (double) 10.0; case 0x0BF1: case 0x0D71: @@ -2239,7 +2223,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x4F70: case 0x767E: case 0x964C: -#ifdef Py_UNICODE_WIDE case 0x10119: case 0x1014B: case 0x10152: @@ -2251,7 +2234,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x10B5E: case 0x10B7E: case 0x10E72: -#endif return (double) 100.0; case 0x0BF2: case 0x0D72: @@ -2261,7 +2243,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x4EDF: case 0x5343: case 0x9621: -#ifdef Py_UNICODE_WIDE case 0x10122: case 0x1014D: case 0x10154: @@ -2270,17 +2251,14 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x10A47: case 0x10B5F: case 0x10B7F: -#endif return (double) 1000.0; case 0x137C: case 0x2182: case 0x4E07: case 0x842C: -#ifdef Py_UNICODE_WIDE case 0x1012B: case 0x10155: case 0x1085F: -#endif return (double) 10000.0; case 0x2188: return (double) 100000.0; @@ -2414,7 +2392,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xABF2: case 0xF978: case 0xFF12: -#ifdef Py_UNICODE_WIDE case 0x10108: case 0x1015B: case 0x1015C: @@ -2445,15 +2422,12 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x1D7F8: case 0x1F103: case 0x22390: -#endif return (double) 2.0; case 0x2154: -#ifdef Py_UNICODE_WIDE case 0x10177: case 0x10E7E: case 0x1245B: case 0x1245E: -#endif return (double) 2.0/3.0; case 0x2156: return (double) 2.0/5.0; @@ -2465,7 +2439,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x3039: case 0x5344: case 0x5EFF: -#ifdef Py_UNICODE_WIDE case 0x10111: case 0x103D4: case 0x1085C: @@ -2475,21 +2448,14 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x10B7D: case 0x10E6A: case 0x1D36A: -#endif return (double) 20.0; -#ifdef Py_UNICODE_WIDE case 0x1011A: case 0x10E73: return (double) 200.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10123: return (double) 2000.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x1012C: return (double) 20000.0; -#endif case 0x3251: return (double) 21.0; case 0x3252: @@ -2571,7 +2537,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xABF3: case 0xF96B: case 0xFF13: -#ifdef Py_UNICODE_WIDE case 0x10109: case 0x104A3: case 0x1085A: @@ -2605,7 +2570,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x20B19: case 0x22998: case 0x23B1B: -#endif return (double) 3.0; case 0x09F6: case 0xA835: @@ -2616,9 +2580,7 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x09F8: case 0x0D75: case 0xA832: -#ifdef Py_UNICODE_WIDE case 0x10178: -#endif return (double) 3.0/4.0; case 0x2157: return (double) 3.0/5.0; @@ -2628,28 +2590,20 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x303A: case 0x325A: case 0x5345: -#ifdef Py_UNICODE_WIDE case 0x10112: case 0x10165: case 0x10E6B: case 0x1D36B: case 0x20983: -#endif return (double) 30.0; -#ifdef Py_UNICODE_WIDE case 0x1011B: case 0x1016B: case 0x10E74: return (double) 300.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10124: return (double) 3000.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x1012D: return (double) 30000.0; -#endif case 0x325B: return (double) 31.0; case 0x325C: @@ -2724,7 +2678,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xAA54: case 0xABF4: case 0xFF14: -#ifdef Py_UNICODE_WIDE case 0x1010A: case 0x104A4: case 0x10A43: @@ -2756,34 +2709,25 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x20064: case 0x200E2: case 0x2626D: -#endif return (double) 4.0; case 0x2158: return (double) 4.0/5.0; case 0x1375: case 0x32B5: case 0x534C: -#ifdef Py_UNICODE_WIDE case 0x10113: case 0x10E6C: case 0x1D36C: case 0x2098C: case 0x2099C: -#endif return (double) 40.0; -#ifdef Py_UNICODE_WIDE case 0x1011C: case 0x10E75: return (double) 400.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10125: return (double) 4000.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x1012E: return (double) 40000.0; -#endif case 0x32B6: return (double) 41.0; case 0x32B7: @@ -2858,7 +2802,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xAA55: case 0xABF5: case 0xFF15: -#ifdef Py_UNICODE_WIDE case 0x1010B: case 0x10143: case 0x10148: @@ -2887,14 +2830,11 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x1D7FB: case 0x1F106: case 0x20121: -#endif return (double) 5.0; case 0x0F2C: return (double) 5.0/2.0; case 0x215A: -#ifdef Py_UNICODE_WIDE case 0x1245C: -#endif return (double) 5.0/6.0; case 0x215D: return (double) 5.0/8.0; @@ -2903,7 +2843,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x217C: case 0x2186: case 0x32BF: -#ifdef Py_UNICODE_WIDE case 0x10114: case 0x10144: case 0x1014A: @@ -2917,11 +2856,9 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x10A7E: case 0x10E6D: case 0x1D36D: -#endif return (double) 50.0; case 0x216E: case 0x217E: -#ifdef Py_UNICODE_WIDE case 0x1011D: case 0x10145: case 0x1014C: @@ -2932,22 +2869,17 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x1016F: case 0x10170: case 0x10E76: -#endif return (double) 500.0; case 0x2181: -#ifdef Py_UNICODE_WIDE case 0x10126: case 0x10146: case 0x1014E: case 0x10172: -#endif return (double) 5000.0; case 0x2187: -#ifdef Py_UNICODE_WIDE case 0x1012F: case 0x10147: case 0x10156: -#endif return (double) 50000.0; case 0x0036: case 0x0666: @@ -3007,7 +2939,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xF9D1: case 0xF9D3: case 0xFF16: -#ifdef Py_UNICODE_WIDE case 0x1010C: case 0x104A6: case 0x10E65: @@ -3026,28 +2957,19 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x1D7FC: case 0x1F107: case 0x20AEA: -#endif return (double) 6.0; case 0x1377: -#ifdef Py_UNICODE_WIDE case 0x10115: case 0x10E6E: case 0x1D36E: -#endif return (double) 60.0; -#ifdef Py_UNICODE_WIDE case 0x1011E: case 0x10E77: return (double) 600.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10127: return (double) 6000.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10130: return (double) 60000.0; -#endif case 0x0037: case 0x0667: case 0x06F7: @@ -3104,7 +3026,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xAA57: case 0xABF7: case 0xFF17: -#ifdef Py_UNICODE_WIDE case 0x1010D: case 0x104A7: case 0x10E66: @@ -3124,32 +3045,23 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x1D7FD: case 0x1F108: case 0x20001: -#endif return (double) 7.0; case 0x0F2D: return (double) 7.0/2.0; case 0x215E: return (double) 7.0/8.0; case 0x1378: -#ifdef Py_UNICODE_WIDE case 0x10116: case 0x10E6F: case 0x1D36F: -#endif return (double) 70.0; -#ifdef Py_UNICODE_WIDE case 0x1011F: case 0x10E78: return (double) 700.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10128: return (double) 7000.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10131: return (double) 70000.0; -#endif case 0x0038: case 0x0668: case 0x06F8: @@ -3204,7 +3116,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xAA58: case 0xABF8: case 0xFF18: -#ifdef Py_UNICODE_WIDE case 0x1010E: case 0x104A8: case 0x10E67: @@ -3222,28 +3133,19 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x1D7F4: case 0x1D7FE: case 0x1F109: -#endif return (double) 8.0; case 0x1379: -#ifdef Py_UNICODE_WIDE case 0x10117: case 0x10E70: case 0x1D370: -#endif return (double) 80.0; -#ifdef Py_UNICODE_WIDE case 0x10120: case 0x10E79: return (double) 800.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10129: return (double) 8000.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10132: return (double) 80000.0; -#endif case 0x0039: case 0x0669: case 0x06F9: @@ -3299,7 +3201,6 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0xAA59: case 0xABF9: case 0xFF19: -#ifdef Py_UNICODE_WIDE case 0x1010F: case 0x104A9: case 0x10E68: @@ -3320,32 +3221,23 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) case 0x1D7FF: case 0x1F10A: case 0x2F890: -#endif return (double) 9.0; case 0x0F2E: return (double) 9.0/2.0; case 0x137A: -#ifdef Py_UNICODE_WIDE case 0x10118: case 0x10341: case 0x10E71: case 0x1D371: -#endif return (double) 90.0; -#ifdef Py_UNICODE_WIDE case 0x10121: case 0x1034A: case 0x10E7A: return (double) 900.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x1012A: return (double) 9000.0; -#endif -#ifdef Py_UNICODE_WIDE case 0x10133: return (double) 90000.0; -#endif } return -1.0; } @@ -3353,7 +3245,7 @@ double _PyUnicode_ToNumeric(Py_UNICODE ch) /* Returns 1 for Unicode characters having the bidirectional * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */ -int _PyUnicode_IsWhitespace(register const Py_UNICODE ch) +int _PyUnicode_IsWhitespace(register const Py_UCS4 ch) { #ifdef WANT_WCTYPE_FUNCTIONS return iswspace(ch); @@ -3399,7 +3291,7 @@ int _PyUnicode_IsWhitespace(register const Py_UNICODE ch) * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional * type 'B', 0 otherwise. */ -int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) +int _PyUnicode_IsLinebreak(register const Py_UCS4 ch) { switch (ch) { case 0x000A: diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index f38b866d6c0..7266a91c4ec 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -28,7 +28,7 @@ import sys SCRIPT = sys.argv[0] -VERSION = "2.6" +VERSION = "3.2" # The Unicode Database UNIDATA_VERSION = "5.2.0" @@ -479,7 +479,7 @@ def makeunicodetype(unicode, trace): print('/* Returns the numeric value as double for Unicode characters', file=fp) print(' * having this property, -1.0 otherwise.', file=fp) print(' */', file=fp) - print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp) + print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp) print('{', file=fp) print(' switch (ch) {', file=fp) for value, codepoints in numeric_items: @@ -488,21 +488,10 @@ def makeunicodetype(unicode, trace): parts = [repr(float(part)) for part in parts] value = '/'.join(parts) - haswide = False - hasnonewide = False codepoints.sort() for codepoint in codepoints: - if codepoint < 0x10000: - hasnonewide = True - if codepoint >= 0x10000 and not haswide: - print('#ifdef Py_UNICODE_WIDE', file=fp) - haswide = True print(' case 0x%04X:' % (codepoint,), file=fp) - if haswide and hasnonewide: - print('#endif', file=fp) print(' return (double) %s;' % (value,), file=fp) - if haswide and not hasnonewide: - print('#endif', file=fp) print(' }', file=fp) print(' return -1.0;', file=fp) print('}', file=fp) @@ -512,27 +501,16 @@ def makeunicodetype(unicode, trace): print("/* Returns 1 for Unicode characters having the bidirectional", file=fp) print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp) print(" */", file=fp) - print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp) + print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp) print('{', file=fp) print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp) print(' return iswspace(ch);', file=fp) print('#else', file=fp) print(' switch (ch) {', file=fp) - haswide = False - hasnonewide = False for codepoint in sorted(spaces): - if codepoint < 0x10000: - hasnonewide = True - if codepoint >= 0x10000 and not haswide: - print('#ifdef Py_UNICODE_WIDE', file=fp) - haswide = True print(' case 0x%04X:' % (codepoint,), file=fp) - if haswide and hasnonewide: - print('#endif', file=fp) print(' return 1;', file=fp) - if haswide and not hasnonewide: - print('#endif', file=fp) print(' }', file=fp) print(' return 0;', file=fp) @@ -545,23 +523,12 @@ def makeunicodetype(unicode, trace): print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp) print(" * type 'B', 0 otherwise.", file=fp) print(" */", file=fp) - print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp) + print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp) print('{', file=fp) print(' switch (ch) {', file=fp) - haswide = False - hasnonewide = False for codepoint in sorted(linebreaks): - if codepoint < 0x10000: - hasnonewide = True - if codepoint >= 0x10000 and not haswide: - print('#ifdef Py_UNICODE_WIDE', file=fp) - haswide = True print(' case 0x%04X:' % (codepoint,), file=fp) - if haswide and hasnonewide: - print('#endif', file=fp) print(' return 1;', file=fp) - if haswide and not hasnonewide: - print('#endif', file=fp) print(' }', file=fp) print(' return 0;', file=fp)