From d5890c8db5ed67d41719543a34b33f6a0e0a6f7f Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Sat, 14 Jan 2012 13:23:30 -0500 Subject: [PATCH] add str.casefold() (closes #13752) --- Doc/library/stdtypes.rst | 8 + Include/unicodeobject.h | 5 + Lib/test/test_unicode.py | 8 + Misc/NEWS | 2 + Objects/unicodectype.c | 25 +- Objects/unicodeobject.c | 35 +++ Objects/unicodetype_db.h | 497 +++++++++++++++++++++++-------- Tools/unicode/makeunicodedata.py | 50 +++- 8 files changed, 493 insertions(+), 137 deletions(-) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 04fd57f92b0..8fba7b81ecb 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1002,6 +1002,14 @@ functions based on regular expressions. rest lowercased. +.. method:: str.casefold() + + Return a casefolded copy of the string. Casefolded strings may be used for + caseless matching. For example, ``"MASSE".casefold() == "maße".casefold()``. + + .. versionadded:: 3.3 + + .. method:: str.center(width[, fillchar]) Return centered in a string of length *width*. Padding is done using the diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index e42631c8adc..29d927ff7d7 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -2023,6 +2023,11 @@ PyAPI_FUNC(int) _PyUnicode_ToUpperFull( Py_UCS4 *res ); +PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res + ); + PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( Py_UCS4 ch /* Unicode character */ ); diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index cc933b62574..33d7b359bd9 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -565,6 +565,14 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('\U0008fffe'.lower(), '\U0008fffe') self.assertEqual('\u2177'.lower(), '\u2177') + def test_casefold(self): + self.assertEqual('hello'.casefold(), 'hello') + self.assertEqual('hELlo'.casefold(), 'hello') + self.assertEqual('ß'.casefold(), 'ss') + self.assertEqual('fi'.casefold(), 'fi') + self.assertEqual('\u03a3'.casefold(), '\u03c3') + self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3') + def test_upper(self): string_tests.CommonTest.test_upper(self) self.assertEqual('\U0001044F'.upper(), '\U00010427') diff --git a/Misc/NEWS b/Misc/NEWS index adb6e458055..1afe584bd99 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1? Core and Builtins ----------------- +- Issue #13752: Add a casefold() method to str. + - Issue #13761: Add a "flush" keyword argument to the print() function, used to ensure flushing the output stream. diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 05b63cc430a..0ebdedbe9f7 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -185,7 +185,7 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) - return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF]; + return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF]; return ctype->upper ? ctype->upper : ch; } @@ -197,7 +197,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) - return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF]; + return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF]; return ctype->lower ? ctype->lower : ch; } @@ -206,7 +206,7 @@ int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { - int index = ctype->lower & 0xFFFFFF; + int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; int i; for (i = 0; i < n; i++) @@ -222,7 +222,7 @@ int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { - int index = ctype->title & 0xFFFFFF; + int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; int i; for (i = 0; i < n; i++) @@ -238,7 +238,7 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { - int index = ctype->upper & 0xFFFFFF; + int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; int i; for (i = 0; i < n; i++) @@ -249,6 +249,21 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) return 1; } +int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { + int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); + int n = (ctype->lower >> 20) & 7; + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + return _PyUnicode_ToLowerFull(ch, res); +} + int _PyUnicode_IsCased(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 471d98b0f3b..6d9df182d30 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9576,6 +9576,24 @@ do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar return do_upper_or_lower(kind, data, length, res, maxchar, 1); } +static Py_ssize_t +do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) +{ + Py_ssize_t i, k = 0; + + for (i = 0; i < length; i++) { + Py_UCS4 c = PyUnicode_READ(kind, data, i); + Py_UCS4 mapped[3]; + int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); + for (j = 0; j < n_res; j++) { + if (mapped[j] > *maxchar) + *maxchar = mapped[j]; + res[k++] = mapped[j]; + } + } + return k; +} + static Py_ssize_t do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { @@ -10501,6 +10519,22 @@ unicode_capitalize(PyObject *self) return case_operation(self, do_capitalize); } +PyDoc_STRVAR(casefold__doc__, + "S.casefold() -> str\n\ +\n\ +Return a version of S suitable for caseless comparisons."); + +static PyObject * +unicode_casefold(PyObject *self) +{ + if (PyUnicode_READY(self) == -1) + return NULL; + if (PyUnicode_IS_ASCII(self)) + return ascii_upper_or_lower(self, 1); + return case_operation(self, do_casefold); +} + + /* Argument converter. Coerces to a single unicode character */ static int @@ -12998,6 +13032,7 @@ static PyMethodDef unicode_methods[] = { {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, + {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h index 4786e256e43..fbfc98a83ef 100644 --- a/Objects/unicodetype_db.h +++ b/Objects/unicodetype_db.h @@ -76,7 +76,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {0, 0, 0, 0, 0, 4096}, {0, 0, 0, 0, 2, 3076}, {0, 0, 0, 0, 3, 3076}, - {924, 181, 924, 0, 0, 9993}, + {16777218, 17825792, 16777218, 0, 0, 26377}, {0, 0, 0, 0, 0, 5632}, {0, 0, 0, 0, 1, 3076}, {0, 0, 0, 0, 0, 3072}, @@ -110,7 +110,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {220, 252, 220, 0, 0, 10113}, {221, 253, 221, 0, 0, 10113}, {222, 254, 222, 0, 0, 10113}, - {33554433, 16777216, 33554435, 0, 0, 26377}, + {33554438, 18874371, 33554440, 0, 0, 26377}, {192, 224, 192, 0, 0, 9993}, {193, 225, 193, 0, 0, 9993}, {194, 226, 194, 0, 0, 9993}, @@ -190,7 +190,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {300, 301, 300, 0, 0, 9993}, {302, 303, 302, 0, 0, 10113}, {302, 303, 302, 0, 0, 9993}, - {16777223, 33554437, 16777223, 0, 0, 26497}, + {16777228, 33554442, 16777228, 0, 0, 26497}, {73, 305, 73, 0, 0, 9993}, {306, 307, 306, 0, 0, 10113}, {306, 307, 306, 0, 0, 9993}, @@ -214,7 +214,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {325, 326, 325, 0, 0, 9993}, {327, 328, 327, 0, 0, 10113}, {327, 328, 327, 0, 0, 9993}, - {33554441, 16777224, 33554441, 0, 0, 26377}, + {33554448, 18874381, 33554448, 0, 0, 26377}, {330, 331, 330, 0, 0, 10113}, {330, 331, 330, 0, 0, 9993}, {332, 333, 332, 0, 0, 10113}, @@ -268,7 +268,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {379, 380, 379, 0, 0, 9993}, {381, 382, 381, 0, 0, 10113}, {381, 382, 381, 0, 0, 9993}, - {83, 383, 83, 0, 0, 9993}, + {16777236, 17825810, 16777236, 0, 0, 26377}, {579, 384, 579, 0, 0, 9993}, {385, 595, 385, 0, 0, 10113}, {386, 387, 386, 0, 0, 10113}, @@ -371,7 +371,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {492, 493, 492, 0, 0, 9993}, {494, 495, 494, 0, 0, 10113}, {494, 495, 494, 0, 0, 9993}, - {33554444, 16777227, 33554444, 0, 0, 26377}, + {33554456, 18874389, 33554456, 0, 0, 26377}, {497, 499, 498, 0, 0, 10113}, {497, 499, 498, 0, 0, 10049}, {497, 499, 498, 0, 0, 9993}, @@ -490,7 +490,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {439, 658, 439, 0, 0, 9993}, {0, 0, 0, 0, 0, 14089}, {0, 0, 0, 0, 0, 5889}, - {921, 837, 921, 0, 0, 13832}, + {16777244, 17825818, 16777244, 0, 0, 30216}, {880, 881, 880, 0, 0, 10113}, {880, 881, 880, 0, 0, 9993}, {882, 883, 882, 0, 0, 10113}, @@ -508,7 +508,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {908, 972, 908, 0, 0, 10113}, {910, 973, 910, 0, 0, 10113}, {911, 974, 911, 0, 0, 10113}, - {50331663, 16777230, 50331663, 0, 0, 26377}, + {50331681, 19922973, 50331681, 0, 0, 26377}, {913, 945, 913, 0, 0, 10113}, {914, 946, 914, 0, 0, 10113}, {915, 947, 915, 0, 0, 10113}, @@ -539,7 +539,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {904, 941, 904, 0, 0, 9993}, {905, 942, 905, 0, 0, 9993}, {906, 943, 906, 0, 0, 9993}, - {50331667, 16777234, 50331667, 0, 0, 26377}, + {50331688, 19922980, 50331688, 0, 0, 26377}, {913, 945, 913, 0, 0, 9993}, {914, 946, 914, 0, 0, 9993}, {915, 947, 915, 0, 0, 9993}, @@ -557,7 +557,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {927, 959, 927, 0, 0, 9993}, {928, 960, 928, 0, 0, 9993}, {929, 961, 929, 0, 0, 9993}, - {931, 962, 931, 0, 0, 9993}, + {16777261, 17825835, 16777261, 0, 0, 26377}, {931, 963, 931, 0, 0, 9993}, {932, 964, 932, 0, 0, 9993}, {933, 965, 933, 0, 0, 9993}, @@ -571,11 +571,11 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {910, 973, 910, 0, 0, 9993}, {911, 974, 911, 0, 0, 9993}, {975, 983, 975, 0, 0, 10113}, - {914, 976, 914, 0, 0, 9993}, - {920, 977, 920, 0, 0, 9993}, + {16777264, 17825838, 16777264, 0, 0, 26377}, + {16777267, 17825841, 16777267, 0, 0, 26377}, {0, 0, 0, 0, 0, 10113}, - {934, 981, 934, 0, 0, 9993}, - {928, 982, 928, 0, 0, 9993}, + {16777270, 17825844, 16777270, 0, 0, 26377}, + {16777273, 17825847, 16777273, 0, 0, 26377}, {975, 983, 975, 0, 0, 9993}, {984, 985, 984, 0, 0, 10113}, {984, 985, 984, 0, 0, 9993}, @@ -601,11 +601,11 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {1004, 1005, 1004, 0, 0, 9993}, {1006, 1007, 1006, 0, 0, 10113}, {1006, 1007, 1006, 0, 0, 9993}, - {922, 1008, 922, 0, 0, 9993}, - {929, 1009, 929, 0, 0, 9993}, + {16777276, 17825850, 16777276, 0, 0, 26377}, + {16777279, 17825853, 16777279, 0, 0, 26377}, {1017, 1010, 1017, 0, 0, 9993}, {1012, 952, 1012, 0, 0, 10113}, - {917, 1013, 917, 0, 0, 9993}, + {16777282, 17825856, 16777282, 0, 0, 26377}, {1015, 1016, 1015, 0, 0, 10113}, {1015, 1016, 1015, 0, 0, 9993}, {1017, 1010, 1017, 0, 0, 10113}, @@ -978,7 +978,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {1364, 1412, 1364, 0, 0, 9993}, {1365, 1413, 1365, 0, 0, 9993}, {1366, 1414, 1366, 0, 0, 9993}, - {33554455, 16777238, 33554457, 0, 0, 26377}, + {33554502, 18874435, 33554504, 0, 0, 26377}, {0, 0, 0, 0, 0, 1537}, {4256, 11520, 4256, 0, 0, 10113}, {4257, 11521, 4257, 0, 0, 10113}, @@ -1180,13 +1180,13 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {7826, 7827, 7826, 0, 0, 9993}, {7828, 7829, 7828, 0, 0, 10113}, {7828, 7829, 7828, 0, 0, 9993}, - {33554460, 16777243, 33554460, 0, 0, 26377}, - {33554463, 16777246, 33554463, 0, 0, 26377}, - {33554466, 16777249, 33554466, 0, 0, 26377}, - {33554469, 16777252, 33554469, 0, 0, 26377}, - {33554472, 16777255, 33554472, 0, 0, 26377}, - {7776, 7835, 7776, 0, 0, 9993}, - {7838, 223, 7838, 0, 0, 10113}, + {33554509, 18874442, 33554509, 0, 0, 26377}, + {33554514, 18874447, 33554514, 0, 0, 26377}, + {33554519, 18874452, 33554519, 0, 0, 26377}, + {33554524, 18874457, 33554524, 0, 0, 26377}, + {33554529, 18874462, 33554529, 0, 0, 26377}, + {16777317, 17825891, 16777317, 0, 0, 26377}, + {16777321, 18874470, 16777321, 0, 0, 26497}, {7840, 7841, 7840, 0, 0, 10113}, {7840, 7841, 7840, 0, 0, 9993}, {7842, 7843, 7842, 0, 0, 10113}, @@ -1355,13 +1355,13 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {8011, 8003, 8011, 0, 0, 10113}, {8012, 8004, 8012, 0, 0, 10113}, {8013, 8005, 8013, 0, 0, 10113}, - {33554475, 16777258, 33554475, 0, 0, 26377}, + {33554541, 18874474, 33554541, 0, 0, 26377}, {8025, 8017, 8025, 0, 0, 9993}, - {50331694, 16777261, 50331694, 0, 0, 26377}, + {50331763, 19923055, 50331763, 0, 0, 26377}, {8027, 8019, 8027, 0, 0, 9993}, - {50331698, 16777265, 50331698, 0, 0, 26377}, + {50331770, 19923062, 50331770, 0, 0, 26377}, {8029, 8021, 8029, 0, 0, 9993}, - {50331702, 16777269, 50331702, 0, 0, 26377}, + {50331777, 19923069, 50331777, 0, 0, 26377}, {8031, 8023, 8031, 0, 0, 9993}, {8025, 8017, 8025, 0, 0, 10113}, {8027, 8019, 8027, 0, 0, 10113}, @@ -1397,110 +1397,110 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {8171, 8059, 8171, 0, 0, 9993}, {8186, 8060, 8186, 0, 0, 9993}, {8187, 8061, 8187, 0, 0, 9993}, - {33554490, 16777273, 16777276, 0, 0, 26377}, - {33554494, 16777277, 16777280, 0, 0, 26377}, - {33554498, 16777281, 16777284, 0, 0, 26377}, - {33554502, 16777285, 16777288, 0, 0, 26377}, - {33554506, 16777289, 16777292, 0, 0, 26377}, - {33554510, 16777293, 16777296, 0, 0, 26377}, - {33554514, 16777297, 16777300, 0, 0, 26377}, - {33554518, 16777301, 16777304, 0, 0, 26377}, - {33554522, 16777305, 16777308, 0, 0, 26433}, - {33554526, 16777309, 16777312, 0, 0, 26433}, - {33554530, 16777313, 16777316, 0, 0, 26433}, - {33554534, 16777317, 16777320, 0, 0, 26433}, - {33554538, 16777321, 16777324, 0, 0, 26433}, - {33554542, 16777325, 16777328, 0, 0, 26433}, - {33554546, 16777329, 16777332, 0, 0, 26433}, - {33554550, 16777333, 16777336, 0, 0, 26433}, - {33554554, 16777337, 16777340, 0, 0, 26377}, - {33554558, 16777341, 16777344, 0, 0, 26377}, - {33554562, 16777345, 16777348, 0, 0, 26377}, - {33554566, 16777349, 16777352, 0, 0, 26377}, - {33554570, 16777353, 16777356, 0, 0, 26377}, - {33554574, 16777357, 16777360, 0, 0, 26377}, - {33554578, 16777361, 16777364, 0, 0, 26377}, - {33554582, 16777365, 16777368, 0, 0, 26377}, - {33554586, 16777369, 16777372, 0, 0, 26433}, - {33554590, 16777373, 16777376, 0, 0, 26433}, - {33554594, 16777377, 16777380, 0, 0, 26433}, - {33554598, 16777381, 16777384, 0, 0, 26433}, - {33554602, 16777385, 16777388, 0, 0, 26433}, - {33554606, 16777389, 16777392, 0, 0, 26433}, - {33554610, 16777393, 16777396, 0, 0, 26433}, - {33554614, 16777397, 16777400, 0, 0, 26433}, - {33554618, 16777401, 16777404, 0, 0, 26377}, - {33554622, 16777405, 16777408, 0, 0, 26377}, - {33554626, 16777409, 16777412, 0, 0, 26377}, - {33554630, 16777413, 16777416, 0, 0, 26377}, - {33554634, 16777417, 16777420, 0, 0, 26377}, - {33554638, 16777421, 16777424, 0, 0, 26377}, - {33554642, 16777425, 16777428, 0, 0, 26377}, - {33554646, 16777429, 16777432, 0, 0, 26377}, - {33554650, 16777433, 16777436, 0, 0, 26433}, - {33554654, 16777437, 16777440, 0, 0, 26433}, - {33554658, 16777441, 16777444, 0, 0, 26433}, - {33554662, 16777445, 16777448, 0, 0, 26433}, - {33554666, 16777449, 16777452, 0, 0, 26433}, - {33554670, 16777453, 16777456, 0, 0, 26433}, - {33554674, 16777457, 16777460, 0, 0, 26433}, - {33554678, 16777461, 16777464, 0, 0, 26433}, + {33554567, 18874500, 16777353, 0, 0, 26377}, + {33554573, 18874506, 16777359, 0, 0, 26377}, + {33554579, 18874512, 16777365, 0, 0, 26377}, + {33554585, 18874518, 16777371, 0, 0, 26377}, + {33554591, 18874524, 16777377, 0, 0, 26377}, + {33554597, 18874530, 16777383, 0, 0, 26377}, + {33554603, 18874536, 16777389, 0, 0, 26377}, + {33554609, 18874542, 16777395, 0, 0, 26377}, + {33554615, 18874548, 16777401, 0, 0, 26433}, + {33554621, 18874554, 16777407, 0, 0, 26433}, + {33554627, 18874560, 16777413, 0, 0, 26433}, + {33554633, 18874566, 16777419, 0, 0, 26433}, + {33554639, 18874572, 16777425, 0, 0, 26433}, + {33554645, 18874578, 16777431, 0, 0, 26433}, + {33554651, 18874584, 16777437, 0, 0, 26433}, + {33554657, 18874590, 16777443, 0, 0, 26433}, + {33554663, 18874596, 16777449, 0, 0, 26377}, + {33554669, 18874602, 16777455, 0, 0, 26377}, + {33554675, 18874608, 16777461, 0, 0, 26377}, + {33554681, 18874614, 16777467, 0, 0, 26377}, + {33554687, 18874620, 16777473, 0, 0, 26377}, + {33554693, 18874626, 16777479, 0, 0, 26377}, + {33554699, 18874632, 16777485, 0, 0, 26377}, + {33554705, 18874638, 16777491, 0, 0, 26377}, + {33554711, 18874644, 16777497, 0, 0, 26433}, + {33554717, 18874650, 16777503, 0, 0, 26433}, + {33554723, 18874656, 16777509, 0, 0, 26433}, + {33554729, 18874662, 16777515, 0, 0, 26433}, + {33554735, 18874668, 16777521, 0, 0, 26433}, + {33554741, 18874674, 16777527, 0, 0, 26433}, + {33554747, 18874680, 16777533, 0, 0, 26433}, + {33554753, 18874686, 16777539, 0, 0, 26433}, + {33554759, 18874692, 16777545, 0, 0, 26377}, + {33554765, 18874698, 16777551, 0, 0, 26377}, + {33554771, 18874704, 16777557, 0, 0, 26377}, + {33554777, 18874710, 16777563, 0, 0, 26377}, + {33554783, 18874716, 16777569, 0, 0, 26377}, + {33554789, 18874722, 16777575, 0, 0, 26377}, + {33554795, 18874728, 16777581, 0, 0, 26377}, + {33554801, 18874734, 16777587, 0, 0, 26377}, + {33554807, 18874740, 16777593, 0, 0, 26433}, + {33554813, 18874746, 16777599, 0, 0, 26433}, + {33554819, 18874752, 16777605, 0, 0, 26433}, + {33554825, 18874758, 16777611, 0, 0, 26433}, + {33554831, 18874764, 16777617, 0, 0, 26433}, + {33554837, 18874770, 16777623, 0, 0, 26433}, + {33554843, 18874776, 16777629, 0, 0, 26433}, + {33554849, 18874782, 16777635, 0, 0, 26433}, {8120, 8112, 8120, 0, 0, 9993}, {8121, 8113, 8121, 0, 0, 9993}, - {33554682, 16777465, 33554684, 0, 0, 26377}, - {33554687, 16777470, 16777473, 0, 0, 26377}, - {33554691, 16777474, 33554693, 0, 0, 26377}, - {33554696, 16777479, 33554696, 0, 0, 26377}, - {50331915, 16777482, 50331918, 0, 0, 26377}, + {33554855, 18874788, 33554857, 0, 0, 26377}, + {33554862, 18874795, 16777648, 0, 0, 26377}, + {33554868, 18874801, 33554870, 0, 0, 26377}, + {33554875, 18874808, 33554875, 0, 0, 26377}, + {50332097, 19923389, 50332100, 0, 0, 26377}, {8120, 8112, 8120, 0, 0, 10113}, {8121, 8113, 8121, 0, 0, 10113}, {8122, 8048, 8122, 0, 0, 10113}, {8123, 8049, 8123, 0, 0, 10113}, - {33554706, 16777489, 16777492, 0, 0, 26433}, - {921, 8126, 921, 0, 0, 9993}, - {33554710, 16777493, 33554712, 0, 0, 26377}, - {33554715, 16777498, 16777501, 0, 0, 26377}, - {33554719, 16777502, 33554721, 0, 0, 26377}, - {33554724, 16777507, 33554724, 0, 0, 26377}, - {50331943, 16777510, 50331946, 0, 0, 26377}, + {33554890, 18874823, 16777676, 0, 0, 26433}, + {16777679, 17826253, 16777679, 0, 0, 26377}, + {33554899, 18874832, 33554901, 0, 0, 26377}, + {33554906, 18874839, 16777692, 0, 0, 26377}, + {33554912, 18874845, 33554914, 0, 0, 26377}, + {33554919, 18874852, 33554919, 0, 0, 26377}, + {50332141, 19923433, 50332144, 0, 0, 26377}, {8136, 8050, 8136, 0, 0, 10113}, {8137, 8051, 8137, 0, 0, 10113}, {8138, 8052, 8138, 0, 0, 10113}, {8139, 8053, 8139, 0, 0, 10113}, - {33554734, 16777517, 16777520, 0, 0, 26433}, + {33554934, 18874867, 16777720, 0, 0, 26433}, {8152, 8144, 8152, 0, 0, 9993}, {8153, 8145, 8153, 0, 0, 9993}, - {50331954, 16777521, 50331954, 0, 0, 26377}, - {50331958, 16777525, 50331958, 0, 0, 26377}, - {33554746, 16777529, 33554746, 0, 0, 26377}, - {50331965, 16777532, 50331965, 0, 0, 26377}, + {50332157, 19923449, 50332157, 0, 0, 26377}, + {50332164, 19923456, 50332164, 0, 0, 26377}, + {33554954, 18874887, 33554954, 0, 0, 26377}, + {50332176, 19923468, 50332176, 0, 0, 26377}, {8152, 8144, 8152, 0, 0, 10113}, {8153, 8145, 8153, 0, 0, 10113}, {8154, 8054, 8154, 0, 0, 10113}, {8155, 8055, 8155, 0, 0, 10113}, {8168, 8160, 8168, 0, 0, 9993}, {8169, 8161, 8169, 0, 0, 9993}, - {50331969, 16777536, 50331969, 0, 0, 26377}, - {50331973, 16777540, 50331973, 0, 0, 26377}, - {33554761, 16777544, 33554761, 0, 0, 26377}, + {50332183, 19923475, 50332183, 0, 0, 26377}, + {50332190, 19923482, 50332190, 0, 0, 26377}, + {33554980, 18874913, 33554980, 0, 0, 26377}, {8172, 8165, 8172, 0, 0, 9993}, - {33554764, 16777547, 33554764, 0, 0, 26377}, - {50331983, 16777550, 50331983, 0, 0, 26377}, + {33554985, 18874918, 33554985, 0, 0, 26377}, + {50332207, 19923499, 50332207, 0, 0, 26377}, {8168, 8160, 8168, 0, 0, 10113}, {8169, 8161, 8169, 0, 0, 10113}, {8170, 8058, 8170, 0, 0, 10113}, {8171, 8059, 8171, 0, 0, 10113}, {8172, 8165, 8172, 0, 0, 10113}, - {33554771, 16777554, 33554773, 0, 0, 26377}, - {33554776, 16777559, 16777562, 0, 0, 26377}, - {33554780, 16777563, 33554782, 0, 0, 26377}, - {33554785, 16777568, 33554785, 0, 0, 26377}, - {50332004, 16777571, 50332007, 0, 0, 26377}, + {33554997, 18874930, 33554999, 0, 0, 26377}, + {33555004, 18874937, 16777790, 0, 0, 26377}, + {33555010, 18874943, 33555012, 0, 0, 26377}, + {33555017, 18874950, 33555017, 0, 0, 26377}, + {50332239, 19923531, 50332242, 0, 0, 26377}, {8184, 8056, 8184, 0, 0, 10113}, {8185, 8057, 8185, 0, 0, 10113}, {8186, 8060, 8186, 0, 0, 10113}, {8187, 8061, 8187, 0, 0, 10113}, - {33554795, 16777578, 16777581, 0, 0, 26433}, + {33555032, 18874965, 16777818, 0, 0, 26433}, {0, 0, 0, 0, 0, 3076}, {0, 0, 0, 0, 4, 3076}, {0, 0, 0, 0, 5, 3076}, @@ -2037,18 +2037,18 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {42918, 42919, 42918, 0, 0, 9993}, {42920, 42921, 42920, 0, 0, 10113}, {42920, 42921, 42920, 0, 0, 9993}, - {33554799, 16777582, 33554801, 0, 0, 26377}, - {33554804, 16777587, 33554806, 0, 0, 26377}, - {33554809, 16777592, 33554811, 0, 0, 26377}, - {50332030, 16777597, 50332033, 0, 0, 26377}, - {50332037, 16777604, 50332040, 0, 0, 26377}, - {33554828, 16777611, 33554830, 0, 0, 26377}, - {33554833, 16777616, 33554835, 0, 0, 26377}, - {33554838, 16777621, 33554840, 0, 0, 26377}, - {33554843, 16777626, 33554845, 0, 0, 26377}, - {33554848, 16777631, 33554850, 0, 0, 26377}, - {33554853, 16777636, 33554855, 0, 0, 26377}, - {33554858, 16777641, 33554860, 0, 0, 26377}, + {33555038, 18874971, 33555040, 0, 0, 26377}, + {33555045, 18874978, 33555047, 0, 0, 26377}, + {33555052, 18874985, 33555054, 0, 0, 26377}, + {50332276, 19923568, 50332279, 0, 0, 26377}, + {50332286, 19923578, 50332289, 0, 0, 26377}, + {33555079, 18875012, 33555081, 0, 0, 26377}, + {33555086, 18875019, 33555088, 0, 0, 26377}, + {33555093, 18875026, 33555095, 0, 0, 26377}, + {33555100, 18875033, 33555102, 0, 0, 26377}, + {33555107, 18875040, 33555109, 0, 0, 26377}, + {33555114, 18875047, 33555116, 0, 0, 26377}, + {33555121, 18875054, 33555123, 0, 0, 26377}, {0, 0, 0, 0, 0, 1025}, {65313, 65345, 65313, 0, 0, 10113}, {65314, 65346, 65314, 0, 0, 10113}, @@ -2188,7 +2188,12 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { /* extended case mappings */ const Py_UCS4 _PyUnicode_ExtendedCase[] = { + 181, + 956, + 924, 223, + 115, + 115, 83, 83, 83, @@ -2198,263 +2203,440 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = { 304, 329, 700, + 110, + 700, 78, + 383, + 115, + 83, 496, + 106, + 780, 74, 780, + 837, + 953, + 921, 912, + 953, + 776, + 769, 921, 776, 769, 944, + 965, + 776, + 769, 933, 776, 769, + 962, + 963, + 931, + 976, + 946, + 914, + 977, + 952, + 920, + 981, + 966, + 934, + 982, + 960, + 928, + 1008, + 954, + 922, + 1009, + 961, + 929, + 1013, + 949, + 917, 1415, + 1381, + 1410, 1333, 1362, 1333, 1410, 7830, + 104, + 817, 72, 817, 7831, + 116, + 776, 84, 776, 7832, + 119, + 778, 87, 778, 7833, + 121, + 778, 89, 778, 7834, + 97, + 702, 65, 702, + 7835, + 7777, + 7776, + 223, + 115, + 115, + 7838, 8016, + 965, + 787, 933, 787, 8018, + 965, + 787, + 768, 933, 787, 768, 8020, + 965, + 787, + 769, 933, 787, 769, 8022, + 965, + 787, + 834, 933, 787, 834, 8064, + 7936, + 953, 7944, 921, 8072, 8065, + 7937, + 953, 7945, 921, 8073, 8066, + 7938, + 953, 7946, 921, 8074, 8067, + 7939, + 953, 7947, 921, 8075, 8068, + 7940, + 953, 7948, 921, 8076, 8069, + 7941, + 953, 7949, 921, 8077, 8070, + 7942, + 953, 7950, 921, 8078, 8071, + 7943, + 953, 7951, 921, 8079, 8064, + 7936, + 953, 7944, 921, 8072, 8065, + 7937, + 953, 7945, 921, 8073, 8066, + 7938, + 953, 7946, 921, 8074, 8067, + 7939, + 953, 7947, 921, 8075, 8068, + 7940, + 953, 7948, 921, 8076, 8069, + 7941, + 953, 7949, 921, 8077, 8070, + 7942, + 953, 7950, 921, 8078, 8071, + 7943, + 953, 7951, 921, 8079, 8080, + 7968, + 953, 7976, 921, 8088, 8081, + 7969, + 953, 7977, 921, 8089, 8082, + 7970, + 953, 7978, 921, 8090, 8083, + 7971, + 953, 7979, 921, 8091, 8084, + 7972, + 953, 7980, 921, 8092, 8085, + 7973, + 953, 7981, 921, 8093, 8086, + 7974, + 953, 7982, 921, 8094, 8087, + 7975, + 953, 7983, 921, 8095, 8080, + 7968, + 953, 7976, 921, 8088, 8081, + 7969, + 953, 7977, 921, 8089, 8082, + 7970, + 953, 7978, 921, 8090, 8083, + 7971, + 953, 7979, 921, 8091, 8084, + 7972, + 953, 7980, 921, 8092, 8085, + 7973, + 953, 7981, 921, 8093, 8086, + 7974, + 953, 7982, 921, 8094, 8087, + 7975, + 953, 7983, 921, 8095, 8096, + 8032, + 953, 8040, 921, 8104, 8097, + 8033, + 953, 8041, 921, 8105, 8098, + 8034, + 953, 8042, 921, 8106, 8099, + 8035, + 953, 8043, 921, 8107, 8100, + 8036, + 953, 8044, 921, 8108, 8101, + 8037, + 953, 8045, 921, 8109, 8102, + 8038, + 953, 8046, 921, 8110, 8103, + 8039, + 953, 8047, 921, 8111, 8096, + 8032, + 953, 8040, 921, 8104, 8097, + 8033, + 953, 8041, 921, 8105, 8098, + 8034, + 953, 8042, 921, 8106, 8099, + 8035, + 953, 8043, 921, 8107, 8100, + 8036, + 953, 8044, 921, 8108, 8101, + 8037, + 953, 8045, 921, 8109, 8102, + 8038, + 953, 8046, 921, 8110, 8103, + 8039, + 953, 8047, 921, 8111, 8114, + 8048, + 953, 8122, 921, 8122, 837, 8115, + 945, + 953, 913, 921, 8124, 8116, + 940, + 953, 902, 921, 902, 837, 8118, + 945, + 834, 913, 834, 8119, + 945, + 834, + 953, 913, 834, 921, @@ -2462,27 +2644,43 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = { 834, 837, 8115, + 945, + 953, 913, 921, 8124, + 8126, + 953, + 921, 8130, + 8052, + 953, 8138, 921, 8138, 837, 8131, + 951, + 953, 919, 921, 8140, 8132, + 942, + 953, 905, 921, 905, 837, 8134, + 951, + 834, 919, 834, 8135, + 951, + 834, + 953, 919, 834, 921, @@ -2490,60 +2688,97 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = { 834, 837, 8131, + 951, + 953, 919, 921, 8140, 8146, + 953, + 776, + 768, 921, 776, 768, 8147, + 953, + 776, + 769, 921, 776, 769, 8150, + 953, + 834, 921, 834, 8151, + 953, + 776, + 834, 921, 776, 834, 8162, + 965, + 776, + 768, 933, 776, 768, 8163, + 965, + 776, + 769, 933, 776, 769, 8164, + 961, + 787, 929, 787, 8166, + 965, + 834, 933, 834, 8167, + 965, + 776, + 834, 933, 776, 834, 8178, + 8060, + 953, 8186, 921, 8186, 837, 8179, + 969, + 953, 937, 921, 8188, 8180, + 974, + 953, 911, 921, 911, 837, 8182, + 969, + 834, 937, 834, 8183, + 969, + 834, + 953, 937, 834, 921, @@ -2551,25 +2786,36 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = { 834, 837, 8179, + 969, + 953, 937, 921, 8188, 64256, + 102, + 102, 70, 70, 70, 102, 64257, + 102, + 105, 70, 73, 70, 105, 64258, + 102, + 108, 70, 76, 70, 108, 64259, + 102, + 102, + 105, 70, 70, 73, @@ -2577,6 +2823,9 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = { 102, 105, 64260, + 102, + 102, + 108, 70, 70, 76, @@ -2584,36 +2833,50 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = { 102, 108, 64261, + 115, + 116, 83, 84, 83, 116, 64262, + 115, + 116, 83, 84, 83, 116, 64275, + 1396, + 1398, 1348, 1350, 1348, 1398, 64276, + 1396, + 1381, 1348, 1333, 1348, 1381, 64277, + 1396, + 1387, 1348, 1339, 1348, 1387, 64278, + 1406, + 1398, 1358, 1350, 1358, 1398, 64279, + 1396, + 1389, 1348, 1341, 1348, diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 140fc6484fa..0795d9e72ff 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -49,6 +49,7 @@ LINE_BREAK = "LineBreak%s.txt" NAME_ALIASES = "NameAliases%s.txt" NAMED_SEQUENCES = "NamedSequences%s.txt" SPECIAL_CASING = "SpecialCasing%s.txt" +CASE_FOLDING = "CaseFolding%s.txt" # Private Use Areas -- in planes 1, 15, 16 PUA_1 = range(0xE000, 0xF900) @@ -424,28 +425,36 @@ def makeunicodetype(unicode, trace): if "Case_Ignorable" in properties: flags |= CASE_IGNORABLE_MASK sc = unicode.special_casing.get(char) + cf = unicode.case_folding.get(char, [char]) + if record[12]: + upper = int(record[12], 16) + else: + upper = char + if record[13]: + lower = int(record[13], 16) + else: + lower = char + if record[14]: + title = int(record[14], 16) + else: + title = upper + if sc is None and cf != [lower]: + sc = ([lower], [title], [upper]) if sc is None: - if record[12]: - upper = int(record[12], 16) - else: - upper = char - if record[13]: - lower = int(record[13], 16) - else: - lower = char - if record[14]: - title = int(record[14], 16) - else: - title = upper if upper == lower == title: upper = lower = title = 0 else: - # This happens when some character maps to more than one - # character in uppercase, lowercase, or titlecase. The extra - # characters are stored in a different array. + # This happens either when some character maps to more than one + # character in uppercase, lowercase, or titlecase or the + # casefolded version of the character is different from the + # lowercase. The extra characters are stored in a different + # array. flags |= EXTENDED_CASE_MASK lower = len(extra_casing) | (len(sc[0]) << 24) extra_casing.extend(sc[0]) + if cf != sc[0]: + lower |= len(cf) << 20 + extra_casing.extend(cf) upper = len(extra_casing) | (len(sc[2]) << 24) extra_casing.extend(sc[2]) # Title is probably equal to upper. @@ -1107,6 +1116,17 @@ class UnicodeData: title = [int(char, 16) for char in data[2].split()] upper = [int(char, 16) for char in data[3].split()] sc[c] = (lower, title, upper) + cf = self.case_folding = {} + if version != '3.2.0': + with open_data(CASE_FOLDING, version) as file: + for s in file: + s = s[:-1].split('#', 1)[0] + if not s: + continue + data = s.split("; ") + if data[1] in "CF": + c = int(data[0], 16) + cf[c] = [int(char, 16) for char in data[2].split()] def uselatin1(self): # restrict character range to ISO Latin 1