use full unicode mappings for upper/lower/title case (#12736)

Also broaden the category of characters that count as lowercase/uppercase.
2012-01-11 18:17:06 -05:00 · 2012-01-11 18:17:06 -05:00 · b2bf01d824
parent 9007f72db0
commit b2bf01d824
11 changed files with 4634 additions and 1757 deletions
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@ -318,16 +318,25 @@ These APIs can be used for fast direct character conversions:
   Return the character *ch* converted to lower case.
   .. deprecated:: 3.3
      This function uses simple case mappings.
 .. c:function:: Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch)
   Return the character *ch* converted to upper case.
   .. deprecated:: 3.3
      This function uses simple case mappings.
 .. c:function:: Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch)
   Return the character *ch* converted to title case.
   .. deprecated:: 3.3
      This function uses simple case mappings.
 .. c:function:: int Py_UNICODE_TODECIMAL(Py_UNICODE ch)
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@ -1360,7 +1360,8 @@ functions based on regular expressions.
 .. method:: str.swapcase()
   Return a copy of the string with uppercase characters converted to lowercase and
-   vice versa.
+   vice versa. Note that it is not necessarily true that
   ``s.swapcase().swapcase() == s``.
 .. method:: str.title()
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -2008,6 +2008,29 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
    Py_UCS4 ch       /* Unicode character */
    );
 PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
    Py_UCS4 ch,       /* Unicode character */
    Py_UCS4 *res
    );
 PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
    Py_UCS4 ch,       /* Unicode character */
    Py_UCS4 *res
    );
 PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
    Py_UCS4 ch,       /* Unicode character */
    Py_UCS4 *res
    );
 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
    const Py_UCS4 ch         /* Unicode character */
    );
 PyAPI_FUNC(int) _PyUnicode_IsCased(
    const Py_UCS4 ch         /* Unicode character */
    );
 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
    Py_UCS4 ch       /* Unicode character */
    );
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@ -669,7 +669,7 @@ class CommonTest(BaseTest):
        # check that titlecased chars are lowered correctly
        # \u1ffc is the titlecased char
-        self.checkequal('\u1ffc\u1ff3\u1ff3\u1ff3',
+        self.checkequal('\u03a9\u0399\u1ff3\u1ff3\u1ff3',
                        '\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
        # check with cased non-letter chars
        self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -369,6 +369,8 @@ class UnicodeTest(string_tests.CommonTest,
    def test_islower(self):
        string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
        self.checkequalnofix(False, '\u1FFc', 'islower')
        self.assertFalse('\u2167'.islower())
        self.assertTrue('\u2177'.islower())
        # non-BMP, uppercase
        self.assertFalse('\U00010401'.islower())
        self.assertFalse('\U00010427'.islower())
@ -383,6 +385,8 @@ class UnicodeTest(string_tests.CommonTest,
        string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
        if not sys.platform.startswith('java'):
            self.checkequalnofix(False, '\u1FFc', 'isupper')
        self.assertTrue('\u2167'.isupper())
        self.assertFalse('\u2177'.isupper())
        # non-BMP, uppercase
        self.assertTrue('\U00010401'.isupper())
        self.assertTrue('\U00010427'.isupper())
@ -548,6 +552,18 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U0001044F\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F'.lower(),
                         'x\U0001044Fx\U0001044F')
        self.assertEqual('ﬁ'.lower(), 'ﬁ')
        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
        self.assertEqual('\u03a3'.lower(), '\u03c3')
        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
        self.assertEqual('\u2177'.lower(), '\u2177')
    def test_upper(self):
        string_tests.CommonTest.test_upper(self)
@ -558,6 +574,13 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U00010427')
        self.assertEqual('X\U00010427x\U0001044F'.upper(),
                         'X\U00010427X\U00010427')
        self.assertEqual('ﬁ'.upper(), 'FI')
        self.assertEqual('\u0130'.upper(), '\u0130')
        self.assertEqual('\u03a3'.upper(), '\u03a3')
        self.assertEqual('ß'.upper(), 'SS')
        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
        self.assertEqual('\u2177'.upper(), '\u2167')
    def test_capitalize(self):
        string_tests.CommonTest.test_capitalize(self)
@ -570,6 +593,11 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
                         'X\U0001044Fx\U0001044F')
        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
        exp = '\u0399\u0308\u0300\u0069\u0307'
        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
        self.assertEqual('ﬁnnish'.capitalize(), 'FInnish')
        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
    def test_title(self):
        string_tests.MixinStrUnicodeUserStringTest.test_title(self)
@ -584,6 +612,9 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U0001044F \U00010427\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
        self.assertEqual('ﬁNNISH'.title(), 'Finnish')
        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
    def test_swapcase(self):
        string_tests.CommonTest.test_swapcase(self)
@ -597,6 +628,19 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
                         'x\U0001044FX\U00010427')
        self.assertEqual('ﬁ'.swapcase(), 'FI')
        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
        self.assertEqual('ß'.swapcase(), 'SS')
        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
    def test_contains(self):
        # Testing Unicode contains method
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -21,7 +21,7 @@ errors = 'surrogatepass'
 class UnicodeMethodsTest(unittest.TestCase):
    # update this, if the database changes
-    expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0'
+    expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921'
    def test_method_checksum(self):
        h = hashlib.sha1()
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------
 - Issue #12736: Use full unicode case mappings for upper, lower, and title case.
 - Issue #12760: Add a create mode to open(). Patch by David Townshend.
 - Issue #13738: Simplify implementation of bytes.lower() and bytes.upper().
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@ -21,8 +21,10 @@
 #define XID_START_MASK 0x100
 #define XID_CONTINUE_MASK 0x200
 #define PRINTABLE_MASK 0x400
-#define NODELTA_MASK 0x800
+#define NUMERIC_MASK 0x800
-#define NUMERIC_MASK 0x1000
+#define CASE_IGNORABLE_MASK 0x1000
 #define CASED_MASK 0x2000
 #define EXTENDED_CASE_MASK 0x4000
 typedef struct {
    const Py_UCS4 upper;
@ -57,15 +59,8 @@ gettyperecord(Py_UCS4 code)
 Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    int delta = ctype->title;
-    if (ctype->flags & NODELTA_MASK)
+    return ctype->title ? ctype->title : ch;
        return delta;
    if (delta >= 32768)
            delta -= 65536;
    return ch + delta;
 }
 /* Returns 1 for Unicode characters having the category 'Lt', 0
@ -188,12 +183,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch)
 Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->upper;
+
-    if (ctype->flags & NODELTA_MASK)
+    if (ctype->flags & EXTENDED_CASE_MASK)
-        return delta;
+        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
-    if (delta >= 32768)
+    return ctype->upper ? ctype->upper : ch;
            delta -= 65536;
    return ch + delta;
 }
 /* Returns the lowercase Unicode characters corresponding to ch or just
@ -202,12 +195,72 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->lower;
+
-    if (ctype->flags & NODELTA_MASK)
+    if (ctype->flags & EXTENDED_CASE_MASK)
-        return delta;
+        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
-    if (delta >= 32768)
+    return ctype->lower ? ctype->lower : ch;
-            delta -= 65536;
+}
-    return ch + delta;
+
 int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    if (ctype->flags & EXTENDED_CASE_MASK) {
        int index = ctype->lower & 0xFFFFFF;
        int n = ctype->lower >> 24;
        int i;
        for (i = 0; i < n; i++)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
    res[0] = ctype->lower ? ctype->lower : ch;
    return 1;
 }
 int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    if (ctype->flags & EXTENDED_CASE_MASK) {
        int index = ctype->title & 0xFFFFFF;
        int n = ctype->title >> 24;
        int i;
        for (i = 0; i < n; i++)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
    res[0] = ctype->title ? ctype->title : ch;
    return 1;
 }
 int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    if (ctype->flags & EXTENDED_CASE_MASK) {
        int index = ctype->upper & 0xFFFFFF;
        int n = ctype->upper >> 24;
        int i;
        for (i = 0; i < n; i++)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
    res[0] = ctype->upper ? ctype->upper : ch;
    return 1;
 }
 int _PyUnicode_IsCased(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    return (ctype->flags & CASED_MASK) != 0;
 }
 int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
 }
 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -41,6 +41,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #define PY_SSIZE_T_CLEAN
 #include "Python.h"
 #include "ucnhash.h"
 #include "bytes_methods.h"
 #ifdef MS_WINDOWS
 #include <windows.h>
@ -9428,188 +9429,222 @@ fixup(PyObject *self,
    return v;
 }
-static Py_UCS4
+static PyObject *
-fixupper(PyObject *self)
+ascii_upper_or_lower(PyObject *self, int lower)
 {
-    /* No need to call PyUnicode_READY(self) because this function is only
+    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
-       called as a callback from fixup() which does it already. */
+    char *resdata, *data = PyUnicode_DATA(self);
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+    PyObject *res;
    const int kind = PyUnicode_KIND(self);
    void *data = PyUnicode_DATA(self);
    int touched = 0;
    Py_UCS4 maxchar = 0;
    Py_ssize_t i;
-    for (i = 0; i < len; ++i) {
+    res = PyUnicode_New(len, 127);
-        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+    if (res == NULL)
-        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
+        return NULL;
-        if (up != ch) {
+    resdata = PyUnicode_DATA(res);
-            if (up > maxchar)
+    if (lower)
-                maxchar = up;
+        _Py_bytes_lower(resdata, data, len);
            PyUnicode_WRITE(kind, data, i, up);
            touched = 1;
        }
        else if (ch > maxchar)
            maxchar = ch;
    }
    if (touched)
        return maxchar;
    else
-        return 0;
+        _Py_bytes_upper(resdata, data, len);
    return res;
 }
 static Py_UCS4
-fixlower(PyObject *self)
+handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
 {
-    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
+    Py_ssize_t j;
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+    int final_sigma;
-    const int kind = PyUnicode_KIND(self);
+    Py_UCS4 c;
-    void *data = PyUnicode_DATA(self);
+    /* U+03A3 is in the Final_Sigma context when, it is found like this:
    int touched = 0;
    Py_UCS4 maxchar = 0;
    Py_ssize_t i;
-    for(i = 0; i < len; ++i) {
+     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
-        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+
-        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
+    where ! is a negation and \p{xxx} is a character with property xxx.
-        if (lo != ch) {
+    */
-            if (lo > maxchar)
+    for (j = i - 1; j >= 0; j--) {
-                maxchar = lo;
+        c = PyUnicode_READ(kind, data, j);
-            PyUnicode_WRITE(kind, data, i, lo);
+        if (!_PyUnicode_IsCaseIgnorable(c))
-            touched = 1;
+            break;
        }
        else if (ch > maxchar)
            maxchar = ch;
    }
-
+    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
-    if (touched)
+    if (final_sigma) {
-        return maxchar;
+        for (j = i + 1; j < length; j++) {
-    else
+            c = PyUnicode_READ(kind, data, j);
-        return 0;
+            if (!_PyUnicode_IsCaseIgnorable(c))
                break;
        }
        final_sigma = j == length || !_PyUnicode_IsCased(c);
    }
    return (final_sigma) ? 0x3C2 : 0x3C3;
 }
-static Py_UCS4
+static int
-fixswapcase(PyObject *self)
+lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
           Py_UCS4 c, Py_UCS4 *mapped)
 {
-    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
+    /* Obscure special case. */
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+    if (c == 0x3A3) {
-    const int kind = PyUnicode_KIND(self);
+        mapped[0] = handle_capital_sigma(kind, data, length, i);
-    void *data = PyUnicode_DATA(self);
+        return 1;
    int touched = 0;
    Py_UCS4 maxchar = 0;
    Py_ssize_t i;
    for(i = 0; i < len; ++i) {
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
        Py_UCS4 nu = 0;
        if (Py_UNICODE_ISUPPER(ch))
            nu = Py_UNICODE_TOLOWER(ch);
        else if (Py_UNICODE_ISLOWER(ch))
            nu = Py_UNICODE_TOUPPER(ch);
        if (nu != 0) {
            if (nu > maxchar)
                maxchar = nu;
            PyUnicode_WRITE(kind, data, i, nu);
            touched = 1;
        }
        else if (ch > maxchar)
            maxchar = ch;
    }
-
+    return _PyUnicode_ToLowerFull(c, mapped);
    if (touched)
        return maxchar;
    else
        return 0;
 }
-static Py_UCS4
+static Py_ssize_t
-fixcapitalize(PyObject *self)
+do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
 {
-    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
+    Py_ssize_t i, k = 0;
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+    int n_res, j;
-    const int kind = PyUnicode_KIND(self);
+    Py_UCS4 c, mapped[3];
    void *data = PyUnicode_DATA(self);
    int touched = 0;
    Py_UCS4 maxchar = 0;
    Py_ssize_t i = 0;
    Py_UCS4 ch;
-    if (len == 0)
+    c = PyUnicode_READ(kind, data, 0);
-        return 0;
+    n_res = _PyUnicode_ToUpperFull(c, mapped);
-
+    for (j = 0; j < n_res; j++) {
-    ch = PyUnicode_READ(kind, data, i);
+        if (mapped[j] > *maxchar)
-    if (!Py_UNICODE_ISUPPER(ch)) {
+            *maxchar = mapped[j];
-        maxchar = Py_UNICODE_TOUPPER(ch);
+        res[k++] = mapped[j];
        PyUnicode_WRITE(kind, data, i, maxchar);
        touched = 1;
    }
-    ++i;
+    for (i = 1; i < length; i++) {
-    for(; i < len; ++i) {
+        c = PyUnicode_READ(kind, data, i);
-        ch = PyUnicode_READ(kind, data, i);
+        n_res = lower_ucs4(kind, data, length, i, c, mapped);
-        if (!Py_UNICODE_ISLOWER(ch)) {
+        for (j = 0; j < n_res; j++) {
-            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
+            if (mapped[j] > *maxchar)
-            if (lo > maxchar)
+                *maxchar = mapped[j];
-                maxchar = lo;
+            res[k++] = mapped[j];
            PyUnicode_WRITE(kind, data, i, lo);
            touched = 1;
        }
        else if (ch > maxchar)
            maxchar = ch;
    }
-
+    return k;
    if (touched)
        return maxchar;
    else
        return 0;
 }
-static Py_UCS4
+static Py_ssize_t
-fixtitle(PyObject *self)
+do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
    Py_ssize_t i, k = 0;
    for (i = 0; i < length; i++) {
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
        int n_res, j;
        if (Py_UNICODE_ISUPPER(c)) {
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
        }
        else if (Py_UNICODE_ISLOWER(c)) {
            n_res = _PyUnicode_ToUpperFull(c, mapped);
        }
        else {
            n_res = 1;
            mapped[0] = c;
        }
        for (j = 0; j < n_res; j++) {
            if (mapped[j] > *maxchar)
                *maxchar = mapped[j];
            res[k++] = mapped[j];
        }
    }
    return k;
 }
 static Py_ssize_t
 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
                  Py_UCS4 *maxchar, int lower)
 {
-    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
+    Py_ssize_t i, k = 0;
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+
-    const int kind = PyUnicode_KIND(self);
+    for (i = 0; i < length; i++) {
-    void *data = PyUnicode_DATA(self);
+        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
-    Py_UCS4 maxchar = 0;
+        int n_res, j;
-    Py_ssize_t i = 0;
+        if (lower)
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
        else
            n_res = _PyUnicode_ToUpperFull(c, mapped);
        for (j = 0; j < n_res; j++) {
            if (mapped[j] > *maxchar)
                *maxchar = mapped[j];
            res[k++] = mapped[j];
        }
    }
    return k;
 }
 static Py_ssize_t
 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
 {
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
 }
 static Py_ssize_t
 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
 {
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
 }
 static PyObject *
 case_operation(PyObject *self,
               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
 {
    PyObject *res = NULL;
    Py_ssize_t length, newlength = 0;
    int kind, outkind;
    void *data, *outdata;
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
    if (PyUnicode_READY(self) == -1)
        return NULL;
    kind = PyUnicode_KIND(self);
    data = PyUnicode_DATA(self);
    length = PyUnicode_GET_LENGTH(self);
    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
    if (tmp == NULL)
        return PyErr_NoMemory();
    newlength = perform(kind, data, length, tmp, &maxchar);
    res = PyUnicode_New(newlength, maxchar);
    if (res == NULL)
        goto leave;
    tmpend = tmp + newlength;
    outdata = PyUnicode_DATA(res);
    outkind = PyUnicode_KIND(res);
    switch (outkind) {
    case PyUnicode_1BYTE_KIND:
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
        break;
    case PyUnicode_2BYTE_KIND:
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
        break;
    case PyUnicode_4BYTE_KIND:
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
        break;
    default:
        assert(0);
        break;
    }
  leave:
    PyMem_FREE(tmp);
    return res;
 }
 static Py_ssize_t
 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
 {
    Py_ssize_t i, k = 0;
    int previous_is_cased;
    /* Shortcut for single character strings */
    if (len == 1) {
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
        if (ti != ch) {
            PyUnicode_WRITE(kind, data, i, ti);
            return ti;
        }
        else
            return 0;
    }
    previous_is_cased = 0;
-    for(; i < len; ++i) {
+    for (i = 0; i < length; i++) {
-        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
-        Py_UCS4 nu;
+        Py_UCS4 mapped[3];
        int n_res, j;
        if (previous_is_cased)
-            nu = Py_UNICODE_TOLOWER(ch);
+            n_res = lower_ucs4(kind, data, length, i, c, mapped);
        else
-            nu = Py_UNICODE_TOTITLE(ch);
+            n_res = _PyUnicode_ToTitleFull(c, mapped);
-        if (nu > maxchar)
+        for (j = 0; j < n_res; j++) {
-            maxchar = nu;
+            if (mapped[j] > *maxchar)
-        PyUnicode_WRITE(kind, data, i, nu);
+                *maxchar = mapped[j];
            res[k++] = mapped[j];
        }
-        if (Py_UNICODE_ISLOWER(ch) ||
+        previous_is_cased = _PyUnicode_IsCased(c);
            Py_UNICODE_ISUPPER(ch) ||
            Py_UNICODE_ISTITLE(ch))
            previous_is_cased = 1;
        else
            previous_is_cased = 0;
    }
-    return maxchar;
+    return k;
 }
 PyObject *
@ -10445,7 +10480,7 @@ characters, all remaining cased characters have lower case.");
 static PyObject*
 unicode_title(PyObject *self)
 {
-    return fixup(self, fixtitle);
+    return case_operation(self, do_title);
 }
 PyDoc_STRVAR(capitalize__doc__,
@ -10457,7 +10492,11 @@ have upper case and the rest lower case.");
 static PyObject*
 unicode_capitalize(PyObject *self)
 {
-    return fixup(self, fixcapitalize);
+    if (PyUnicode_READY(self) == -1)
        return NULL;
    if (PyUnicode_GET_LENGTH(self) == 0)
        return unicode_result_unchanged(self);
    return case_operation(self, do_capitalize);
 }
 #if 0
@ -11715,7 +11754,11 @@ Return a copy of the string S converted to lowercase.");
 static PyObject*
 unicode_lower(PyObject *self)
 {
-    return fixup(self, fixlower);
+    if (PyUnicode_READY(self) == -1)
        return NULL;
    if (PyUnicode_IS_ASCII(self))
        return ascii_upper_or_lower(self, 1);
    return case_operation(self, do_lower);
 }
 #define LEFTSTRIP 0
@ -12604,7 +12647,7 @@ and vice versa.");
 static PyObject*
 unicode_swapcase(PyObject *self)
 {
-    return fixup(self, fixswapcase);
+    return case_operation(self, do_swapcase);
 }
 PyDoc_STRVAR(maketrans__doc__,
@ -12750,7 +12793,11 @@ Return a copy of S converted to uppercase.");
 static PyObject*
 unicode_upper(PyObject *self)
 {
-    return fixup(self, fixupper);
+    if (PyUnicode_READY(self) == -1)
        return NULL;
    if (PyUnicode_IS_ASCII(self))
        return ascii_upper_or_lower(self, 0);
    return case_operation(self, do_upper);
 }
 PyDoc_STRVAR(zfill__doc__,
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -22,6 +22,7 @@
 # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
 # 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
 # 2011-10-21 ezio add support for name aliases and named sequences
 # 2012-01    benjamin add full case mappings
 #
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
@ -47,6 +48,7 @@ DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
 NAME_ALIASES = "NameAliases%s.txt"
 NAMED_SEQUENCES = "NamedSequences%s.txt"
 SPECIAL_CASING = "SpecialCasing%s.txt"
 # Private Use Areas -- in planes 1, 15, 16
 PUA_1 = range(0xE000, 0xF900)
@ -84,8 +86,10 @@ UPPER_MASK = 0x80
 XID_START_MASK = 0x100
 XID_CONTINUE_MASK = 0x200
 PRINTABLE_MASK = 0x400
-NODELTA_MASK = 0x800
+NUMERIC_MASK = 0x800
-NUMERIC_MASK = 0x1000
+CASE_IGNORABLE_MASK = 0x1000
 CASED_MASK = 0x2000
 EXTENDED_CASE_MASK = 0x4000
 # these ranges need to match unicodedata.c:is_unified_ideograph
 cjk_ranges = [
@ -384,6 +388,7 @@ def makeunicodetype(unicode, trace):
    numeric = {}
    spaces = []
    linebreaks = []
    extra_casing = []
    for char in unicode.chars:
        record = unicode.table[char]
@ -396,7 +401,7 @@ def makeunicodetype(unicode, trace):
            delta = True
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                flags |= ALPHA_MASK
-            if category == "Ll":
+            if "Lowercase" in properties:
                flags |= LOWER_MASK
            if 'Line_Break' in properties or bidirectional == "B":
                flags |= LINEBREAK_MASK
@ -406,7 +411,7 @@ def makeunicodetype(unicode, trace):
                spaces.append(char)
            if category == "Lt":
                flags |= TITLE_MASK
-            if category == "Lu":
+            if "Uppercase" in properties:
                flags |= UPPER_MASK
            if char == ord(" ") or category[0] not in ("C", "Z"):
                flags |= PRINTABLE_MASK
@ -414,35 +419,41 @@ def makeunicodetype(unicode, trace):
                flags |= XID_START_MASK
            if "XID_Continue" in properties:
                flags |= XID_CONTINUE_MASK
-            # use delta predictor for upper/lower/title if it fits
+            if "Cased" in properties:
-            if record[12]:
+                flags |= CASED_MASK
-                upper = int(record[12], 16)
+            if "Case_Ignorable" in properties:
                flags |= CASE_IGNORABLE_MASK
            sc = unicode.special_casing.get(char)
            if sc is None:
                if record[12]:
                    upper = int(record[12], 16)
                else:
                    upper = char
                if record[13]:
                    lower = int(record[13], 16)
                else:
                    lower = char
                if record[14]:
                    title = int(record[14], 16)
                else:
                    title = upper
                if upper == lower == title:
                    upper = lower = title = 0
            else:
-                upper = char
+                # This happens when some character maps to more than one
-            if record[13]:
+                # character in uppercase, lowercase, or titlecase. The extra
-                lower = int(record[13], 16)
+                # characters are stored in a different array.
-            else:
+                flags |= EXTENDED_CASE_MASK
-                lower = char
+                lower = len(extra_casing) | (len(sc[0]) << 24)
-            if record[14]:
+                extra_casing.extend(sc[0])
-                title = int(record[14], 16)
+                upper = len(extra_casing) | (len(sc[2]) << 24)
-            else:
+                extra_casing.extend(sc[2])
-                # UCD.html says that a missing title char means that
+                # Title is probably equal to upper.
-                # it defaults to the uppercase character, not to the
+                if sc[1] == sc[2]:
-                # character itself. Apparently, in the current UCD (5.x)
+                    title = upper
-                # this feature is never used
+                else:
-                title = upper
+                    title = len(extra_casing) | (len(sc[1]) << 24)
-            upper_d = upper - char
+                    extra_casing.extend(sc[1])
            lower_d = lower - char
            title_d = title - char
            if -32768 <= upper_d <= 32767 and \
               -32768 <= lower_d <= 32767 and \
               -32768 <= title_d <= 32767:
                # use deltas
                upper = upper_d & 0xffff
                lower = lower_d & 0xffff
                title = title_d & 0xffff
            else:
                flags |= NODELTA_MASK
            # decimal digit, integer digit
            decimal = 0
            if record[6]:
@ -469,6 +480,7 @@ def makeunicodetype(unicode, trace):
    print(sum(map(len, numeric.values())), "numeric code points")
    print(len(spaces), "whitespace code points")
    print(len(linebreaks), "linebreak code points")
    print(len(extra_casing), "extended case array")
    print("--- Writing", FILE, "...")
@ -482,6 +494,14 @@ def makeunicodetype(unicode, trace):
    print("};", file=fp)
    print(file=fp)
    print("/* extended case mappings */", file=fp)
    print(file=fp)
    print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
    for c in extra_casing:
        print("    %d," % c, file=fp)
    print("};", file=fp)
    print(file=fp)
    # split decomposition index table
    index1, index2, shift = splitbins(index, trace)
@ -1070,6 +1090,23 @@ class UnicodeData:
            # Patch the numeric field
            if table[i] is not None:
                table[i][8] = value
        sc = self.special_casing = {}
        with open_data(SPECIAL_CASING, version) as file:
            for s in file:
                s = s[:-1].split('#', 1)[0]
                if not s:
                    continue
                data = s.split("; ")
                if data[4]:
                    # We ignore all conditionals (since they depend on
                    # languages) except for one, which is hardcoded. See
                    # handle_capital_sigma in unicodeobject.c.
                    continue
                c = int(data[0], 16)
                lower = [int(char, 16) for char in data[1].split()]
                title = [int(char, 16) for char in data[2].split()]
                upper = [int(char, 16) for char in data[3].split()]
                sc[c] = (lower, title, upper)
    def uselatin1(self):
        # restrict character range to ISO Latin 1