use full unicode mappings for upper/lower/title case (#12736)

Also broaden the category of characters that count as lowercase/uppercase.
2012-01-11 18:17:06 -05:00 · 2012-01-11 18:17:06 -05:00 · b2bf01d824
parent 9007f72db0
commit b2bf01d824
11 changed files with 4634 additions and 1757 deletions
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@ -318,16 +318,25 @@ These APIs can be used for fast direct character conversions:

   Return the character *ch* converted to lower case.

+   .. deprecated:: 3.3
+      This function uses simple case mappings.
+

 .. c:function:: Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch)

   Return the character *ch* converted to upper case.

+   .. deprecated:: 3.3
+      This function uses simple case mappings.
+

 .. c:function:: Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch)

   Return the character *ch* converted to title case.

+   .. deprecated:: 3.3
+      This function uses simple case mappings.
+

 .. c:function:: int Py_UNICODE_TODECIMAL(Py_UNICODE ch)

--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@ -1360,7 +1360,8 @@ functions based on regular expressions.
 .. method:: str.swapcase()

   Return a copy of the string with uppercase characters converted to lowercase and
-   vice versa.
+   vice versa. Note that it is not necessarily true that
+   ``s.swapcase().swapcase() == s``.


 .. method:: str.title()
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -2008,6 +2008,29 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
    Py_UCS4 ch       /* Unicode character */
    );

+PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
+    const Py_UCS4 ch         /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsCased(
+    const Py_UCS4 ch         /* Unicode character */
+    );
+
 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
    Py_UCS4 ch       /* Unicode character */
    );
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@ -669,7 +669,7 @@ class CommonTest(BaseTest):

        # check that titlecased chars are lowered correctly
        # \u1ffc is the titlecased char
-        self.checkequal('\u1ffc\u1ff3\u1ff3\u1ff3',
+        self.checkequal('\u03a9\u0399\u1ff3\u1ff3\u1ff3',
                        '\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
        # check with cased non-letter chars
        self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -369,6 +369,8 @@ class UnicodeTest(string_tests.CommonTest,
    def test_islower(self):
        string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
        self.checkequalnofix(False, '\u1FFc', 'islower')
+        self.assertFalse('\u2167'.islower())
+        self.assertTrue('\u2177'.islower())
        # non-BMP, uppercase
        self.assertFalse('\U00010401'.islower())
        self.assertFalse('\U00010427'.islower())
@ -383,6 +385,8 @@ class UnicodeTest(string_tests.CommonTest,
        string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
        if not sys.platform.startswith('java'):
            self.checkequalnofix(False, '\u1FFc', 'isupper')
+        self.assertTrue('\u2167'.isupper())
+        self.assertFalse('\u2177'.isupper())
        # non-BMP, uppercase
        self.assertTrue('\U00010401'.isupper())
        self.assertTrue('\U00010427'.isupper())
@ -548,6 +552,18 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U0001044F\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F'.lower(),
                         'x\U0001044Fx\U0001044F')
+        self.assertEqual('ﬁ'.lower(), 'ﬁ')
+        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
+        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
+        self.assertEqual('\u03a3'.lower(), '\u03c3')
+        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
+        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
+        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
+        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
+        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
+        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
+        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
+        self.assertEqual('\u2177'.lower(), '\u2177')

    def test_upper(self):
        string_tests.CommonTest.test_upper(self)
@ -558,6 +574,13 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U00010427')
        self.assertEqual('X\U00010427x\U0001044F'.upper(),
                         'X\U00010427X\U00010427')
+        self.assertEqual('ﬁ'.upper(), 'FI')
+        self.assertEqual('\u0130'.upper(), '\u0130')
+        self.assertEqual('\u03a3'.upper(), '\u03a3')
+        self.assertEqual('ß'.upper(), 'SS')
+        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
+        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
+        self.assertEqual('\u2177'.upper(), '\u2167')

    def test_capitalize(self):
        string_tests.CommonTest.test_capitalize(self)
@ -570,6 +593,11 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
                         'X\U0001044Fx\U0001044F')
+        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
+        exp = '\u0399\u0308\u0300\u0069\u0307'
+        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
+        self.assertEqual('ﬁnnish'.capitalize(), 'FInnish')
+        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')

    def test_title(self):
        string_tests.MixinStrUnicodeUserStringTest.test_title(self)
@ -584,6 +612,9 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U0001044F \U00010427\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
+        self.assertEqual('ﬁNNISH'.title(), 'Finnish')
+        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
+        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')

    def test_swapcase(self):
        string_tests.CommonTest.test_swapcase(self)
@ -597,6 +628,19 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
                         'x\U0001044FX\U00010427')
+        self.assertEqual('ﬁ'.swapcase(), 'FI')
+        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
+        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
+        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
+        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
+        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
+        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
+        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
+        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
+        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
+        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
+        self.assertEqual('ß'.swapcase(), 'SS')
+        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')

    def test_contains(self):
        # Testing Unicode contains method
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -21,7 +21,7 @@ errors = 'surrogatepass'
 class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
-    expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0'
+    expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921'

    def test_method_checksum(self):
        h = hashlib.sha1()
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------

+- Issue #12736: Use full unicode case mappings for upper, lower, and title case.
+
 - Issue #12760: Add a create mode to open(). Patch by David Townshend.

 - Issue #13738: Simplify implementation of bytes.lower() and bytes.upper().
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@ -21,8 +21,10 @@
 #define XID_START_MASK 0x100
 #define XID_CONTINUE_MASK 0x200
 #define PRINTABLE_MASK 0x400
-#define NODELTA_MASK 0x800
-#define NUMERIC_MASK 0x1000
+#define NUMERIC_MASK 0x800
+#define CASE_IGNORABLE_MASK 0x1000
+#define CASED_MASK 0x2000
+#define EXTENDED_CASE_MASK 0x4000

 typedef struct {
    const Py_UCS4 upper;
@ -57,15 +59,8 @@ gettyperecord(Py_UCS4 code)
 Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->title;

-    if (ctype->flags & NODELTA_MASK)
-        return delta;
-
-    if (delta >= 32768)
-            delta -= 65536;
-
-    return ch + delta;
+    return ctype->title ? ctype->title : ch;
 }

 /* Returns 1 for Unicode characters having the category 'Lt', 0
@ -188,12 +183,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch)
 Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->upper;
-    if (ctype->flags & NODELTA_MASK)
-        return delta;
-    if (delta >= 32768)
-            delta -= 65536;
-    return ch + delta;
+
+    if (ctype->flags & EXTENDED_CASE_MASK)
+        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
+    return ctype->upper ? ctype->upper : ch;
 }

 /* Returns the lowercase Unicode characters corresponding to ch or just
@ -202,12 +195,72 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->lower;
-    if (ctype->flags & NODELTA_MASK)
-        return delta;
-    if (delta >= 32768)
-            delta -= 65536;
-    return ch + delta;
+
+    if (ctype->flags & EXTENDED_CASE_MASK)
+        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
+    return ctype->lower ? ctype->lower : ch;
+}
+
+int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    if (ctype->flags & EXTENDED_CASE_MASK) {
+        int index = ctype->lower & 0xFFFFFF;
+        int n = ctype->lower >> 24;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    res[0] = ctype->lower ? ctype->lower : ch;
+    return 1;
+}
+
+int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    if (ctype->flags & EXTENDED_CASE_MASK) {
+        int index = ctype->title & 0xFFFFFF;
+        int n = ctype->title >> 24;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    res[0] = ctype->title ? ctype->title : ch;
+    return 1;
+}
+
+int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    if (ctype->flags & EXTENDED_CASE_MASK) {
+        int index = ctype->upper & 0xFFFFFF;
+        int n = ctype->upper >> 24;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    res[0] = ctype->upper ? ctype->upper : ch;
+    return 1;
+}
+
+int _PyUnicode_IsCased(Py_UCS4 ch)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    return (ctype->flags & CASED_MASK) != 0;
+}
+
+int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
 }

 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -41,6 +41,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #define PY_SSIZE_T_CLEAN
 #include "Python.h"
 #include "ucnhash.h"
+#include "bytes_methods.h"

 #ifdef MS_WINDOWS
 #include <windows.h>
@ -9428,188 +9429,222 @@ fixup(PyObject *self,
    return v;
 }

-static Py_UCS4
-fixupper(PyObject *self)
+static PyObject *
+ascii_upper_or_lower(PyObject *self, int lower)
 {
-    /* No need to call PyUnicode_READY(self) because this function is only
-       called as a callback from fixup() which does it already. */
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
-    const int kind = PyUnicode_KIND(self);
-    void *data = PyUnicode_DATA(self);
-    int touched = 0;
-    Py_UCS4 maxchar = 0;
-    Py_ssize_t i;
+    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+    char *resdata, *data = PyUnicode_DATA(self);
+    PyObject *res;

-    for (i = 0; i < len; ++i) {
-        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
-        if (up != ch) {
-            if (up > maxchar)
-                maxchar = up;
-            PyUnicode_WRITE(kind, data, i, up);
-            touched = 1;
-        }
-        else if (ch > maxchar)
-            maxchar = ch;
-    }
-
-    if (touched)
-        return maxchar;
+    res = PyUnicode_New(len, 127);
+    if (res == NULL)
+        return NULL;
+    resdata = PyUnicode_DATA(res);
+    if (lower)
+        _Py_bytes_lower(resdata, data, len);
    else
-        return 0;
+        _Py_bytes_upper(resdata, data, len);
+    return res;
 }

 static Py_UCS4
-fixlower(PyObject *self)
+handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
 {
-    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
-    const int kind = PyUnicode_KIND(self);
-    void *data = PyUnicode_DATA(self);
-    int touched = 0;
-    Py_UCS4 maxchar = 0;
-    Py_ssize_t i;
+    Py_ssize_t j;
+    int final_sigma;
+    Py_UCS4 c;
+    /* U+03A3 is in the Final_Sigma context when, it is found like this:

-    for(i = 0; i < len; ++i) {
-        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
-        if (lo != ch) {
-            if (lo > maxchar)
-                maxchar = lo;
-            PyUnicode_WRITE(kind, data, i, lo);
-            touched = 1;
-        }
-        else if (ch > maxchar)
-            maxchar = ch;
+     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
+
+    where ! is a negation and \p{xxx} is a character with property xxx.
+    */
+    for (j = i - 1; j >= 0; j--) {
+        c = PyUnicode_READ(kind, data, j);
+        if (!_PyUnicode_IsCaseIgnorable(c))
+            break;
    }
-
-    if (touched)
-        return maxchar;
-    else
-        return 0;
+    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
+    if (final_sigma) {
+        for (j = i + 1; j < length; j++) {
+            c = PyUnicode_READ(kind, data, j);
+            if (!_PyUnicode_IsCaseIgnorable(c))
+                break;
+        }
+        final_sigma = j == length || !_PyUnicode_IsCased(c);
+    }
+    return (final_sigma) ? 0x3C2 : 0x3C3;
 }

-static Py_UCS4
-fixswapcase(PyObject *self)
+static int
+lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
+           Py_UCS4 c, Py_UCS4 *mapped)
 {
-    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
-    const int kind = PyUnicode_KIND(self);
-    void *data = PyUnicode_DATA(self);
-    int touched = 0;
-    Py_UCS4 maxchar = 0;
-    Py_ssize_t i;
-
-    for(i = 0; i < len; ++i) {
-        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-        Py_UCS4 nu = 0;
-
-        if (Py_UNICODE_ISUPPER(ch))
-            nu = Py_UNICODE_TOLOWER(ch);
-        else if (Py_UNICODE_ISLOWER(ch))
-            nu = Py_UNICODE_TOUPPER(ch);
-
-        if (nu != 0) {
-            if (nu > maxchar)
-                maxchar = nu;
-            PyUnicode_WRITE(kind, data, i, nu);
-            touched = 1;
-        }
-        else if (ch > maxchar)
-            maxchar = ch;
+    /* Obscure special case. */
+    if (c == 0x3A3) {
+        mapped[0] = handle_capital_sigma(kind, data, length, i);
+        return 1;
    }
-
-    if (touched)
-        return maxchar;
-    else
-        return 0;
+    return _PyUnicode_ToLowerFull(c, mapped);
 }

-static Py_UCS4
-fixcapitalize(PyObject *self)
+static Py_ssize_t
+do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
 {
-    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
-    const int kind = PyUnicode_KIND(self);
-    void *data = PyUnicode_DATA(self);
-    int touched = 0;
-    Py_UCS4 maxchar = 0;
-    Py_ssize_t i = 0;
-    Py_UCS4 ch;
+    Py_ssize_t i, k = 0;
+    int n_res, j;
+    Py_UCS4 c, mapped[3];

-    if (len == 0)
-        return 0;
-
-    ch = PyUnicode_READ(kind, data, i);
-    if (!Py_UNICODE_ISUPPER(ch)) {
-        maxchar = Py_UNICODE_TOUPPER(ch);
-        PyUnicode_WRITE(kind, data, i, maxchar);
-        touched = 1;
+    c = PyUnicode_READ(kind, data, 0);
+    n_res = _PyUnicode_ToUpperFull(c, mapped);
+    for (j = 0; j < n_res; j++) {
+        if (mapped[j] > *maxchar)
+            *maxchar = mapped[j];
+        res[k++] = mapped[j];
    }
-    ++i;
-    for(; i < len; ++i) {
-        ch = PyUnicode_READ(kind, data, i);
-        if (!Py_UNICODE_ISLOWER(ch)) {
-            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
-            if (lo > maxchar)
-                maxchar = lo;
-            PyUnicode_WRITE(kind, data, i, lo);
-            touched = 1;
+    for (i = 1; i < length; i++) {
+        c = PyUnicode_READ(kind, data, i);
+        n_res = lower_ucs4(kind, data, length, i, c, mapped);
+        for (j = 0; j < n_res; j++) {
+            if (mapped[j] > *maxchar)
+                *maxchar = mapped[j];
+            res[k++] = mapped[j];
        }
-        else if (ch > maxchar)
-            maxchar = ch;
    }
-
-    if (touched)
-        return maxchar;
-    else
-        return 0;
+    return k;
 }

-static Py_UCS4
-fixtitle(PyObject *self)
+static Py_ssize_t
+do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
+    Py_ssize_t i, k = 0;
+
+    for (i = 0; i < length; i++) {
+        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
+        int n_res, j;
+        if (Py_UNICODE_ISUPPER(c)) {
+            n_res = lower_ucs4(kind, data, length, i, c, mapped);
+        }
+        else if (Py_UNICODE_ISLOWER(c)) {
+            n_res = _PyUnicode_ToUpperFull(c, mapped);
+        }
+        else {
+            n_res = 1;
+            mapped[0] = c;
+        }
+        for (j = 0; j < n_res; j++) {
+            if (mapped[j] > *maxchar)
+                *maxchar = mapped[j];
+            res[k++] = mapped[j];
+        }
+    }
+    return k;
+}
+
+static Py_ssize_t
+do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
+                  Py_UCS4 *maxchar, int lower)
 {
-    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
-    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
-    const int kind = PyUnicode_KIND(self);
-    void *data = PyUnicode_DATA(self);
-    Py_UCS4 maxchar = 0;
-    Py_ssize_t i = 0;
+    Py_ssize_t i, k = 0;
+
+    for (i = 0; i < length; i++) {
+        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
+        int n_res, j;
+        if (lower)
+            n_res = lower_ucs4(kind, data, length, i, c, mapped);
+        else
+            n_res = _PyUnicode_ToUpperFull(c, mapped);
+        for (j = 0; j < n_res; j++) {
+            if (mapped[j] > *maxchar)
+                *maxchar = mapped[j];
+            res[k++] = mapped[j];
+        }
+    }
+    return k;
+}
+
+static Py_ssize_t
+do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
+{
+    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
+}
+
+static Py_ssize_t
+do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
+{
+    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
+}
+
+static PyObject *
+case_operation(PyObject *self,
+               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
+{
+    PyObject *res = NULL;
+    Py_ssize_t length, newlength = 0;
+    int kind, outkind;
+    void *data, *outdata;
+    Py_UCS4 maxchar = 0, *tmp, *tmpend;
+
+    if (PyUnicode_READY(self) == -1)
+        return NULL;
+
+    kind = PyUnicode_KIND(self);
+    data = PyUnicode_DATA(self);
+    length = PyUnicode_GET_LENGTH(self);
+    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
+    if (tmp == NULL)
+        return PyErr_NoMemory();
+    newlength = perform(kind, data, length, tmp, &maxchar);
+    res = PyUnicode_New(newlength, maxchar);
+    if (res == NULL)
+        goto leave;
+    tmpend = tmp + newlength;
+    outdata = PyUnicode_DATA(res);
+    outkind = PyUnicode_KIND(res);
+    switch (outkind) {
+    case PyUnicode_1BYTE_KIND:
+        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
+        break;
+    case PyUnicode_2BYTE_KIND:
+        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
+        break;
+    case PyUnicode_4BYTE_KIND:
+        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
+        break;
+    default:
+        assert(0);
+        break;
+    }
+  leave:
+    PyMem_FREE(tmp);
+    return res;
+}
+
+static Py_ssize_t
+do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
+{
+    Py_ssize_t i, k = 0;
    int previous_is_cased;

-    /* Shortcut for single character strings */
-    if (len == 1) {
-        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
-        if (ti != ch) {
-            PyUnicode_WRITE(kind, data, i, ti);
-            return ti;
-        }
-        else
-            return 0;
-    }
    previous_is_cased = 0;
-    for(; i < len; ++i) {
-        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-        Py_UCS4 nu;
+    for (i = 0; i < length; i++) {
+        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
+        Py_UCS4 mapped[3];
+        int n_res, j;

        if (previous_is_cased)
-            nu = Py_UNICODE_TOLOWER(ch);
+            n_res = lower_ucs4(kind, data, length, i, c, mapped);
        else
-            nu = Py_UNICODE_TOTITLE(ch);
+            n_res = _PyUnicode_ToTitleFull(c, mapped);

-        if (nu > maxchar)
-            maxchar = nu;
-        PyUnicode_WRITE(kind, data, i, nu);
+        for (j = 0; j < n_res; j++) {
+            if (mapped[j] > *maxchar)
+                *maxchar = mapped[j];
+            res[k++] = mapped[j];
+        }

-        if (Py_UNICODE_ISLOWER(ch) ||
-            Py_UNICODE_ISUPPER(ch) ||
-            Py_UNICODE_ISTITLE(ch))
-            previous_is_cased = 1;
-        else
-            previous_is_cased = 0;
+        previous_is_cased = _PyUnicode_IsCased(c);
    }
-    return maxchar;
+    return k;
 }

 PyObject *
@ -10445,7 +10480,7 @@ characters, all remaining cased characters have lower case.");
 static PyObject*
 unicode_title(PyObject *self)
 {
-    return fixup(self, fixtitle);
+    return case_operation(self, do_title);
 }

 PyDoc_STRVAR(capitalize__doc__,
@ -10457,7 +10492,11 @@ have upper case and the rest lower case.");
 static PyObject*
 unicode_capitalize(PyObject *self)
 {
-    return fixup(self, fixcapitalize);
+    if (PyUnicode_READY(self) == -1)
+        return NULL;
+    if (PyUnicode_GET_LENGTH(self) == 0)
+        return unicode_result_unchanged(self);
+    return case_operation(self, do_capitalize);
 }

 #if 0
@ -11715,7 +11754,11 @@ Return a copy of the string S converted to lowercase.");
 static PyObject*
 unicode_lower(PyObject *self)
 {
-    return fixup(self, fixlower);
+    if (PyUnicode_READY(self) == -1)
+        return NULL;
+    if (PyUnicode_IS_ASCII(self))
+        return ascii_upper_or_lower(self, 1);
+    return case_operation(self, do_lower);
 }

 #define LEFTSTRIP 0
@ -12604,7 +12647,7 @@ and vice versa.");
 static PyObject*
 unicode_swapcase(PyObject *self)
 {
-    return fixup(self, fixswapcase);
+    return case_operation(self, do_swapcase);
 }

 PyDoc_STRVAR(maketrans__doc__,
@ -12750,7 +12793,11 @@ Return a copy of S converted to uppercase.");
 static PyObject*
 unicode_upper(PyObject *self)
 {
-    return fixup(self, fixupper);
+    if (PyUnicode_READY(self) == -1)
+        return NULL;
+    if (PyUnicode_IS_ASCII(self))
+        return ascii_upper_or_lower(self, 0);
+    return case_operation(self, do_upper);
 }

 PyDoc_STRVAR(zfill__doc__,
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -22,6 +22,7 @@
 # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
 # 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
 # 2011-10-21 ezio add support for name aliases and named sequences
+# 2012-01    benjamin add full case mappings
 #
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
@ -47,6 +48,7 @@ DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
 NAME_ALIASES = "NameAliases%s.txt"
 NAMED_SEQUENCES = "NamedSequences%s.txt"
+SPECIAL_CASING = "SpecialCasing%s.txt"

 # Private Use Areas -- in planes 1, 15, 16
 PUA_1 = range(0xE000, 0xF900)
@ -84,8 +86,10 @@ UPPER_MASK = 0x80
 XID_START_MASK = 0x100
 XID_CONTINUE_MASK = 0x200
 PRINTABLE_MASK = 0x400
-NODELTA_MASK = 0x800
-NUMERIC_MASK = 0x1000
+NUMERIC_MASK = 0x800
+CASE_IGNORABLE_MASK = 0x1000
+CASED_MASK = 0x2000
+EXTENDED_CASE_MASK = 0x4000

 # these ranges need to match unicodedata.c:is_unified_ideograph
 cjk_ranges = [
@ -384,6 +388,7 @@ def makeunicodetype(unicode, trace):
    numeric = {}
    spaces = []
    linebreaks = []
+    extra_casing = []

    for char in unicode.chars:
        record = unicode.table[char]
@ -396,7 +401,7 @@ def makeunicodetype(unicode, trace):
            delta = True
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                flags |= ALPHA_MASK
-            if category == "Ll":
+            if "Lowercase" in properties:
                flags |= LOWER_MASK
            if 'Line_Break' in properties or bidirectional == "B":
                flags |= LINEBREAK_MASK
@ -406,7 +411,7 @@ def makeunicodetype(unicode, trace):
                spaces.append(char)
            if category == "Lt":
                flags |= TITLE_MASK
-            if category == "Lu":
+            if "Uppercase" in properties:
                flags |= UPPER_MASK
            if char == ord(" ") or category[0] not in ("C", "Z"):
                flags |= PRINTABLE_MASK
@ -414,35 +419,41 @@ def makeunicodetype(unicode, trace):
                flags |= XID_START_MASK
            if "XID_Continue" in properties:
                flags |= XID_CONTINUE_MASK
-            # use delta predictor for upper/lower/title if it fits
-            if record[12]:
-                upper = int(record[12], 16)
+            if "Cased" in properties:
+                flags |= CASED_MASK
+            if "Case_Ignorable" in properties:
+                flags |= CASE_IGNORABLE_MASK
+            sc = unicode.special_casing.get(char)
+            if sc is None:
+                if record[12]:
+                    upper = int(record[12], 16)
+                else:
+                    upper = char
+                if record[13]:
+                    lower = int(record[13], 16)
+                else:
+                    lower = char
+                if record[14]:
+                    title = int(record[14], 16)
+                else:
+                    title = upper
+                if upper == lower == title:
+                    upper = lower = title = 0
            else:
-                upper = char
-            if record[13]:
-                lower = int(record[13], 16)
-            else:
-                lower = char
-            if record[14]:
-                title = int(record[14], 16)
-            else:
-                # UCD.html says that a missing title char means that
-                # it defaults to the uppercase character, not to the
-                # character itself. Apparently, in the current UCD (5.x)
-                # this feature is never used
-                title = upper
-            upper_d = upper - char
-            lower_d = lower - char
-            title_d = title - char
-            if -32768 <= upper_d <= 32767 and \
-               -32768 <= lower_d <= 32767 and \
-               -32768 <= title_d <= 32767:
-                # use deltas
-                upper = upper_d & 0xffff
-                lower = lower_d & 0xffff
-                title = title_d & 0xffff
-            else:
-                flags |= NODELTA_MASK
+                # This happens when some character maps to more than one
+                # character in uppercase, lowercase, or titlecase. The extra
+                # characters are stored in a different array.
+                flags |= EXTENDED_CASE_MASK
+                lower = len(extra_casing) | (len(sc[0]) << 24)
+                extra_casing.extend(sc[0])
+                upper = len(extra_casing) | (len(sc[2]) << 24)
+                extra_casing.extend(sc[2])
+                # Title is probably equal to upper.
+                if sc[1] == sc[2]:
+                    title = upper
+                else:
+                    title = len(extra_casing) | (len(sc[1]) << 24)
+                    extra_casing.extend(sc[1])
            # decimal digit, integer digit
            decimal = 0
            if record[6]:
@ -469,6 +480,7 @@ def makeunicodetype(unicode, trace):
    print(sum(map(len, numeric.values())), "numeric code points")
    print(len(spaces), "whitespace code points")
    print(len(linebreaks), "linebreak code points")
+    print(len(extra_casing), "extended case array")

    print("--- Writing", FILE, "...")

@ -482,6 +494,14 @@ def makeunicodetype(unicode, trace):
    print("};", file=fp)
    print(file=fp)

+    print("/* extended case mappings */", file=fp)
+    print(file=fp)
+    print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
+    for c in extra_casing:
+        print("    %d," % c, file=fp)
+    print("};", file=fp)
+    print(file=fp)
+
    # split decomposition index table
    index1, index2, shift = splitbins(index, trace)

@ -1070,6 +1090,23 @@ class UnicodeData:
            # Patch the numeric field
            if table[i] is not None:
                table[i][8] = value
+        sc = self.special_casing = {}
+        with open_data(SPECIAL_CASING, version) as file:
+            for s in file:
+                s = s[:-1].split('#', 1)[0]
+                if not s:
+                    continue
+                data = s.split("; ")
+                if data[4]:
+                    # We ignore all conditionals (since they depend on
+                    # languages) except for one, which is hardcoded. See
+                    # handle_capital_sigma in unicodeobject.c.
+                    continue
+                c = int(data[0], 16)
+                lower = [int(char, 16) for char in data[1].split()]
+                title = [int(char, 16) for char in data[2].split()]
+                upper = [int(char, 16) for char in data[3].split()]
+                sc[c] = (lower, title, upper)

    def uselatin1(self):
        # restrict character range to ISO Latin 1