add str.casefold() (closes #13752)

2012-01-14 13:23:30 -05:00 · 2012-01-14 13:23:30 -05:00 · d5890c8db5
parent 94d5a7174a
commit d5890c8db5
8 changed files with 493 additions and 137 deletions
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@ -1002,6 +1002,14 @@ functions based on regular expressions.
   rest lowercased.


+.. method:: str.casefold()
+
+   Return a casefolded copy of the string. Casefolded strings may be used for
+   caseless matching. For example, ``"MASSE".casefold() == "maße".casefold()``.
+
+   .. versionadded:: 3.3
+
+
 .. method:: str.center(width[, fillchar])

   Return centered in a string of length *width*. Padding is done using the
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -2023,6 +2023,11 @@ PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
    Py_UCS4 *res
    );

+PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+
 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
    Py_UCS4 ch         /* Unicode character */
    );
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -565,6 +565,14 @@ class UnicodeTest(string_tests.CommonTest,
        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
        self.assertEqual('\u2177'.lower(), '\u2177')

+    def test_casefold(self):
+        self.assertEqual('hello'.casefold(), 'hello')
+        self.assertEqual('hELlo'.casefold(), 'hello')
+        self.assertEqual('ß'.casefold(), 'ss')
+        self.assertEqual('ﬁ'.casefold(), 'fi')
+        self.assertEqual('\u03a3'.casefold(), '\u03c3')
+        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
+
    def test_upper(self):
        string_tests.CommonTest.test_upper(self)
        self.assertEqual('\U0001044F'.upper(), '\U00010427')
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------

+- Issue #13752: Add a casefold() method to str.
+
 - Issue #13761: Add a "flush" keyword argument to the print() function,
  used to ensure flushing the output stream.

--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@ -185,7 +185,7 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK)
-        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
+        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
    return ctype->upper ? ctype->upper : ch;
 }

@ -197,7 +197,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK)
-        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
+        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
    return ctype->lower ? ctype->lower : ch;
 }

@ -206,7 +206,7 @@ int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK) {
-        int index = ctype->lower & 0xFFFFFF;
+        int index = ctype->lower & 0xFFFF;
        int n = ctype->lower >> 24;
        int i;
        for (i = 0; i < n; i++)
@ -222,7 +222,7 @@ int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK) {
-        int index = ctype->title & 0xFFFFFF;
+        int index = ctype->title & 0xFFFF;
        int n = ctype->title >> 24;
        int i;
        for (i = 0; i < n; i++)
@ -238,7 +238,7 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK) {
-        int index = ctype->upper & 0xFFFFFF;
+        int index = ctype->upper & 0xFFFF;
        int n = ctype->upper >> 24;
        int i;
        for (i = 0; i < n; i++)
@ -249,6 +249,21 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
    return 1;
 }

+int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
+        int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
+        int n = (ctype->lower >> 20) & 7;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    return _PyUnicode_ToLowerFull(ch, res);
+}
+
 int _PyUnicode_IsCased(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -9576,6 +9576,24 @@ do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
 }

+static Py_ssize_t
+do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
+{
+    Py_ssize_t i, k = 0;
+
+    for (i = 0; i < length; i++) {
+        Py_UCS4 c = PyUnicode_READ(kind, data, i);
+        Py_UCS4 mapped[3];
+        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
+        for (j = 0; j < n_res; j++) {
+            if (mapped[j] > *maxchar)
+                *maxchar = mapped[j];
+            res[k++] = mapped[j];
+        }
+    }
+    return k;
+}
+
 static Py_ssize_t
 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
 {
@ -10501,6 +10519,22 @@ unicode_capitalize(PyObject *self)
    return case_operation(self, do_capitalize);
 }

+PyDoc_STRVAR(casefold__doc__,
+             "S.casefold() -> str\n\
+\n\
+Return a version of S suitable for caseless comparisons.");
+
+static PyObject *
+unicode_casefold(PyObject *self)
+{
+    if (PyUnicode_READY(self) == -1)
+        return NULL;
+    if (PyUnicode_IS_ASCII(self))
+        return ascii_upper_or_lower(self, 1);
+    return case_operation(self, do_casefold);
+}
+
+
 /* Argument converter.  Coerces to a single unicode character */

 static int
@ -12998,6 +13032,7 @@ static PyMethodDef unicode_methods[] = {
    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
+    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -49,6 +49,7 @@ LINE_BREAK = "LineBreak%s.txt"
 NAME_ALIASES = "NameAliases%s.txt"
 NAMED_SEQUENCES = "NamedSequences%s.txt"
 SPECIAL_CASING = "SpecialCasing%s.txt"
+CASE_FOLDING = "CaseFolding%s.txt"

 # Private Use Areas -- in planes 1, 15, 16
 PUA_1 = range(0xE000, 0xF900)
@ -424,28 +425,36 @@ def makeunicodetype(unicode, trace):
            if "Case_Ignorable" in properties:
                flags |= CASE_IGNORABLE_MASK
            sc = unicode.special_casing.get(char)
+            cf = unicode.case_folding.get(char, [char])
+            if record[12]:
+                upper = int(record[12], 16)
+            else:
+                upper = char
+            if record[13]:
+                lower = int(record[13], 16)
+            else:
+                lower = char
+            if record[14]:
+                title = int(record[14], 16)
+            else:
+                title = upper
+            if sc is None and cf != [lower]:
+                sc = ([lower], [title], [upper])
            if sc is None:
-                if record[12]:
-                    upper = int(record[12], 16)
-                else:
-                    upper = char
-                if record[13]:
-                    lower = int(record[13], 16)
-                else:
-                    lower = char
-                if record[14]:
-                    title = int(record[14], 16)
-                else:
-                    title = upper
                if upper == lower == title:
                    upper = lower = title = 0
            else:
-                # This happens when some character maps to more than one
-                # character in uppercase, lowercase, or titlecase. The extra
-                # characters are stored in a different array.
+                # This happens either when some character maps to more than one
+                # character in uppercase, lowercase, or titlecase or the
+                # casefolded version of the character is different from the
+                # lowercase. The extra characters are stored in a different
+                # array.
                flags |= EXTENDED_CASE_MASK
                lower = len(extra_casing) | (len(sc[0]) << 24)
                extra_casing.extend(sc[0])
+                if cf != sc[0]:
+                    lower |= len(cf) << 20
+                    extra_casing.extend(cf)
                upper = len(extra_casing) | (len(sc[2]) << 24)
                extra_casing.extend(sc[2])
                # Title is probably equal to upper.
@ -1107,6 +1116,17 @@ class UnicodeData:
                title = [int(char, 16) for char in data[2].split()]
                upper = [int(char, 16) for char in data[3].split()]
                sc[c] = (lower, title, upper)
+        cf = self.case_folding = {}
+        if version != '3.2.0':
+            with open_data(CASE_FOLDING, version) as file:
+                for s in file:
+                    s = s[:-1].split('#', 1)[0]
+                    if not s:
+                        continue
+                    data = s.split("; ")
+                    if data[1] in "CF":
+                        c = int(data[0], 16)
+                        cf[c] = [int(char, 16) for char in data[2].split()]

    def uselatin1(self):
        # restrict character range to ISO Latin 1