mirror of https://github.com/python/cpython
add str.casefold() (closes #13752)
This commit is contained in:
parent
94d5a7174a
commit
d5890c8db5
|
@ -1002,6 +1002,14 @@ functions based on regular expressions.
|
||||||
rest lowercased.
|
rest lowercased.
|
||||||
|
|
||||||
|
|
||||||
|
.. method:: str.casefold()
|
||||||
|
|
||||||
|
Return a casefolded copy of the string. Casefolded strings may be used for
|
||||||
|
caseless matching. For example, ``"MASSE".casefold() == "maße".casefold()``.
|
||||||
|
|
||||||
|
.. versionadded:: 3.3
|
||||||
|
|
||||||
|
|
||||||
.. method:: str.center(width[, fillchar])
|
.. method:: str.center(width[, fillchar])
|
||||||
|
|
||||||
Return centered in a string of length *width*. Padding is done using the
|
Return centered in a string of length *width*. Padding is done using the
|
||||||
|
|
|
@ -2023,6 +2023,11 @@ PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
|
||||||
Py_UCS4 *res
|
Py_UCS4 *res
|
||||||
);
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
|
||||||
|
Py_UCS4 ch, /* Unicode character */
|
||||||
|
Py_UCS4 *res
|
||||||
|
);
|
||||||
|
|
||||||
PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
|
PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
|
||||||
Py_UCS4 ch /* Unicode character */
|
Py_UCS4 ch /* Unicode character */
|
||||||
);
|
);
|
||||||
|
|
|
@ -565,6 +565,14 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
|
self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
|
||||||
self.assertEqual('\u2177'.lower(), '\u2177')
|
self.assertEqual('\u2177'.lower(), '\u2177')
|
||||||
|
|
||||||
|
def test_casefold(self):
|
||||||
|
self.assertEqual('hello'.casefold(), 'hello')
|
||||||
|
self.assertEqual('hELlo'.casefold(), 'hello')
|
||||||
|
self.assertEqual('ß'.casefold(), 'ss')
|
||||||
|
self.assertEqual('fi'.casefold(), 'fi')
|
||||||
|
self.assertEqual('\u03a3'.casefold(), '\u03c3')
|
||||||
|
self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
|
||||||
|
|
||||||
def test_upper(self):
|
def test_upper(self):
|
||||||
string_tests.CommonTest.test_upper(self)
|
string_tests.CommonTest.test_upper(self)
|
||||||
self.assertEqual('\U0001044F'.upper(), '\U00010427')
|
self.assertEqual('\U0001044F'.upper(), '\U00010427')
|
||||||
|
|
|
@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #13752: Add a casefold() method to str.
|
||||||
|
|
||||||
- Issue #13761: Add a "flush" keyword argument to the print() function,
|
- Issue #13761: Add a "flush" keyword argument to the print() function,
|
||||||
used to ensure flushing the output stream.
|
used to ensure flushing the output stream.
|
||||||
|
|
||||||
|
|
|
@ -185,7 +185,7 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
if (ctype->flags & EXTENDED_CASE_MASK)
|
if (ctype->flags & EXTENDED_CASE_MASK)
|
||||||
return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
|
return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
|
||||||
return ctype->upper ? ctype->upper : ch;
|
return ctype->upper ? ctype->upper : ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -197,7 +197,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
if (ctype->flags & EXTENDED_CASE_MASK)
|
if (ctype->flags & EXTENDED_CASE_MASK)
|
||||||
return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
|
return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
|
||||||
return ctype->lower ? ctype->lower : ch;
|
return ctype->lower ? ctype->lower : ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -206,7 +206,7 @@ int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
if (ctype->flags & EXTENDED_CASE_MASK) {
|
if (ctype->flags & EXTENDED_CASE_MASK) {
|
||||||
int index = ctype->lower & 0xFFFFFF;
|
int index = ctype->lower & 0xFFFF;
|
||||||
int n = ctype->lower >> 24;
|
int n = ctype->lower >> 24;
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < n; i++)
|
for (i = 0; i < n; i++)
|
||||||
|
@ -222,7 +222,7 @@ int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
if (ctype->flags & EXTENDED_CASE_MASK) {
|
if (ctype->flags & EXTENDED_CASE_MASK) {
|
||||||
int index = ctype->title & 0xFFFFFF;
|
int index = ctype->title & 0xFFFF;
|
||||||
int n = ctype->title >> 24;
|
int n = ctype->title >> 24;
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < n; i++)
|
for (i = 0; i < n; i++)
|
||||||
|
@ -238,7 +238,7 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
if (ctype->flags & EXTENDED_CASE_MASK) {
|
if (ctype->flags & EXTENDED_CASE_MASK) {
|
||||||
int index = ctype->upper & 0xFFFFFF;
|
int index = ctype->upper & 0xFFFF;
|
||||||
int n = ctype->upper >> 24;
|
int n = ctype->upper >> 24;
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < n; i++)
|
for (i = 0; i < n; i++)
|
||||||
|
@ -249,6 +249,21 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
|
||||||
|
int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
|
||||||
|
int n = (ctype->lower >> 20) & 7;
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
res[i] = _PyUnicode_ExtendedCase[index + i];
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
return _PyUnicode_ToLowerFull(ch, res);
|
||||||
|
}
|
||||||
|
|
||||||
int _PyUnicode_IsCased(Py_UCS4 ch)
|
int _PyUnicode_IsCased(Py_UCS4 ch)
|
||||||
{
|
{
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
|
@ -9576,6 +9576,24 @@ do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar
|
||||||
return do_upper_or_lower(kind, data, length, res, maxchar, 1);
|
return do_upper_or_lower(kind, data, length, res, maxchar, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Py_ssize_t
|
||||||
|
do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
|
||||||
|
{
|
||||||
|
Py_ssize_t i, k = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < length; i++) {
|
||||||
|
Py_UCS4 c = PyUnicode_READ(kind, data, i);
|
||||||
|
Py_UCS4 mapped[3];
|
||||||
|
int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
|
||||||
|
for (j = 0; j < n_res; j++) {
|
||||||
|
if (mapped[j] > *maxchar)
|
||||||
|
*maxchar = mapped[j];
|
||||||
|
res[k++] = mapped[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return k;
|
||||||
|
}
|
||||||
|
|
||||||
static Py_ssize_t
|
static Py_ssize_t
|
||||||
do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
|
do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
|
||||||
{
|
{
|
||||||
|
@ -10501,6 +10519,22 @@ unicode_capitalize(PyObject *self)
|
||||||
return case_operation(self, do_capitalize);
|
return case_operation(self, do_capitalize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyDoc_STRVAR(casefold__doc__,
|
||||||
|
"S.casefold() -> str\n\
|
||||||
|
\n\
|
||||||
|
Return a version of S suitable for caseless comparisons.");
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicode_casefold(PyObject *self)
|
||||||
|
{
|
||||||
|
if (PyUnicode_READY(self) == -1)
|
||||||
|
return NULL;
|
||||||
|
if (PyUnicode_IS_ASCII(self))
|
||||||
|
return ascii_upper_or_lower(self, 1);
|
||||||
|
return case_operation(self, do_casefold);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Argument converter. Coerces to a single unicode character */
|
/* Argument converter. Coerces to a single unicode character */
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
@ -12998,6 +13032,7 @@ static PyMethodDef unicode_methods[] = {
|
||||||
{"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
|
{"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
|
||||||
{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
|
{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
|
||||||
{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
|
{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
|
||||||
|
{"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
|
||||||
{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
|
{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
|
||||||
{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
|
{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
|
||||||
{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
|
{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -49,6 +49,7 @@ LINE_BREAK = "LineBreak%s.txt"
|
||||||
NAME_ALIASES = "NameAliases%s.txt"
|
NAME_ALIASES = "NameAliases%s.txt"
|
||||||
NAMED_SEQUENCES = "NamedSequences%s.txt"
|
NAMED_SEQUENCES = "NamedSequences%s.txt"
|
||||||
SPECIAL_CASING = "SpecialCasing%s.txt"
|
SPECIAL_CASING = "SpecialCasing%s.txt"
|
||||||
|
CASE_FOLDING = "CaseFolding%s.txt"
|
||||||
|
|
||||||
# Private Use Areas -- in planes 1, 15, 16
|
# Private Use Areas -- in planes 1, 15, 16
|
||||||
PUA_1 = range(0xE000, 0xF900)
|
PUA_1 = range(0xE000, 0xF900)
|
||||||
|
@ -424,28 +425,36 @@ def makeunicodetype(unicode, trace):
|
||||||
if "Case_Ignorable" in properties:
|
if "Case_Ignorable" in properties:
|
||||||
flags |= CASE_IGNORABLE_MASK
|
flags |= CASE_IGNORABLE_MASK
|
||||||
sc = unicode.special_casing.get(char)
|
sc = unicode.special_casing.get(char)
|
||||||
|
cf = unicode.case_folding.get(char, [char])
|
||||||
|
if record[12]:
|
||||||
|
upper = int(record[12], 16)
|
||||||
|
else:
|
||||||
|
upper = char
|
||||||
|
if record[13]:
|
||||||
|
lower = int(record[13], 16)
|
||||||
|
else:
|
||||||
|
lower = char
|
||||||
|
if record[14]:
|
||||||
|
title = int(record[14], 16)
|
||||||
|
else:
|
||||||
|
title = upper
|
||||||
|
if sc is None and cf != [lower]:
|
||||||
|
sc = ([lower], [title], [upper])
|
||||||
if sc is None:
|
if sc is None:
|
||||||
if record[12]:
|
|
||||||
upper = int(record[12], 16)
|
|
||||||
else:
|
|
||||||
upper = char
|
|
||||||
if record[13]:
|
|
||||||
lower = int(record[13], 16)
|
|
||||||
else:
|
|
||||||
lower = char
|
|
||||||
if record[14]:
|
|
||||||
title = int(record[14], 16)
|
|
||||||
else:
|
|
||||||
title = upper
|
|
||||||
if upper == lower == title:
|
if upper == lower == title:
|
||||||
upper = lower = title = 0
|
upper = lower = title = 0
|
||||||
else:
|
else:
|
||||||
# This happens when some character maps to more than one
|
# This happens either when some character maps to more than one
|
||||||
# character in uppercase, lowercase, or titlecase. The extra
|
# character in uppercase, lowercase, or titlecase or the
|
||||||
# characters are stored in a different array.
|
# casefolded version of the character is different from the
|
||||||
|
# lowercase. The extra characters are stored in a different
|
||||||
|
# array.
|
||||||
flags |= EXTENDED_CASE_MASK
|
flags |= EXTENDED_CASE_MASK
|
||||||
lower = len(extra_casing) | (len(sc[0]) << 24)
|
lower = len(extra_casing) | (len(sc[0]) << 24)
|
||||||
extra_casing.extend(sc[0])
|
extra_casing.extend(sc[0])
|
||||||
|
if cf != sc[0]:
|
||||||
|
lower |= len(cf) << 20
|
||||||
|
extra_casing.extend(cf)
|
||||||
upper = len(extra_casing) | (len(sc[2]) << 24)
|
upper = len(extra_casing) | (len(sc[2]) << 24)
|
||||||
extra_casing.extend(sc[2])
|
extra_casing.extend(sc[2])
|
||||||
# Title is probably equal to upper.
|
# Title is probably equal to upper.
|
||||||
|
@ -1107,6 +1116,17 @@ class UnicodeData:
|
||||||
title = [int(char, 16) for char in data[2].split()]
|
title = [int(char, 16) for char in data[2].split()]
|
||||||
upper = [int(char, 16) for char in data[3].split()]
|
upper = [int(char, 16) for char in data[3].split()]
|
||||||
sc[c] = (lower, title, upper)
|
sc[c] = (lower, title, upper)
|
||||||
|
cf = self.case_folding = {}
|
||||||
|
if version != '3.2.0':
|
||||||
|
with open_data(CASE_FOLDING, version) as file:
|
||||||
|
for s in file:
|
||||||
|
s = s[:-1].split('#', 1)[0]
|
||||||
|
if not s:
|
||||||
|
continue
|
||||||
|
data = s.split("; ")
|
||||||
|
if data[1] in "CF":
|
||||||
|
c = int(data[0], 16)
|
||||||
|
cf[c] = [int(char, 16) for char in data[2].split()]
|
||||||
|
|
||||||
def uselatin1(self):
|
def uselatin1(self):
|
||||||
# restrict character range to ISO Latin 1
|
# restrict character range to ISO Latin 1
|
||||||
|
|
Loading…
Reference in New Issue