mirror of https://github.com/python/cpython
use full unicode mappings for upper/lower/title case (#12736)
Also broaden the category of characters that count as lowercase/uppercase.
This commit is contained in:
parent
9007f72db0
commit
b2bf01d824
|
@ -318,16 +318,25 @@ These APIs can be used for fast direct character conversions:
|
||||||
|
|
||||||
Return the character *ch* converted to lower case.
|
Return the character *ch* converted to lower case.
|
||||||
|
|
||||||
|
.. deprecated:: 3.3
|
||||||
|
This function uses simple case mappings.
|
||||||
|
|
||||||
|
|
||||||
.. c:function:: Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch)
|
.. c:function:: Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch)
|
||||||
|
|
||||||
Return the character *ch* converted to upper case.
|
Return the character *ch* converted to upper case.
|
||||||
|
|
||||||
|
.. deprecated:: 3.3
|
||||||
|
This function uses simple case mappings.
|
||||||
|
|
||||||
|
|
||||||
.. c:function:: Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch)
|
.. c:function:: Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch)
|
||||||
|
|
||||||
Return the character *ch* converted to title case.
|
Return the character *ch* converted to title case.
|
||||||
|
|
||||||
|
.. deprecated:: 3.3
|
||||||
|
This function uses simple case mappings.
|
||||||
|
|
||||||
|
|
||||||
.. c:function:: int Py_UNICODE_TODECIMAL(Py_UNICODE ch)
|
.. c:function:: int Py_UNICODE_TODECIMAL(Py_UNICODE ch)
|
||||||
|
|
||||||
|
|
|
@ -1360,7 +1360,8 @@ functions based on regular expressions.
|
||||||
.. method:: str.swapcase()
|
.. method:: str.swapcase()
|
||||||
|
|
||||||
Return a copy of the string with uppercase characters converted to lowercase and
|
Return a copy of the string with uppercase characters converted to lowercase and
|
||||||
vice versa.
|
vice versa. Note that it is not necessarily true that
|
||||||
|
``s.swapcase().swapcase() == s``.
|
||||||
|
|
||||||
|
|
||||||
.. method:: str.title()
|
.. method:: str.title()
|
||||||
|
|
|
@ -2008,6 +2008,29 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
|
||||||
Py_UCS4 ch /* Unicode character */
|
Py_UCS4 ch /* Unicode character */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
|
||||||
|
Py_UCS4 ch, /* Unicode character */
|
||||||
|
Py_UCS4 *res
|
||||||
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
|
||||||
|
Py_UCS4 ch, /* Unicode character */
|
||||||
|
Py_UCS4 *res
|
||||||
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
|
||||||
|
Py_UCS4 ch, /* Unicode character */
|
||||||
|
Py_UCS4 *res
|
||||||
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
|
||||||
|
const Py_UCS4 ch /* Unicode character */
|
||||||
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_IsCased(
|
||||||
|
const Py_UCS4 ch /* Unicode character */
|
||||||
|
);
|
||||||
|
|
||||||
PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
|
PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
|
||||||
Py_UCS4 ch /* Unicode character */
|
Py_UCS4 ch /* Unicode character */
|
||||||
);
|
);
|
||||||
|
|
|
@ -669,7 +669,7 @@ class CommonTest(BaseTest):
|
||||||
|
|
||||||
# check that titlecased chars are lowered correctly
|
# check that titlecased chars are lowered correctly
|
||||||
# \u1ffc is the titlecased char
|
# \u1ffc is the titlecased char
|
||||||
self.checkequal('\u1ffc\u1ff3\u1ff3\u1ff3',
|
self.checkequal('\u03a9\u0399\u1ff3\u1ff3\u1ff3',
|
||||||
'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
|
'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
|
||||||
# check with cased non-letter chars
|
# check with cased non-letter chars
|
||||||
self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
|
self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
|
||||||
|
|
|
@ -369,6 +369,8 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
def test_islower(self):
|
def test_islower(self):
|
||||||
string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
|
string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
|
||||||
self.checkequalnofix(False, '\u1FFc', 'islower')
|
self.checkequalnofix(False, '\u1FFc', 'islower')
|
||||||
|
self.assertFalse('\u2167'.islower())
|
||||||
|
self.assertTrue('\u2177'.islower())
|
||||||
# non-BMP, uppercase
|
# non-BMP, uppercase
|
||||||
self.assertFalse('\U00010401'.islower())
|
self.assertFalse('\U00010401'.islower())
|
||||||
self.assertFalse('\U00010427'.islower())
|
self.assertFalse('\U00010427'.islower())
|
||||||
|
@ -383,6 +385,8 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
|
string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
|
||||||
if not sys.platform.startswith('java'):
|
if not sys.platform.startswith('java'):
|
||||||
self.checkequalnofix(False, '\u1FFc', 'isupper')
|
self.checkequalnofix(False, '\u1FFc', 'isupper')
|
||||||
|
self.assertTrue('\u2167'.isupper())
|
||||||
|
self.assertFalse('\u2177'.isupper())
|
||||||
# non-BMP, uppercase
|
# non-BMP, uppercase
|
||||||
self.assertTrue('\U00010401'.isupper())
|
self.assertTrue('\U00010401'.isupper())
|
||||||
self.assertTrue('\U00010427'.isupper())
|
self.assertTrue('\U00010427'.isupper())
|
||||||
|
@ -548,6 +552,18 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
'\U0001044F\U0001044F')
|
'\U0001044F\U0001044F')
|
||||||
self.assertEqual('X\U00010427x\U0001044F'.lower(),
|
self.assertEqual('X\U00010427x\U0001044F'.lower(),
|
||||||
'x\U0001044Fx\U0001044F')
|
'x\U0001044Fx\U0001044F')
|
||||||
|
self.assertEqual('fi'.lower(), 'fi')
|
||||||
|
self.assertEqual('\u0130'.lower(), '\u0069\u0307')
|
||||||
|
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
|
||||||
|
self.assertEqual('\u03a3'.lower(), '\u03c3')
|
||||||
|
self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
|
||||||
|
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
|
||||||
|
self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
|
||||||
|
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
|
||||||
|
self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
|
||||||
|
self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
|
||||||
|
self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
|
||||||
|
self.assertEqual('\u2177'.lower(), '\u2177')
|
||||||
|
|
||||||
def test_upper(self):
|
def test_upper(self):
|
||||||
string_tests.CommonTest.test_upper(self)
|
string_tests.CommonTest.test_upper(self)
|
||||||
|
@ -558,6 +574,13 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
'\U00010427\U00010427')
|
'\U00010427\U00010427')
|
||||||
self.assertEqual('X\U00010427x\U0001044F'.upper(),
|
self.assertEqual('X\U00010427x\U0001044F'.upper(),
|
||||||
'X\U00010427X\U00010427')
|
'X\U00010427X\U00010427')
|
||||||
|
self.assertEqual('fi'.upper(), 'FI')
|
||||||
|
self.assertEqual('\u0130'.upper(), '\u0130')
|
||||||
|
self.assertEqual('\u03a3'.upper(), '\u03a3')
|
||||||
|
self.assertEqual('ß'.upper(), 'SS')
|
||||||
|
self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
|
||||||
|
self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
|
||||||
|
self.assertEqual('\u2177'.upper(), '\u2167')
|
||||||
|
|
||||||
def test_capitalize(self):
|
def test_capitalize(self):
|
||||||
string_tests.CommonTest.test_capitalize(self)
|
string_tests.CommonTest.test_capitalize(self)
|
||||||
|
@ -570,6 +593,11 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
'\U00010427\U0001044F')
|
'\U00010427\U0001044F')
|
||||||
self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
|
self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
|
||||||
'X\U0001044Fx\U0001044F')
|
'X\U0001044Fx\U0001044F')
|
||||||
|
self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
|
||||||
|
exp = '\u0399\u0308\u0300\u0069\u0307'
|
||||||
|
self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
|
||||||
|
self.assertEqual('finnish'.capitalize(), 'FInnish')
|
||||||
|
self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
|
||||||
|
|
||||||
def test_title(self):
|
def test_title(self):
|
||||||
string_tests.MixinStrUnicodeUserStringTest.test_title(self)
|
string_tests.MixinStrUnicodeUserStringTest.test_title(self)
|
||||||
|
@ -584,6 +612,9 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
'\U00010427\U0001044F \U00010427\U0001044F')
|
'\U00010427\U0001044F \U00010427\U0001044F')
|
||||||
self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
|
self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
|
||||||
'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
|
'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
|
||||||
|
self.assertEqual('fiNNISH'.title(), 'Finnish')
|
||||||
|
self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
|
||||||
|
self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
|
||||||
|
|
||||||
def test_swapcase(self):
|
def test_swapcase(self):
|
||||||
string_tests.CommonTest.test_swapcase(self)
|
string_tests.CommonTest.test_swapcase(self)
|
||||||
|
@ -597,6 +628,19 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
'\U00010427\U0001044F')
|
'\U00010427\U0001044F')
|
||||||
self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
|
self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
|
||||||
'x\U0001044FX\U00010427')
|
'x\U0001044FX\U00010427')
|
||||||
|
self.assertEqual('fi'.swapcase(), 'FI')
|
||||||
|
self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
|
||||||
|
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
|
||||||
|
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
|
||||||
|
self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
|
||||||
|
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
|
||||||
|
self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
|
||||||
|
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
|
||||||
|
self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
|
||||||
|
self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
|
||||||
|
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
|
||||||
|
self.assertEqual('ß'.swapcase(), 'SS')
|
||||||
|
self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
|
||||||
|
|
||||||
def test_contains(self):
|
def test_contains(self):
|
||||||
# Testing Unicode contains method
|
# Testing Unicode contains method
|
||||||
|
|
|
@ -21,7 +21,7 @@ errors = 'surrogatepass'
|
||||||
class UnicodeMethodsTest(unittest.TestCase):
|
class UnicodeMethodsTest(unittest.TestCase):
|
||||||
|
|
||||||
# update this, if the database changes
|
# update this, if the database changes
|
||||||
expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0'
|
expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921'
|
||||||
|
|
||||||
def test_method_checksum(self):
|
def test_method_checksum(self):
|
||||||
h = hashlib.sha1()
|
h = hashlib.sha1()
|
||||||
|
|
|
@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #12736: Use full unicode case mappings for upper, lower, and title case.
|
||||||
|
|
||||||
- Issue #12760: Add a create mode to open(). Patch by David Townshend.
|
- Issue #12760: Add a create mode to open(). Patch by David Townshend.
|
||||||
|
|
||||||
- Issue #13738: Simplify implementation of bytes.lower() and bytes.upper().
|
- Issue #13738: Simplify implementation of bytes.lower() and bytes.upper().
|
||||||
|
|
|
@ -21,8 +21,10 @@
|
||||||
#define XID_START_MASK 0x100
|
#define XID_START_MASK 0x100
|
||||||
#define XID_CONTINUE_MASK 0x200
|
#define XID_CONTINUE_MASK 0x200
|
||||||
#define PRINTABLE_MASK 0x400
|
#define PRINTABLE_MASK 0x400
|
||||||
#define NODELTA_MASK 0x800
|
#define NUMERIC_MASK 0x800
|
||||||
#define NUMERIC_MASK 0x1000
|
#define CASE_IGNORABLE_MASK 0x1000
|
||||||
|
#define CASED_MASK 0x2000
|
||||||
|
#define EXTENDED_CASE_MASK 0x4000
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const Py_UCS4 upper;
|
const Py_UCS4 upper;
|
||||||
|
@ -57,15 +59,8 @@ gettyperecord(Py_UCS4 code)
|
||||||
Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
|
Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
|
||||||
{
|
{
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
int delta = ctype->title;
|
|
||||||
|
|
||||||
if (ctype->flags & NODELTA_MASK)
|
return ctype->title ? ctype->title : ch;
|
||||||
return delta;
|
|
||||||
|
|
||||||
if (delta >= 32768)
|
|
||||||
delta -= 65536;
|
|
||||||
|
|
||||||
return ch + delta;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns 1 for Unicode characters having the category 'Lt', 0
|
/* Returns 1 for Unicode characters having the category 'Lt', 0
|
||||||
|
@ -188,12 +183,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch)
|
||||||
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
|
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
|
||||||
{
|
{
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
int delta = ctype->upper;
|
|
||||||
if (ctype->flags & NODELTA_MASK)
|
if (ctype->flags & EXTENDED_CASE_MASK)
|
||||||
return delta;
|
return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
|
||||||
if (delta >= 32768)
|
return ctype->upper ? ctype->upper : ch;
|
||||||
delta -= 65536;
|
|
||||||
return ch + delta;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns the lowercase Unicode characters corresponding to ch or just
|
/* Returns the lowercase Unicode characters corresponding to ch or just
|
||||||
|
@ -202,12 +195,72 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
|
||||||
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
|
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
|
||||||
{
|
{
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
int delta = ctype->lower;
|
|
||||||
if (ctype->flags & NODELTA_MASK)
|
if (ctype->flags & EXTENDED_CASE_MASK)
|
||||||
return delta;
|
return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
|
||||||
if (delta >= 32768)
|
return ctype->lower ? ctype->lower : ch;
|
||||||
delta -= 65536;
|
}
|
||||||
return ch + delta;
|
|
||||||
|
int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
if (ctype->flags & EXTENDED_CASE_MASK) {
|
||||||
|
int index = ctype->lower & 0xFFFFFF;
|
||||||
|
int n = ctype->lower >> 24;
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
res[i] = _PyUnicode_ExtendedCase[index + i];
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
res[0] = ctype->lower ? ctype->lower : ch;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
if (ctype->flags & EXTENDED_CASE_MASK) {
|
||||||
|
int index = ctype->title & 0xFFFFFF;
|
||||||
|
int n = ctype->title >> 24;
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
res[i] = _PyUnicode_ExtendedCase[index + i];
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
res[0] = ctype->title ? ctype->title : ch;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
if (ctype->flags & EXTENDED_CASE_MASK) {
|
||||||
|
int index = ctype->upper & 0xFFFFFF;
|
||||||
|
int n = ctype->upper >> 24;
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
res[i] = _PyUnicode_ExtendedCase[index + i];
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
res[0] = ctype->upper ? ctype->upper : ch;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int _PyUnicode_IsCased(Py_UCS4 ch)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
return (ctype->flags & CASED_MASK) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
|
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
|
||||||
|
|
|
@ -41,6 +41,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
#define PY_SSIZE_T_CLEAN
|
#define PY_SSIZE_T_CLEAN
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
#include "ucnhash.h"
|
#include "ucnhash.h"
|
||||||
|
#include "bytes_methods.h"
|
||||||
|
|
||||||
#ifdef MS_WINDOWS
|
#ifdef MS_WINDOWS
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
@ -9428,188 +9429,222 @@ fixup(PyObject *self,
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Py_UCS4
|
static PyObject *
|
||||||
fixupper(PyObject *self)
|
ascii_upper_or_lower(PyObject *self, int lower)
|
||||||
{
|
{
|
||||||
/* No need to call PyUnicode_READY(self) because this function is only
|
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
|
||||||
called as a callback from fixup() which does it already. */
|
char *resdata, *data = PyUnicode_DATA(self);
|
||||||
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
|
PyObject *res;
|
||||||
const int kind = PyUnicode_KIND(self);
|
|
||||||
void *data = PyUnicode_DATA(self);
|
|
||||||
int touched = 0;
|
|
||||||
Py_UCS4 maxchar = 0;
|
|
||||||
Py_ssize_t i;
|
|
||||||
|
|
||||||
for (i = 0; i < len; ++i) {
|
res = PyUnicode_New(len, 127);
|
||||||
const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
if (res == NULL)
|
||||||
const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
|
return NULL;
|
||||||
if (up != ch) {
|
resdata = PyUnicode_DATA(res);
|
||||||
if (up > maxchar)
|
if (lower)
|
||||||
maxchar = up;
|
_Py_bytes_lower(resdata, data, len);
|
||||||
PyUnicode_WRITE(kind, data, i, up);
|
|
||||||
touched = 1;
|
|
||||||
}
|
|
||||||
else if (ch > maxchar)
|
|
||||||
maxchar = ch;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (touched)
|
|
||||||
return maxchar;
|
|
||||||
else
|
else
|
||||||
return 0;
|
_Py_bytes_upper(resdata, data, len);
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Py_UCS4
|
static Py_UCS4
|
||||||
fixlower(PyObject *self)
|
handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
|
||||||
{
|
{
|
||||||
/* No need to call PyUnicode_READY(self) because fixup() which does it. */
|
Py_ssize_t j;
|
||||||
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
|
int final_sigma;
|
||||||
const int kind = PyUnicode_KIND(self);
|
Py_UCS4 c;
|
||||||
void *data = PyUnicode_DATA(self);
|
/* U+03A3 is in the Final_Sigma context when, it is found like this:
|
||||||
int touched = 0;
|
|
||||||
Py_UCS4 maxchar = 0;
|
|
||||||
Py_ssize_t i;
|
|
||||||
|
|
||||||
for(i = 0; i < len; ++i) {
|
\p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
|
||||||
const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
|
||||||
const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
|
where ! is a negation and \p{xxx} is a character with property xxx.
|
||||||
if (lo != ch) {
|
*/
|
||||||
if (lo > maxchar)
|
for (j = i - 1; j >= 0; j--) {
|
||||||
maxchar = lo;
|
c = PyUnicode_READ(kind, data, j);
|
||||||
PyUnicode_WRITE(kind, data, i, lo);
|
if (!_PyUnicode_IsCaseIgnorable(c))
|
||||||
touched = 1;
|
break;
|
||||||
}
|
|
||||||
else if (ch > maxchar)
|
|
||||||
maxchar = ch;
|
|
||||||
}
|
}
|
||||||
|
final_sigma = j >= 0 && _PyUnicode_IsCased(c);
|
||||||
if (touched)
|
if (final_sigma) {
|
||||||
return maxchar;
|
for (j = i + 1; j < length; j++) {
|
||||||
else
|
c = PyUnicode_READ(kind, data, j);
|
||||||
return 0;
|
if (!_PyUnicode_IsCaseIgnorable(c))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
final_sigma = j == length || !_PyUnicode_IsCased(c);
|
||||||
|
}
|
||||||
|
return (final_sigma) ? 0x3C2 : 0x3C3;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Py_UCS4
|
static int
|
||||||
fixswapcase(PyObject *self)
|
lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
|
||||||
|
Py_UCS4 c, Py_UCS4 *mapped)
|
||||||
{
|
{
|
||||||
/* No need to call PyUnicode_READY(self) because fixup() which does it. */
|
/* Obscure special case. */
|
||||||
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
|
if (c == 0x3A3) {
|
||||||
const int kind = PyUnicode_KIND(self);
|
mapped[0] = handle_capital_sigma(kind, data, length, i);
|
||||||
void *data = PyUnicode_DATA(self);
|
return 1;
|
||||||
int touched = 0;
|
|
||||||
Py_UCS4 maxchar = 0;
|
|
||||||
Py_ssize_t i;
|
|
||||||
|
|
||||||
for(i = 0; i < len; ++i) {
|
|
||||||
const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
|
||||||
Py_UCS4 nu = 0;
|
|
||||||
|
|
||||||
if (Py_UNICODE_ISUPPER(ch))
|
|
||||||
nu = Py_UNICODE_TOLOWER(ch);
|
|
||||||
else if (Py_UNICODE_ISLOWER(ch))
|
|
||||||
nu = Py_UNICODE_TOUPPER(ch);
|
|
||||||
|
|
||||||
if (nu != 0) {
|
|
||||||
if (nu > maxchar)
|
|
||||||
maxchar = nu;
|
|
||||||
PyUnicode_WRITE(kind, data, i, nu);
|
|
||||||
touched = 1;
|
|
||||||
}
|
|
||||||
else if (ch > maxchar)
|
|
||||||
maxchar = ch;
|
|
||||||
}
|
}
|
||||||
|
return _PyUnicode_ToLowerFull(c, mapped);
|
||||||
if (touched)
|
|
||||||
return maxchar;
|
|
||||||
else
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static Py_UCS4
|
static Py_ssize_t
|
||||||
fixcapitalize(PyObject *self)
|
do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
|
||||||
{
|
{
|
||||||
/* No need to call PyUnicode_READY(self) because fixup() which does it. */
|
Py_ssize_t i, k = 0;
|
||||||
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
|
int n_res, j;
|
||||||
const int kind = PyUnicode_KIND(self);
|
Py_UCS4 c, mapped[3];
|
||||||
void *data = PyUnicode_DATA(self);
|
|
||||||
int touched = 0;
|
|
||||||
Py_UCS4 maxchar = 0;
|
|
||||||
Py_ssize_t i = 0;
|
|
||||||
Py_UCS4 ch;
|
|
||||||
|
|
||||||
if (len == 0)
|
c = PyUnicode_READ(kind, data, 0);
|
||||||
return 0;
|
n_res = _PyUnicode_ToUpperFull(c, mapped);
|
||||||
|
for (j = 0; j < n_res; j++) {
|
||||||
ch = PyUnicode_READ(kind, data, i);
|
if (mapped[j] > *maxchar)
|
||||||
if (!Py_UNICODE_ISUPPER(ch)) {
|
*maxchar = mapped[j];
|
||||||
maxchar = Py_UNICODE_TOUPPER(ch);
|
res[k++] = mapped[j];
|
||||||
PyUnicode_WRITE(kind, data, i, maxchar);
|
|
||||||
touched = 1;
|
|
||||||
}
|
}
|
||||||
++i;
|
for (i = 1; i < length; i++) {
|
||||||
for(; i < len; ++i) {
|
c = PyUnicode_READ(kind, data, i);
|
||||||
ch = PyUnicode_READ(kind, data, i);
|
n_res = lower_ucs4(kind, data, length, i, c, mapped);
|
||||||
if (!Py_UNICODE_ISLOWER(ch)) {
|
for (j = 0; j < n_res; j++) {
|
||||||
const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
|
if (mapped[j] > *maxchar)
|
||||||
if (lo > maxchar)
|
*maxchar = mapped[j];
|
||||||
maxchar = lo;
|
res[k++] = mapped[j];
|
||||||
PyUnicode_WRITE(kind, data, i, lo);
|
|
||||||
touched = 1;
|
|
||||||
}
|
}
|
||||||
else if (ch > maxchar)
|
|
||||||
maxchar = ch;
|
|
||||||
}
|
}
|
||||||
|
return k;
|
||||||
if (touched)
|
|
||||||
return maxchar;
|
|
||||||
else
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static Py_UCS4
|
static Py_ssize_t
|
||||||
fixtitle(PyObject *self)
|
do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
|
||||||
|
Py_ssize_t i, k = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < length; i++) {
|
||||||
|
Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
|
||||||
|
int n_res, j;
|
||||||
|
if (Py_UNICODE_ISUPPER(c)) {
|
||||||
|
n_res = lower_ucs4(kind, data, length, i, c, mapped);
|
||||||
|
}
|
||||||
|
else if (Py_UNICODE_ISLOWER(c)) {
|
||||||
|
n_res = _PyUnicode_ToUpperFull(c, mapped);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
n_res = 1;
|
||||||
|
mapped[0] = c;
|
||||||
|
}
|
||||||
|
for (j = 0; j < n_res; j++) {
|
||||||
|
if (mapped[j] > *maxchar)
|
||||||
|
*maxchar = mapped[j];
|
||||||
|
res[k++] = mapped[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return k;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Py_ssize_t
|
||||||
|
do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
|
||||||
|
Py_UCS4 *maxchar, int lower)
|
||||||
{
|
{
|
||||||
/* No need to call PyUnicode_READY(self) because fixup() which does it. */
|
Py_ssize_t i, k = 0;
|
||||||
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
|
|
||||||
const int kind = PyUnicode_KIND(self);
|
for (i = 0; i < length; i++) {
|
||||||
void *data = PyUnicode_DATA(self);
|
Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
|
||||||
Py_UCS4 maxchar = 0;
|
int n_res, j;
|
||||||
Py_ssize_t i = 0;
|
if (lower)
|
||||||
|
n_res = lower_ucs4(kind, data, length, i, c, mapped);
|
||||||
|
else
|
||||||
|
n_res = _PyUnicode_ToUpperFull(c, mapped);
|
||||||
|
for (j = 0; j < n_res; j++) {
|
||||||
|
if (mapped[j] > *maxchar)
|
||||||
|
*maxchar = mapped[j];
|
||||||
|
res[k++] = mapped[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return k;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Py_ssize_t
|
||||||
|
do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
|
||||||
|
{
|
||||||
|
return do_upper_or_lower(kind, data, length, res, maxchar, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Py_ssize_t
|
||||||
|
do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
|
||||||
|
{
|
||||||
|
return do_upper_or_lower(kind, data, length, res, maxchar, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
case_operation(PyObject *self,
|
||||||
|
Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
|
||||||
|
{
|
||||||
|
PyObject *res = NULL;
|
||||||
|
Py_ssize_t length, newlength = 0;
|
||||||
|
int kind, outkind;
|
||||||
|
void *data, *outdata;
|
||||||
|
Py_UCS4 maxchar = 0, *tmp, *tmpend;
|
||||||
|
|
||||||
|
if (PyUnicode_READY(self) == -1)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
kind = PyUnicode_KIND(self);
|
||||||
|
data = PyUnicode_DATA(self);
|
||||||
|
length = PyUnicode_GET_LENGTH(self);
|
||||||
|
tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
|
||||||
|
if (tmp == NULL)
|
||||||
|
return PyErr_NoMemory();
|
||||||
|
newlength = perform(kind, data, length, tmp, &maxchar);
|
||||||
|
res = PyUnicode_New(newlength, maxchar);
|
||||||
|
if (res == NULL)
|
||||||
|
goto leave;
|
||||||
|
tmpend = tmp + newlength;
|
||||||
|
outdata = PyUnicode_DATA(res);
|
||||||
|
outkind = PyUnicode_KIND(res);
|
||||||
|
switch (outkind) {
|
||||||
|
case PyUnicode_1BYTE_KIND:
|
||||||
|
_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
|
||||||
|
break;
|
||||||
|
case PyUnicode_2BYTE_KIND:
|
||||||
|
_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
|
||||||
|
break;
|
||||||
|
case PyUnicode_4BYTE_KIND:
|
||||||
|
memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
leave:
|
||||||
|
PyMem_FREE(tmp);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Py_ssize_t
|
||||||
|
do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
|
||||||
|
{
|
||||||
|
Py_ssize_t i, k = 0;
|
||||||
int previous_is_cased;
|
int previous_is_cased;
|
||||||
|
|
||||||
/* Shortcut for single character strings */
|
|
||||||
if (len == 1) {
|
|
||||||
const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
|
||||||
const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
|
|
||||||
if (ti != ch) {
|
|
||||||
PyUnicode_WRITE(kind, data, i, ti);
|
|
||||||
return ti;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
previous_is_cased = 0;
|
previous_is_cased = 0;
|
||||||
for(; i < len; ++i) {
|
for (i = 0; i < length; i++) {
|
||||||
const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
const Py_UCS4 c = PyUnicode_READ(kind, data, i);
|
||||||
Py_UCS4 nu;
|
Py_UCS4 mapped[3];
|
||||||
|
int n_res, j;
|
||||||
|
|
||||||
if (previous_is_cased)
|
if (previous_is_cased)
|
||||||
nu = Py_UNICODE_TOLOWER(ch);
|
n_res = lower_ucs4(kind, data, length, i, c, mapped);
|
||||||
else
|
else
|
||||||
nu = Py_UNICODE_TOTITLE(ch);
|
n_res = _PyUnicode_ToTitleFull(c, mapped);
|
||||||
|
|
||||||
if (nu > maxchar)
|
for (j = 0; j < n_res; j++) {
|
||||||
maxchar = nu;
|
if (mapped[j] > *maxchar)
|
||||||
PyUnicode_WRITE(kind, data, i, nu);
|
*maxchar = mapped[j];
|
||||||
|
res[k++] = mapped[j];
|
||||||
|
}
|
||||||
|
|
||||||
if (Py_UNICODE_ISLOWER(ch) ||
|
previous_is_cased = _PyUnicode_IsCased(c);
|
||||||
Py_UNICODE_ISUPPER(ch) ||
|
|
||||||
Py_UNICODE_ISTITLE(ch))
|
|
||||||
previous_is_cased = 1;
|
|
||||||
else
|
|
||||||
previous_is_cased = 0;
|
|
||||||
}
|
}
|
||||||
return maxchar;
|
return k;
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
|
@ -10445,7 +10480,7 @@ characters, all remaining cased characters have lower case.");
|
||||||
static PyObject*
|
static PyObject*
|
||||||
unicode_title(PyObject *self)
|
unicode_title(PyObject *self)
|
||||||
{
|
{
|
||||||
return fixup(self, fixtitle);
|
return case_operation(self, do_title);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(capitalize__doc__,
|
PyDoc_STRVAR(capitalize__doc__,
|
||||||
|
@ -10457,7 +10492,11 @@ have upper case and the rest lower case.");
|
||||||
static PyObject*
|
static PyObject*
|
||||||
unicode_capitalize(PyObject *self)
|
unicode_capitalize(PyObject *self)
|
||||||
{
|
{
|
||||||
return fixup(self, fixcapitalize);
|
if (PyUnicode_READY(self) == -1)
|
||||||
|
return NULL;
|
||||||
|
if (PyUnicode_GET_LENGTH(self) == 0)
|
||||||
|
return unicode_result_unchanged(self);
|
||||||
|
return case_operation(self, do_capitalize);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
@ -11715,7 +11754,11 @@ Return a copy of the string S converted to lowercase.");
|
||||||
static PyObject*
|
static PyObject*
|
||||||
unicode_lower(PyObject *self)
|
unicode_lower(PyObject *self)
|
||||||
{
|
{
|
||||||
return fixup(self, fixlower);
|
if (PyUnicode_READY(self) == -1)
|
||||||
|
return NULL;
|
||||||
|
if (PyUnicode_IS_ASCII(self))
|
||||||
|
return ascii_upper_or_lower(self, 1);
|
||||||
|
return case_operation(self, do_lower);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LEFTSTRIP 0
|
#define LEFTSTRIP 0
|
||||||
|
@ -12604,7 +12647,7 @@ and vice versa.");
|
||||||
static PyObject*
|
static PyObject*
|
||||||
unicode_swapcase(PyObject *self)
|
unicode_swapcase(PyObject *self)
|
||||||
{
|
{
|
||||||
return fixup(self, fixswapcase);
|
return case_operation(self, do_swapcase);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(maketrans__doc__,
|
PyDoc_STRVAR(maketrans__doc__,
|
||||||
|
@ -12750,7 +12793,11 @@ Return a copy of S converted to uppercase.");
|
||||||
static PyObject*
|
static PyObject*
|
||||||
unicode_upper(PyObject *self)
|
unicode_upper(PyObject *self)
|
||||||
{
|
{
|
||||||
return fixup(self, fixupper);
|
if (PyUnicode_READY(self) == -1)
|
||||||
|
return NULL;
|
||||||
|
if (PyUnicode_IS_ASCII(self))
|
||||||
|
return ascii_upper_or_lower(self, 0);
|
||||||
|
return case_operation(self, do_upper);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(zfill__doc__,
|
PyDoc_STRVAR(zfill__doc__,
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -22,6 +22,7 @@
|
||||||
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
|
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
|
||||||
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
|
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
|
||||||
# 2011-10-21 ezio add support for name aliases and named sequences
|
# 2011-10-21 ezio add support for name aliases and named sequences
|
||||||
|
# 2012-01 benjamin add full case mappings
|
||||||
#
|
#
|
||||||
# written by Fredrik Lundh (fredrik@pythonware.com)
|
# written by Fredrik Lundh (fredrik@pythonware.com)
|
||||||
#
|
#
|
||||||
|
@ -47,6 +48,7 @@ DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
||||||
LINE_BREAK = "LineBreak%s.txt"
|
LINE_BREAK = "LineBreak%s.txt"
|
||||||
NAME_ALIASES = "NameAliases%s.txt"
|
NAME_ALIASES = "NameAliases%s.txt"
|
||||||
NAMED_SEQUENCES = "NamedSequences%s.txt"
|
NAMED_SEQUENCES = "NamedSequences%s.txt"
|
||||||
|
SPECIAL_CASING = "SpecialCasing%s.txt"
|
||||||
|
|
||||||
# Private Use Areas -- in planes 1, 15, 16
|
# Private Use Areas -- in planes 1, 15, 16
|
||||||
PUA_1 = range(0xE000, 0xF900)
|
PUA_1 = range(0xE000, 0xF900)
|
||||||
|
@ -84,8 +86,10 @@ UPPER_MASK = 0x80
|
||||||
XID_START_MASK = 0x100
|
XID_START_MASK = 0x100
|
||||||
XID_CONTINUE_MASK = 0x200
|
XID_CONTINUE_MASK = 0x200
|
||||||
PRINTABLE_MASK = 0x400
|
PRINTABLE_MASK = 0x400
|
||||||
NODELTA_MASK = 0x800
|
NUMERIC_MASK = 0x800
|
||||||
NUMERIC_MASK = 0x1000
|
CASE_IGNORABLE_MASK = 0x1000
|
||||||
|
CASED_MASK = 0x2000
|
||||||
|
EXTENDED_CASE_MASK = 0x4000
|
||||||
|
|
||||||
# these ranges need to match unicodedata.c:is_unified_ideograph
|
# these ranges need to match unicodedata.c:is_unified_ideograph
|
||||||
cjk_ranges = [
|
cjk_ranges = [
|
||||||
|
@ -384,6 +388,7 @@ def makeunicodetype(unicode, trace):
|
||||||
numeric = {}
|
numeric = {}
|
||||||
spaces = []
|
spaces = []
|
||||||
linebreaks = []
|
linebreaks = []
|
||||||
|
extra_casing = []
|
||||||
|
|
||||||
for char in unicode.chars:
|
for char in unicode.chars:
|
||||||
record = unicode.table[char]
|
record = unicode.table[char]
|
||||||
|
@ -396,7 +401,7 @@ def makeunicodetype(unicode, trace):
|
||||||
delta = True
|
delta = True
|
||||||
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
||||||
flags |= ALPHA_MASK
|
flags |= ALPHA_MASK
|
||||||
if category == "Ll":
|
if "Lowercase" in properties:
|
||||||
flags |= LOWER_MASK
|
flags |= LOWER_MASK
|
||||||
if 'Line_Break' in properties or bidirectional == "B":
|
if 'Line_Break' in properties or bidirectional == "B":
|
||||||
flags |= LINEBREAK_MASK
|
flags |= LINEBREAK_MASK
|
||||||
|
@ -406,7 +411,7 @@ def makeunicodetype(unicode, trace):
|
||||||
spaces.append(char)
|
spaces.append(char)
|
||||||
if category == "Lt":
|
if category == "Lt":
|
||||||
flags |= TITLE_MASK
|
flags |= TITLE_MASK
|
||||||
if category == "Lu":
|
if "Uppercase" in properties:
|
||||||
flags |= UPPER_MASK
|
flags |= UPPER_MASK
|
||||||
if char == ord(" ") or category[0] not in ("C", "Z"):
|
if char == ord(" ") or category[0] not in ("C", "Z"):
|
||||||
flags |= PRINTABLE_MASK
|
flags |= PRINTABLE_MASK
|
||||||
|
@ -414,35 +419,41 @@ def makeunicodetype(unicode, trace):
|
||||||
flags |= XID_START_MASK
|
flags |= XID_START_MASK
|
||||||
if "XID_Continue" in properties:
|
if "XID_Continue" in properties:
|
||||||
flags |= XID_CONTINUE_MASK
|
flags |= XID_CONTINUE_MASK
|
||||||
# use delta predictor for upper/lower/title if it fits
|
if "Cased" in properties:
|
||||||
if record[12]:
|
flags |= CASED_MASK
|
||||||
upper = int(record[12], 16)
|
if "Case_Ignorable" in properties:
|
||||||
|
flags |= CASE_IGNORABLE_MASK
|
||||||
|
sc = unicode.special_casing.get(char)
|
||||||
|
if sc is None:
|
||||||
|
if record[12]:
|
||||||
|
upper = int(record[12], 16)
|
||||||
|
else:
|
||||||
|
upper = char
|
||||||
|
if record[13]:
|
||||||
|
lower = int(record[13], 16)
|
||||||
|
else:
|
||||||
|
lower = char
|
||||||
|
if record[14]:
|
||||||
|
title = int(record[14], 16)
|
||||||
|
else:
|
||||||
|
title = upper
|
||||||
|
if upper == lower == title:
|
||||||
|
upper = lower = title = 0
|
||||||
else:
|
else:
|
||||||
upper = char
|
# This happens when some character maps to more than one
|
||||||
if record[13]:
|
# character in uppercase, lowercase, or titlecase. The extra
|
||||||
lower = int(record[13], 16)
|
# characters are stored in a different array.
|
||||||
else:
|
flags |= EXTENDED_CASE_MASK
|
||||||
lower = char
|
lower = len(extra_casing) | (len(sc[0]) << 24)
|
||||||
if record[14]:
|
extra_casing.extend(sc[0])
|
||||||
title = int(record[14], 16)
|
upper = len(extra_casing) | (len(sc[2]) << 24)
|
||||||
else:
|
extra_casing.extend(sc[2])
|
||||||
# UCD.html says that a missing title char means that
|
# Title is probably equal to upper.
|
||||||
# it defaults to the uppercase character, not to the
|
if sc[1] == sc[2]:
|
||||||
# character itself. Apparently, in the current UCD (5.x)
|
title = upper
|
||||||
# this feature is never used
|
else:
|
||||||
title = upper
|
title = len(extra_casing) | (len(sc[1]) << 24)
|
||||||
upper_d = upper - char
|
extra_casing.extend(sc[1])
|
||||||
lower_d = lower - char
|
|
||||||
title_d = title - char
|
|
||||||
if -32768 <= upper_d <= 32767 and \
|
|
||||||
-32768 <= lower_d <= 32767 and \
|
|
||||||
-32768 <= title_d <= 32767:
|
|
||||||
# use deltas
|
|
||||||
upper = upper_d & 0xffff
|
|
||||||
lower = lower_d & 0xffff
|
|
||||||
title = title_d & 0xffff
|
|
||||||
else:
|
|
||||||
flags |= NODELTA_MASK
|
|
||||||
# decimal digit, integer digit
|
# decimal digit, integer digit
|
||||||
decimal = 0
|
decimal = 0
|
||||||
if record[6]:
|
if record[6]:
|
||||||
|
@ -469,6 +480,7 @@ def makeunicodetype(unicode, trace):
|
||||||
print(sum(map(len, numeric.values())), "numeric code points")
|
print(sum(map(len, numeric.values())), "numeric code points")
|
||||||
print(len(spaces), "whitespace code points")
|
print(len(spaces), "whitespace code points")
|
||||||
print(len(linebreaks), "linebreak code points")
|
print(len(linebreaks), "linebreak code points")
|
||||||
|
print(len(extra_casing), "extended case array")
|
||||||
|
|
||||||
print("--- Writing", FILE, "...")
|
print("--- Writing", FILE, "...")
|
||||||
|
|
||||||
|
@ -482,6 +494,14 @@ def makeunicodetype(unicode, trace):
|
||||||
print("};", file=fp)
|
print("};", file=fp)
|
||||||
print(file=fp)
|
print(file=fp)
|
||||||
|
|
||||||
|
print("/* extended case mappings */", file=fp)
|
||||||
|
print(file=fp)
|
||||||
|
print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
|
||||||
|
for c in extra_casing:
|
||||||
|
print(" %d," % c, file=fp)
|
||||||
|
print("};", file=fp)
|
||||||
|
print(file=fp)
|
||||||
|
|
||||||
# split decomposition index table
|
# split decomposition index table
|
||||||
index1, index2, shift = splitbins(index, trace)
|
index1, index2, shift = splitbins(index, trace)
|
||||||
|
|
||||||
|
@ -1070,6 +1090,23 @@ class UnicodeData:
|
||||||
# Patch the numeric field
|
# Patch the numeric field
|
||||||
if table[i] is not None:
|
if table[i] is not None:
|
||||||
table[i][8] = value
|
table[i][8] = value
|
||||||
|
sc = self.special_casing = {}
|
||||||
|
with open_data(SPECIAL_CASING, version) as file:
|
||||||
|
for s in file:
|
||||||
|
s = s[:-1].split('#', 1)[0]
|
||||||
|
if not s:
|
||||||
|
continue
|
||||||
|
data = s.split("; ")
|
||||||
|
if data[4]:
|
||||||
|
# We ignore all conditionals (since they depend on
|
||||||
|
# languages) except for one, which is hardcoded. See
|
||||||
|
# handle_capital_sigma in unicodeobject.c.
|
||||||
|
continue
|
||||||
|
c = int(data[0], 16)
|
||||||
|
lower = [int(char, 16) for char in data[1].split()]
|
||||||
|
title = [int(char, 16) for char in data[2].split()]
|
||||||
|
upper = [int(char, 16) for char in data[3].split()]
|
||||||
|
sc[c] = (lower, title, upper)
|
||||||
|
|
||||||
def uselatin1(self):
|
def uselatin1(self):
|
||||||
# restrict character range to ISO Latin 1
|
# restrict character range to ISO Latin 1
|
||||||
|
|
Loading…
Reference in New Issue