Bill Tutt:

Make unicode_compare a true UTF-16 compare function (includes
support for surrogates).
This commit is contained in:
Marc-André Lemburg 2000-07-04 09:51:07 +00:00
parent 4b0200e322
commit 1e7205a62a
1 changed files with 29 additions and 6 deletions

View File

@ -3045,10 +3045,23 @@ unicode_center(PyUnicodeObject *self, PyObject *args)
return (PyObject*) pad(self, left, marg - left, ' '); return (PyObject*) pad(self, left, marg - left, ' ');
} }
/* speedy UTF-16 code point order comparison */
/* gleaned from: */
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
static unsigned short utf16Fixup[32] =
{
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0x2000, 0xf800, 0xf800, 0xf800, 0xf800
};
static int static int
unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
{ {
int len1, len2; int len1, len2;
Py_UNICODE *s1 = str1->str; Py_UNICODE *s1 = str1->str;
Py_UNICODE *s2 = str2->str; Py_UNICODE *s2 = str2->str;
@ -3056,11 +3069,21 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
len2 = str2->length; len2 = str2->length;
while (len1 > 0 && len2 > 0) { while (len1 > 0 && len2 > 0) {
int cmp = (*s1++) - (*s2++); unsigned short c1, c2; /* 16 bits */
if (cmp) int diff; /* 32 bits */
/* This should make Christian happy! */
return (cmp < 0) ? -1 : (cmp != 0); c1 = *s1++;
len1--, len2--; c2 = *s2++;
if (c1 > (1<<11) * 26)
c1 += utf16Fixup[c1>>11];
if (c2 > (1<<11) * 26)
c2 += utf16Fixup[c2>>11];
/* now c1 and c2 are in UTF-32-compatible order */
diff = (int)c1 - (int)c2;
if (diff)
return (diff < 0) ? -1 : (diff != 0);
len1--; len2--;
} }
return (len1 < len2) ? -1 : (len1 != len2); return (len1 < len2) ? -1 : (len1 != len2);