Merged revisions 72054 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r72054 | antoine.pitrou | 2009-04-27 23:53:26 +0200 (lun., 27 avril 2009) | 5 lines

  Issue #1734234: Massively speedup `unicodedata.normalize()` when the
  string is already in normalized form, by performing a quick check beforehand.
  Original patch by Rauli Ruohonen.
........
This commit is contained in:
Antoine Pitrou 2009-04-27 22:31:40 +00:00
parent 57f3d93552
commit 7a0fedfd1d
5 changed files with 2041 additions and 1731 deletions

View File

@ -616,6 +616,7 @@ Craig Rowland
Paul Rubin
Sam Ruby
Audun S. Runde
Rauli Ruohonen
Jeff Rush
Sam Rushing
Mark Russell

View File

@ -92,6 +92,10 @@ Installation
Library
-------
- Issue #1734234: Massively speedup ``unicodedata.normalize()`` when the
string is already in normalized form, by performing a quick check beforehand.
Original patch by Rauli Ruohonen.
- Issue #5853: calling a function of the mimetypes module from several threads
at once could hit the recursion limit if the mimetypes database hadn't been
initialized before.

View File

@ -27,6 +27,7 @@ typedef struct {
const unsigned char mirrored; /* true if mirrored in bidir mode */
const unsigned char east_asian_width; /* index into
_PyUnicode_EastAsianWidth */
const unsigned char normalization_quick_check; /* see is_normalized() */
} _PyUnicode_DatabaseRecord;
typedef struct change_record {
@ -723,6 +724,38 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
return result;
}
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
static int
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
{
Py_UNICODE *i, *end;
unsigned char prev_combining = 0, quickcheck_mask;
/* An older version of the database is requested, quickchecks must be
disabled. */
if (self && UCD_Check(self))
return 0;
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
as described in http://unicode.org/reports/tr15/#Annex8. */
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
i = PyUnicode_AS_UNICODE(input);
end = i + PyUnicode_GET_SIZE(input);
while (i < end) {
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
unsigned char combining = record->combining;
unsigned char quickcheck = record->normalization_quick_check;
if (quickcheck & quickcheck_mask)
return 0; /* this string might need normalization */
if (combining && prev_combining > combining)
return 0; /* non-canonical sort order, not normalized */
prev_combining = combining;
}
return 1; /* certainly normalized */
}
PyDoc_STRVAR(unicodedata_normalize__doc__,
"normalize(form, unistr)\n\
\n\
@ -746,14 +779,34 @@ unicodedata_normalize(PyObject *self, PyObject *args)
return input;
}
if (strcmp(form, "NFC") == 0)
if (strcmp(form, "NFC") == 0) {
if (is_normalized(self, input, 1, 0)) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 0);
if (strcmp(form, "NFKC") == 0)
}
if (strcmp(form, "NFKC") == 0) {
if (is_normalized(self, input, 1, 1)) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 1);
if (strcmp(form, "NFD") == 0)
}
if (strcmp(form, "NFD") == 0) {
if (is_normalized(self, input, 0, 0)) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 0);
if (strcmp(form, "NFKD") == 0)
}
if (strcmp(form, "NFKD") == 0) {
if (is_normalized(self, input, 0, 1)) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 1);
}
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL;
}

File diff suppressed because it is too large Load Diff

View File

@ -36,6 +36,7 @@ UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
old_versions = ["3.2.0"]
@ -72,7 +73,8 @@ def maketables(trace=0):
unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version,
DERIVED_CORE_PROPERTIES % version)
DERIVED_CORE_PROPERTIES % version,
DERIVEDNORMALIZATION_PROPS % version)
print(len(list(filter(None, unicode.table))), "characters")
@ -94,7 +96,7 @@ def maketables(trace=0):
def makeunicodedata(unicode, trace):
dummy = (0, 0, 0, 0, 0)
dummy = (0, 0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
@ -114,8 +116,10 @@ def makeunicodedata(unicode, trace):
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
normalizationquickcheck = record[17]
item = (
category, combining, bidirectional, mirrored, eastasianwidth
category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck
)
# add entry to index and item tables
i = cache.get(item)
@ -227,7 +231,7 @@ def makeunicodedata(unicode, trace):
print("/* a list of unique database records */", file=fp)
print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
for item in table:
print(" {%d, %d, %d, %d, %d}," % item, file=fp)
print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
print("};", file=fp)
print(file=fp)
@ -717,7 +721,7 @@ class UnicodeData:
# derived-props] (17)
def __init__(self, filename, exclusions, eastasianwidth,
derivedprops, expand=1):
derivedprops, derivednormalizationprops=None, expand=1):
self.changed = []
file = open(filename)
table = [None] * 0x110000
@ -803,6 +807,29 @@ class UnicodeData:
# apply to unassigned code points; ignore them
table[char][-1].add(p)
if derivednormalizationprops:
quickchecks = [0] * 0x110000 # default is Yes
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
for s in open(derivednormalizationprops):
if '#' in s:
s = s[:s.index('#')]
s = [i.strip() for i in s.split(';')]
if len(s) < 2 or s[1] not in qc_order:
continue
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
quickcheck_shift = qc_order.index(s[1])*2
quickcheck <<= quickcheck_shift
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
def uselatin1(self):
# restrict character range to ISO Latin 1
self.chars = list(range(256))