Issue #1734234: Massively speedup `unicodedata.normalize()` when the

string is already in normalized form, by performing a quick check beforehand. Original patch by Rauli Ruohonen.
2009-04-27 21:53:26 +00:00 · 2009-04-27 21:53:26 +00:00 · e988e286b2
parent 8b8f8cc1b0
commit e988e286b2
5 changed files with 2041 additions and 1731 deletions
--- a/Misc/ACKS
+++ b/Misc/ACKS
@ -612,6 +612,7 @@ Craig Rowland
 Paul Rubin
 Sam Ruby
 Audun S. Runde
+Rauli Ruohonen
 Jeff Rush
 Sam Rushing
 Mark Russell
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -255,6 +255,10 @@ Core and Builtins
 Library
 -------

+- Issue #1734234: Massively speedup ``unicodedata.normalize()`` when the
+  string is already in normalized form, by performing a quick check beforehand.
+  Original patch by Rauli Ruohonen.
+
 - Issue #5853: calling a function of the mimetypes module from several threads
  at once could hit the recursion limit if the mimetypes database hadn't been
  initialized before.
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -27,6 +27,7 @@ typedef struct {
    const unsigned char mirrored;	/* true if mirrored in bidir mode */
    const unsigned char east_asian_width;	/* index into
 						   _PyUnicode_EastAsianWidth */
+    const unsigned char normalization_quick_check; /* see is_normalized() */
 } _PyUnicode_DatabaseRecord;

 typedef struct change_record {
@ -720,7 +721,39 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
        PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
    return result;
 }
-		
+
+/* Return 1 if the input is certainly normalized, 0 if it might not be. */
+static int
+is_normalized(PyObject *self, PyObject *input, int nfc, int k)
+{
+    Py_UNICODE *i, *end;
+    unsigned char prev_combining = 0, quickcheck_mask;
+
+    /* An older version of the database is requested, quickchecks must be
+       disabled. */
+    if (self != NULL)
+        return 0;
+
+    /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
+       as described in http://unicode.org/reports/tr15/#Annex8. */
+    quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
+
+    i = PyUnicode_AS_UNICODE(input);
+    end = i + PyUnicode_GET_SIZE(input);
+    while (i < end) {
+        const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
+        unsigned char combining = record->combining;
+        unsigned char quickcheck = record->normalization_quick_check;
+
+        if (quickcheck & quickcheck_mask)
+            return 0; /* this string might need normalization */
+        if (combining && prev_combining > combining)
+            return 0; /* non-canonical sort order, not normalized */
+        prev_combining = combining;
+    }
+    return 1; /* certainly normalized */
+}
+
 PyDoc_STRVAR(unicodedata_normalize__doc__,
 "normalize(form, unistr)\n\
 \n\
@ -744,14 +777,34 @@ unicodedata_normalize(PyObject *self, PyObject *args)
        return input;
    }

-    if (strcmp(form, "NFC") == 0)
+    if (strcmp(form, "NFC") == 0) {
+        if (is_normalized(self, input, 1, 0)) {
+            Py_INCREF(input);
+            return input;
+        }
        return nfc_nfkc(self, input, 0);
-    if (strcmp(form, "NFKC") == 0)
+    }
+    if (strcmp(form, "NFKC") == 0) {
+        if (is_normalized(self, input, 1, 1)) {
+            Py_INCREF(input);
+            return input;
+        }
        return nfc_nfkc(self, input, 1);
-    if (strcmp(form, "NFD") == 0)
+    }
+    if (strcmp(form, "NFD") == 0) {
+        if (is_normalized(self, input, 0, 0)) {
+            Py_INCREF(input);
+            return input;
+        }
        return nfd_nfkd(self, input, 0);
-    if (strcmp(form, "NFKD") == 0)
+    }
+    if (strcmp(form, "NFKD") == 0) {
+        if (is_normalized(self, input, 0, 1)) {
+            Py_INCREF(input);
+            return input;
+        }
        return nfd_nfkd(self, input, 1);
+    }
    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
    return NULL;
 }
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
+DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

 old_versions = ["3.2.0"]

@ -66,7 +67,8 @@ def maketables(trace=0):
    version = ""
    unicode = UnicodeData(UNICODE_DATA % version,
                          COMPOSITION_EXCLUSIONS % version,
-                          EASTASIAN_WIDTH % version)
+                          EASTASIAN_WIDTH % version,
+                          DERIVEDNORMALIZATION_PROPS % version)

    print len(filter(None, unicode.table)), "characters"

@ -87,7 +89,7 @@ def maketables(trace=0):

 def makeunicodedata(unicode, trace):

-    dummy = (0, 0, 0, 0, 0)
+    dummy = (0, 0, 0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)
@ -107,8 +109,10 @@ def makeunicodedata(unicode, trace):
            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
            mirrored = record[9] == "Y"
            eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
+            normalizationquickcheck = record[16]
            item = (
-                category, combining, bidirectional, mirrored, eastasianwidth
+                category, combining, bidirectional, mirrored, eastasianwidth,
+                normalizationquickcheck
                )
            # add entry to index and item tables
            i = cache.get(item)
@ -222,7 +226,7 @@ def makeunicodedata(unicode, trace):
    print >>fp, \
          "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
    for item in table:
-        print >>fp, "    {%d, %d, %d, %d, %d}," % item
+        print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
    print >>fp, "};"
    print >>fp

@ -698,7 +702,8 @@ import sys

 class UnicodeData:

-    def __init__(self, filename, exclusions, eastasianwidth, expand=1):
+    def __init__(self, filename, exclusions, eastasianwidth,
+                 derivednormalizationprops=None, expand=1):
        self.changed = []
        file = open(filename)
        table = [None] * 0x110000
@ -761,6 +766,28 @@ class UnicodeData:
        for i in range(0, 0x110000):
            if table[i] is not None:
                table[i].append(widths[i])
+        if derivednormalizationprops:
+            quickchecks = [0] * 0x110000 # default is Yes
+            qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
+            for s in open(derivednormalizationprops):
+                if '#' in s:
+                    s = s[:s.index('#')]
+                s = [i.strip() for i in s.split(';')]
+                if len(s) < 2 or s[1] not in qc_order:
+                    continue
+                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+                quickcheck_shift = qc_order.index(s[1])*2
+                quickcheck <<= quickcheck_shift
+                if '..' not in s[0]:
+                    first = last = int(s[0], 16)
+                else:
+                    first, last = [int(c, 16) for c in s[0].split('..')]
+                for char in range(first, last+1):
+                    assert not (quickchecks[char]>>quickcheck_shift)&3
+                    quickchecks[char] |= quickcheck
+            for i in range(0, 0x110000):
+                if table[i] is not None:
+                    table[i].append(quickchecks[i])

    def uselatin1(self):
        # restrict character range to ISO Latin 1