Add XID_Start and XID_Continue properties to unicodectype.

2007-08-14 22:37:03 +00:00 · 2007-08-14 22:37:03 +00:00 · 13c3e380d1
parent ff398c6f95
commit 13c3e380d1
4 changed files with 1076 additions and 955 deletions
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -205,6 +205,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
 # define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
 # define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
@ -289,6 +291,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
 # define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
 # define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
@ -1274,6 +1278,14 @@ PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
    Py_UNICODE ch 	/* Unicode character */
    );
 PyAPI_FUNC(int) _PyUnicode_IsXidStart(
    Py_UNICODE ch 	/* Unicode character */
    );
 PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
    Py_UNICODE ch 	/* Unicode character */
    );
 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
    const Py_UNICODE ch 	/* Unicode character */
    );
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@ -19,6 +19,8 @@
 #define SPACE_MASK 0x20
 #define TITLE_MASK 0x40
 #define UPPER_MASK 0x80
 #define XID_START_MASK 0x100
 #define XID_CONTINUE_MASK 0x200
 typedef struct {
    const Py_UNICODE upper;
@ -98,6 +100,26 @@ int _PyUnicode_IsTitlecase(Py_UNICODE ch)
    return (ctype->flags & TITLE_MASK) != 0;
 }
 /* Returns 1 for Unicode characters having the XID_Start property, 0
   otherwise. */
 int _PyUnicode_IsXidStart(Py_UNICODE ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    return (ctype->flags & XID_START_MASK) != 0;
 }
 /* Returns 1 for Unicode characters having the XID_Continue property,
   0 otherwise. */
 int _PyUnicode_IsXidContinue(Py_UNICODE ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    return (ctype->flags & XID_CONTINUE_MASK) != 0;
 }
 /* Returns the integer decimal (0-9) for Unicode characters having
   this property, -1 otherwise. */
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -34,6 +34,7 @@ UNIDATA_VERSION = "4.1.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 old_versions = ["3.2.0"]
@ -57,6 +58,8 @@ LINEBREAK_MASK = 0x10
 SPACE_MASK = 0x20
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
 XID_START_MASK = 0x100
 XID_CONTINUE_MASK = 0x200
 def maketables(trace=0):
@ -65,16 +68,18 @@ def maketables(trace=0):
    version = ""
    unicode = UnicodeData(UNICODE_DATA % version,
                          COMPOSITION_EXCLUSIONS % version,
-                          EASTASIAN_WIDTH % version)
+                          EASTASIAN_WIDTH % version,
                          DERIVED_CORE_PROPERTIES % version)
-    print(len(filter(None, unicode.table)), "characters")
+    print(len(list(filter(None, unicode.table))), "characters")
    for version in old_versions:
        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
                                  COMPOSITION_EXCLUSIONS % ("-"+version),
-                                  EASTASIAN_WIDTH % ("-"+version))
+                                  EASTASIAN_WIDTH % ("-"+version),
-        print(len(filter(None, old_unicode.table)), "characters")
+                                  DERIVED_CORE_PROPERTIES % ("-"+version))
        print(len(list(filter(None, old_unicode.table))), "characters")
        merge_old_version(version, unicode, old_unicode)
    makeunicodename(unicode, trace)
@ -148,7 +153,7 @@ def makeunicodedata(unicode, trace):
                assert prefix < 256
                # content
                decomp = [prefix + (len(decomp)<<8)] +\
-                         map(lambda s: int(s, 16), decomp)
+                         list(map(lambda s: int(s, 16), decomp))
                # Collect NFC pairs
                if not prefix and len(decomp) == 3 and \
                   char not in unicode.exclusions and \
@ -353,6 +358,7 @@ def makeunicodetype(unicode, trace):
            # extract database properties
            category = record[2]
            bidirectional = record[4]
            properties = record[16]
            flags = 0
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                flags |= ALPHA_MASK
@ -366,6 +372,10 @@ def makeunicodetype(unicode, trace):
                flags |= TITLE_MASK
            if category == "Lu":
                flags |= UPPER_MASK
            if "XID_Start" in properties:
                flags |= XID_START_MASK
            if "XID_Continue" in properties:
                flags |= XID_CONTINUE_MASK
            # use delta predictor for upper/lower/title
            if record[12]:
                upper = int(record[12], 16) - char
@ -447,7 +457,7 @@ def makeunicodename(unicode, trace):
            if name and name[0] != "<":
                names[char] = name + chr(0)
-    print(len(filter(lambda n: n is not None, names)), "distinct names")
+    print(len(list(filter(lambda n: n is not None, names))), "distinct names")
    # collect unique words from names (note that we differ between
    # words inside a sentence, and words ending a sentence.  the
@ -470,10 +480,12 @@ def makeunicodename(unicode, trace):
    print(n, "words in text;", b, "bytes")
-    wordlist = words.items()
+    wordlist = list(words.items())
    # sort on falling frequency, then by name
-    def cmpwords((aword, alist),(bword, blist)):
+    def cmpwords(a,b):
        aword, alist = a
        bword, blist = b
        r = -cmp(len(alist),len(blist))
        if r:
            return r
@ -526,7 +538,7 @@ def makeunicodename(unicode, trace):
        words[w] = len(lexicon_offset)
        lexicon_offset.append(o)
-    lexicon = map(ord, lexicon)
+    lexicon = list(map(ord, lexicon))
    # generate phrasebook from names and lexicon
    phrasebook = [0]
@ -660,11 +672,14 @@ def merge_old_version(version, new, old):
                    elif k == 14:
                        # change to simple titlecase mapping; ignore
                        pass
                    elif k == 16:
                        # derived property changes; not yet
                        pass
                    else:
                        class Difference(Exception):pass
                        raise Difference, (hex(i), k, old.table[i], new.table[i])
-    new.changed.append((version, zip(bidir_changes, category_changes,
+    new.changed.append((version, list(zip(bidir_changes, category_changes,
-                                     decimal_changes, numeric_changes),
+                                     decimal_changes, numeric_changes)),
                        normalization_changes))
@ -677,8 +692,14 @@ def merge_old_version(version, new, old):
 import sys
 class UnicodeData:
    # Record structure:
    # [ID, name, category, combining, bidi, decomp,  (6)
    #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
    #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
    #  derived-props] (17)
-    def __init__(self, filename, exclusions, eastasianwidth, expand=1):
+    def __init__(self, filename, exclusions, eastasianwidth,
                 derivedprops, expand=1):
        self.changed = []
        file = open(filename)
        table = [None] * 0x110000
@ -742,6 +763,28 @@ class UnicodeData:
            if table[i] is not None:
                table[i].append(widths[i])
        for i in range(0, 0x110000):
            if table[i] is not None:
                table[i].append(set())
        for s in open(derivedprops):
            s = s.split('#', 1)[0].strip()
            if not s:
                continue
            r, p = s.split(";")
            r = r.strip()
            p = p.strip()
            if ".." in r:
                first, last = [int(c, 16) for c in r.split('..')]
                chars = range(first, last+1)
            else:
                chars = [int(r, 16)]
            for char in chars:
                if table[char]:
                    # Some properties (e.g. Default_Ignorable_Code_Point)
                    # apply to unassigned code points; ignore them
                    table[char][-1].add(p)
    def uselatin1(self):
        # restrict character range to ISO Latin 1
        self.chars = range(256)