#12753: Add support for Unicode name aliases and named sequences.

2011-10-21 21:57:36 +03:00 · 2011-10-21 21:57:36 +03:00 · 931b8aac80
parent 3764a964ca
commit 931b8aac80
10 changed files with 18125 additions and 17194 deletions
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@ -29,6 +29,9 @@ following functions:
   Look up character by name.  If a character with the given name is found, return
   the corresponding character.  If not found, :exc:`KeyError` is raised.

+   .. versionchanged:: 3.3
+      Support for name aliases [#]_ and named sequences [#]_ has been added.
+

 .. function:: name(chr[, default])

@ -160,3 +163,9 @@ Examples:
   >>> unicodedata.bidirectional('\u0660') # 'A'rabic, 'N'umber
   'AN'

+
+.. rubric:: Footnotes
+
+.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
+
+.. [#] http://www.unicode.org/Public/6.0.0/ucd/NamedSequences.txt
--- a/Doc/reference/lexical_analysis.rst
+++ b/Doc/reference/lexical_analysis.rst
@ -492,13 +492,13 @@ Escape sequences only recognized in string literals are:
 +-----------------+---------------------------------+-------+
 | Escape Sequence | Meaning                         | Notes |
 +=================+=================================+=======+
-| ``\N{name}``    | Character named *name* in the   |       |
+| ``\N{name}``    | Character named *name* in the   | \(4)  |
 |                 | Unicode database                |       |
 +-----------------+---------------------------------+-------+
-| ``\uxxxx``      | Character with 16-bit hex value | \(4)  |
+| ``\uxxxx``      | Character with 16-bit hex value | \(5)  |
 |                 | *xxxx*                          |       |
 +-----------------+---------------------------------+-------+
-| ``\Uxxxxxxxx``  | Character with 32-bit hex value | \(5)  |
+| ``\Uxxxxxxxx``  | Character with 32-bit hex value | \(6)  |
 |                 | *xxxxxxxx*                      |       |
 +-----------------+---------------------------------+-------+

@ -516,10 +516,14 @@ Notes:
   with the given value.

 (4)
+   .. versionchanged:: 3.3
+      Support for name aliases [#]_ has been added.
+
+(5)
   Individual code units which form parts of a surrogate pair can be encoded using
   this escape sequence.  Exactly four hex digits are required.

-(5)
+(6)
   Any Unicode character can be encoded this way, but characters outside the Basic
   Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is
   compiled to use 16-bit code units (the default).  Exactly eight hex digits
@ -706,3 +710,8 @@ The following printing ASCII characters are not used in Python.  Their
 occurrence outside string literals and comments is an unconditional error::

   $       ?       `
+
+
+.. rubric:: Footnotes
+
+.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@ -179,6 +179,12 @@ Some smaller changes made to the core Python language are:

 * Stub

+Added support for Unicode name aliases and named sequences.
+Both :func:`unicodedata.lookup()` and '\N{...}' now resolve name aliases,
+and :func:`unicodedata.lookup()` resolves named sequences too.
+
+(Contributed by Ezio Melotti in :issue:`12753`)
+

 New, Improved, and Deprecated Modules
 =====================================
--- a/Include/ucnhash.h
+++ b/Include/ucnhash.h
@ -19,11 +19,13 @@ typedef struct {
       success, zero if not.  Does not set Python exceptions. 
       If self is NULL, data come from the default version of the database.
       If it is not NULL, it should be a unicodedata.ucd_X_Y_Z object */
-    int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen);
+    int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
+                   int with_alias_and_seq);

    /* Get character code for a given name.  Same error handling
       as for getname. */
-    int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code);
+    int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code,
+                   int with_named_seq);

 } _PyUnicode_Name_CAPI;

--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@ -8,8 +8,11 @@ Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
 """#"

 import unittest
+import unicodedata

 from test import support
+from http.client import HTTPException
+from test.test_normalization import check_version

 class UnicodeNamesTest(unittest.TestCase):

@ -59,8 +62,6 @@ class UnicodeNamesTest(unittest.TestCase):
        )

    def test_ascii_letters(self):
-        import unicodedata
-
        for char in "".join(map(chr, range(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
@ -81,7 +82,6 @@ class UnicodeNamesTest(unittest.TestCase):
        self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
        self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")

-        import unicodedata
        self.assertRaises(ValueError, unicodedata.name, "\ud7a4")

    def test_cjk_unified_ideographs(self):
@ -97,14 +97,11 @@ class UnicodeNamesTest(unittest.TestCase):
        self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")

    def test_bmp_characters(self):
-        import unicodedata
-        count = 0
        for code in range(0x10000):
            char = chr(code)
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)
-                count += 1

    def test_misc_symbols(self):
        self.checkletter("PILCROW SIGN", "\u00b6")
@ -112,8 +109,85 @@ class UnicodeNamesTest(unittest.TestCase):
        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")

+    def test_aliases(self):
+        # Check that the aliases defined in the NameAliases.txt file work.
+        # This should be updated when new aliases are added or the file
+        # should be downloaded and parsed instead.  See #12753.
+        aliases = [
+            ('LATIN CAPITAL LETTER GHA', 0x01A2),
+            ('LATIN SMALL LETTER GHA', 0x01A3),
+            ('KANNADA LETTER LLLA', 0x0CDE),
+            ('LAO LETTER FO FON', 0x0E9D),
+            ('LAO LETTER FO FAY', 0x0E9F),
+            ('LAO LETTER RO', 0x0EA3),
+            ('LAO LETTER LO', 0x0EA5),
+            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
+            ('YI SYLLABLE ITERATION MARK', 0xA015),
+            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
+            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
+        ]
+        for alias, codepoint in aliases:
+            self.checkletter(alias, chr(codepoint))
+            name = unicodedata.name(chr(codepoint))
+            self.assertNotEqual(name, alias)
+            self.assertEqual(unicodedata.lookup(alias),
+                             unicodedata.lookup(name))
+            with self.assertRaises(KeyError):
+                unicodedata.ucd_3_2_0.lookup(alias)
+
+    def test_aliases_names_in_pua_range(self):
+        # We are storing aliases in the PUA 15, but their names shouldn't leak
+        for cp in range(0xf0000, 0xf0100):
+            with self.assertRaises(ValueError) as cm:
+                unicodedata.name(chr(cp))
+            self.assertEqual(str(cm.exception), 'no such name')
+
+    def test_named_sequences_names_in_pua_range(self):
+        # We are storing named seq in the PUA 15, but their names shouldn't leak
+        for cp in range(0xf0100, 0xf0fff):
+            with self.assertRaises(ValueError) as cm:
+                unicodedata.name(chr(cp))
+            self.assertEqual(str(cm.exception), 'no such name')
+
+    def test_named_sequences_sample(self):
+        # Check a few named sequences.  See #12753.
+        sequences = [
+            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
+            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
+            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
+            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
+            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
+        ]
+        for seqname, codepoints in sequences:
+            self.assertEqual(unicodedata.lookup(seqname), codepoints)
+            with self.assertRaises(SyntaxError):
+                self.checkletter(seqname, None)
+            with self.assertRaises(KeyError):
+                unicodedata.ucd_3_2_0.lookup(seqname)
+
+    def test_named_sequences_full(self):
+        # Check all the named sequences
+        url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
+               unicodedata.unidata_version)
+        try:
+            testdata = support.open_urlresource(url, encoding="utf-8",
+                                                check=check_version)
+        except (IOError, HTTPException):
+            self.skipTest("Could not retrieve " + url)
+        self.addCleanup(testdata.close)
+        for line in testdata:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            seqname, codepoints = line.split(';')
+            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
+            self.assertEqual(unicodedata.lookup(seqname), codepoints)
+            with self.assertRaises(SyntaxError):
+                self.checkletter(seqname, None)
+            with self.assertRaises(KeyError):
+                unicodedata.ucd_3_2_0.lookup(seqname)
+
    def test_errors(self):
-        import unicodedata
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, 'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------

+- Issue #12753: Add support for Unicode name aliases and named sequences.
+  Both :func:`unicodedata.lookup()` and '\N{...}' now resolve aliases,
+  and :func:`unicodedata.lookup()` resolves named sequences too.
+
 - Issue #12170: The count(), find(), rfind(), index() and rindex() methods
  of bytes and bytearray objects now accept an integer between 0 and 255
  as their first argument.  Patch by Petri Lehtinen.
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -926,9 +926,19 @@ is_unified_ideograph(Py_UCS4 code)
        (0x2B740 <= code && code <= 0x2B81D);   /* CJK Ideograph Extension D */
 }

+/* macros used to determine if the given codepoint is in the PUA range that
+ * we are using to store aliases and named sequences */
+#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
+#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
+                          (cp < named_sequences_end))
+
 static int
-_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
+_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
+           int with_alias_and_seq)
 {
+    /* Find the name associated with the given codepoint.
+     * If with_alias_and_seq is 1, check for names in the Private Use Area 15
+     * that we are using for aliases and named sequences. */
    int offset;
    int i;
    int word;
@ -937,7 +947,14 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
    if (code >= 0x110000)
        return 0;

+    /* XXX should we just skip all the codepoints in the PUAs here? */
+    if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
+        return 0;
+
    if (self && UCD_Check(self)) {
+        /* in 3.2.0 there are no aliases and named sequences */
+        if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
+            return 0;
        const change_record *old = get_old_record(self, code);
        if (old->category_changed == 0) {
            /* unassigned */
@ -1022,7 +1039,7 @@ _cmpname(PyObject *self, int code, const char* name, int namelen)
    /* check if code corresponds to the given name */
    int i;
    char buffer[NAME_MAXLEN];
-    if (!_getucname(self, code, buffer, sizeof(buffer)))
+    if (!_getucname(self, code, buffer, sizeof(buffer), 1))
        return 0;
    for (i = 0; i < namelen; i++) {
        if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
@ -1052,8 +1069,28 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
 }

 static int
-_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
+_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
 {
+    /* check if named sequences are allowed */
+    if (!with_named_seq && IS_NAMED_SEQ(cp))
+        return 0;
+    /* if the codepoint is in the PUA range that we use for aliases,
+     * convert it to obtain the right codepoint */
+    if (IS_ALIAS(cp))
+        *code = name_aliases[cp-aliases_start];
+    else
+        *code = cp;
+    return 1;
+}
+
+static int
+_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
+         int with_named_seq)
+{
+    /* Return the codepoint associated with the given name.
+     * Named aliases are resolved too (unless self != NULL (i.e. we are using
+     * 3.2.0)).  If with_named_seq is 1, returns the PUA codepoint that we are
+     * using for the named sequence, and the caller must then convert it. */
    unsigned int h, v;
    unsigned int mask = code_size-1;
    unsigned int i, incr;
@ -1109,10 +1146,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
    v = code_hash[i];
    if (!v)
        return 0;
-    if (_cmpname(self, v, name, namelen)) {
-        *code = v;
-        return 1;
-    }
+    if (_cmpname(self, v, name, namelen))
+        return _check_alias_and_seq(v, code, with_named_seq);
    incr = (h ^ (h >> 3)) & mask;
    if (!incr)
        incr = mask;
@ -1121,10 +1156,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
        v = code_hash[i];
        if (!v)
            return 0;
-        if (_cmpname(self, v, name, namelen)) {
-            *code = v;
-            return 1;
-        }
+        if (_cmpname(self, v, name, namelen))
+            return _check_alias_and_seq(v, code, with_named_seq);
        incr = incr << 1;
        if (incr > mask)
            incr = incr ^ code_poly;
@ -1162,7 +1195,7 @@ unicodedata_name(PyObject* self, PyObject* args)
    if (c == (Py_UCS4)-1)
        return NULL;

-    if (!_getucname(self, c, name, sizeof(name))) {
+    if (!_getucname(self, c, name, sizeof(name), 0)) {
        if (defobj == NULL) {
            PyErr_SetString(PyExc_ValueError, "no such name");
            return NULL;
@ -1190,15 +1223,22 @@ unicodedata_lookup(PyObject* self, PyObject* args)

    char* name;
    int namelen;
+    unsigned int index;
    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
        return NULL;

-    if (!_getcode(self, name, namelen, &code)) {
-        PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
-                     name);
+    if (!_getcode(self, name, namelen, &code, 1)) {
+        PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
        return NULL;
    }
-
+    // check if code is in the PUA range that we use for named sequences
+    // and convert it
+    if (IS_NAMED_SEQ(code)) {
+        index = code-named_sequences_start;
+        return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
+                                         named_sequences[index].seq,
+                                         named_sequences[index].seqlen);
+    }
    return PyUnicode_FromOrdinal(code);
 }

--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -5857,7 +5857,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
                    message = "unknown Unicode character name";
                    s++;
                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
-                                              &chr))
+                                              &chr, 0))
                        goto store;
                }
            }
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -21,11 +21,17 @@
 # 2004-05-29 perky add east asian width information
 # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
 # 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
+# 2011-10-21 ezio add support for name aliases and named sequences
 #
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #

-import sys, os, zipfile
+import os
+import sys
+import zipfile
+
+from textwrap import dedent
+from operator import itemgetter

 SCRIPT = sys.argv[0]
 VERSION = "3.2"
@ -39,6 +45,17 @@ UNIHAN = "Unihan%s.zip"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
+NAME_ALIASES = "NameAliases%s.txt"
+NAMED_SEQUENCES = "NamedSequences%s.txt"
+
+# Private Use Areas -- in planes 1, 15, 16
+PUA_1 = range(0xE000, 0xF900)
+PUA_15 = range(0xF0000, 0xFFFFE)
+PUA_16 = range(0x100000, 0x10FFFE)
+
+# we use this ranges of PUA_15 to store name aliases and named sequences
+NAME_ALIASES_START = 0xF0000
+NAMED_SEQUENCES_START = 0xF0100

 old_versions = ["3.2.0"]

@ -692,6 +709,39 @@ def makeunicodename(unicode, trace):
    print("/* name->code dictionary */", file=fp)
    codehash.dump(fp, trace)

+    print(file=fp)
+    print('static const unsigned int aliases_start = %#x;' %
+          NAME_ALIASES_START, file=fp)
+    print('static const unsigned int aliases_end = %#x;' %
+          (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
+
+    print('static const unsigned int name_aliases[] = {', file=fp)
+    for name, codepoint in unicode.aliases:
+        print('    0x%04X,' % codepoint, file=fp)
+    print('};', file=fp)
+
+    # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
+    # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
+    # sequences or sequences with non-BMP chars are added.
+    # unicodedata_lookup should be adapted too.
+    print(dedent("""
+        typedef struct NamedSequence {
+            int seqlen;
+            Py_UCS2 seq[4];
+        } named_sequence;
+        """), file=fp)
+
+    print('static const unsigned int named_sequences_start = %#x;' %
+          NAMED_SEQUENCES_START, file=fp)
+    print('static const unsigned int named_sequences_end = %#x;' %
+          (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
+
+    print('static const named_sequence named_sequences[] = {', file=fp)
+    for name, sequence in unicode.named_sequences:
+        seq_str = ', '.join('0x%04X' % cp for cp in sequence)
+        print('    {%d, {%s}},' % (len(sequence), seq_str), file=fp)
+    print('};', file=fp)
+
    fp.close()


@ -726,7 +776,11 @@ def merge_old_version(version, new, old):
            for k in range(len(old.table[i])):
                if old.table[i][k] != new.table[i][k]:
                    value = old.table[i][k]
-                    if k == 2:
+                    if k == 1 and i in PUA_15:
+                        # the name is not set in the old.table, but in the
+                        # new.table we are using it for aliases and named seq
+                        assert value == ''
+                    elif k == 2:
                        #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
                        category_changes[i] = CATEGORY_NAMES.index(value)
                    elif k == 4:
@ -855,6 +909,50 @@ class UnicodeData:
        self.table = table
        self.chars = list(range(0x110000)) # unicode 3.2

+        # check for name aliases and named sequences, see #12753
+        # aliases and named sequences are not in 3.2.0
+        if version != '3.2.0':
+            self.aliases = []
+            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
+            # in order to take advantage of the compression and lookup
+            # algorithms used for the other characters
+            pua_index = NAME_ALIASES_START
+            with open_data(NAME_ALIASES, version) as file:
+                for s in file:
+                    s = s.strip()
+                    if not s or s.startswith('#'):
+                        continue
+                    char, name = s.split(';')
+                    char = int(char, 16)
+                    self.aliases.append((name, char))
+                    # also store the name in the PUA 1
+                    self.table[pua_index][1] = name
+                    pua_index += 1
+            assert pua_index - NAME_ALIASES_START == len(self.aliases)
+
+            self.named_sequences = []
+            # store named seqences in the PUA 1, in range U+F0100..,
+            # in order to take advantage of the compression and lookup
+            # algorithms used for the other characters.
+
+            pua_index = NAMED_SEQUENCES_START
+            with open_data(NAMED_SEQUENCES, version) as file:
+                for s in file:
+                    s = s.strip()
+                    if not s or s.startswith('#'):
+                        continue
+                    name, chars = s.split(';')
+                    chars = tuple(int(char, 16) for char in chars.split())
+                    # check that the structure defined in makeunicodename is OK
+                    assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
+                    assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
+                        "the NamedSequence struct and in unicodedata_lookup")
+                    self.named_sequences.append((name, chars))
+                    # also store these in the PUA 1
+                    self.table[pua_index][1] = name
+                    pua_index += 1
+            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
+
        self.exclusions = {}
        with open_data(COMPOSITION_EXCLUSIONS, version) as file:
            for s in file: