#12753: Add support for Unicode name aliases and named sequences.

This commit is contained in:
Ezio Melotti 2011-10-21 21:57:36 +03:00
parent 3764a964ca
commit 931b8aac80
10 changed files with 18125 additions and 17194 deletions

View File

@ -29,6 +29,9 @@ following functions:
Look up character by name. If a character with the given name is found, return
the corresponding character. If not found, :exc:`KeyError` is raised.
.. versionchanged:: 3.3
Support for name aliases [#]_ and named sequences [#]_ has been added.
.. function:: name(chr[, default])
@ -160,3 +163,9 @@ Examples:
>>> unicodedata.bidirectional('\u0660') # 'A'rabic, 'N'umber
'AN'
.. rubric:: Footnotes
.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
.. [#] http://www.unicode.org/Public/6.0.0/ucd/NamedSequences.txt

View File

@ -492,13 +492,13 @@ Escape sequences only recognized in string literals are:
+-----------------+---------------------------------+-------+
| Escape Sequence | Meaning | Notes |
+=================+=================================+=======+
| ``\N{name}`` | Character named *name* in the | |
| ``\N{name}`` | Character named *name* in the | \(4) |
| | Unicode database | |
+-----------------+---------------------------------+-------+
| ``\uxxxx`` | Character with 16-bit hex value | \(4) |
| ``\uxxxx`` | Character with 16-bit hex value | \(5) |
| | *xxxx* | |
+-----------------+---------------------------------+-------+
| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(5) |
| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(6) |
| | *xxxxxxxx* | |
+-----------------+---------------------------------+-------+
@ -516,10 +516,14 @@ Notes:
with the given value.
(4)
.. versionchanged:: 3.3
Support for name aliases [#]_ has been added.
(5)
Individual code units which form parts of a surrogate pair can be encoded using
this escape sequence. Exactly four hex digits are required.
(5)
(6)
Any Unicode character can be encoded this way, but characters outside the Basic
Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is
compiled to use 16-bit code units (the default). Exactly eight hex digits
@ -706,3 +710,8 @@ The following printing ASCII characters are not used in Python. Their
occurrence outside string literals and comments is an unconditional error::
$ ? `
.. rubric:: Footnotes
.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt

View File

@ -179,6 +179,12 @@ Some smaller changes made to the core Python language are:
* Stub
Added support for Unicode name aliases and named sequences.
Both :func:`unicodedata.lookup()` and '\N{...}' now resolve name aliases,
and :func:`unicodedata.lookup()` resolves named sequences too.
(Contributed by Ezio Melotti in :issue:`12753`)
New, Improved, and Deprecated Modules
=====================================

View File

@ -19,11 +19,13 @@ typedef struct {
success, zero if not. Does not set Python exceptions.
If self is NULL, data come from the default version of the database.
If it is not NULL, it should be a unicodedata.ucd_X_Y_Z object */
int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen);
int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
int with_alias_and_seq);
/* Get character code for a given name. Same error handling
as for getname. */
int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code);
int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code,
int with_named_seq);
} _PyUnicode_Name_CAPI;

View File

@ -8,8 +8,11 @@ Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
"""#"
import unittest
import unicodedata
from test import support
from http.client import HTTPException
from test.test_normalization import check_version
class UnicodeNamesTest(unittest.TestCase):
@ -59,8 +62,6 @@ class UnicodeNamesTest(unittest.TestCase):
)
def test_ascii_letters(self):
import unicodedata
for char in "".join(map(chr, range(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
@ -81,7 +82,6 @@ class UnicodeNamesTest(unittest.TestCase):
self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
import unicodedata
self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
def test_cjk_unified_ideographs(self):
@ -97,14 +97,11 @@ class UnicodeNamesTest(unittest.TestCase):
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
def test_bmp_characters(self):
import unicodedata
count = 0
for code in range(0x10000):
char = chr(code)
name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
count += 1
def test_misc_symbols(self):
self.checkletter("PILCROW SIGN", "\u00b6")
@ -112,8 +109,85 @@ class UnicodeNamesTest(unittest.TestCase):
self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
def test_aliases(self):
# Check that the aliases defined in the NameAliases.txt file work.
# This should be updated when new aliases are added or the file
# should be downloaded and parsed instead. See #12753.
aliases = [
('LATIN CAPITAL LETTER GHA', 0x01A2),
('LATIN SMALL LETTER GHA', 0x01A3),
('KANNADA LETTER LLLA', 0x0CDE),
('LAO LETTER FO FON', 0x0E9D),
('LAO LETTER FO FAY', 0x0E9F),
('LAO LETTER RO', 0x0EA3),
('LAO LETTER LO', 0x0EA5),
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
('YI SYLLABLE ITERATION MARK', 0xA015),
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
]
for alias, codepoint in aliases:
self.checkletter(alias, chr(codepoint))
name = unicodedata.name(chr(codepoint))
self.assertNotEqual(name, alias)
self.assertEqual(unicodedata.lookup(alias),
unicodedata.lookup(name))
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(alias)
def test_aliases_names_in_pua_range(self):
# We are storing aliases in the PUA 15, but their names shouldn't leak
for cp in range(0xf0000, 0xf0100):
with self.assertRaises(ValueError) as cm:
unicodedata.name(chr(cp))
self.assertEqual(str(cm.exception), 'no such name')
def test_named_sequences_names_in_pua_range(self):
# We are storing named seq in the PUA 15, but their names shouldn't leak
for cp in range(0xf0100, 0xf0fff):
with self.assertRaises(ValueError) as cm:
unicodedata.name(chr(cp))
self.assertEqual(str(cm.exception), 'no such name')
def test_named_sequences_sample(self):
# Check a few named sequences. See #12753.
sequences = [
('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
]
for seqname, codepoints in sequences:
self.assertEqual(unicodedata.lookup(seqname), codepoints)
with self.assertRaises(SyntaxError):
self.checkletter(seqname, None)
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)
def test_named_sequences_full(self):
# Check all the named sequences
url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
unicodedata.unidata_version)
try:
testdata = support.open_urlresource(url, encoding="utf-8",
check=check_version)
except (IOError, HTTPException):
self.skipTest("Could not retrieve " + url)
self.addCleanup(testdata.close)
for line in testdata:
line = line.strip()
if not line or line.startswith('#'):
continue
seqname, codepoints = line.split(';')
codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
self.assertEqual(unicodedata.lookup(seqname), codepoints)
with self.assertRaises(SyntaxError):
self.checkletter(seqname, None)
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)
def test_errors(self):
import unicodedata
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, 'xx')
self.assertRaises(TypeError, unicodedata.lookup)

View File

@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
- Issue #12753: Add support for Unicode name aliases and named sequences.
Both :func:`unicodedata.lookup()` and '\N{...}' now resolve aliases,
and :func:`unicodedata.lookup()` resolves named sequences too.
- Issue #12170: The count(), find(), rfind(), index() and rindex() methods
of bytes and bytearray objects now accept an integer between 0 and 255
as their first argument. Patch by Petri Lehtinen.

View File

@ -926,9 +926,19 @@ is_unified_ideograph(Py_UCS4 code)
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
}
/* macros used to determine if the given codepoint is in the PUA range that
* we are using to store aliases and named sequences */
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
(cp < named_sequences_end))
static int
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
int with_alias_and_seq)
{
/* Find the name associated with the given codepoint.
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
* that we are using for aliases and named sequences. */
int offset;
int i;
int word;
@ -937,7 +947,14 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
if (code >= 0x110000)
return 0;
/* XXX should we just skip all the codepoints in the PUAs here? */
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
return 0;
if (self && UCD_Check(self)) {
/* in 3.2.0 there are no aliases and named sequences */
if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
return 0;
const change_record *old = get_old_record(self, code);
if (old->category_changed == 0) {
/* unassigned */
@ -1022,7 +1039,7 @@ _cmpname(PyObject *self, int code, const char* name, int namelen)
/* check if code corresponds to the given name */
int i;
char buffer[NAME_MAXLEN];
if (!_getucname(self, code, buffer, sizeof(buffer)))
if (!_getucname(self, code, buffer, sizeof(buffer), 1))
return 0;
for (i = 0; i < namelen; i++) {
if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
@ -1052,8 +1069,28 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
}
static int
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
{
/* check if named sequences are allowed */
if (!with_named_seq && IS_NAMED_SEQ(cp))
return 0;
/* if the codepoint is in the PUA range that we use for aliases,
* convert it to obtain the right codepoint */
if (IS_ALIAS(cp))
*code = name_aliases[cp-aliases_start];
else
*code = cp;
return 1;
}
static int
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
int with_named_seq)
{
/* Return the codepoint associated with the given name.
* Named aliases are resolved too (unless self != NULL (i.e. we are using
* 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
* using for the named sequence, and the caller must then convert it. */
unsigned int h, v;
unsigned int mask = code_size-1;
unsigned int i, incr;
@ -1109,10 +1146,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
v = code_hash[i];
if (!v)
return 0;
if (_cmpname(self, v, name, namelen)) {
*code = v;
return 1;
}
if (_cmpname(self, v, name, namelen))
return _check_alias_and_seq(v, code, with_named_seq);
incr = (h ^ (h >> 3)) & mask;
if (!incr)
incr = mask;
@ -1121,10 +1156,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
v = code_hash[i];
if (!v)
return 0;
if (_cmpname(self, v, name, namelen)) {
*code = v;
return 1;
}
if (_cmpname(self, v, name, namelen))
return _check_alias_and_seq(v, code, with_named_seq);
incr = incr << 1;
if (incr > mask)
incr = incr ^ code_poly;
@ -1162,7 +1195,7 @@ unicodedata_name(PyObject* self, PyObject* args)
if (c == (Py_UCS4)-1)
return NULL;
if (!_getucname(self, c, name, sizeof(name))) {
if (!_getucname(self, c, name, sizeof(name), 0)) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "no such name");
return NULL;
@ -1190,15 +1223,22 @@ unicodedata_lookup(PyObject* self, PyObject* args)
char* name;
int namelen;
unsigned int index;
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
return NULL;
if (!_getcode(self, name, namelen, &code)) {
PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
name);
if (!_getcode(self, name, namelen, &code, 1)) {
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
return NULL;
}
// check if code is in the PUA range that we use for named sequences
// and convert it
if (IS_NAMED_SEQ(code)) {
index = code-named_sequences_start;
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
named_sequences[index].seq,
named_sequences[index].seqlen);
}
return PyUnicode_FromOrdinal(code);
}

File diff suppressed because it is too large Load Diff

View File

@ -5857,7 +5857,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
message = "unknown Unicode character name";
s++;
if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
&chr))
&chr, 0))
goto store;
}
}

View File

@ -21,11 +21,17 @@
# 2004-05-29 perky add east asian width information
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
# 2011-10-21 ezio add support for name aliases and named sequences
#
# written by Fredrik Lundh (fredrik@pythonware.com)
#
import sys, os, zipfile
import os
import sys
import zipfile
from textwrap import dedent
from operator import itemgetter
SCRIPT = sys.argv[0]
VERSION = "3.2"
@ -39,6 +45,17 @@ UNIHAN = "Unihan%s.zip"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
NAME_ALIASES = "NameAliases%s.txt"
NAMED_SEQUENCES = "NamedSequences%s.txt"
# Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900)
PUA_15 = range(0xF0000, 0xFFFFE)
PUA_16 = range(0x100000, 0x10FFFE)
# we use this ranges of PUA_15 to store name aliases and named sequences
NAME_ALIASES_START = 0xF0000
NAMED_SEQUENCES_START = 0xF0100
old_versions = ["3.2.0"]
@ -692,6 +709,39 @@ def makeunicodename(unicode, trace):
print("/* name->code dictionary */", file=fp)
codehash.dump(fp, trace)
print(file=fp)
print('static const unsigned int aliases_start = %#x;' %
NAME_ALIASES_START, file=fp)
print('static const unsigned int aliases_end = %#x;' %
(NAME_ALIASES_START + len(unicode.aliases)), file=fp)
print('static const unsigned int name_aliases[] = {', file=fp)
for name, codepoint in unicode.aliases:
print(' 0x%04X,' % codepoint, file=fp)
print('};', file=fp)
# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
# so we are using Py_UCS2 seq[4]. This needs to be updated if longer
# sequences or sequences with non-BMP chars are added.
# unicodedata_lookup should be adapted too.
print(dedent("""
typedef struct NamedSequence {
int seqlen;
Py_UCS2 seq[4];
} named_sequence;
"""), file=fp)
print('static const unsigned int named_sequences_start = %#x;' %
NAMED_SEQUENCES_START, file=fp)
print('static const unsigned int named_sequences_end = %#x;' %
(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
print('static const named_sequence named_sequences[] = {', file=fp)
for name, sequence in unicode.named_sequences:
seq_str = ', '.join('0x%04X' % cp for cp in sequence)
print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
print('};', file=fp)
fp.close()
@ -726,7 +776,11 @@ def merge_old_version(version, new, old):
for k in range(len(old.table[i])):
if old.table[i][k] != new.table[i][k]:
value = old.table[i][k]
if k == 2:
if k == 1 and i in PUA_15:
# the name is not set in the old.table, but in the
# new.table we are using it for aliases and named seq
assert value == ''
elif k == 2:
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
category_changes[i] = CATEGORY_NAMES.index(value)
elif k == 4:
@ -855,6 +909,50 @@ class UnicodeData:
self.table = table
self.chars = list(range(0x110000)) # unicode 3.2
# check for name aliases and named sequences, see #12753
# aliases and named sequences are not in 3.2.0
if version != '3.2.0':
self.aliases = []
# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
# in order to take advantage of the compression and lookup
# algorithms used for the other characters
pua_index = NAME_ALIASES_START
with open_data(NAME_ALIASES, version) as file:
for s in file:
s = s.strip()
if not s or s.startswith('#'):
continue
char, name = s.split(';')
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
assert pua_index - NAME_ALIASES_START == len(self.aliases)
self.named_sequences = []
# store named seqences in the PUA 1, in range U+F0100..,
# in order to take advantage of the compression and lookup
# algorithms used for the other characters.
pua_index = NAMED_SEQUENCES_START
with open_data(NAMED_SEQUENCES, version) as file:
for s in file:
s = s.strip()
if not s or s.startswith('#'):
continue
name, chars = s.split(';')
chars = tuple(int(char, 16) for char in chars.split())
# check that the structure defined in makeunicodename is OK
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
"the NamedSequence struct and in unicodedata_lookup")
self.named_sequences.append((name, chars))
# also store these in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
self.exclusions = {}
with open_data(COMPOSITION_EXCLUSIONS, version) as file:
for s in file: