mirror of https://github.com/python/cpython
#12753: Add support for Unicode name aliases and named sequences.
This commit is contained in:
parent
3764a964ca
commit
931b8aac80
|
@ -29,6 +29,9 @@ following functions:
|
|||
Look up character by name. If a character with the given name is found, return
|
||||
the corresponding character. If not found, :exc:`KeyError` is raised.
|
||||
|
||||
.. versionchanged:: 3.3
|
||||
Support for name aliases [#]_ and named sequences [#]_ has been added.
|
||||
|
||||
|
||||
.. function:: name(chr[, default])
|
||||
|
||||
|
@ -160,3 +163,9 @@ Examples:
|
|||
>>> unicodedata.bidirectional('\u0660') # 'A'rabic, 'N'umber
|
||||
'AN'
|
||||
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
|
||||
|
||||
.. [#] http://www.unicode.org/Public/6.0.0/ucd/NamedSequences.txt
|
||||
|
|
|
@ -492,13 +492,13 @@ Escape sequences only recognized in string literals are:
|
|||
+-----------------+---------------------------------+-------+
|
||||
| Escape Sequence | Meaning | Notes |
|
||||
+=================+=================================+=======+
|
||||
| ``\N{name}`` | Character named *name* in the | |
|
||||
| ``\N{name}`` | Character named *name* in the | \(4) |
|
||||
| | Unicode database | |
|
||||
+-----------------+---------------------------------+-------+
|
||||
| ``\uxxxx`` | Character with 16-bit hex value | \(4) |
|
||||
| ``\uxxxx`` | Character with 16-bit hex value | \(5) |
|
||||
| | *xxxx* | |
|
||||
+-----------------+---------------------------------+-------+
|
||||
| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(5) |
|
||||
| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(6) |
|
||||
| | *xxxxxxxx* | |
|
||||
+-----------------+---------------------------------+-------+
|
||||
|
||||
|
@ -516,10 +516,14 @@ Notes:
|
|||
with the given value.
|
||||
|
||||
(4)
|
||||
.. versionchanged:: 3.3
|
||||
Support for name aliases [#]_ has been added.
|
||||
|
||||
(5)
|
||||
Individual code units which form parts of a surrogate pair can be encoded using
|
||||
this escape sequence. Exactly four hex digits are required.
|
||||
|
||||
(5)
|
||||
(6)
|
||||
Any Unicode character can be encoded this way, but characters outside the Basic
|
||||
Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is
|
||||
compiled to use 16-bit code units (the default). Exactly eight hex digits
|
||||
|
@ -706,3 +710,8 @@ The following printing ASCII characters are not used in Python. Their
|
|||
occurrence outside string literals and comments is an unconditional error::
|
||||
|
||||
$ ? `
|
||||
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
|
||||
|
|
|
@ -179,6 +179,12 @@ Some smaller changes made to the core Python language are:
|
|||
|
||||
* Stub
|
||||
|
||||
Added support for Unicode name aliases and named sequences.
|
||||
Both :func:`unicodedata.lookup()` and '\N{...}' now resolve name aliases,
|
||||
and :func:`unicodedata.lookup()` resolves named sequences too.
|
||||
|
||||
(Contributed by Ezio Melotti in :issue:`12753`)
|
||||
|
||||
|
||||
New, Improved, and Deprecated Modules
|
||||
=====================================
|
||||
|
|
|
@ -19,11 +19,13 @@ typedef struct {
|
|||
success, zero if not. Does not set Python exceptions.
|
||||
If self is NULL, data come from the default version of the database.
|
||||
If it is not NULL, it should be a unicodedata.ucd_X_Y_Z object */
|
||||
int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen);
|
||||
int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
|
||||
int with_alias_and_seq);
|
||||
|
||||
/* Get character code for a given name. Same error handling
|
||||
as for getname. */
|
||||
int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code);
|
||||
int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code,
|
||||
int with_named_seq);
|
||||
|
||||
} _PyUnicode_Name_CAPI;
|
||||
|
||||
|
|
|
@ -8,8 +8,11 @@ Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
|||
"""#"
|
||||
|
||||
import unittest
|
||||
import unicodedata
|
||||
|
||||
from test import support
|
||||
from http.client import HTTPException
|
||||
from test.test_normalization import check_version
|
||||
|
||||
class UnicodeNamesTest(unittest.TestCase):
|
||||
|
||||
|
@ -59,8 +62,6 @@ class UnicodeNamesTest(unittest.TestCase):
|
|||
)
|
||||
|
||||
def test_ascii_letters(self):
|
||||
import unicodedata
|
||||
|
||||
for char in "".join(map(chr, range(ord("a"), ord("z")))):
|
||||
name = "LATIN SMALL LETTER %s" % char.upper()
|
||||
code = unicodedata.lookup(name)
|
||||
|
@ -81,7 +82,6 @@ class UnicodeNamesTest(unittest.TestCase):
|
|||
self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
|
||||
self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
|
||||
|
||||
import unicodedata
|
||||
self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
|
||||
|
||||
def test_cjk_unified_ideographs(self):
|
||||
|
@ -97,14 +97,11 @@ class UnicodeNamesTest(unittest.TestCase):
|
|||
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
|
||||
|
||||
def test_bmp_characters(self):
|
||||
import unicodedata
|
||||
count = 0
|
||||
for code in range(0x10000):
|
||||
char = chr(code)
|
||||
name = unicodedata.name(char, None)
|
||||
if name is not None:
|
||||
self.assertEqual(unicodedata.lookup(name), char)
|
||||
count += 1
|
||||
|
||||
def test_misc_symbols(self):
|
||||
self.checkletter("PILCROW SIGN", "\u00b6")
|
||||
|
@ -112,8 +109,85 @@ class UnicodeNamesTest(unittest.TestCase):
|
|||
self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
|
||||
self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
|
||||
|
||||
def test_aliases(self):
|
||||
# Check that the aliases defined in the NameAliases.txt file work.
|
||||
# This should be updated when new aliases are added or the file
|
||||
# should be downloaded and parsed instead. See #12753.
|
||||
aliases = [
|
||||
('LATIN CAPITAL LETTER GHA', 0x01A2),
|
||||
('LATIN SMALL LETTER GHA', 0x01A3),
|
||||
('KANNADA LETTER LLLA', 0x0CDE),
|
||||
('LAO LETTER FO FON', 0x0E9D),
|
||||
('LAO LETTER FO FAY', 0x0E9F),
|
||||
('LAO LETTER RO', 0x0EA3),
|
||||
('LAO LETTER LO', 0x0EA5),
|
||||
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
|
||||
('YI SYLLABLE ITERATION MARK', 0xA015),
|
||||
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
|
||||
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
|
||||
]
|
||||
for alias, codepoint in aliases:
|
||||
self.checkletter(alias, chr(codepoint))
|
||||
name = unicodedata.name(chr(codepoint))
|
||||
self.assertNotEqual(name, alias)
|
||||
self.assertEqual(unicodedata.lookup(alias),
|
||||
unicodedata.lookup(name))
|
||||
with self.assertRaises(KeyError):
|
||||
unicodedata.ucd_3_2_0.lookup(alias)
|
||||
|
||||
def test_aliases_names_in_pua_range(self):
|
||||
# We are storing aliases in the PUA 15, but their names shouldn't leak
|
||||
for cp in range(0xf0000, 0xf0100):
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
unicodedata.name(chr(cp))
|
||||
self.assertEqual(str(cm.exception), 'no such name')
|
||||
|
||||
def test_named_sequences_names_in_pua_range(self):
|
||||
# We are storing named seq in the PUA 15, but their names shouldn't leak
|
||||
for cp in range(0xf0100, 0xf0fff):
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
unicodedata.name(chr(cp))
|
||||
self.assertEqual(str(cm.exception), 'no such name')
|
||||
|
||||
def test_named_sequences_sample(self):
|
||||
# Check a few named sequences. See #12753.
|
||||
sequences = [
|
||||
('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
|
||||
('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
|
||||
('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
|
||||
('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
|
||||
('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
|
||||
]
|
||||
for seqname, codepoints in sequences:
|
||||
self.assertEqual(unicodedata.lookup(seqname), codepoints)
|
||||
with self.assertRaises(SyntaxError):
|
||||
self.checkletter(seqname, None)
|
||||
with self.assertRaises(KeyError):
|
||||
unicodedata.ucd_3_2_0.lookup(seqname)
|
||||
|
||||
def test_named_sequences_full(self):
|
||||
# Check all the named sequences
|
||||
url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
|
||||
unicodedata.unidata_version)
|
||||
try:
|
||||
testdata = support.open_urlresource(url, encoding="utf-8",
|
||||
check=check_version)
|
||||
except (IOError, HTTPException):
|
||||
self.skipTest("Could not retrieve " + url)
|
||||
self.addCleanup(testdata.close)
|
||||
for line in testdata:
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
seqname, codepoints = line.split(';')
|
||||
codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
|
||||
self.assertEqual(unicodedata.lookup(seqname), codepoints)
|
||||
with self.assertRaises(SyntaxError):
|
||||
self.checkletter(seqname, None)
|
||||
with self.assertRaises(KeyError):
|
||||
unicodedata.ucd_3_2_0.lookup(seqname)
|
||||
|
||||
def test_errors(self):
|
||||
import unicodedata
|
||||
self.assertRaises(TypeError, unicodedata.name)
|
||||
self.assertRaises(TypeError, unicodedata.name, 'xx')
|
||||
self.assertRaises(TypeError, unicodedata.lookup)
|
||||
|
|
|
@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Issue #12753: Add support for Unicode name aliases and named sequences.
|
||||
Both :func:`unicodedata.lookup()` and '\N{...}' now resolve aliases,
|
||||
and :func:`unicodedata.lookup()` resolves named sequences too.
|
||||
|
||||
- Issue #12170: The count(), find(), rfind(), index() and rindex() methods
|
||||
of bytes and bytearray objects now accept an integer between 0 and 255
|
||||
as their first argument. Patch by Petri Lehtinen.
|
||||
|
|
|
@ -926,9 +926,19 @@ is_unified_ideograph(Py_UCS4 code)
|
|||
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
|
||||
}
|
||||
|
||||
/* macros used to determine if the given codepoint is in the PUA range that
|
||||
* we are using to store aliases and named sequences */
|
||||
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
|
||||
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
|
||||
(cp < named_sequences_end))
|
||||
|
||||
static int
|
||||
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
|
||||
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
|
||||
int with_alias_and_seq)
|
||||
{
|
||||
/* Find the name associated with the given codepoint.
|
||||
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
|
||||
* that we are using for aliases and named sequences. */
|
||||
int offset;
|
||||
int i;
|
||||
int word;
|
||||
|
@ -937,7 +947,14 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
|
|||
if (code >= 0x110000)
|
||||
return 0;
|
||||
|
||||
/* XXX should we just skip all the codepoints in the PUAs here? */
|
||||
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
|
||||
return 0;
|
||||
|
||||
if (self && UCD_Check(self)) {
|
||||
/* in 3.2.0 there are no aliases and named sequences */
|
||||
if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
|
||||
return 0;
|
||||
const change_record *old = get_old_record(self, code);
|
||||
if (old->category_changed == 0) {
|
||||
/* unassigned */
|
||||
|
@ -1022,7 +1039,7 @@ _cmpname(PyObject *self, int code, const char* name, int namelen)
|
|||
/* check if code corresponds to the given name */
|
||||
int i;
|
||||
char buffer[NAME_MAXLEN];
|
||||
if (!_getucname(self, code, buffer, sizeof(buffer)))
|
||||
if (!_getucname(self, code, buffer, sizeof(buffer), 1))
|
||||
return 0;
|
||||
for (i = 0; i < namelen; i++) {
|
||||
if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
|
||||
|
@ -1052,8 +1069,28 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
|
|||
}
|
||||
|
||||
static int
|
||||
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
|
||||
_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
|
||||
{
|
||||
/* check if named sequences are allowed */
|
||||
if (!with_named_seq && IS_NAMED_SEQ(cp))
|
||||
return 0;
|
||||
/* if the codepoint is in the PUA range that we use for aliases,
|
||||
* convert it to obtain the right codepoint */
|
||||
if (IS_ALIAS(cp))
|
||||
*code = name_aliases[cp-aliases_start];
|
||||
else
|
||||
*code = cp;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
|
||||
int with_named_seq)
|
||||
{
|
||||
/* Return the codepoint associated with the given name.
|
||||
* Named aliases are resolved too (unless self != NULL (i.e. we are using
|
||||
* 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
|
||||
* using for the named sequence, and the caller must then convert it. */
|
||||
unsigned int h, v;
|
||||
unsigned int mask = code_size-1;
|
||||
unsigned int i, incr;
|
||||
|
@ -1109,10 +1146,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
|
|||
v = code_hash[i];
|
||||
if (!v)
|
||||
return 0;
|
||||
if (_cmpname(self, v, name, namelen)) {
|
||||
*code = v;
|
||||
return 1;
|
||||
}
|
||||
if (_cmpname(self, v, name, namelen))
|
||||
return _check_alias_and_seq(v, code, with_named_seq);
|
||||
incr = (h ^ (h >> 3)) & mask;
|
||||
if (!incr)
|
||||
incr = mask;
|
||||
|
@ -1121,10 +1156,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
|
|||
v = code_hash[i];
|
||||
if (!v)
|
||||
return 0;
|
||||
if (_cmpname(self, v, name, namelen)) {
|
||||
*code = v;
|
||||
return 1;
|
||||
}
|
||||
if (_cmpname(self, v, name, namelen))
|
||||
return _check_alias_and_seq(v, code, with_named_seq);
|
||||
incr = incr << 1;
|
||||
if (incr > mask)
|
||||
incr = incr ^ code_poly;
|
||||
|
@ -1162,7 +1195,7 @@ unicodedata_name(PyObject* self, PyObject* args)
|
|||
if (c == (Py_UCS4)-1)
|
||||
return NULL;
|
||||
|
||||
if (!_getucname(self, c, name, sizeof(name))) {
|
||||
if (!_getucname(self, c, name, sizeof(name), 0)) {
|
||||
if (defobj == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError, "no such name");
|
||||
return NULL;
|
||||
|
@ -1190,15 +1223,22 @@ unicodedata_lookup(PyObject* self, PyObject* args)
|
|||
|
||||
char* name;
|
||||
int namelen;
|
||||
unsigned int index;
|
||||
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
|
||||
return NULL;
|
||||
|
||||
if (!_getcode(self, name, namelen, &code)) {
|
||||
PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
|
||||
name);
|
||||
if (!_getcode(self, name, namelen, &code, 1)) {
|
||||
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// check if code is in the PUA range that we use for named sequences
|
||||
// and convert it
|
||||
if (IS_NAMED_SEQ(code)) {
|
||||
index = code-named_sequences_start;
|
||||
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
|
||||
named_sequences[index].seq,
|
||||
named_sequences[index].seqlen);
|
||||
}
|
||||
return PyUnicode_FromOrdinal(code);
|
||||
}
|
||||
|
||||
|
|
35013
Modules/unicodename_db.h
35013
Modules/unicodename_db.h
File diff suppressed because it is too large
Load Diff
|
@ -5857,7 +5857,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
|
|||
message = "unknown Unicode character name";
|
||||
s++;
|
||||
if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
|
||||
&chr))
|
||||
&chr, 0))
|
||||
goto store;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,11 +21,17 @@
|
|||
# 2004-05-29 perky add east asian width information
|
||||
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
|
||||
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
|
||||
# 2011-10-21 ezio add support for name aliases and named sequences
|
||||
#
|
||||
# written by Fredrik Lundh (fredrik@pythonware.com)
|
||||
#
|
||||
|
||||
import sys, os, zipfile
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
from textwrap import dedent
|
||||
from operator import itemgetter
|
||||
|
||||
SCRIPT = sys.argv[0]
|
||||
VERSION = "3.2"
|
||||
|
@ -39,6 +45,17 @@ UNIHAN = "Unihan%s.zip"
|
|||
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
||||
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
||||
LINE_BREAK = "LineBreak%s.txt"
|
||||
NAME_ALIASES = "NameAliases%s.txt"
|
||||
NAMED_SEQUENCES = "NamedSequences%s.txt"
|
||||
|
||||
# Private Use Areas -- in planes 1, 15, 16
|
||||
PUA_1 = range(0xE000, 0xF900)
|
||||
PUA_15 = range(0xF0000, 0xFFFFE)
|
||||
PUA_16 = range(0x100000, 0x10FFFE)
|
||||
|
||||
# we use this ranges of PUA_15 to store name aliases and named sequences
|
||||
NAME_ALIASES_START = 0xF0000
|
||||
NAMED_SEQUENCES_START = 0xF0100
|
||||
|
||||
old_versions = ["3.2.0"]
|
||||
|
||||
|
@ -692,6 +709,39 @@ def makeunicodename(unicode, trace):
|
|||
print("/* name->code dictionary */", file=fp)
|
||||
codehash.dump(fp, trace)
|
||||
|
||||
print(file=fp)
|
||||
print('static const unsigned int aliases_start = %#x;' %
|
||||
NAME_ALIASES_START, file=fp)
|
||||
print('static const unsigned int aliases_end = %#x;' %
|
||||
(NAME_ALIASES_START + len(unicode.aliases)), file=fp)
|
||||
|
||||
print('static const unsigned int name_aliases[] = {', file=fp)
|
||||
for name, codepoint in unicode.aliases:
|
||||
print(' 0x%04X,' % codepoint, file=fp)
|
||||
print('};', file=fp)
|
||||
|
||||
# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
|
||||
# so we are using Py_UCS2 seq[4]. This needs to be updated if longer
|
||||
# sequences or sequences with non-BMP chars are added.
|
||||
# unicodedata_lookup should be adapted too.
|
||||
print(dedent("""
|
||||
typedef struct NamedSequence {
|
||||
int seqlen;
|
||||
Py_UCS2 seq[4];
|
||||
} named_sequence;
|
||||
"""), file=fp)
|
||||
|
||||
print('static const unsigned int named_sequences_start = %#x;' %
|
||||
NAMED_SEQUENCES_START, file=fp)
|
||||
print('static const unsigned int named_sequences_end = %#x;' %
|
||||
(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
|
||||
|
||||
print('static const named_sequence named_sequences[] = {', file=fp)
|
||||
for name, sequence in unicode.named_sequences:
|
||||
seq_str = ', '.join('0x%04X' % cp for cp in sequence)
|
||||
print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
|
||||
print('};', file=fp)
|
||||
|
||||
fp.close()
|
||||
|
||||
|
||||
|
@ -726,7 +776,11 @@ def merge_old_version(version, new, old):
|
|||
for k in range(len(old.table[i])):
|
||||
if old.table[i][k] != new.table[i][k]:
|
||||
value = old.table[i][k]
|
||||
if k == 2:
|
||||
if k == 1 and i in PUA_15:
|
||||
# the name is not set in the old.table, but in the
|
||||
# new.table we are using it for aliases and named seq
|
||||
assert value == ''
|
||||
elif k == 2:
|
||||
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
|
||||
category_changes[i] = CATEGORY_NAMES.index(value)
|
||||
elif k == 4:
|
||||
|
@ -855,6 +909,50 @@ class UnicodeData:
|
|||
self.table = table
|
||||
self.chars = list(range(0x110000)) # unicode 3.2
|
||||
|
||||
# check for name aliases and named sequences, see #12753
|
||||
# aliases and named sequences are not in 3.2.0
|
||||
if version != '3.2.0':
|
||||
self.aliases = []
|
||||
# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
|
||||
# in order to take advantage of the compression and lookup
|
||||
# algorithms used for the other characters
|
||||
pua_index = NAME_ALIASES_START
|
||||
with open_data(NAME_ALIASES, version) as file:
|
||||
for s in file:
|
||||
s = s.strip()
|
||||
if not s or s.startswith('#'):
|
||||
continue
|
||||
char, name = s.split(';')
|
||||
char = int(char, 16)
|
||||
self.aliases.append((name, char))
|
||||
# also store the name in the PUA 1
|
||||
self.table[pua_index][1] = name
|
||||
pua_index += 1
|
||||
assert pua_index - NAME_ALIASES_START == len(self.aliases)
|
||||
|
||||
self.named_sequences = []
|
||||
# store named seqences in the PUA 1, in range U+F0100..,
|
||||
# in order to take advantage of the compression and lookup
|
||||
# algorithms used for the other characters.
|
||||
|
||||
pua_index = NAMED_SEQUENCES_START
|
||||
with open_data(NAMED_SEQUENCES, version) as file:
|
||||
for s in file:
|
||||
s = s.strip()
|
||||
if not s or s.startswith('#'):
|
||||
continue
|
||||
name, chars = s.split(';')
|
||||
chars = tuple(int(char, 16) for char in chars.split())
|
||||
# check that the structure defined in makeunicodename is OK
|
||||
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
|
||||
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
|
||||
"the NamedSequence struct and in unicodedata_lookup")
|
||||
self.named_sequences.append((name, chars))
|
||||
# also store these in the PUA 1
|
||||
self.table[pua_index][1] = name
|
||||
pua_index += 1
|
||||
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
|
||||
|
||||
self.exclusions = {}
|
||||
with open_data(COMPOSITION_EXCLUSIONS, version) as file:
|
||||
for s in file:
|
||||
|
|
Loading…
Reference in New Issue