Add XID_Start and XID_Continue properties to unicodectype.

This commit is contained in:
Martin v. Löwis 2007-08-14 22:37:03 +00:00
parent ff398c6f95
commit 13c3e380d1
4 changed files with 1076 additions and 955 deletions

View File

@ -205,6 +205,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
@ -289,6 +291,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
@ -1274,6 +1278,14 @@ PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Py_UNICODE ch /* Unicode character */
);
PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Py_UNICODE ch /* Unicode character */
);
PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Py_UNICODE ch /* Unicode character */
);
PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
const Py_UNICODE ch /* Unicode character */
);

View File

@ -19,6 +19,8 @@
#define SPACE_MASK 0x20
#define TITLE_MASK 0x40
#define UPPER_MASK 0x80
#define XID_START_MASK 0x100
#define XID_CONTINUE_MASK 0x200
typedef struct {
const Py_UNICODE upper;
@ -98,6 +100,26 @@ int _PyUnicode_IsTitlecase(Py_UNICODE ch)
return (ctype->flags & TITLE_MASK) != 0;
}
/* Returns 1 for Unicode characters having the XID_Start property, 0
otherwise. */
int _PyUnicode_IsXidStart(Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
return (ctype->flags & XID_START_MASK) != 0;
}
/* Returns 1 for Unicode characters having the XID_Continue property,
0 otherwise. */
int _PyUnicode_IsXidContinue(Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
return (ctype->flags & XID_CONTINUE_MASK) != 0;
}
/* Returns the integer decimal (0-9) for Unicode characters having
this property, -1 otherwise. */

File diff suppressed because it is too large Load Diff

View File

@ -34,6 +34,7 @@ UNIDATA_VERSION = "4.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
old_versions = ["3.2.0"]
@ -57,6 +58,8 @@ LINEBREAK_MASK = 0x10
SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200
def maketables(trace=0):
@ -65,16 +68,18 @@ def maketables(trace=0):
version = ""
unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version)
EASTASIAN_WIDTH % version,
DERIVED_CORE_PROPERTIES % version)
print(len(filter(None, unicode.table)), "characters")
print(len(list(filter(None, unicode.table))), "characters")
for version in old_versions:
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
COMPOSITION_EXCLUSIONS % ("-"+version),
EASTASIAN_WIDTH % ("-"+version))
print(len(filter(None, old_unicode.table)), "characters")
EASTASIAN_WIDTH % ("-"+version),
DERIVED_CORE_PROPERTIES % ("-"+version))
print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode)
makeunicodename(unicode, trace)
@ -148,7 +153,7 @@ def makeunicodedata(unicode, trace):
assert prefix < 256
# content
decomp = [prefix + (len(decomp)<<8)] +\
map(lambda s: int(s, 16), decomp)
list(map(lambda s: int(s, 16), decomp))
# Collect NFC pairs
if not prefix and len(decomp) == 3 and \
char not in unicode.exclusions and \
@ -353,6 +358,7 @@ def makeunicodetype(unicode, trace):
# extract database properties
category = record[2]
bidirectional = record[4]
properties = record[16]
flags = 0
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK
@ -366,6 +372,10 @@ def makeunicodetype(unicode, trace):
flags |= TITLE_MASK
if category == "Lu":
flags |= UPPER_MASK
if "XID_Start" in properties:
flags |= XID_START_MASK
if "XID_Continue" in properties:
flags |= XID_CONTINUE_MASK
# use delta predictor for upper/lower/title
if record[12]:
upper = int(record[12], 16) - char
@ -447,7 +457,7 @@ def makeunicodename(unicode, trace):
if name and name[0] != "<":
names[char] = name + chr(0)
print(len(filter(lambda n: n is not None, names)), "distinct names")
print(len(list(filter(lambda n: n is not None, names))), "distinct names")
# collect unique words from names (note that we differ between
# words inside a sentence, and words ending a sentence. the
@ -470,10 +480,12 @@ def makeunicodename(unicode, trace):
print(n, "words in text;", b, "bytes")
wordlist = words.items()
wordlist = list(words.items())
# sort on falling frequency, then by name
def cmpwords((aword, alist),(bword, blist)):
def cmpwords(a,b):
aword, alist = a
bword, blist = b
r = -cmp(len(alist),len(blist))
if r:
return r
@ -526,7 +538,7 @@ def makeunicodename(unicode, trace):
words[w] = len(lexicon_offset)
lexicon_offset.append(o)
lexicon = map(ord, lexicon)
lexicon = list(map(ord, lexicon))
# generate phrasebook from names and lexicon
phrasebook = [0]
@ -660,11 +672,14 @@ def merge_old_version(version, new, old):
elif k == 14:
# change to simple titlecase mapping; ignore
pass
elif k == 16:
# derived property changes; not yet
pass
else:
class Difference(Exception):pass
raise Difference, (hex(i), k, old.table[i], new.table[i])
new.changed.append((version, zip(bidir_changes, category_changes,
decimal_changes, numeric_changes),
new.changed.append((version, list(zip(bidir_changes, category_changes,
decimal_changes, numeric_changes)),
normalization_changes))
@ -677,8 +692,14 @@ def merge_old_version(version, new, old):
import sys
class UnicodeData:
# Record structure:
# [ID, name, category, combining, bidi, decomp, (6)
# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# derived-props] (17)
def __init__(self, filename, exclusions, eastasianwidth, expand=1):
def __init__(self, filename, exclusions, eastasianwidth,
derivedprops, expand=1):
self.changed = []
file = open(filename)
table = [None] * 0x110000
@ -742,6 +763,28 @@ class UnicodeData:
if table[i] is not None:
table[i].append(widths[i])
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(set())
for s in open(derivedprops):
s = s.split('#', 1)[0].strip()
if not s:
continue
r, p = s.split(";")
r = r.strip()
p = p.strip()
if ".." in r:
first, last = [int(c, 16) for c in r.split('..')]
chars = range(first, last+1)
else:
chars = [int(r, 16)]
for char in chars:
if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char][-1].add(p)
def uselatin1(self):
# restrict character range to ISO Latin 1
self.chars = range(256)