mirror of https://github.com/python/cpython
Add XID_Start and XID_Continue properties to unicodectype.
This commit is contained in:
parent
ff398c6f95
commit
13c3e380d1
|
@ -205,6 +205,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
|
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
|
||||||
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
|
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
|
||||||
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
|
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
|
||||||
|
# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
|
||||||
|
# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
|
||||||
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
|
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
|
||||||
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
|
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
|
||||||
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
|
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
|
||||||
|
@ -289,6 +291,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
|
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
|
||||||
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
|
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
|
||||||
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
|
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
|
||||||
|
# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
|
||||||
|
# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
|
||||||
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
|
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
|
||||||
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
|
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
|
||||||
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
|
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
|
||||||
|
@ -1274,6 +1278,14 @@ PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
|
||||||
Py_UNICODE ch /* Unicode character */
|
Py_UNICODE ch /* Unicode character */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_IsXidStart(
|
||||||
|
Py_UNICODE ch /* Unicode character */
|
||||||
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
|
||||||
|
Py_UNICODE ch /* Unicode character */
|
||||||
|
);
|
||||||
|
|
||||||
PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
|
PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
|
||||||
const Py_UNICODE ch /* Unicode character */
|
const Py_UNICODE ch /* Unicode character */
|
||||||
);
|
);
|
||||||
|
|
|
@ -19,6 +19,8 @@
|
||||||
#define SPACE_MASK 0x20
|
#define SPACE_MASK 0x20
|
||||||
#define TITLE_MASK 0x40
|
#define TITLE_MASK 0x40
|
||||||
#define UPPER_MASK 0x80
|
#define UPPER_MASK 0x80
|
||||||
|
#define XID_START_MASK 0x100
|
||||||
|
#define XID_CONTINUE_MASK 0x200
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const Py_UNICODE upper;
|
const Py_UNICODE upper;
|
||||||
|
@ -98,6 +100,26 @@ int _PyUnicode_IsTitlecase(Py_UNICODE ch)
|
||||||
return (ctype->flags & TITLE_MASK) != 0;
|
return (ctype->flags & TITLE_MASK) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Returns 1 for Unicode characters having the XID_Start property, 0
|
||||||
|
otherwise. */
|
||||||
|
|
||||||
|
int _PyUnicode_IsXidStart(Py_UNICODE ch)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
return (ctype->flags & XID_START_MASK) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Returns 1 for Unicode characters having the XID_Continue property,
|
||||||
|
0 otherwise. */
|
||||||
|
|
||||||
|
int _PyUnicode_IsXidContinue(Py_UNICODE ch)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
return (ctype->flags & XID_CONTINUE_MASK) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* Returns the integer decimal (0-9) for Unicode characters having
|
/* Returns the integer decimal (0-9) for Unicode characters having
|
||||||
this property, -1 otherwise. */
|
this property, -1 otherwise. */
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -34,6 +34,7 @@ UNIDATA_VERSION = "4.1.0"
|
||||||
UNICODE_DATA = "UnicodeData%s.txt"
|
UNICODE_DATA = "UnicodeData%s.txt"
|
||||||
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
||||||
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
||||||
|
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
||||||
|
|
||||||
old_versions = ["3.2.0"]
|
old_versions = ["3.2.0"]
|
||||||
|
|
||||||
|
@ -57,6 +58,8 @@ LINEBREAK_MASK = 0x10
|
||||||
SPACE_MASK = 0x20
|
SPACE_MASK = 0x20
|
||||||
TITLE_MASK = 0x40
|
TITLE_MASK = 0x40
|
||||||
UPPER_MASK = 0x80
|
UPPER_MASK = 0x80
|
||||||
|
XID_START_MASK = 0x100
|
||||||
|
XID_CONTINUE_MASK = 0x200
|
||||||
|
|
||||||
def maketables(trace=0):
|
def maketables(trace=0):
|
||||||
|
|
||||||
|
@ -65,16 +68,18 @@ def maketables(trace=0):
|
||||||
version = ""
|
version = ""
|
||||||
unicode = UnicodeData(UNICODE_DATA % version,
|
unicode = UnicodeData(UNICODE_DATA % version,
|
||||||
COMPOSITION_EXCLUSIONS % version,
|
COMPOSITION_EXCLUSIONS % version,
|
||||||
EASTASIAN_WIDTH % version)
|
EASTASIAN_WIDTH % version,
|
||||||
|
DERIVED_CORE_PROPERTIES % version)
|
||||||
|
|
||||||
print(len(filter(None, unicode.table)), "characters")
|
print(len(list(filter(None, unicode.table))), "characters")
|
||||||
|
|
||||||
for version in old_versions:
|
for version in old_versions:
|
||||||
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
||||||
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
|
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
|
||||||
COMPOSITION_EXCLUSIONS % ("-"+version),
|
COMPOSITION_EXCLUSIONS % ("-"+version),
|
||||||
EASTASIAN_WIDTH % ("-"+version))
|
EASTASIAN_WIDTH % ("-"+version),
|
||||||
print(len(filter(None, old_unicode.table)), "characters")
|
DERIVED_CORE_PROPERTIES % ("-"+version))
|
||||||
|
print(len(list(filter(None, old_unicode.table))), "characters")
|
||||||
merge_old_version(version, unicode, old_unicode)
|
merge_old_version(version, unicode, old_unicode)
|
||||||
|
|
||||||
makeunicodename(unicode, trace)
|
makeunicodename(unicode, trace)
|
||||||
|
@ -148,7 +153,7 @@ def makeunicodedata(unicode, trace):
|
||||||
assert prefix < 256
|
assert prefix < 256
|
||||||
# content
|
# content
|
||||||
decomp = [prefix + (len(decomp)<<8)] +\
|
decomp = [prefix + (len(decomp)<<8)] +\
|
||||||
map(lambda s: int(s, 16), decomp)
|
list(map(lambda s: int(s, 16), decomp))
|
||||||
# Collect NFC pairs
|
# Collect NFC pairs
|
||||||
if not prefix and len(decomp) == 3 and \
|
if not prefix and len(decomp) == 3 and \
|
||||||
char not in unicode.exclusions and \
|
char not in unicode.exclusions and \
|
||||||
|
@ -353,6 +358,7 @@ def makeunicodetype(unicode, trace):
|
||||||
# extract database properties
|
# extract database properties
|
||||||
category = record[2]
|
category = record[2]
|
||||||
bidirectional = record[4]
|
bidirectional = record[4]
|
||||||
|
properties = record[16]
|
||||||
flags = 0
|
flags = 0
|
||||||
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
||||||
flags |= ALPHA_MASK
|
flags |= ALPHA_MASK
|
||||||
|
@ -366,6 +372,10 @@ def makeunicodetype(unicode, trace):
|
||||||
flags |= TITLE_MASK
|
flags |= TITLE_MASK
|
||||||
if category == "Lu":
|
if category == "Lu":
|
||||||
flags |= UPPER_MASK
|
flags |= UPPER_MASK
|
||||||
|
if "XID_Start" in properties:
|
||||||
|
flags |= XID_START_MASK
|
||||||
|
if "XID_Continue" in properties:
|
||||||
|
flags |= XID_CONTINUE_MASK
|
||||||
# use delta predictor for upper/lower/title
|
# use delta predictor for upper/lower/title
|
||||||
if record[12]:
|
if record[12]:
|
||||||
upper = int(record[12], 16) - char
|
upper = int(record[12], 16) - char
|
||||||
|
@ -447,7 +457,7 @@ def makeunicodename(unicode, trace):
|
||||||
if name and name[0] != "<":
|
if name and name[0] != "<":
|
||||||
names[char] = name + chr(0)
|
names[char] = name + chr(0)
|
||||||
|
|
||||||
print(len(filter(lambda n: n is not None, names)), "distinct names")
|
print(len(list(filter(lambda n: n is not None, names))), "distinct names")
|
||||||
|
|
||||||
# collect unique words from names (note that we differ between
|
# collect unique words from names (note that we differ between
|
||||||
# words inside a sentence, and words ending a sentence. the
|
# words inside a sentence, and words ending a sentence. the
|
||||||
|
@ -470,10 +480,12 @@ def makeunicodename(unicode, trace):
|
||||||
|
|
||||||
print(n, "words in text;", b, "bytes")
|
print(n, "words in text;", b, "bytes")
|
||||||
|
|
||||||
wordlist = words.items()
|
wordlist = list(words.items())
|
||||||
|
|
||||||
# sort on falling frequency, then by name
|
# sort on falling frequency, then by name
|
||||||
def cmpwords((aword, alist),(bword, blist)):
|
def cmpwords(a,b):
|
||||||
|
aword, alist = a
|
||||||
|
bword, blist = b
|
||||||
r = -cmp(len(alist),len(blist))
|
r = -cmp(len(alist),len(blist))
|
||||||
if r:
|
if r:
|
||||||
return r
|
return r
|
||||||
|
@ -526,7 +538,7 @@ def makeunicodename(unicode, trace):
|
||||||
words[w] = len(lexicon_offset)
|
words[w] = len(lexicon_offset)
|
||||||
lexicon_offset.append(o)
|
lexicon_offset.append(o)
|
||||||
|
|
||||||
lexicon = map(ord, lexicon)
|
lexicon = list(map(ord, lexicon))
|
||||||
|
|
||||||
# generate phrasebook from names and lexicon
|
# generate phrasebook from names and lexicon
|
||||||
phrasebook = [0]
|
phrasebook = [0]
|
||||||
|
@ -660,11 +672,14 @@ def merge_old_version(version, new, old):
|
||||||
elif k == 14:
|
elif k == 14:
|
||||||
# change to simple titlecase mapping; ignore
|
# change to simple titlecase mapping; ignore
|
||||||
pass
|
pass
|
||||||
|
elif k == 16:
|
||||||
|
# derived property changes; not yet
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
class Difference(Exception):pass
|
class Difference(Exception):pass
|
||||||
raise Difference, (hex(i), k, old.table[i], new.table[i])
|
raise Difference, (hex(i), k, old.table[i], new.table[i])
|
||||||
new.changed.append((version, zip(bidir_changes, category_changes,
|
new.changed.append((version, list(zip(bidir_changes, category_changes,
|
||||||
decimal_changes, numeric_changes),
|
decimal_changes, numeric_changes)),
|
||||||
normalization_changes))
|
normalization_changes))
|
||||||
|
|
||||||
|
|
||||||
|
@ -677,8 +692,14 @@ def merge_old_version(version, new, old):
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
class UnicodeData:
|
class UnicodeData:
|
||||||
|
# Record structure:
|
||||||
|
# [ID, name, category, combining, bidi, decomp, (6)
|
||||||
|
# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
|
||||||
|
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
|
||||||
|
# derived-props] (17)
|
||||||
|
|
||||||
def __init__(self, filename, exclusions, eastasianwidth, expand=1):
|
def __init__(self, filename, exclusions, eastasianwidth,
|
||||||
|
derivedprops, expand=1):
|
||||||
self.changed = []
|
self.changed = []
|
||||||
file = open(filename)
|
file = open(filename)
|
||||||
table = [None] * 0x110000
|
table = [None] * 0x110000
|
||||||
|
@ -742,6 +763,28 @@ class UnicodeData:
|
||||||
if table[i] is not None:
|
if table[i] is not None:
|
||||||
table[i].append(widths[i])
|
table[i].append(widths[i])
|
||||||
|
|
||||||
|
for i in range(0, 0x110000):
|
||||||
|
if table[i] is not None:
|
||||||
|
table[i].append(set())
|
||||||
|
for s in open(derivedprops):
|
||||||
|
s = s.split('#', 1)[0].strip()
|
||||||
|
if not s:
|
||||||
|
continue
|
||||||
|
|
||||||
|
r, p = s.split(";")
|
||||||
|
r = r.strip()
|
||||||
|
p = p.strip()
|
||||||
|
if ".." in r:
|
||||||
|
first, last = [int(c, 16) for c in r.split('..')]
|
||||||
|
chars = range(first, last+1)
|
||||||
|
else:
|
||||||
|
chars = [int(r, 16)]
|
||||||
|
for char in chars:
|
||||||
|
if table[char]:
|
||||||
|
# Some properties (e.g. Default_Ignorable_Code_Point)
|
||||||
|
# apply to unassigned code points; ignore them
|
||||||
|
table[char][-1].add(p)
|
||||||
|
|
||||||
def uselatin1(self):
|
def uselatin1(self):
|
||||||
# restrict character range to ISO Latin 1
|
# restrict character range to ISO Latin 1
|
||||||
self.chars = range(256)
|
self.chars = range(256)
|
||||||
|
|
Loading…
Reference in New Issue