- SF #962502: Add two more methods for unicode type; width() and
iswide() for east asian width manipulation. (Inspired by David Goodger, Reviewed by Martin v. Loewis) - Move _PyUnicode_TypeRecord.flags to the end of the struct so that no padding is added for UCS-4 builds. (Suggested by Martin v. Loewis)
This commit is contained in:
parent
b6568b91fd
commit
974ed7cfa5
|
@ -850,6 +850,11 @@ functions depending on the Python configuration.
|
|||
character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_ISWIDE}{Py_UNICODE ch}
|
||||
Returns 1/0 depending on whether \var{ch} is a wide or full-width
|
||||
character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
These APIs can be used for fast direct character conversions:
|
||||
|
||||
\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
|
||||
|
@ -908,6 +913,10 @@ use these APIs:
|
|||
Return the length of the Unicode object.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_GetWidth}{PyObject *unicode}
|
||||
Return the fixed-width representation length of the Unicode object.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromEncodedObject}{PyObject *obj,
|
||||
const char *encoding,
|
||||
const char *errors}
|
||||
|
|
|
@ -642,6 +642,12 @@ Return true if all cased characters in the string are uppercase and
|
|||
there is at least one cased character, false otherwise.
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}[string]{iswide}{}
|
||||
Return true if all characters in the string are wide or full width and
|
||||
there is at least one wide or full width character, false otherwise.
|
||||
This method is supported by unicode type only.
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}[string]{join}{seq}
|
||||
Return a string which is the concatenation of the strings in the
|
||||
sequence \var{seq}. The separator between elements is the string
|
||||
|
@ -774,6 +780,11 @@ character mapping codec using the \refmodule{codecs} module (see
|
|||
Return a copy of the string converted to uppercase.
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}[string]{width}{}
|
||||
Return length of fixed-width representation of the string. This method
|
||||
is supported by unicode type only.
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}[string]{zfill}{width}
|
||||
Return the numeric string left filled with zeros in a string
|
||||
of length \var{width}. The original string is returned if
|
||||
|
|
|
@ -180,6 +180,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
|
||||
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
|
||||
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
|
||||
# define PyUnicode_GetWidth PyUnicodeUCS2_GetWidth
|
||||
# define PyUnicode_Join PyUnicodeUCS2_Join
|
||||
# define PyUnicode_Replace PyUnicodeUCS2_Replace
|
||||
# define PyUnicode_Resize PyUnicodeUCS2_Resize
|
||||
|
@ -199,6 +200,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
|
||||
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
|
||||
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
|
||||
# define _PyUnicode_IsWide _PyUnicodeUCS2_IsWide
|
||||
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
|
||||
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
|
||||
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
|
||||
|
@ -252,6 +254,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
|
||||
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
|
||||
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
|
||||
# define PyUnicode_GetWidth PyUnicodeUCS4_GetWidth
|
||||
# define PyUnicode_Join PyUnicodeUCS4_Join
|
||||
# define PyUnicode_Replace PyUnicodeUCS4_Replace
|
||||
# define PyUnicode_Resize PyUnicodeUCS4_Resize
|
||||
|
@ -270,6 +273,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
|
||||
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
|
||||
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
|
||||
# define _PyUnicode_IsWide _PyUnicodeUCS4_IsWide
|
||||
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
|
||||
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
|
||||
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
|
||||
|
@ -315,6 +319,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
|
||||
#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
|
||||
|
||||
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
|
||||
|
||||
#else
|
||||
|
||||
#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
|
||||
|
@ -338,6 +344,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
|
||||
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
|
||||
|
||||
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
|
||||
|
||||
#endif
|
||||
|
||||
#define Py_UNICODE_ISALNUM(ch) \
|
||||
|
@ -430,6 +438,12 @@ PyAPI_FUNC(int) PyUnicode_GetSize(
|
|||
PyObject *unicode /* Unicode object */
|
||||
);
|
||||
|
||||
/* Get the fixed-width representation length of the Unicode object */
|
||||
|
||||
PyAPI_FUNC(int) PyUnicode_GetWidth(
|
||||
PyObject *unicode /* Unicode object */
|
||||
);
|
||||
|
||||
/* Get the maximum ordinal for a Unicode character. */
|
||||
PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
|
||||
|
||||
|
@ -1151,6 +1165,10 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
|
|||
Py_UNICODE ch /* Unicode character */
|
||||
);
|
||||
|
||||
PyAPI_FUNC(int) _PyUnicode_IsWide(
|
||||
Py_UNICODE ch /* Unicode character */
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -291,6 +291,26 @@ class UnicodeTest(
|
|||
|
||||
self.assertRaises(TypeError, u"abc".isnumeric, 42)
|
||||
|
||||
def test_iswide(self):
|
||||
self.checkequalnofix(False, u'', 'iswide')
|
||||
self.checkequalnofix(False, u'\x1f', 'iswide') # Neutral
|
||||
self.checkequalnofix(False, u'\x20', 'iswide') # Narrow
|
||||
self.checkequalnofix(True, u'\u2329', 'iswide') # Wide
|
||||
self.checkequalnofix(False, u'\uff64', 'iswide') # Half
|
||||
self.checkequalnofix(True, u'\u3000', 'iswide') # Full
|
||||
self.checkequalnofix(False, u'\u2460', 'iswide') # Ambiguous
|
||||
self.checkequalnofix(True, u'\ud55c\uae00', 'iswide')
|
||||
self.checkequalnofix(False, u'\ud55c\u2606\uae00', 'iswide')
|
||||
|
||||
def test_wide(self):
|
||||
self.assertEqual(u''.width(), 0)
|
||||
self.assertEqual(u'abcd'.width(), 4)
|
||||
self.assertEqual(u'\u0187\u01c9'.width(), 2)
|
||||
self.assertEqual(u'\u2460\u2329'.width(), 3)
|
||||
self.assertEqual(u'\u2329\u2460'.width(), 3)
|
||||
self.assertEqual(u'\ud55c\uae00'.width(), 4)
|
||||
self.assertEqual(u'\ud55c\u2606\uae00'.width(), 5)
|
||||
|
||||
def test_contains(self):
|
||||
# Testing Unicode contains method
|
||||
self.assert_('a' in u'abdb')
|
||||
|
|
|
@ -12,6 +12,9 @@ What's New in Python 2.4 alpha 1?
|
|||
Core and builtins
|
||||
-----------------
|
||||
|
||||
- Unicode type got two new methods; iswide() and width(). They
|
||||
manipulate east asian width information as of Unicode TR11.
|
||||
|
||||
- Improved the tuple hashing algorithm to give fewer collisions in
|
||||
common cases. Fixes bug #942952.
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
|
||||
/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
|
||||
|
||||
#define UNIDATA_VERSION "3.2.0"
|
||||
/* a list of unique database records */
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
|
||||
/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
|
||||
|
||||
#define NAME_MAXLEN 256
|
||||
|
||||
|
|
|
@ -19,14 +19,15 @@
|
|||
#define SPACE_MASK 0x20
|
||||
#define TITLE_MASK 0x40
|
||||
#define UPPER_MASK 0x80
|
||||
#define WIDE_MASK 0x100
|
||||
|
||||
typedef struct {
|
||||
const unsigned short flags;
|
||||
const Py_UNICODE upper;
|
||||
const Py_UNICODE lower;
|
||||
const Py_UNICODE title;
|
||||
const unsigned char decimal;
|
||||
const unsigned char digit;
|
||||
const unsigned short flags;
|
||||
} _PyUnicode_TypeRecord;
|
||||
|
||||
#include "unicodetype_db.h"
|
||||
|
@ -322,6 +323,15 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Returns 1 for Unicode characters having Full or Wide width, 0 otherwise */
|
||||
|
||||
int _PyUnicode_IsWide(Py_UNICODE ch)
|
||||
{
|
||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||
|
||||
return (ctype->flags & WIDE_MASK) != 0;
|
||||
}
|
||||
|
||||
#ifndef WANT_WCTYPE_FUNCTIONS
|
||||
|
||||
/* Returns 1 for Unicode characters having the bidirectional type
|
||||
|
|
|
@ -655,6 +655,27 @@ int PyUnicode_GetSize(PyObject *unicode)
|
|||
return -1;
|
||||
}
|
||||
|
||||
int PyUnicode_GetWidth(PyObject *unicode)
|
||||
{
|
||||
const Py_UNICODE *p, *e;
|
||||
int width;
|
||||
|
||||
if (!PyUnicode_Check(unicode)) {
|
||||
PyErr_BadArgument();
|
||||
return -1;
|
||||
}
|
||||
|
||||
p = PyUnicode_AS_UNICODE(unicode);
|
||||
e = p + PyUnicode_GET_SIZE(unicode);
|
||||
for (width = 0; p < e; p++)
|
||||
if (Py_UNICODE_ISWIDE(*p))
|
||||
width += 2;
|
||||
else
|
||||
width++;
|
||||
|
||||
return width;
|
||||
}
|
||||
|
||||
const char *PyUnicode_GetDefaultEncoding(void)
|
||||
{
|
||||
return unicode_default_encoding;
|
||||
|
@ -5316,6 +5337,35 @@ unicode_isnumeric(PyUnicodeObject *self)
|
|||
return PyBool_FromLong(1);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(iswide__doc__,
|
||||
"S.iswide() -> bool\n\
|
||||
\n\
|
||||
Return True if all characters in S are wide width\n\
|
||||
and there is at least one character in S, False otherwise.");
|
||||
|
||||
static PyObject*
|
||||
unicode_iswide(PyUnicodeObject *self)
|
||||
{
|
||||
register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
||||
register const Py_UNICODE *e;
|
||||
|
||||
/* Shortcut for single character strings */
|
||||
if (PyUnicode_GET_SIZE(self) == 1 &&
|
||||
Py_UNICODE_ISWIDE(*p))
|
||||
Py_RETURN_TRUE;
|
||||
|
||||
/* Special case for empty strings */
|
||||
if (PyString_GET_SIZE(self) == 0)
|
||||
Py_RETURN_FALSE;
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (; p < e; p++) {
|
||||
if (!Py_UNICODE_ISWIDE(*p))
|
||||
Py_RETURN_FALSE;
|
||||
}
|
||||
Py_RETURN_TRUE;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(join__doc__,
|
||||
"S.join(sequence) -> unicode\n\
|
||||
\n\
|
||||
|
@ -5335,7 +5385,7 @@ unicode_length(PyUnicodeObject *self)
|
|||
}
|
||||
|
||||
PyDoc_STRVAR(ljust__doc__,
|
||||
"S.ljust(width[, fillchar]) -> unicode\n\
|
||||
"S.ljust(width[, fillchar]) -> int\n\
|
||||
\n\
|
||||
Return S left justified in a Unicode string of length width. Padding is\n\
|
||||
done using the specified fill character (default is a space).");
|
||||
|
@ -5927,6 +5977,21 @@ unicode_upper(PyUnicodeObject *self)
|
|||
return fixup(self, fixupper);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(width__doc__,
|
||||
"S.width() -> unicode\n\
|
||||
\n\
|
||||
Return a fixed-width representation length of S.");
|
||||
|
||||
static PyObject*
|
||||
unicode_width(PyObject *self)
|
||||
{
|
||||
int width = PyUnicode_GetWidth(self);
|
||||
if (width == -1)
|
||||
return NULL;
|
||||
else
|
||||
return PyInt_FromLong((long)width);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(zfill__doc__,
|
||||
"S.zfill(width) -> unicode\n\
|
||||
\n\
|
||||
|
@ -6090,6 +6155,8 @@ static PyMethodDef unicode_methods[] = {
|
|||
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
|
||||
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
|
||||
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
|
||||
{"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
|
||||
{"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
|
||||
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
|
||||
#if 0
|
||||
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -18,6 +18,7 @@
|
|||
# 2002-10-22 mvl generate NFC tables
|
||||
# 2002-11-24 mvl expand all ranges, sort names version-independently
|
||||
# 2002-11-25 mvl add UNIDATA_VERSION
|
||||
# 2004-05-29 perky add east asian width information
|
||||
#
|
||||
# written by Fredrik Lundh (fredrik@pythonware.com)
|
||||
#
|
||||
|
@ -25,12 +26,13 @@
|
|||
import sys
|
||||
|
||||
SCRIPT = sys.argv[0]
|
||||
VERSION = "2.2"
|
||||
VERSION = "2.3"
|
||||
|
||||
# The Unicode Database
|
||||
UNIDATA_VERSION = "3.2.0"
|
||||
UNICODE_DATA = "UnicodeData.txt"
|
||||
COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
|
||||
EASTASIAN_WIDTH = "EastAsianWidth.txt"
|
||||
|
||||
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
|
||||
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
|
||||
|
@ -50,12 +52,14 @@ LINEBREAK_MASK = 0x10
|
|||
SPACE_MASK = 0x20
|
||||
TITLE_MASK = 0x40
|
||||
UPPER_MASK = 0x80
|
||||
WIDE_MASK = 0x100
|
||||
|
||||
def maketables(trace=0):
|
||||
|
||||
print "--- Reading", UNICODE_DATA, "..."
|
||||
|
||||
unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
|
||||
unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
|
||||
EASTASIAN_WIDTH)
|
||||
|
||||
print len(filter(None, unicode.table)), "characters"
|
||||
|
||||
|
@ -330,8 +334,10 @@ def makeunicodetype(unicode, trace):
|
|||
if record[7]:
|
||||
flags |= DIGIT_MASK
|
||||
digit = int(record[7])
|
||||
if record[15] in ('W', 'F'): # Wide or Full width
|
||||
flags |= WIDE_MASK
|
||||
item = (
|
||||
flags, upper, lower, title, decimal, digit
|
||||
upper, lower, title, decimal, digit, flags
|
||||
)
|
||||
# add entry to index and item tables
|
||||
i = cache.get(item)
|
||||
|
@ -538,7 +544,7 @@ import sys
|
|||
|
||||
class UnicodeData:
|
||||
|
||||
def __init__(self, filename, exclusions, expand=1):
|
||||
def __init__(self, filename, exclusions, eastasianwidth, expand=1):
|
||||
file = open(filename)
|
||||
table = [None] * 0x110000
|
||||
while 1:
|
||||
|
@ -581,6 +587,25 @@ class UnicodeData:
|
|||
char = int(s.split()[0],16)
|
||||
self.exclusions[char] = 1
|
||||
|
||||
widths = [None] * 0x110000
|
||||
for s in open(eastasianwidth):
|
||||
s = s.strip()
|
||||
if not s:
|
||||
continue
|
||||
if s[0] == '#':
|
||||
continue
|
||||
s = s.split()[0].split(';')
|
||||
if '..' in s[0]:
|
||||
first, last = [int(c, 16) for c in s[0].split('..')]
|
||||
chars = range(first, last+1)
|
||||
else:
|
||||
chars = [int(s[0], 16)]
|
||||
for char in chars:
|
||||
widths[char] = s[1]
|
||||
for i in range(0, 0x110000):
|
||||
if table[i] is not None:
|
||||
table[i].append(widths[i])
|
||||
|
||||
def uselatin1(self):
|
||||
# restrict character range to ISO Latin 1
|
||||
self.chars = range(256)
|
||||
|
|
Loading…
Reference in New Issue