- SF #962502: Add two more methods for unicode type; width() and

iswide() for east asian width manipulation. (Inspired by David
Goodger, Reviewed by Martin v. Loewis)
- Move _PyUnicode_TypeRecord.flags to the end of the struct so that
no padding is added for UCS-4 builds. (Suggested by Martin v. Loewis)
This commit is contained in:
Hye-Shik Chang 2004-06-02 16:49:17 +00:00
parent b6568b91fd
commit 974ed7cfa5
11 changed files with 683 additions and 459 deletions

View File

@ -850,6 +850,11 @@ functions depending on the Python configuration.
character.
\end{cfuncdesc}
\begin{cfuncdesc}{int}{Py_UNICODE_ISWIDE}{Py_UNICODE ch}
Returns 1/0 depending on whether \var{ch} is a wide or full-width
character.
\end{cfuncdesc}
These APIs can be used for fast direct character conversions:
\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
@ -908,6 +913,10 @@ use these APIs:
Return the length of the Unicode object.
\end{cfuncdesc}
\begin{cfuncdesc}{int}{PyUnicode_GetWidth}{PyObject *unicode}
Return the fixed-width representation length of the Unicode object.
\end{cfuncdesc}
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromEncodedObject}{PyObject *obj,
const char *encoding,
const char *errors}

View File

@ -642,6 +642,12 @@ Return true if all cased characters in the string are uppercase and
there is at least one cased character, false otherwise.
\end{methoddesc}
\begin{methoddesc}[string]{iswide}{}
Return true if all characters in the string are wide or full width and
there is at least one wide or full width character, false otherwise.
This method is supported by unicode type only.
\end{methoddesc}
\begin{methoddesc}[string]{join}{seq}
Return a string which is the concatenation of the strings in the
sequence \var{seq}. The separator between elements is the string
@ -774,6 +780,11 @@ character mapping codec using the \refmodule{codecs} module (see
Return a copy of the string converted to uppercase.
\end{methoddesc}
\begin{methoddesc}[string]{width}{}
Return length of fixed-width representation of the string. This method
is supported by unicode type only.
\end{methoddesc}
\begin{methoddesc}[string]{zfill}{width}
Return the numeric string left filled with zeros in a string
of length \var{width}. The original string is returned if

View File

@ -180,6 +180,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
# define PyUnicode_GetWidth PyUnicodeUCS2_GetWidth
# define PyUnicode_Join PyUnicodeUCS2_Join
# define PyUnicode_Replace PyUnicodeUCS2_Replace
# define PyUnicode_Resize PyUnicodeUCS2_Resize
@ -199,6 +200,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
# define _PyUnicode_IsWide _PyUnicodeUCS2_IsWide
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
@ -252,6 +254,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
# define PyUnicode_GetWidth PyUnicodeUCS4_GetWidth
# define PyUnicode_Join PyUnicodeUCS4_Join
# define PyUnicode_Replace PyUnicodeUCS4_Replace
# define PyUnicode_Resize PyUnicodeUCS4_Resize
@ -270,6 +273,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
# define _PyUnicode_IsWide _PyUnicodeUCS4_IsWide
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
@ -315,6 +319,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
#else
#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
@ -338,6 +344,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
#endif
#define Py_UNICODE_ISALNUM(ch) \
@ -430,6 +438,12 @@ PyAPI_FUNC(int) PyUnicode_GetSize(
PyObject *unicode /* Unicode object */
);
/* Get the fixed-width representation length of the Unicode object */
PyAPI_FUNC(int) PyUnicode_GetWidth(
PyObject *unicode /* Unicode object */
);
/* Get the maximum ordinal for a Unicode character. */
PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
@ -1151,6 +1165,10 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Py_UNICODE ch /* Unicode character */
);
PyAPI_FUNC(int) _PyUnicode_IsWide(
Py_UNICODE ch /* Unicode character */
);
#ifdef __cplusplus
}
#endif

View File

@ -291,6 +291,26 @@ class UnicodeTest(
self.assertRaises(TypeError, u"abc".isnumeric, 42)
def test_iswide(self):
self.checkequalnofix(False, u'', 'iswide')
self.checkequalnofix(False, u'\x1f', 'iswide') # Neutral
self.checkequalnofix(False, u'\x20', 'iswide') # Narrow
self.checkequalnofix(True, u'\u2329', 'iswide') # Wide
self.checkequalnofix(False, u'\uff64', 'iswide') # Half
self.checkequalnofix(True, u'\u3000', 'iswide') # Full
self.checkequalnofix(False, u'\u2460', 'iswide') # Ambiguous
self.checkequalnofix(True, u'\ud55c\uae00', 'iswide')
self.checkequalnofix(False, u'\ud55c\u2606\uae00', 'iswide')
def test_wide(self):
self.assertEqual(u''.width(), 0)
self.assertEqual(u'abcd'.width(), 4)
self.assertEqual(u'\u0187\u01c9'.width(), 2)
self.assertEqual(u'\u2460\u2329'.width(), 3)
self.assertEqual(u'\u2329\u2460'.width(), 3)
self.assertEqual(u'\ud55c\uae00'.width(), 4)
self.assertEqual(u'\ud55c\u2606\uae00'.width(), 5)
def test_contains(self):
# Testing Unicode contains method
self.assert_('a' in u'abdb')

View File

@ -12,6 +12,9 @@ What's New in Python 2.4 alpha 1?
Core and builtins
-----------------
- Unicode type got two new methods; iswide() and width(). They
manipulate east asian width information as of Unicode TR11.
- Improved the tuple hashing algorithm to give fewer collisions in
common cases. Fixes bug #942952.

View File

@ -1,4 +1,4 @@
/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
#define UNIDATA_VERSION "3.2.0"
/* a list of unique database records */

View File

@ -1,4 +1,4 @@
/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
#define NAME_MAXLEN 256

View File

@ -19,14 +19,15 @@
#define SPACE_MASK 0x20
#define TITLE_MASK 0x40
#define UPPER_MASK 0x80
#define WIDE_MASK 0x100
typedef struct {
const unsigned short flags;
const Py_UNICODE upper;
const Py_UNICODE lower;
const Py_UNICODE title;
const unsigned char decimal;
const unsigned char digit;
const unsigned short flags;
} _PyUnicode_TypeRecord;
#include "unicodetype_db.h"
@ -322,6 +323,15 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
return 1;
}
/* Returns 1 for Unicode characters having Full or Wide width, 0 otherwise */
int _PyUnicode_IsWide(Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
return (ctype->flags & WIDE_MASK) != 0;
}
#ifndef WANT_WCTYPE_FUNCTIONS
/* Returns 1 for Unicode characters having the bidirectional type

View File

@ -655,6 +655,27 @@ int PyUnicode_GetSize(PyObject *unicode)
return -1;
}
int PyUnicode_GetWidth(PyObject *unicode)
{
const Py_UNICODE *p, *e;
int width;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return -1;
}
p = PyUnicode_AS_UNICODE(unicode);
e = p + PyUnicode_GET_SIZE(unicode);
for (width = 0; p < e; p++)
if (Py_UNICODE_ISWIDE(*p))
width += 2;
else
width++;
return width;
}
const char *PyUnicode_GetDefaultEncoding(void)
{
return unicode_default_encoding;
@ -5316,6 +5337,35 @@ unicode_isnumeric(PyUnicodeObject *self)
return PyBool_FromLong(1);
}
PyDoc_STRVAR(iswide__doc__,
"S.iswide() -> bool\n\
\n\
Return True if all characters in S are wide width\n\
and there is at least one character in S, False otherwise.");
static PyObject*
unicode_iswide(PyUnicodeObject *self)
{
register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
register const Py_UNICODE *e;
/* Shortcut for single character strings */
if (PyUnicode_GET_SIZE(self) == 1 &&
Py_UNICODE_ISWIDE(*p))
Py_RETURN_TRUE;
/* Special case for empty strings */
if (PyString_GET_SIZE(self) == 0)
Py_RETURN_FALSE;
e = p + PyUnicode_GET_SIZE(self);
for (; p < e; p++) {
if (!Py_UNICODE_ISWIDE(*p))
Py_RETURN_FALSE;
}
Py_RETURN_TRUE;
}
PyDoc_STRVAR(join__doc__,
"S.join(sequence) -> unicode\n\
\n\
@ -5335,7 +5385,7 @@ unicode_length(PyUnicodeObject *self)
}
PyDoc_STRVAR(ljust__doc__,
"S.ljust(width[, fillchar]) -> unicode\n\
"S.ljust(width[, fillchar]) -> int\n\
\n\
Return S left justified in a Unicode string of length width. Padding is\n\
done using the specified fill character (default is a space).");
@ -5927,6 +5977,21 @@ unicode_upper(PyUnicodeObject *self)
return fixup(self, fixupper);
}
PyDoc_STRVAR(width__doc__,
"S.width() -> unicode\n\
\n\
Return a fixed-width representation length of S.");
static PyObject*
unicode_width(PyObject *self)
{
int width = PyUnicode_GetWidth(self);
if (width == -1)
return NULL;
else
return PyInt_FromLong((long)width);
}
PyDoc_STRVAR(zfill__doc__,
"S.zfill(width) -> unicode\n\
\n\
@ -6090,6 +6155,8 @@ static PyMethodDef unicode_methods[] = {
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
{"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
{"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
#if 0
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},

File diff suppressed because it is too large Load Diff

View File

@ -18,6 +18,7 @@
# 2002-10-22 mvl generate NFC tables
# 2002-11-24 mvl expand all ranges, sort names version-independently
# 2002-11-25 mvl add UNIDATA_VERSION
# 2004-05-29 perky add east asian width information
#
# written by Fredrik Lundh (fredrik@pythonware.com)
#
@ -25,12 +26,13 @@
import sys
SCRIPT = sys.argv[0]
VERSION = "2.2"
VERSION = "2.3"
# The Unicode Database
UNIDATA_VERSION = "3.2.0"
UNICODE_DATA = "UnicodeData.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
EASTASIAN_WIDTH = "EastAsianWidth.txt"
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@ -50,12 +52,14 @@ LINEBREAK_MASK = 0x10
SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
WIDE_MASK = 0x100
def maketables(trace=0):
print "--- Reading", UNICODE_DATA, "..."
unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
EASTASIAN_WIDTH)
print len(filter(None, unicode.table)), "characters"
@ -330,8 +334,10 @@ def makeunicodetype(unicode, trace):
if record[7]:
flags |= DIGIT_MASK
digit = int(record[7])
if record[15] in ('W', 'F'): # Wide or Full width
flags |= WIDE_MASK
item = (
flags, upper, lower, title, decimal, digit
upper, lower, title, decimal, digit, flags
)
# add entry to index and item tables
i = cache.get(item)
@ -538,7 +544,7 @@ import sys
class UnicodeData:
def __init__(self, filename, exclusions, expand=1):
def __init__(self, filename, exclusions, eastasianwidth, expand=1):
file = open(filename)
table = [None] * 0x110000
while 1:
@ -581,6 +587,25 @@ class UnicodeData:
char = int(s.split()[0],16)
self.exclusions[char] = 1
widths = [None] * 0x110000
for s in open(eastasianwidth):
s = s.strip()
if not s:
continue
if s[0] == '#':
continue
s = s.split()[0].split(';')
if '..' in s[0]:
first, last = [int(c, 16) for c in s[0].split('..')]
chars = range(first, last+1)
else:
chars = [int(s[0], 16)]
for char in chars:
widths[char] = s[1]
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(widths[i])
def uselatin1(self):
# restrict character range to ISO Latin 1
self.chars = range(256)