Additional test and documentation for the unicode() changes.

This patch should also be applied to the 2.2b1 trunk.
This commit is contained in:
Marc-André Lemburg 2001-10-19 12:02:29 +00:00
parent f6fb171c9d
commit b5507ecd3c
5 changed files with 93 additions and 37 deletions

View File

@ -758,19 +758,33 @@ def my_import(name):
\versionadded{2.0} \versionadded{2.0}
\end{funcdesc} \end{funcdesc}
\begin{funcdesc}{unicode}{string\optional{, encoding\optional{, errors}}} \begin{funcdesc}{unicode}{object\optional{, encoding\optional{, errors}}}
Create a Unicode string from an 8-bit string \var{string} using the Return the Unicode string version of \var{object} using one of the
codec for \var{encoding}. The \var{encoding} parameter is a string following modes:
giving the name of an encoding. Error handling is done according to
\var{errors}; this specifies the treatment of characters which are If \var{encoding} and/or \var{errors} are given, \code{unicode()}
invalid in the input encoding. If \var{errors} is \code{'strict'} will decode the object which can either be an 8-bit string or a
(the default), a \exception{ValueError} is raised on errors, while a character buffer using the codec for \var{encoding}. The
value of \code{'ignore'} causes errors to be silently ignored, and a \var{encoding} parameter is a string giving the name of an encoding.
value of \code{'replace'} causes the official Unicode replacement Error handling is done according to \var{errors}; this specifies the
character, \code{U+FFFD}, to be used to replace input characters treatment of characters which are invalid in the input encoding. If
which cannot be decoded. The default behavior is to decode UTF-8 in \var{errors} is \code{'strict'} (the default), a
strict mode, meaning that encoding errors raise \exception{ValueError} is raised on errors, while a value of
\exception{ValueError}. See also the \refmodule{codecs} module. \code{'ignore'} causes errors to be silently ignored, and a value of
\code{'replace'} causes the official Unicode replacement character,
\code{U+FFFD}, to be used to replace input characters which cannot
be decoded. See also the \refmodule{codecs} module.
If no optional parameters are given, \code{unicode()} will mimic the
behaviour of \code{str()} except that it returns Unicode strings
instead of 8-bit strings. More precisely, if \var{object} is an
Unicode string or subclass it will return a Unicode string without
any additional decoding applied. For objects which provide a
\code{__unicode__} method, it will call this method without
arguments to create a Unicode string. For all other objects, the
8-bit string version or representation is requested and then
converted to a Unicode string using the codec for the default
encoding in \code{'strict'} mode.
\versionadded{2.0} \versionadded{2.0}
\end{funcdesc} \end{funcdesc}

View File

@ -2,6 +2,7 @@ test_unicode
Testing Unicode comparisons... done. Testing Unicode comparisons... done.
Testing Unicode contains method... done. Testing Unicode contains method... done.
Testing Unicode formatting strings... done. Testing Unicode formatting strings... done.
Testing builtin unicode()... done.
Testing builtin codecs... done. Testing builtin codecs... done.
Testing standard mapping codecs... 0-127... 128-255... done. Testing standard mapping codecs... 0-127... 128-255... done.
Testing Unicode string concatenation... done. Testing Unicode string concatenation... done.

View File

@ -389,6 +389,67 @@ verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc') verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
print 'done.' print 'done.'
print 'Testing builtin unicode()...',
# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
class UnicodeSubclass(unicode):
pass
verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
== u'unicode subclass becomes unicode')
verify(unicode('strings are converted to unicode')
== u'strings are converted to unicode')
class UnicodeCompat:
def __init__(self, x):
self.x = x
def __unicode__(self):
return self.x
verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
== u'__unicode__ compatible objects are recognized')
class StringCompat:
def __init__(self, x):
self.x = x
def __str__(self):
return self.x
verify(unicode(StringCompat('__str__ compatible objects are recognized'))
== u'__str__ compatible objects are recognized')
# unicode(obj) is compatible to str():
o = StringCompat('unicode(obj) is compatible to str()')
verify(unicode(o) == u'unicode(obj) is compatible to str()')
verify(str(o) == 'unicode(obj) is compatible to str()')
for obj in (123, 123.45, 123L):
verify(unicode(obj) == unicode(str(obj)))
# unicode(obj, encoding, error) tests (this maps to
# PyUnicode_FromEncodedObject() at C level)
try:
unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
except TypeError:
pass
else:
raise TestFailed, "decoding unicode should NOT be supported"
verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
== u'strings are decoded to unicode')
verify(unicode(buffer('character buffers are decoded to unicode'),
'utf-8', 'strict')
== u'character buffers are decoded to unicode')
print 'done.'
# Test builtin codecs # Test builtin codecs
print 'Testing builtin codecs...', print 'Testing builtin codecs...',
@ -437,32 +498,11 @@ verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
# * strict decoding testing for all of the # * strict decoding testing for all of the
# UTF8_ERROR cases in PyUnicode_DecodeUTF8 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
verify(unicode('hello','ascii') == u'hello') verify(unicode('hello','ascii') == u'hello')
verify(unicode('hello','utf-8') == u'hello') verify(unicode('hello','utf-8') == u'hello')
verify(unicode('hello','utf8') == u'hello') verify(unicode('hello','utf8') == u'hello')
verify(unicode('hello','latin-1') == u'hello') verify(unicode('hello','latin-1') == u'hello')
# Compatibility to str():
class String:
x = ''
def __str__(self):
return self.x
o = String()
o.x = 'abc'
verify(unicode(o) == u'abc')
verify(str(o) == 'abc')
o.x = u'abc'
verify(unicode(o) == u'abc')
verify(str(o) == 'abc')
for obj in (123, 123.45, 123L):
verify(unicode(obj) == unicode(str(obj)))
# Error handling # Error handling
try: try:
u'Andr\202 x'.encode('ascii') u'Andr\202 x'.encode('ascii')

View File

@ -44,7 +44,7 @@ Core and builtins
- unicode(obj) now behaves more like str(obj), accepting arbitrary - unicode(obj) now behaves more like str(obj), accepting arbitrary
objects, and calling a __unicode__ method if it exists. objects, and calling a __unicode__ method if it exists.
unicode(obj, encoding) and unicode(obj, encoding, errors) still unicode(obj, encoding) and unicode(obj, encoding, errors) still
require an 8-bit string argument. require an 8-bit string or character buffer argument.
- isinstance() now allows any object as the first argument and a - isinstance() now allows any object as the first argument and a
class, a type or something with a __bases__ tuple attribute for the class, a type or something with a __bases__ tuple attribute for the

View File

@ -426,8 +426,9 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
#if 0 #if 0
/* For b/w compatibility we also accept Unicode objects provided /* For b/w compatibility we also accept Unicode objects provided
that no encodings is given and then redirect to PyObject_Unicode() that no encodings is given and then redirect to
which then applies the additional logic for Unicode subclasses. PyObject_Unicode() which then applies the additional logic for
Unicode subclasses.
NOTE: This API should really only be used for object which NOTE: This API should really only be used for object which
represent *encoded* Unicode ! represent *encoded* Unicode !