mirror of https://github.com/python/cpython
Additional test and documentation for the unicode() changes.
This patch should also be applied to the 2.2b1 trunk.
This commit is contained in:
parent
f6fb171c9d
commit
b5507ecd3c
|
@ -758,19 +758,33 @@ def my_import(name):
|
||||||
\versionadded{2.0}
|
\versionadded{2.0}
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
\begin{funcdesc}{unicode}{string\optional{, encoding\optional{, errors}}}
|
\begin{funcdesc}{unicode}{object\optional{, encoding\optional{, errors}}}
|
||||||
Create a Unicode string from an 8-bit string \var{string} using the
|
Return the Unicode string version of \var{object} using one of the
|
||||||
codec for \var{encoding}. The \var{encoding} parameter is a string
|
following modes:
|
||||||
giving the name of an encoding. Error handling is done according to
|
|
||||||
\var{errors}; this specifies the treatment of characters which are
|
If \var{encoding} and/or \var{errors} are given, \code{unicode()}
|
||||||
invalid in the input encoding. If \var{errors} is \code{'strict'}
|
will decode the object which can either be an 8-bit string or a
|
||||||
(the default), a \exception{ValueError} is raised on errors, while a
|
character buffer using the codec for \var{encoding}. The
|
||||||
value of \code{'ignore'} causes errors to be silently ignored, and a
|
\var{encoding} parameter is a string giving the name of an encoding.
|
||||||
value of \code{'replace'} causes the official Unicode replacement
|
Error handling is done according to \var{errors}; this specifies the
|
||||||
character, \code{U+FFFD}, to be used to replace input characters
|
treatment of characters which are invalid in the input encoding. If
|
||||||
which cannot be decoded. The default behavior is to decode UTF-8 in
|
\var{errors} is \code{'strict'} (the default), a
|
||||||
strict mode, meaning that encoding errors raise
|
\exception{ValueError} is raised on errors, while a value of
|
||||||
\exception{ValueError}. See also the \refmodule{codecs} module.
|
\code{'ignore'} causes errors to be silently ignored, and a value of
|
||||||
|
\code{'replace'} causes the official Unicode replacement character,
|
||||||
|
\code{U+FFFD}, to be used to replace input characters which cannot
|
||||||
|
be decoded. See also the \refmodule{codecs} module.
|
||||||
|
|
||||||
|
If no optional parameters are given, \code{unicode()} will mimic the
|
||||||
|
behaviour of \code{str()} except that it returns Unicode strings
|
||||||
|
instead of 8-bit strings. More precisely, if \var{object} is an
|
||||||
|
Unicode string or subclass it will return a Unicode string without
|
||||||
|
any additional decoding applied. For objects which provide a
|
||||||
|
\code{__unicode__} method, it will call this method without
|
||||||
|
arguments to create a Unicode string. For all other objects, the
|
||||||
|
8-bit string version or representation is requested and then
|
||||||
|
converted to a Unicode string using the codec for the default
|
||||||
|
encoding in \code{'strict'} mode.
|
||||||
\versionadded{2.0}
|
\versionadded{2.0}
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ test_unicode
|
||||||
Testing Unicode comparisons... done.
|
Testing Unicode comparisons... done.
|
||||||
Testing Unicode contains method... done.
|
Testing Unicode contains method... done.
|
||||||
Testing Unicode formatting strings... done.
|
Testing Unicode formatting strings... done.
|
||||||
|
Testing builtin unicode()... done.
|
||||||
Testing builtin codecs... done.
|
Testing builtin codecs... done.
|
||||||
Testing standard mapping codecs... 0-127... 128-255... done.
|
Testing standard mapping codecs... 0-127... 128-255... done.
|
||||||
Testing Unicode string concatenation... done.
|
Testing Unicode string concatenation... done.
|
||||||
|
|
|
@ -389,6 +389,67 @@ verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
|
||||||
verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
|
verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
|
||||||
print 'done.'
|
print 'done.'
|
||||||
|
|
||||||
|
print 'Testing builtin unicode()...',
|
||||||
|
|
||||||
|
# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
|
||||||
|
|
||||||
|
verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
|
||||||
|
|
||||||
|
class UnicodeSubclass(unicode):
|
||||||
|
pass
|
||||||
|
|
||||||
|
verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
|
||||||
|
== u'unicode subclass becomes unicode')
|
||||||
|
|
||||||
|
verify(unicode('strings are converted to unicode')
|
||||||
|
== u'strings are converted to unicode')
|
||||||
|
|
||||||
|
class UnicodeCompat:
|
||||||
|
def __init__(self, x):
|
||||||
|
self.x = x
|
||||||
|
def __unicode__(self):
|
||||||
|
return self.x
|
||||||
|
|
||||||
|
verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
|
||||||
|
== u'__unicode__ compatible objects are recognized')
|
||||||
|
|
||||||
|
class StringCompat:
|
||||||
|
def __init__(self, x):
|
||||||
|
self.x = x
|
||||||
|
def __str__(self):
|
||||||
|
return self.x
|
||||||
|
|
||||||
|
verify(unicode(StringCompat('__str__ compatible objects are recognized'))
|
||||||
|
== u'__str__ compatible objects are recognized')
|
||||||
|
|
||||||
|
# unicode(obj) is compatible to str():
|
||||||
|
|
||||||
|
o = StringCompat('unicode(obj) is compatible to str()')
|
||||||
|
verify(unicode(o) == u'unicode(obj) is compatible to str()')
|
||||||
|
verify(str(o) == 'unicode(obj) is compatible to str()')
|
||||||
|
|
||||||
|
for obj in (123, 123.45, 123L):
|
||||||
|
verify(unicode(obj) == unicode(str(obj)))
|
||||||
|
|
||||||
|
# unicode(obj, encoding, error) tests (this maps to
|
||||||
|
# PyUnicode_FromEncodedObject() at C level)
|
||||||
|
|
||||||
|
try:
|
||||||
|
unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
|
||||||
|
except TypeError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise TestFailed, "decoding unicode should NOT be supported"
|
||||||
|
|
||||||
|
verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
|
||||||
|
== u'strings are decoded to unicode')
|
||||||
|
|
||||||
|
verify(unicode(buffer('character buffers are decoded to unicode'),
|
||||||
|
'utf-8', 'strict')
|
||||||
|
== u'character buffers are decoded to unicode')
|
||||||
|
|
||||||
|
print 'done.'
|
||||||
|
|
||||||
# Test builtin codecs
|
# Test builtin codecs
|
||||||
print 'Testing builtin codecs...',
|
print 'Testing builtin codecs...',
|
||||||
|
|
||||||
|
@ -437,32 +498,11 @@ verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
|
||||||
# * strict decoding testing for all of the
|
# * strict decoding testing for all of the
|
||||||
# UTF8_ERROR cases in PyUnicode_DecodeUTF8
|
# UTF8_ERROR cases in PyUnicode_DecodeUTF8
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
verify(unicode('hello','ascii') == u'hello')
|
verify(unicode('hello','ascii') == u'hello')
|
||||||
verify(unicode('hello','utf-8') == u'hello')
|
verify(unicode('hello','utf-8') == u'hello')
|
||||||
verify(unicode('hello','utf8') == u'hello')
|
verify(unicode('hello','utf8') == u'hello')
|
||||||
verify(unicode('hello','latin-1') == u'hello')
|
verify(unicode('hello','latin-1') == u'hello')
|
||||||
|
|
||||||
# Compatibility to str():
|
|
||||||
class String:
|
|
||||||
x = ''
|
|
||||||
def __str__(self):
|
|
||||||
return self.x
|
|
||||||
|
|
||||||
o = String()
|
|
||||||
|
|
||||||
o.x = 'abc'
|
|
||||||
verify(unicode(o) == u'abc')
|
|
||||||
verify(str(o) == 'abc')
|
|
||||||
|
|
||||||
o.x = u'abc'
|
|
||||||
verify(unicode(o) == u'abc')
|
|
||||||
verify(str(o) == 'abc')
|
|
||||||
|
|
||||||
for obj in (123, 123.45, 123L):
|
|
||||||
verify(unicode(obj) == unicode(str(obj)))
|
|
||||||
|
|
||||||
# Error handling
|
# Error handling
|
||||||
try:
|
try:
|
||||||
u'Andr\202 x'.encode('ascii')
|
u'Andr\202 x'.encode('ascii')
|
||||||
|
|
|
@ -44,7 +44,7 @@ Core and builtins
|
||||||
- unicode(obj) now behaves more like str(obj), accepting arbitrary
|
- unicode(obj) now behaves more like str(obj), accepting arbitrary
|
||||||
objects, and calling a __unicode__ method if it exists.
|
objects, and calling a __unicode__ method if it exists.
|
||||||
unicode(obj, encoding) and unicode(obj, encoding, errors) still
|
unicode(obj, encoding) and unicode(obj, encoding, errors) still
|
||||||
require an 8-bit string argument.
|
require an 8-bit string or character buffer argument.
|
||||||
|
|
||||||
- isinstance() now allows any object as the first argument and a
|
- isinstance() now allows any object as the first argument and a
|
||||||
class, a type or something with a __bases__ tuple attribute for the
|
class, a type or something with a __bases__ tuple attribute for the
|
||||||
|
|
|
@ -426,8 +426,9 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
/* For b/w compatibility we also accept Unicode objects provided
|
/* For b/w compatibility we also accept Unicode objects provided
|
||||||
that no encodings is given and then redirect to PyObject_Unicode()
|
that no encodings is given and then redirect to
|
||||||
which then applies the additional logic for Unicode subclasses.
|
PyObject_Unicode() which then applies the additional logic for
|
||||||
|
Unicode subclasses.
|
||||||
|
|
||||||
NOTE: This API should really only be used for object which
|
NOTE: This API should really only be used for object which
|
||||||
represent *encoded* Unicode !
|
represent *encoded* Unicode !
|
||||||
|
|
Loading…
Reference in New Issue