From b5507ecd3cfce17bab26311298f527572611af0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lemburg?= Date: Fri, 19 Oct 2001 12:02:29 +0000 Subject: [PATCH] Additional test and documentation for the unicode() changes. This patch should also be applied to the 2.2b1 trunk. --- Doc/lib/libfuncs.tex | 40 ++++++++++++------ Lib/test/output/test_unicode | 1 + Lib/test/test_unicode.py | 82 +++++++++++++++++++++++++++--------- Misc/NEWS | 2 +- Objects/unicodeobject.c | 5 ++- 5 files changed, 93 insertions(+), 37 deletions(-) diff --git a/Doc/lib/libfuncs.tex b/Doc/lib/libfuncs.tex index 080876105b7..b19d4a643a2 100644 --- a/Doc/lib/libfuncs.tex +++ b/Doc/lib/libfuncs.tex @@ -758,19 +758,33 @@ def my_import(name): \versionadded{2.0} \end{funcdesc} -\begin{funcdesc}{unicode}{string\optional{, encoding\optional{, errors}}} - Create a Unicode string from an 8-bit string \var{string} using the - codec for \var{encoding}. The \var{encoding} parameter is a string - giving the name of an encoding. Error handling is done according to - \var{errors}; this specifies the treatment of characters which are - invalid in the input encoding. If \var{errors} is \code{'strict'} - (the default), a \exception{ValueError} is raised on errors, while a - value of \code{'ignore'} causes errors to be silently ignored, and a - value of \code{'replace'} causes the official Unicode replacement - character, \code{U+FFFD}, to be used to replace input characters - which cannot be decoded. The default behavior is to decode UTF-8 in - strict mode, meaning that encoding errors raise - \exception{ValueError}. See also the \refmodule{codecs} module. +\begin{funcdesc}{unicode}{object\optional{, encoding\optional{, errors}}} + Return the Unicode string version of \var{object} using one of the + following modes: + + If \var{encoding} and/or \var{errors} are given, \code{unicode()} + will decode the object which can either be an 8-bit string or a + character buffer using the codec for \var{encoding}. The + \var{encoding} parameter is a string giving the name of an encoding. + Error handling is done according to \var{errors}; this specifies the + treatment of characters which are invalid in the input encoding. If + \var{errors} is \code{'strict'} (the default), a + \exception{ValueError} is raised on errors, while a value of + \code{'ignore'} causes errors to be silently ignored, and a value of + \code{'replace'} causes the official Unicode replacement character, + \code{U+FFFD}, to be used to replace input characters which cannot + be decoded. See also the \refmodule{codecs} module. + + If no optional parameters are given, \code{unicode()} will mimic the + behaviour of \code{str()} except that it returns Unicode strings + instead of 8-bit strings. More precisely, if \var{object} is an + Unicode string or subclass it will return a Unicode string without + any additional decoding applied. For objects which provide a + \code{__unicode__} method, it will call this method without + arguments to create a Unicode string. For all other objects, the + 8-bit string version or representation is requested and then + converted to a Unicode string using the codec for the default + encoding in \code{'strict'} mode. \versionadded{2.0} \end{funcdesc} diff --git a/Lib/test/output/test_unicode b/Lib/test/output/test_unicode index 783a4860ab4..82ed2400751 100644 --- a/Lib/test/output/test_unicode +++ b/Lib/test/output/test_unicode @@ -2,6 +2,7 @@ test_unicode Testing Unicode comparisons... done. Testing Unicode contains method... done. Testing Unicode formatting strings... done. +Testing builtin unicode()... done. Testing builtin codecs... done. Testing standard mapping codecs... 0-127... 128-255... done. Testing Unicode string concatenation... done. diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 68eae13115d..eff11cfa6a4 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -389,6 +389,67 @@ verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc') verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc') print 'done.' +print 'Testing builtin unicode()...', + +# unicode(obj) tests (this maps to PyObject_Unicode() at C level) + +verify(unicode(u'unicode remains unicode') == u'unicode remains unicode') + +class UnicodeSubclass(unicode): + pass + +verify(unicode(UnicodeSubclass('unicode subclass becomes unicode')) + == u'unicode subclass becomes unicode') + +verify(unicode('strings are converted to unicode') + == u'strings are converted to unicode') + +class UnicodeCompat: + def __init__(self, x): + self.x = x + def __unicode__(self): + return self.x + +verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized')) + == u'__unicode__ compatible objects are recognized') + +class StringCompat: + def __init__(self, x): + self.x = x + def __str__(self): + return self.x + +verify(unicode(StringCompat('__str__ compatible objects are recognized')) + == u'__str__ compatible objects are recognized') + +# unicode(obj) is compatible to str(): + +o = StringCompat('unicode(obj) is compatible to str()') +verify(unicode(o) == u'unicode(obj) is compatible to str()') +verify(str(o) == 'unicode(obj) is compatible to str()') + +for obj in (123, 123.45, 123L): + verify(unicode(obj) == unicode(str(obj))) + +# unicode(obj, encoding, error) tests (this maps to +# PyUnicode_FromEncodedObject() at C level) + +try: + unicode(u'decoding unicode is not supported', 'utf-8', 'strict') +except TypeError: + pass +else: + raise TestFailed, "decoding unicode should NOT be supported" + +verify(unicode('strings are decoded to unicode', 'utf-8', 'strict') + == u'strings are decoded to unicode') + +verify(unicode(buffer('character buffers are decoded to unicode'), + 'utf-8', 'strict') + == u'character buffers are decoded to unicode') + +print 'done.' + # Test builtin codecs print 'Testing builtin codecs...', @@ -437,32 +498,11 @@ verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))), # * strict decoding testing for all of the # UTF8_ERROR cases in PyUnicode_DecodeUTF8 - - verify(unicode('hello','ascii') == u'hello') verify(unicode('hello','utf-8') == u'hello') verify(unicode('hello','utf8') == u'hello') verify(unicode('hello','latin-1') == u'hello') -# Compatibility to str(): -class String: - x = '' - def __str__(self): - return self.x - -o = String() - -o.x = 'abc' -verify(unicode(o) == u'abc') -verify(str(o) == 'abc') - -o.x = u'abc' -verify(unicode(o) == u'abc') -verify(str(o) == 'abc') - -for obj in (123, 123.45, 123L): - verify(unicode(obj) == unicode(str(obj))) - # Error handling try: u'Andr\202 x'.encode('ascii') diff --git a/Misc/NEWS b/Misc/NEWS index b7ea28ce067..522ba40778f 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -44,7 +44,7 @@ Core and builtins - unicode(obj) now behaves more like str(obj), accepting arbitrary objects, and calling a __unicode__ method if it exists. unicode(obj, encoding) and unicode(obj, encoding, errors) still - require an 8-bit string argument. + require an 8-bit string or character buffer argument. - isinstance() now allows any object as the first argument and a class, a type or something with a __bases__ tuple attribute for the diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a29c75b5a34..57ef62a7138 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -426,8 +426,9 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, #if 0 /* For b/w compatibility we also accept Unicode objects provided - that no encodings is given and then redirect to PyObject_Unicode() - which then applies the additional logic for Unicode subclasses. + that no encodings is given and then redirect to + PyObject_Unicode() which then applies the additional logic for + Unicode subclasses. NOTE: This API should really only be used for object which represent *encoded* Unicode !