Slightly revised version of patch #1538956:

Replace UnicodeDecodeErrors raised during == and !=
compares of Unicode and other objects with a new
UnicodeWarning.

All other comparisons continue to raise exceptions.
Exceptions other than UnicodeDecodeErrors are also left
untouched.
This commit is contained in:
Marc-André Lemburg 2006-08-14 10:55:19 +00:00
parent e6dd31c50b
commit 040f76b79c
11 changed files with 170 additions and 36 deletions

View File

@ -1560,6 +1560,31 @@ They all return \NULL{} or \code{-1} if an exception occurs.
greater than, respectively.
\end{cfuncdesc}
\begin{cfuncdesc}{int}{PyUnicode_RichCompare}{PyObject *left,
PyObject *right,
int op}
% This entry could use some polishing - my TeX is too
% rusty these days... (MAL)
Rich compare two strings and return one of the following:
\begin{verbatim}
- NULL in case an exception was raised
- Py_True or Py_False for successfuly comparisons
- Py_NotImplemented in case the type combination is unknown
\end{verbatim}
Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
case the conversion of the arguments to Unicode fails with a
UnicodeDecodeError.
Possible values for \var{op}:
\begin{verbatim}
Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
\end{verbatim}
\end{cfuncdesc}
\begin{cfuncdesc}{PyObject*}{PyUnicode_Format}{PyObject *format,
PyObject *args}
Return a new string object from \var{format} and \var{args}; this

View File

@ -288,10 +288,11 @@ for each thread.
names are \samp{PyExc_} followed by the Python exception name.
These have the type \ctype{PyObject*}; they are all class objects.
Their names are \cdata{PyExc_Warning}, \cdata{PyExc_UserWarning},
\cdata{PyExc_DeprecationWarning}, \cdata{PyExc_SyntaxWarning},
\cdata{PyExc_RuntimeWarning}, and \cdata{PyExc_FutureWarning}.
\cdata{PyExc_Warning} is a subclass of \cdata{PyExc_Exception}; the
other warning categories are subclasses of \cdata{PyExc_Warning}.
\cdata{PyExc_UnicodeWarning}, \cdata{PyExc_DeprecationWarning},
\cdata{PyExc_SyntaxWarning}, \cdata{PyExc_RuntimeWarning}, and
\cdata{PyExc_FutureWarning}. \cdata{PyExc_Warning} is a subclass of
\cdata{PyExc_Exception}; the other warning categories are subclasses
of \cdata{PyExc_Warning}.
For information about warning control, see the documentation for the
\module{warnings} module and the \programopt{-W} option in the

View File

@ -456,6 +456,11 @@ Base class for warnings about probable mistakes in module imports.
\versionadded{2.5}
\end{excdesc}
\begin{excdesc}{UnicodeWarning}
Base class for warnings related to Unicode.
\versionadded{2.5}
\end{excdesc}
The class hierarchy for built-in exceptions is:
\verbatiminput{../../Lib/test/exception_hierarchy.txt}

View File

@ -76,6 +76,9 @@ features that will be deprecated in the future (ignored by default).}
\lineii{ImportWarning}{Base category for warnings triggered during the
process of importing a module (ignored by default).}
\lineii{UnicodeWarning}{Base category for warnings related to Unicode.}
\end{tableii}
While these are technically built-in exceptions, they are documented

View File

@ -173,6 +173,7 @@ PyAPI_DATA(PyObject *) PyExc_SyntaxWarning;
PyAPI_DATA(PyObject *) PyExc_RuntimeWarning;
PyAPI_DATA(PyObject *) PyExc_FutureWarning;
PyAPI_DATA(PyObject *) PyExc_ImportWarning;
PyAPI_DATA(PyObject *) PyExc_UnicodeWarning;
/* Convenience functions */

View File

@ -189,6 +189,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
# define PyUnicode_Replace PyUnicodeUCS2_Replace
# define PyUnicode_Resize PyUnicodeUCS2_Resize
# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
# define PyUnicode_Split PyUnicodeUCS2_Split
# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
@ -266,6 +267,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
# define PyUnicode_Replace PyUnicodeUCS4_Replace
# define PyUnicode_Resize PyUnicodeUCS4_Resize
# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
# define PyUnicode_Split PyUnicodeUCS4_Split
# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
@ -1139,6 +1141,28 @@ PyAPI_FUNC(int) PyUnicode_Compare(
PyObject *right /* Right string */
);
/* Rich compare two strings and return one of the following:
- NULL in case an exception was raised
- Py_True or Py_False for successfuly comparisons
- Py_NotImplemented in case the type combination is unknown
Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
case the conversion of the arguments to Unicode fails with a
UnicodeDecodeError.
Possible values for op:
Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
*/
PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
PyObject *left, /* Left string */
PyObject *right, /* Right string */
int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
);
/* Apply a argument tuple or dictionary to a format string and return
the resulting Unicode string. */

View File

@ -45,3 +45,4 @@ BaseException
+-- UserWarning
+-- FutureWarning
+-- ImportWarning
+-- UnicodeWarning

View File

@ -12,18 +12,18 @@ What's New in Python 2.5 release candidate 1?
Core and builtins
-----------------
- Fix segfault when doing string formatting on subclasses of long.
- Unicode objects will no longer raise an exception when being
compared equal or unequal to a string and causing a
UnicodeDecodeError exception, e.g. as result of a decoding failure.
- Fix bug related to __len__ functions using values > 2**32 on 64-bit machines
with new-style classes.
- Fix bug related to __len__ functions returning negative values with
classic classes.
- Patch #1538606, Fix __index__() clipping. There were some problems
discovered with the API and how integers that didn't fit into Py_ssize_t
were handled. This patch attempts to provide enough alternatives
to effectively use __index__.
Instead, the equal (==) and unequal (!=) comparison operators will
now issue a UnicodeWarning and interpret the two objects as
unequal. The UnicodeWarning can be filtered as desired using
the warning framework, e.g. silenced completely, turned into an
exception, logged, etc.
Note that compare operators other than equal and unequal will still
raise UnicodeDecodeError exceptions as they've always done.
- Bug #1536021: __hash__ may now return long int; the final hash
value is obtained by invoking hash on the long int.
@ -99,6 +99,8 @@ Build
C API
-----
- New API for Unicode rich comparisons: PyUnicode_RichCompare()
- Bug #1069160. Internal correctness changes were made to
``PyThreadState_SetAsyncExc()``. A test case was added, and
the documentation was changed to state that the return value

View File

@ -1948,6 +1948,14 @@ SimpleExtendsException(PyExc_Warning, ImportWarning,
"Base class for warnings about probable mistakes in module imports");
/*
* UnicodeWarning extends Warning
*/
SimpleExtendsException(PyExc_Warning, UnicodeWarning,
"Base class for warnings about Unicode related problems, mostly\n"
"related to conversion problems.");
/* Pre-computed MemoryError instance. Best to create this as early as
* possible and not wait until a MemoryError is actually raised!
*/
@ -2048,6 +2056,7 @@ _PyExc_Init(void)
PRE_INIT(RuntimeWarning)
PRE_INIT(FutureWarning)
PRE_INIT(ImportWarning)
PRE_INIT(UnicodeWarning)
m = Py_InitModule4("exceptions", functions, exceptions_doc,
(PyObject *)NULL, PYTHON_API_VERSION);
@ -2113,6 +2122,7 @@ _PyExc_Init(void)
POST_INIT(RuntimeWarning)
POST_INIT(FutureWarning)
POST_INIT(ImportWarning)
POST_INIT(UnicodeWarning)
PyExc_MemoryErrorInst = BaseException_new(&_PyExc_MemoryError, NULL, NULL);
if (!PyExc_MemoryErrorInst)

View File

@ -731,23 +731,6 @@ default_3way_compare(PyObject *v, PyObject *w)
return (vv < ww) ? -1 : (vv > ww) ? 1 : 0;
}
#ifdef Py_USING_UNICODE
/* Special case for Unicode */
if (PyUnicode_Check(v) || PyUnicode_Check(w)) {
c = PyUnicode_Compare(v, w);
if (!PyErr_Occurred())
return c;
/* TypeErrors are ignored: if Unicode coercion fails due
to one of the arguments not having the right type, we
continue as defined by the coercion protocol (see
above). Luckily, decoding errors are reported as
ValueErrors and are not masked by this technique. */
if (!PyErr_ExceptionMatches(PyExc_TypeError))
return -2;
PyErr_Clear();
}
#endif
/* None is smaller than anything */
if (v == Py_None)
return -1;

View File

@ -5405,6 +5405,82 @@ onError:
return -1;
}
PyObject *PyUnicode_RichCompare(PyObject *left,
PyObject *right,
int op)
{
int result;
result = PyUnicode_Compare(left, right);
if (result == -1 && PyErr_Occurred())
goto onError;
/* Convert the return value to a Boolean */
switch (op) {
case Py_EQ:
result = (result == 0);
break;
case Py_NE:
result = (result != 0);
break;
case Py_LE:
result = (result <= 0);
break;
case Py_GE:
result = (result >= 0);
break;
case Py_LT:
result = (result == -1);
break;
case Py_GT:
result = (result == 1);
break;
}
return PyBool_FromLong(result);
onError:
/* Standard case
Type errors mean that PyUnicode_FromObject() could not convert
one of the arguments (usually the right hand side) to Unicode,
ie. we can't handle the comparison request. However, it is
possible that the other object knows a comparison method, which
is why we return Py_NotImplemented to give the other object a
chance.
*/
if (PyErr_ExceptionMatches(PyExc_TypeError)) {
PyErr_Clear();
Py_INCREF(Py_NotImplemented);
return Py_NotImplemented;
}
if (op != Py_EQ && op != Py_NE)
return NULL;
/* Equality comparison.
This is a special case: we silence any PyExc_UnicodeDecodeError
and instead turn it into a PyErr_UnicodeWarning.
*/
if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
return NULL;
PyErr_Clear();
if (PyErr_Warn(PyExc_UnicodeWarning,
(op == Py_EQ) ?
"Unicode equal comparison "
"failed to convert both arguments to Unicode - "
"interpreting them as being unequal" :
"Unicode unequal comparison "
"failed to convert both arguments to Unicode - "
"interpreting them as being unequal"
) < 0)
return NULL;
result = (op == Py_NE);
return PyBool_FromLong(result);
}
int PyUnicode_Contains(PyObject *container,
PyObject *element)
{
@ -6985,11 +7061,14 @@ static PySequenceMethods unicode_as_sequence = {
PyUnicode_Contains, /* sq_contains */
};
#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
static PyObject*
unicode_subscript(PyUnicodeObject* self, PyObject* item)
{
if (PyIndex_Check(item)) {
Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
PyNumberMethods *nb = item->ob_type->tp_as_number;
if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
Py_ssize_t i = nb->nb_index(item);
if (i == -1 && PyErr_Occurred())
return NULL;
if (i < 0)
@ -7859,7 +7938,7 @@ PyTypeObject PyUnicode_Type = {
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
(cmpfunc) unicode_compare, /* tp_compare */
0, /* tp_compare */
unicode_repr, /* tp_repr */
&unicode_as_number, /* tp_as_number */
&unicode_as_sequence, /* tp_as_sequence */
@ -7875,7 +7954,7 @@ PyTypeObject PyUnicode_Type = {
unicode_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
PyUnicode_RichCompare, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */