diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 4b6c7e5024c..ab578ea2817 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -323,6 +323,18 @@ and implemented by all standard Python codecs: | | (only for encoding). | +-------------------------+-----------------------------------------------+ +In addition, the following error handlers are specific to a single codec: + ++------------------+---------+--------------------------------------------+ +| Value | Codec | Meaning | ++==================+=========+============================================+ +| ``'surrogates'`` | utf-8 | Allow encoding and decoding of surrogate | +| | | codes in UTF-8. | ++------------------+---------+--------------------------------------------+ + +.. versionadded:: 3.1 + The ``'surrogates'`` error handler. + The set of allowed values can be extended via :meth:`register_error`. diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index a3ea40aa50f..992f3d2eecb 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -169,13 +169,13 @@ class BaseBytesTest(unittest.TestCase): self.assertEqual(b[start:stop:step], self.type2test(L[start:stop:step])) def test_encoding(self): - sample = "Hello world\n\u1234\u5678\u9abc\udef0" + sample = "Hello world\n\u1234\u5678\u9abc" for enc in ("utf8", "utf16"): b = self.type2test(sample, enc) self.assertEqual(b, self.type2test(sample.encode(enc))) self.assertRaises(UnicodeEncodeError, self.type2test, sample, "latin1") b = self.type2test(sample, "latin1", "ignore") - self.assertEqual(b, self.type2test(sample[:-4], "utf-8")) + self.assertEqual(b, self.type2test(sample[:-3], "utf-8")) def test_decode(self): sample = "Hello world\n\u1234\u5678\u9abc\def0\def0" diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 1730dbe5939..6706507335e 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -541,6 +541,17 @@ class UTF8Test(ReadTest): self.check_state_handling_decode(self.encoding, u, u.encode(self.encoding)) + def test_lone_surrogates(self): + self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8") + self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8") + + def test_surrogates_handler(self): + self.assertEquals("abc\ud800def".encode("utf-8", "surrogates"), + b"abc\xed\xa0\x80def") + self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogates"), + "abc\ud800def") + self.assertTrue(codecs.lookup_error("surrogates")) + class UTF7Test(ReadTest): encoding = "utf-7" @@ -1023,12 +1034,12 @@ class NameprepTest(unittest.TestCase): # Skipped continue # The Unicode strings are given in UTF-8 - orig = str(orig, "utf-8") + orig = str(orig, "utf-8", "surrogates") if prepped is None: # Input contains prohibited characters self.assertRaises(UnicodeError, nameprep, orig) else: - prepped = str(prepped, "utf-8") + prepped = str(prepped, "utf-8", "surrogates") try: self.assertEquals(nameprep(orig), prepped) except Exception as e: diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 1fddc06c629..220a8eb26e8 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -886,10 +886,10 @@ class UnicodeTest( self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac') self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82') self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96') - self.assertEqual('\ud800'.encode('utf-8'), b'\xed\xa0\x80') - self.assertEqual('\udc00'.encode('utf-8'), b'\xed\xb0\x80') + self.assertEqual('\ud800'.encode('utf-8', 'surrogates'), b'\xed\xa0\x80') + self.assertEqual('\udc00'.encode('utf-8', 'surrogates'), b'\xed\xb0\x80') self.assertEqual( - ('\ud800\udc02'*1000).encode('utf-8'), + ('\ud800\udc02'*1000).encode('utf-8', 'surrogates'), b'\xf0\x90\x80\x82'*1000 ) self.assertEqual( diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index aed8eaa0fe6..b84aaaf5ec3 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -13,6 +13,7 @@ import subprocess import test.support encoding = 'utf-8' +errors = 'surrogates' ### Run tests @@ -61,7 +62,7 @@ class UnicodeMethodsTest(unittest.TestCase): (char + 'ABC').title(), ] - h.update(''.join(data).encode(encoding)) + h.update(''.join(data).encode(encoding, errors)) result = h.hexdigest() self.assertEqual(result, self.expectedchecksum) diff --git a/Misc/NEWS b/Misc/NEWS index 7f22b0d5b5a..f4116ad8e78 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,8 @@ What's New in Python 3.1 beta 1? Core and Builtins ----------------- +- Issue #3672: Reject surrogates in utf-8 codec; add surrogates error handler. + - Issue #5883: In the io module, the BufferedIOBase and TextIOBase ABCs have received a new method, detach(). detach() disconnects the underlying stream from the buffer or text IO and returns it. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 68d4fc41ae9..cc70bad825a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -154,6 +154,11 @@ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0 }; +static PyObject *unicode_encode_call_errorhandler(const char *errors, + PyObject **errorHandler,const char *encoding, const char *reason, + const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, + Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); + /* Same for linebreaks */ static unsigned char ascii_linebreak[] = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -2214,14 +2219,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, goto utf8Error; } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); - if (ch < 0x0800) { - /* Note: UTF-8 encodings of surrogates are considered - legal UTF-8 sequences; - - XXX For wide builds (UCS-4) we should probably try - to recombine the surrogates into a single code - unit. - */ + if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) { errmsg = "illegal encoding"; startinpos = s-starts; endinpos = startinpos+3; @@ -2328,6 +2326,8 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s, Py_ssize_t nallocated; /* number of result bytes allocated */ Py_ssize_t nneeded; /* number of result bytes needed */ char stackbuf[MAX_SHORT_UNICHARS * 4]; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; assert(s != NULL); assert(size >= 0); @@ -2367,6 +2367,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s, else { /* Encode UCS2 Unicode ordinals */ if (ch < 0x10000) { +#ifndef Py_UNICODE_WIDE /* Special case: check for high surrogate */ if (0xD800 <= ch && ch <= 0xDBFF && i != size) { Py_UCS4 ch2 = s[i]; @@ -2379,6 +2380,36 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s, } /* Fall through: handles isolated high surrogates */ } +#endif + if (ch >= 0xd800 && ch <= 0xdfff) { + Py_ssize_t newpos; + PyObject *rep; + char *prep; + int k; + rep = unicode_encode_call_errorhandler + (errors, &errorHandler, "utf-8", "surrogates not allowed", + s, size, &exc, i-1, i, &newpos); + if (!rep) + goto error; + /* Implementation limitations: only support error handler that return + bytes, and only support up to four replacement bytes. */ + if (!PyBytes_Check(rep)) { + PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes"); + Py_DECREF(rep); + goto error; + } + if (PyBytes_Size(rep) > 4) { + PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes"); + Py_DECREF(rep); + goto error; + } + prep = PyBytes_AsString(rep); + for(k = PyBytes_Size(rep); k > 0; k--) + *p++ = *prep++; + Py_DECREF(rep); + continue; + + } *p++ = (char)(0xe0 | (ch >> 12)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f)); @@ -2405,7 +2436,14 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s, assert(nneeded <= nallocated); _PyBytes_Resize(&result, nneeded); } + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return result; + error: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + Py_XDECREF(result); + return NULL; #undef MAX_SHORT_UNICHARS } @@ -3897,7 +3935,7 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos) { - static char *argparse = "O!n;encoding error handler must return (str, int) tuple"; + static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; PyObject *restuple; PyObject *resunicode; @@ -3918,15 +3956,20 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors, if (restuple == NULL) return NULL; if (!PyTuple_Check(restuple)) { - PyErr_SetString(PyExc_TypeError, &argparse[4]); + PyErr_SetString(PyExc_TypeError, &argparse[3]); Py_DECREF(restuple); return NULL; } - if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, + if (!PyArg_ParseTuple(restuple, argparse, &resunicode, newpos)) { Py_DECREF(restuple); return NULL; } + if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { + PyErr_SetString(PyExc_TypeError, &argparse[3]); + Py_DECREF(restuple); + return NULL; + } if (*newpos<0) *newpos = size+*newpos; if (*newpos<0 || *newpos>size) { @@ -4064,6 +4107,12 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, collstart-startp, collend-startp, &newpos); if (repunicode == NULL) goto onError; + if (!PyUnicode_Check(repunicode)) { + /* Implementation limitation: byte results not supported yet. */ + PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); + Py_DECREF(repunicode); + goto onError; + } /* need more space? (at least enough for what we have+the replacement+the rest of the string, so we won't have to check space for encodable characters) */ @@ -5027,6 +5076,12 @@ int charmap_encoding_error( collstartpos, collendpos, &newpos); if (repunicode == NULL) return -1; + if (!PyUnicode_Check(repunicode)) { + /* Implementation limitation: byte results not supported yet. */ + PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); + Py_DECREF(repunicode); + return -1; + } /* generate replacement */ repsize = PyUnicode_GET_SIZE(repunicode); for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { @@ -5588,6 +5643,12 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, collstart-s, collend-s, &newpos); if (repunicode == NULL) goto onError; + if (!PyUnicode_Check(repunicode)) { + /* Implementation limitation: byte results not supported yet. */ + PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); + Py_DECREF(repunicode); + goto onError; + } /* generate replacement */ repsize = PyUnicode_GET_SIZE(repunicode); for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { diff --git a/Python/codecs.c b/Python/codecs.c index ebddc09d7b0..3f1412d00ca 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -748,6 +748,85 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } } +PyObject *PyCodec_SurrogateErrors(PyObject *exc) +{ + PyObject *restuple; + PyObject *object; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + Py_UNICODE *p; + Py_UNICODE *startp; + char *outp; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + startp = PyUnicode_AS_UNICODE(object); + res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); + if (!res) { + Py_DECREF(object); + return NULL; + } + outp = PyBytes_AsString(res); + for (p = startp+start; p < startp+end; p++) { + Py_UNICODE ch = *p; + if (ch < 0xd800 || ch > 0xdfff) { + /* Not a surrogate, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(res); + Py_DECREF(object); + return NULL; + } + *outp++ = (char)(0xe0 | (ch >> 12)); + *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *outp++ = (char)(0x80 | (ch & 0x3f)); + } + restuple = Py_BuildValue("(On)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + unsigned char *p; + Py_UNICODE ch = 0; + if (PyUnicodeDecodeError_GetStart(exc, &start)) + return NULL; + if (!(object = PyUnicodeDecodeError_GetObject(exc))) + return NULL; + if (!(p = (unsigned char*)PyBytes_AsString(object))) { + Py_DECREF(object); + return NULL; + } + /* Try decoding a single surrogate character. If + there are more, let the codec call us again. */ + p += start; + if ((p[0] & 0xf0) == 0xe0 || + (p[1] & 0xc0) == 0x80 || + (p[2] & 0xc0) == 0x80) { + /* it's a three-byte code */ + ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + if (ch < 0xd800 || ch > 0xdfff) + /* it's not a surrogate - fail */ + ch = 0; + } + Py_DECREF(object); + if (ch == 0) { + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; + } + return Py_BuildValue("(u#n)", &ch, 1, start+3); + } + else { + wrong_exception_type(exc); + return NULL; + } +} + + static PyObject *strict_errors(PyObject *self, PyObject *exc) { return PyCodec_StrictErrors(exc); @@ -777,6 +856,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) return PyCodec_BackslashReplaceErrors(exc); } +static PyObject *surrogates_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_SurrogateErrors(exc); +} + static int _PyCodecRegistry_Init(void) { static struct { @@ -823,6 +907,14 @@ static int _PyCodecRegistry_Init(void) backslashreplace_errors, METH_O } + }, + { + "surrogates", + { + "surrogates", + surrogates_errors, + METH_O + } } }; diff --git a/Python/marshal.c b/Python/marshal.c index bf7a26b5b2a..4ad873eb77e 100644 --- a/Python/marshal.c +++ b/Python/marshal.c @@ -312,7 +312,9 @@ w_object(PyObject *v, WFILE *p) } else if (PyUnicode_CheckExact(v)) { PyObject *utf8; - utf8 = PyUnicode_AsUTF8String(v); + utf8 = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(v), + PyUnicode_GET_SIZE(v), + "surrogates"); if (utf8 == NULL) { p->depth--; p->error = WFERR_UNMARSHALLABLE; @@ -810,7 +812,7 @@ r_object(RFILE *p) retval = NULL; break; } - v = PyUnicode_DecodeUTF8(buffer, n, NULL); + v = PyUnicode_DecodeUTF8(buffer, n, "surrogates"); PyMem_DEL(buffer); retval = v; break;