Issue #5915: Implement PEP 383, Non-decodable Bytes in
System Character Interfaces.
This commit is contained in:
parent
93f65a177b
commit
011e842033
|
@ -322,6 +322,8 @@ and implemented by all standard Python codecs:
|
|||
| ``'backslashreplace'`` | Replace with backslashed escape sequences |
|
||||
| | (only for encoding). |
|
||||
+-------------------------+-----------------------------------------------+
|
||||
| ``'utf8b'`` | Replace byte with surrogate U+DCxx. |
|
||||
+-------------------------+-----------------------------------------------+
|
||||
|
||||
In addition, the following error handlers are specific to a single codec:
|
||||
|
||||
|
@ -333,7 +335,7 @@ In addition, the following error handlers are specific to a single codec:
|
|||
+------------------+---------+--------------------------------------------+
|
||||
|
||||
.. versionadded:: 3.1
|
||||
The ``'surrogates'`` error handler.
|
||||
The ``'utf8b'`` and ``'surrogates'`` error handlers.
|
||||
|
||||
The set of allowed values can be extended via :meth:`register_error`.
|
||||
|
||||
|
|
|
@ -51,6 +51,30 @@ the :mod:`os` module, but using them is of course a threat to portability!
|
|||
``'ce'``, ``'java'``.
|
||||
|
||||
|
||||
.. _os-filenames:
|
||||
|
||||
File Names, Command Line Arguments, and Environment Variables
|
||||
-------------------------------------------------------------
|
||||
|
||||
In Python, file names, command line arguments, and environment
|
||||
variables are represented using the string type. On some systems,
|
||||
decoding these strings to and from bytes is necessary before passing
|
||||
them to the operating system. Python uses the file system encoding to
|
||||
perform this conversion (see :func:`sys.getfilesystemencoding`).
|
||||
|
||||
.. versionchanged:: 3.1
|
||||
On some systems, conversion using the file system encoding may
|
||||
fail. In this case, Python uses the ``utf8b`` encoding error
|
||||
handler, which means that undecodable bytes are replaced by a
|
||||
Unicode character U+DCxx on decoding, and these are again
|
||||
translated to the original byte on encoding.
|
||||
|
||||
|
||||
The file system encoding must guarantee to successfully decode all
|
||||
bytes below 128. If the file system encoding fails to provide this
|
||||
guarantee, API functions may raise UnicodeErrors.
|
||||
|
||||
|
||||
.. _os-procinfo:
|
||||
|
||||
Process Parameters
|
||||
|
@ -688,12 +712,8 @@ Files and Directories
|
|||
|
||||
.. function:: getcwd()
|
||||
|
||||
Return a string representing the current working directory. On Unix
|
||||
platforms, this function may raise :exc:`UnicodeDecodeError` if the name of
|
||||
the current directory is not decodable in the file system encoding. Use
|
||||
:func:`getcwdb` if you need the call to never fail. Availability: Unix,
|
||||
Windows.
|
||||
|
||||
Return a string representing the current working directory.
|
||||
Availability: Unix, Windows.
|
||||
|
||||
.. function:: getcwdb()
|
||||
|
||||
|
@ -800,10 +820,8 @@ Files and Directories
|
|||
entries ``'.'`` and ``'..'`` even if they are present in the directory.
|
||||
Availability: Unix, Windows.
|
||||
|
||||
This function can be called with a bytes or string argument. In the bytes
|
||||
case, all filenames will be listed as returned by the underlying API. In the
|
||||
string case, filenames will be decoded using the file system encoding, and
|
||||
skipped if a decoding error occurs.
|
||||
This function can be called with a bytes or string argument, and returns
|
||||
filenames of the same datatype.
|
||||
|
||||
|
||||
.. function:: lstat(path)
|
||||
|
|
|
@ -198,6 +198,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
|
||||
# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
|
||||
# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
|
||||
# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
|
||||
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
|
||||
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
|
||||
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
|
||||
|
@ -296,6 +297,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
|
||||
# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
|
||||
# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
|
||||
# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
|
||||
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
|
||||
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
|
||||
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
|
||||
|
@ -693,25 +695,6 @@ PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
|
|||
PyObject *unicode,
|
||||
const char *errors);
|
||||
|
||||
/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
|
||||
|
||||
If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
|
||||
UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
|
||||
invalid characters with '?'.
|
||||
|
||||
The function is intended to be used for paths and file names only
|
||||
during bootstrapping process where the codecs are not set up.
|
||||
*/
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
|
||||
const char *s /* encoded string */
|
||||
);
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
|
||||
const char *s, /* encoded string */
|
||||
Py_ssize_t size /* size */
|
||||
);
|
||||
|
||||
/* Returns a pointer to the default encoding (normally, UTF-8) of the
|
||||
Unicode object unicode and the size of the encoded representation
|
||||
in bytes stored in *size.
|
||||
|
@ -1252,6 +1235,33 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
|
|||
const char *errors /* error handling */
|
||||
);
|
||||
|
||||
/* --- File system encoding ---------------------------------------------- */
|
||||
|
||||
/* ParseTuple converter which converts a Unicode object into the file
|
||||
system encoding, using the PEP 383 error handler; bytes objects are
|
||||
output as-is. */
|
||||
|
||||
PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
|
||||
|
||||
/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
|
||||
|
||||
If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
|
||||
UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
|
||||
invalid characters with '?'.
|
||||
|
||||
The function is intended to be used for paths and file names only
|
||||
during bootstrapping process where the codecs are not set up.
|
||||
*/
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
|
||||
const char *s /* encoded string */
|
||||
);
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
|
||||
const char *s, /* encoded string */
|
||||
Py_ssize_t size /* size */
|
||||
);
|
||||
|
||||
/* --- Methods & Slots ----------------------------------------------------
|
||||
|
||||
These are capable of handling Unicode objects and strings on input
|
||||
|
|
|
@ -1516,6 +1516,34 @@ class TypesTest(unittest.TestCase):
|
|||
self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
|
||||
self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
|
||||
|
||||
class Utf8bTest(unittest.TestCase):
|
||||
|
||||
def test_utf8(self):
|
||||
# Bad byte
|
||||
self.assertEqual(b"foo\x80bar".decode("utf-8", "utf8b"),
|
||||
"foo\udc80bar")
|
||||
self.assertEqual("foo\udc80bar".encode("utf-8", "utf8b"),
|
||||
b"foo\x80bar")
|
||||
# bad-utf-8 encoded surrogate
|
||||
self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "utf8b"),
|
||||
"\udced\udcb0\udc80")
|
||||
self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "utf8b"),
|
||||
b"\xed\xb0\x80")
|
||||
|
||||
def test_ascii(self):
|
||||
# bad byte
|
||||
self.assertEqual(b"foo\x80bar".decode("ascii", "utf8b"),
|
||||
"foo\udc80bar")
|
||||
self.assertEqual("foo\udc80bar".encode("ascii", "utf8b"),
|
||||
b"foo\x80bar")
|
||||
|
||||
def test_charmap(self):
|
||||
# bad byte: \xa5 is unmapped in iso-8859-3
|
||||
self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "utf8b"),
|
||||
"foo\udca5bar")
|
||||
self.assertEqual("foo\udca5bar".encode("iso-8859-3", "utf8b"),
|
||||
b"foo\xa5bar")
|
||||
|
||||
|
||||
def test_main():
|
||||
support.run_unittest(
|
||||
|
@ -1543,6 +1571,7 @@ def test_main():
|
|||
CharmapTest,
|
||||
WithStmtTest,
|
||||
TypesTest,
|
||||
Utf8bTest,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import errno
|
|||
import unittest
|
||||
import warnings
|
||||
import sys
|
||||
import shutil
|
||||
from test import support
|
||||
|
||||
# Tests creating TESTFN
|
||||
|
@ -698,9 +699,44 @@ if sys.platform != 'win32':
|
|||
self.assertRaises(os.error, os.setregid, 0, 0)
|
||||
self.assertRaises(OverflowError, os.setregid, 1<<32, 0)
|
||||
self.assertRaises(OverflowError, os.setregid, 0, 1<<32)
|
||||
|
||||
class Pep383Tests(unittest.TestCase):
|
||||
filenames = [b'foo\xf6bar', 'foo\xf6bar'.encode("utf-8")]
|
||||
|
||||
def setUp(self):
|
||||
self.fsencoding = sys.getfilesystemencoding()
|
||||
sys.setfilesystemencoding("utf-8")
|
||||
self.dir = support.TESTFN
|
||||
self.bdir = self.dir.encode("utf-8", "utf8b")
|
||||
os.mkdir(self.dir)
|
||||
self.unicodefn = []
|
||||
for fn in self.filenames:
|
||||
f = open(os.path.join(self.bdir, fn), "w")
|
||||
f.close()
|
||||
self.unicodefn.append(fn.decode("utf-8", "utf8b"))
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.dir)
|
||||
sys.setfilesystemencoding(self.fsencoding)
|
||||
|
||||
def test_listdir(self):
|
||||
expected = set(self.unicodefn)
|
||||
found = set(os.listdir(support.TESTFN))
|
||||
self.assertEquals(found, expected)
|
||||
|
||||
def test_open(self):
|
||||
for fn in self.unicodefn:
|
||||
f = open(os.path.join(self.dir, fn))
|
||||
f.close()
|
||||
|
||||
def test_stat(self):
|
||||
for fn in self.unicodefn:
|
||||
os.stat(os.path.join(self.dir, fn))
|
||||
else:
|
||||
class PosixUidGidTests(unittest.TestCase):
|
||||
pass
|
||||
class Pep383Tests(unittest.TestCase):
|
||||
pass
|
||||
|
||||
def test_main():
|
||||
support.run_unittest(
|
||||
|
@ -714,7 +750,8 @@ def test_main():
|
|||
ExecTests,
|
||||
Win32ErrorTests,
|
||||
TestInvalidFD,
|
||||
PosixUidGidTests
|
||||
PosixUidGidTests,
|
||||
Pep383Tests
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -12,6 +12,8 @@ What's New in Python 3.1 beta 1?
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Implement PEP 383, Non-decodable Bytes in System Character Interfaces.
|
||||
|
||||
- Issue #5890: in subclasses of 'property' the __doc__ attribute was
|
||||
shadowed by classtype's, even if it was None. property now
|
||||
inserts the __doc__ into the subclass instance __dict__.
|
||||
|
|
|
@ -245,7 +245,7 @@ fileio_init(PyObject *oself, PyObject *args, PyObject *kwds)
|
|||
return -1;
|
||||
|
||||
stringobj = PyUnicode_AsEncodedString(
|
||||
u, Py_FileSystemDefaultEncoding, NULL);
|
||||
u, Py_FileSystemDefaultEncoding, "utf8b");
|
||||
Py_DECREF(u);
|
||||
if (stringobj == NULL)
|
||||
return -1;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
113
Modules/python.c
113
Modules/python.c
|
@ -14,6 +14,93 @@ wmain(int argc, wchar_t **argv)
|
|||
return Py_Main(argc, argv);
|
||||
}
|
||||
#else
|
||||
static wchar_t*
|
||||
char2wchar(char* arg)
|
||||
{
|
||||
wchar_t *res;
|
||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||
/* Some platforms have a broken implementation of
|
||||
* mbstowcs which does not count the characters that
|
||||
* would result from conversion. Use an upper bound.
|
||||
*/
|
||||
size_t argsize = strlen(arg);
|
||||
#else
|
||||
size_t argsize = mbstowcs(NULL, arg, 0);
|
||||
#endif
|
||||
size_t count;
|
||||
unsigned char *in;
|
||||
wchar_t *out;
|
||||
#ifdef HAVE_MBRTOWC
|
||||
mbstate_t mbs;
|
||||
#endif
|
||||
if (argsize != (size_t)-1) {
|
||||
res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
|
||||
if (!res)
|
||||
goto oom;
|
||||
count = mbstowcs(res, arg, argsize+1);
|
||||
if (count != (size_t)-1)
|
||||
return res;
|
||||
PyMem_Free(res);
|
||||
}
|
||||
/* Conversion failed. Fall back to escaping with utf8b. */
|
||||
#ifdef HAVE_MBRTOWC
|
||||
/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
|
||||
|
||||
/* Overallocate; as multi-byte characters are in the argument, the
|
||||
actual output could use less memory. */
|
||||
argsize = strlen(arg) + 1;
|
||||
res = PyMem_Malloc(argsize*sizeof(wchar_t));
|
||||
if (!res) goto oom;
|
||||
in = (unsigned char*)arg;
|
||||
out = res;
|
||||
memset(&mbs, 0, sizeof mbs);
|
||||
while (argsize) {
|
||||
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
|
||||
if (converted == 0)
|
||||
/* Reached end of string; null char stored. */
|
||||
break;
|
||||
if (converted == (size_t)-2) {
|
||||
/* Incomplete character. This should never happen,
|
||||
since we provide everything that we have -
|
||||
unless there is a bug in the C library, or I
|
||||
misunderstood how mbrtowc works. */
|
||||
fprintf(stderr, "unexpected mbrtowc result -2\n");
|
||||
return NULL;
|
||||
}
|
||||
if (converted == (size_t)-1) {
|
||||
/* Conversion error. Escape as UTF-8b, and start over
|
||||
in the initial shift state. */
|
||||
*out++ = 0xdc00 + *in++;
|
||||
argsize--;
|
||||
memset(&mbs, 0, sizeof mbs);
|
||||
continue;
|
||||
}
|
||||
/* successfully converted some bytes */
|
||||
in += converted;
|
||||
argsize -= converted;
|
||||
out++;
|
||||
}
|
||||
#else
|
||||
/* Cannot use C locale for escaping; manually escape as if charset
|
||||
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
|
||||
correctly in the locale's charset, which must be an ASCII superset. */
|
||||
res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
|
||||
if (!res) goto oom;
|
||||
in = (unsigned char*)arg;
|
||||
out = res;
|
||||
while(*in)
|
||||
if(*in < 128)
|
||||
*out++ = *in++;
|
||||
else
|
||||
*out++ = 0xdc00 + *in++;
|
||||
*out = 0;
|
||||
#endif
|
||||
return res;
|
||||
oom:
|
||||
fprintf(stderr, "out of memory\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
|
@ -40,32 +127,10 @@ main(int argc, char **argv)
|
|||
oldloc = strdup(setlocale(LC_ALL, NULL));
|
||||
setlocale(LC_ALL, "");
|
||||
for (i = 0; i < argc; i++) {
|
||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||
/* Some platforms have a broken implementation of
|
||||
* mbstowcs which does not count the characters that
|
||||
* would result from conversion. Use an upper bound.
|
||||
*/
|
||||
size_t argsize = strlen(argv[i]);
|
||||
#else
|
||||
size_t argsize = mbstowcs(NULL, argv[i], 0);
|
||||
#endif
|
||||
size_t count;
|
||||
if (argsize == (size_t)-1) {
|
||||
fprintf(stderr, "Could not convert argument %d to string\n", i);
|
||||
argv_copy2[i] = argv_copy[i] = char2wchar(argv[i]);
|
||||
if (!argv_copy[i])
|
||||
return 1;
|
||||
}
|
||||
argv_copy[i] = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
|
||||
argv_copy2[i] = argv_copy[i];
|
||||
if (!argv_copy[i]) {
|
||||
fprintf(stderr, "out of memory\n");
|
||||
return 1;
|
||||
}
|
||||
count = mbstowcs(argv_copy[i], argv[i], argsize+1);
|
||||
if (count == (size_t)-1) {
|
||||
fprintf(stderr, "Could not convert argument %d to string\n", i);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
setlocale(LC_ALL, oldloc);
|
||||
free(oldloc);
|
||||
res = Py_Main(argc, argv_copy);
|
||||
|
|
|
@ -1530,6 +1530,53 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
|
|||
}
|
||||
}
|
||||
|
||||
/* Convert the argument to a bytes object, according to the file
|
||||
system encoding */
|
||||
|
||||
int
|
||||
PyUnicode_FSConverter(PyObject* arg, void* addr)
|
||||
{
|
||||
PyObject *output = NULL;
|
||||
Py_ssize_t size;
|
||||
void *data;
|
||||
if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
|
||||
output = arg;
|
||||
Py_INCREF(output);
|
||||
}
|
||||
else {
|
||||
arg = PyUnicode_FromObject(arg);
|
||||
if (!arg)
|
||||
return 0;
|
||||
output = PyUnicode_AsEncodedObject(arg,
|
||||
Py_FileSystemDefaultEncoding,
|
||||
"utf8b");
|
||||
Py_DECREF(arg);
|
||||
if (!output)
|
||||
return 0;
|
||||
if (!PyBytes_Check(output)) {
|
||||
Py_DECREF(output);
|
||||
PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (PyBytes_Check(output)) {
|
||||
size = PyBytes_GET_SIZE(output);
|
||||
data = PyBytes_AS_STRING(output);
|
||||
}
|
||||
else {
|
||||
size = PyByteArray_GET_SIZE(output);
|
||||
data = PyByteArray_AS_STRING(output);
|
||||
}
|
||||
if (size != strlen(data)) {
|
||||
PyErr_SetString(PyExc_TypeError, "embedded NUL character");
|
||||
Py_DECREF(output);
|
||||
return 0;
|
||||
}
|
||||
*(PyObject**)addr = output;
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
char*
|
||||
_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
|
||||
{
|
||||
|
@ -4154,12 +4201,23 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
|
|||
collstart-startp, collend-startp, &newpos);
|
||||
if (repunicode == NULL)
|
||||
goto onError;
|
||||
if (!PyUnicode_Check(repunicode)) {
|
||||
/* Implementation limitation: byte results not supported yet. */
|
||||
PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
|
||||
if (PyBytes_Check(repunicode)) {
|
||||
/* Directly copy bytes result to output. */
|
||||
repsize = PyBytes_Size(repunicode);
|
||||
if (repsize > 1) {
|
||||
/* Make room for all additional bytes. */
|
||||
if (_PyBytes_Resize(&res, ressize+repsize-1)) {
|
||||
Py_DECREF(repunicode);
|
||||
goto onError;
|
||||
}
|
||||
ressize += repsize-1;
|
||||
}
|
||||
memcpy(str, PyBytes_AsString(repunicode), repsize);
|
||||
str += repsize;
|
||||
p = startp + newpos;
|
||||
Py_DECREF(repunicode);
|
||||
break;
|
||||
}
|
||||
/* need more space? (at least enough for what we
|
||||
have+the replacement+the rest of the string, so
|
||||
we won't have to check space for encodable characters) */
|
||||
|
@ -5123,12 +5181,25 @@ int charmap_encoding_error(
|
|||
collstartpos, collendpos, &newpos);
|
||||
if (repunicode == NULL)
|
||||
return -1;
|
||||
if (!PyUnicode_Check(repunicode)) {
|
||||
/* Implementation limitation: byte results not supported yet. */
|
||||
PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
|
||||
if (PyBytes_Check(repunicode)) {
|
||||
/* Directly copy bytes result to output. */
|
||||
Py_ssize_t outsize = PyBytes_Size(*res);
|
||||
Py_ssize_t requiredsize;
|
||||
repsize = PyBytes_Size(repunicode);
|
||||
requiredsize = *respos + repsize;
|
||||
if (requiredsize > outsize)
|
||||
/* Make room for all additional bytes. */
|
||||
if (charmapencode_resize(res, respos, requiredsize)) {
|
||||
Py_DECREF(repunicode);
|
||||
return -1;
|
||||
}
|
||||
memcpy(PyBytes_AsString(*res) + *respos,
|
||||
PyBytes_AsString(repunicode), repsize);
|
||||
*respos += repsize;
|
||||
*inpos = newpos;
|
||||
Py_DECREF(repunicode);
|
||||
break;
|
||||
}
|
||||
/* generate replacement */
|
||||
repsize = PyUnicode_GET_SIZE(repunicode);
|
||||
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
|
||||
|
@ -5691,7 +5762,7 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
|||
if (repunicode == NULL)
|
||||
goto onError;
|
||||
if (!PyUnicode_Check(repunicode)) {
|
||||
/* Implementation limitation: byte results not supported yet. */
|
||||
/* Byte results not supported, since they have no decimal property. */
|
||||
PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
|
||||
Py_DECREF(repunicode);
|
||||
goto onError;
|
||||
|
|
|
@ -829,6 +829,82 @@ PyCodec_SurrogateErrors(PyObject *exc)
|
|||
}
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
PyCodec_UTF8bErrors(PyObject *exc)
|
||||
{
|
||||
PyObject *restuple;
|
||||
PyObject *object;
|
||||
Py_ssize_t start;
|
||||
Py_ssize_t end;
|
||||
PyObject *res;
|
||||
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
||||
Py_UNICODE *p;
|
||||
Py_UNICODE *startp;
|
||||
char *outp;
|
||||
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
||||
return NULL;
|
||||
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
||||
return NULL;
|
||||
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
||||
return NULL;
|
||||
startp = PyUnicode_AS_UNICODE(object);
|
||||
res = PyBytes_FromStringAndSize(NULL, end-start);
|
||||
if (!res) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
outp = PyBytes_AsString(res);
|
||||
for (p = startp+start; p < startp+end; p++) {
|
||||
Py_UNICODE ch = *p;
|
||||
if (ch < 0xdc80 || ch > 0xdcff) {
|
||||
/* Not a UTF-8b surrogate, fail with original exception */
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
Py_DECREF(res);
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
*outp++ = ch - 0xdc00;
|
||||
}
|
||||
restuple = Py_BuildValue("(On)", res, end);
|
||||
Py_DECREF(res);
|
||||
Py_DECREF(object);
|
||||
return restuple;
|
||||
}
|
||||
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
|
||||
unsigned char *p;
|
||||
Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */
|
||||
int consumed = 0;
|
||||
if (PyUnicodeDecodeError_GetStart(exc, &start))
|
||||
return NULL;
|
||||
if (PyUnicodeDecodeError_GetEnd(exc, &end))
|
||||
return NULL;
|
||||
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
|
||||
return NULL;
|
||||
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
while (consumed < 4 && consumed < end-start) {
|
||||
/* Refuse to escape ASCII bytes. */
|
||||
if (p[start+consumed] < 128)
|
||||
break;
|
||||
ch[consumed] = 0xdc00 + p[start+consumed];
|
||||
consumed++;
|
||||
}
|
||||
Py_DECREF(object);
|
||||
if (!consumed) {
|
||||
/* codec complained about ASCII byte. */
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("(u#n)", ch, consumed, start+consumed);
|
||||
}
|
||||
else {
|
||||
wrong_exception_type(exc);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static PyObject *strict_errors(PyObject *self, PyObject *exc)
|
||||
{
|
||||
|
@ -864,6 +940,11 @@ static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
|
|||
return PyCodec_SurrogateErrors(exc);
|
||||
}
|
||||
|
||||
static PyObject *utf8b_errors(PyObject *self, PyObject *exc)
|
||||
{
|
||||
return PyCodec_UTF8bErrors(exc);
|
||||
}
|
||||
|
||||
static int _PyCodecRegistry_Init(void)
|
||||
{
|
||||
static struct {
|
||||
|
@ -918,6 +999,14 @@ static int _PyCodecRegistry_Init(void)
|
|||
surrogates_errors,
|
||||
METH_O
|
||||
}
|
||||
},
|
||||
{
|
||||
"utf8b",
|
||||
{
|
||||
"utf8b",
|
||||
utf8b_errors,
|
||||
METH_O
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -262,6 +262,22 @@ Py_InitializeEx(int install_sigs)
|
|||
|
||||
_PyImportHooks_Init();
|
||||
|
||||
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
|
||||
/* On Unix, set the file system encoding according to the
|
||||
user's preference, if the CODESET names a well-known
|
||||
Python codec, and Py_FileSystemDefaultEncoding isn't
|
||||
initialized by other means. Also set the encoding of
|
||||
stdin and stdout if these are terminals. */
|
||||
|
||||
codeset = get_codeset();
|
||||
if (codeset) {
|
||||
if (!Py_FileSystemDefaultEncoding)
|
||||
Py_FileSystemDefaultEncoding = codeset;
|
||||
else
|
||||
free(codeset);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (install_sigs)
|
||||
initsigs(); /* Signal handling stuff, including initintr() */
|
||||
|
||||
|
@ -285,22 +301,6 @@ Py_InitializeEx(int install_sigs)
|
|||
#ifdef WITH_THREAD
|
||||
_PyGILState_Init(interp, tstate);
|
||||
#endif /* WITH_THREAD */
|
||||
|
||||
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
|
||||
/* On Unix, set the file system encoding according to the
|
||||
user's preference, if the CODESET names a well-known
|
||||
Python codec, and Py_FileSystemDefaultEncoding isn't
|
||||
initialized by other means. Also set the encoding of
|
||||
stdin and stdout if these are terminals. */
|
||||
|
||||
codeset = get_codeset();
|
||||
if (codeset) {
|
||||
if (!Py_FileSystemDefaultEncoding)
|
||||
Py_FileSystemDefaultEncoding = codeset;
|
||||
else
|
||||
free(codeset);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#! /bin/sh
|
||||
# From configure.in Revision: 71731 .
|
||||
# From configure.in Revision: 72144 .
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.61 for python 3.1.
|
||||
#
|
||||
|
@ -16297,13 +16297,14 @@ echo "${ECHO_T}MACHDEP_OBJS" >&6; }
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
for ac_func in alarm setitimer getitimer bind_textdomain_codeset chown \
|
||||
clock confstr ctermid execv fchmod fchown fork fpathconf ftime ftruncate \
|
||||
gai_strerror getgroups getlogin getloadavg getpeername getpgid getpid \
|
||||
getpriority getpwent getspnam getspent getsid getwd \
|
||||
kill killpg lchmod lchown lstat mkfifo mknod mktime \
|
||||
kill killpg lchmod lchown lstat mbrtowc mkfifo mknod mktime \
|
||||
mremap nice pathconf pause plock poll pthread_init \
|
||||
putenv readlink realpath \
|
||||
select sem_open sem_timedwait sem_getvalue sem_unlink setegid seteuid \
|
||||
|
|
|
@ -2403,7 +2403,7 @@ AC_CHECK_FUNCS(alarm setitimer getitimer bind_textdomain_codeset chown \
|
|||
clock confstr ctermid execv fchmod fchown fork fpathconf ftime ftruncate \
|
||||
gai_strerror getgroups getlogin getloadavg getpeername getpgid getpid \
|
||||
getpriority getpwent getspnam getspent getsid getwd \
|
||||
kill killpg lchmod lchown lstat mkfifo mknod mktime \
|
||||
kill killpg lchmod lchown lstat mbrtowc mkfifo mknod mktime \
|
||||
mremap nice pathconf pause plock poll pthread_init \
|
||||
putenv readlink realpath \
|
||||
select sem_open sem_timedwait sem_getvalue sem_unlink setegid seteuid \
|
||||
|
|
|
@ -419,6 +419,9 @@
|
|||
/* Define this if you have the makedev macro. */
|
||||
#undef HAVE_MAKEDEV
|
||||
|
||||
/* Define to 1 if you have the `mbrtowc' function. */
|
||||
#undef HAVE_MBRTOWC
|
||||
|
||||
/* Define to 1 if you have the `memmove' function. */
|
||||
#undef HAVE_MEMMOVE
|
||||
|
||||
|
|
Loading…
Reference in New Issue