Add PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale()

* PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale() decode a string
   from the current locale encoding
 * _Py_char2wchar() writes an "error code" in the size argument to indicate
   if the function failed because of memory allocation failure or because of a
   decoding error. The function doesn't write the error message directly to
   stderr.
 * Fix time.strftime() (if wcsftime() is missing): decode strftime() result
   from the current locale encoding, not from the filesystem encoding.
This commit is contained in:
Victor Stinner 2011-12-16 23:56:01 +01:00
parent 3607e3de27
commit af02e1c85a
7 changed files with 174 additions and 84 deletions

View File

@ -699,6 +699,39 @@ Extension modules can continue using them, as they will not be removed in Python
throughout the interpreter whenever coercion to Unicode is needed.
Locale Encoding
"""""""""""""""
The current locale encoding can be used to decode text from the operating
system.
.. c:function:: PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, int surrogateescape)
Decode a string from the current locale encoding. The decoder is strict if
*surrogateescape* is equal to zero, otherwise it uses the
``'surrogateescape'`` error handler (:pep:`383`) to escape undecodable
bytes. If a byte sequence can be decoded as a surrogate character and
*surrogateescape* is not equal to zero, the byte sequence is escaped using
the ``'surrogateescape'`` error handler instead of being decoded. *str*
must end with a null character but cannot contain embedded null character.
.. seealso::
Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
Python startup).
.. versionadded:: 3.3
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, int surrogateescape)
Similar to :c:func:`PyUnicode_DecodeLocaleAndSize`, but compute the string
length using :c:func:`strlen`.
.. versionadded:: 3.3
File System Encoding
""""""""""""""""""""
@ -739,6 +772,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
locale encoding.
.. seealso::
:c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
locale encoding and cannot be modified later. If you need to decode a
string from the current locale encoding, use
:c:func:`PyUnicode_DecodeLocaleAndSize`.
.. versionchanged:: 3.2
Use ``'strict'`` error handler on Windows.

View File

@ -1595,6 +1595,28 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
);
#endif
/* --- Locale encoding --------------------------------------------------- */
/* Decode a string from the current locale encoding. The decoder is strict if
*surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
be decoded as a surrogate character and *surrogateescape* is not equal to
zero, the byte sequence is escaped using the 'surrogateescape' error handler
instead of being decoded. *str* must end with a null character but cannot
contain embedded null character. */
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
const char *str,
Py_ssize_t len,
int surrogateescape);
/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
length using strlen(). */
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
const char *str,
int surrogateescape);
/* --- File system encoding ---------------------------------------------- */
/* ParseTuple converter: encode str objects to bytes using

View File

@ -42,43 +42,6 @@ PyDoc_STRVAR(locale__doc__, "Support for POSIX locales.");
static PyObject *Error;
/* Convert a char* to a Unicode object according to the current locale */
static PyObject*
str2uni(const char* s)
{
#ifdef HAVE_BROKEN_MBSTOWCS
size_t needed = strlen(s);
#else
size_t needed = mbstowcs(NULL, s, 0);
#endif
size_t res1;
wchar_t smallbuf[30];
wchar_t *dest;
PyObject *res2;
if (needed == (size_t)-1) {
PyErr_SetString(PyExc_ValueError, "Cannot convert byte to string");
return NULL;
}
if (needed*sizeof(wchar_t) < sizeof(smallbuf))
dest = smallbuf;
else {
dest = PyMem_Malloc((needed+1)*sizeof(wchar_t));
if (!dest)
return PyErr_NoMemory();
}
/* This shouldn't fail now */
res1 = mbstowcs(dest, s, needed+1);
#ifdef HAVE_BROKEN_MBSTOWCS
assert(res1 != (size_t)-1);
#else
assert(res1 == needed);
#endif
res2 = PyUnicode_FromWideChar(dest, res1);
if (dest != smallbuf)
PyMem_Free(dest);
return res2;
}
/* support functions for formatting floating point numbers */
PyDoc_STRVAR(setlocale__doc__,
@ -149,7 +112,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
PyErr_SetString(Error, "unsupported locale setting");
return NULL;
}
result_object = str2uni(result);
result_object = PyUnicode_DecodeLocale(result, 0);
if (!result_object)
return NULL;
} else {
@ -159,7 +122,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
PyErr_SetString(Error, "locale query failed");
return NULL;
}
result_object = str2uni(result);
result_object = PyUnicode_DecodeLocale(result, 0);
}
return result_object;
}
@ -185,7 +148,7 @@ PyLocale_localeconv(PyObject* self)
involved herein */
#define RESULT_STRING(s)\
x = str2uni(l->s); \
x = PyUnicode_DecodeLocale(l->s, 0); \
if (!x) goto failed;\
PyDict_SetItemString(result, #s, x);\
Py_XDECREF(x)
@ -476,7 +439,7 @@ PyLocale_nl_langinfo(PyObject* self, PyObject* args)
instead of an empty string for nl_langinfo(ERA). */
const char *result = nl_langinfo(item);
result = result != NULL ? result : "";
return str2uni(result);
return PyUnicode_DecodeLocale(result, 0);
}
PyErr_SetString(PyExc_ValueError, "unsupported langinfo constant");
return NULL;
@ -495,7 +458,7 @@ PyIntl_gettext(PyObject* self, PyObject *args)
char *in;
if (!PyArg_ParseTuple(args, "s", &in))
return 0;
return str2uni(gettext(in));
return PyUnicode_DecodeLocale(gettext(in), 0);
}
PyDoc_STRVAR(dgettext__doc__,
@ -508,7 +471,7 @@ PyIntl_dgettext(PyObject* self, PyObject *args)
char *domain, *in;
if (!PyArg_ParseTuple(args, "zs", &domain, &in))
return 0;
return str2uni(dgettext(domain, in));
return PyUnicode_DecodeLocale(dgettext(domain, in), 0);
}
PyDoc_STRVAR(dcgettext__doc__,
@ -522,7 +485,7 @@ PyIntl_dcgettext(PyObject *self, PyObject *args)
int category;
if (!PyArg_ParseTuple(args, "zsi", &domain, &msgid, &category))
return 0;
return str2uni(dcgettext(domain,msgid,category));
return PyUnicode_DecodeLocale(dcgettext(domain,msgid,category), 0);
}
PyDoc_STRVAR(textdomain__doc__,
@ -540,7 +503,7 @@ PyIntl_textdomain(PyObject* self, PyObject* args)
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
return str2uni(domain);
return PyUnicode_DecodeLocale(domain, 0);
}
PyDoc_STRVAR(bindtextdomain__doc__,
@ -572,7 +535,7 @@ PyIntl_bindtextdomain(PyObject* self,PyObject*args)
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
result = str2uni(current_dirname);
result = PyUnicode_DecodeLocale(current_dirname, 0);
Py_XDECREF(dirname_bytes);
return result;
}
@ -590,7 +553,7 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
return NULL;
codeset = bind_textdomain_codeset(domain, codeset);
if (codeset)
return str2uni(codeset);
return PyUnicode_DecodeLocale(codeset, 0);
Py_RETURN_NONE;
}
#endif

View File

@ -495,16 +495,13 @@ Py_Main(int argc, wchar_t **argv)
/* Use utf-8 on Mac OS X */
unicode = PyUnicode_FromString(p);
#else
wchar_t *wchar;
size_t len;
wchar = _Py_char2wchar(p, &len);
if (wchar == NULL)
continue;
unicode = PyUnicode_FromWideChar(wchar, len);
PyMem_Free(wchar);
unicode = PyUnicode_DecodeLocale(p, 1);
#endif
if (unicode == NULL)
if (unicode == NULL) {
/* ignore errors */
PyErr_Clear();
continue;
}
PySys_AddWarnOptionUnicode(unicode);
Py_DECREF(unicode);
}

View File

@ -532,7 +532,7 @@ time_strftime(PyObject *self, PyObject *args)
#ifdef HAVE_WCSFTIME
ret = PyUnicode_FromWideChar(outbuf, buflen);
#else
ret = PyUnicode_DecodeFSDefaultAndSize(outbuf, buflen);
ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, 1);
#endif
PyMem_Free(outbuf);
break;
@ -764,8 +764,8 @@ PyInit_timezone(PyObject *m) {
#endif /* PYOS_OS2 */
#endif
PyModule_AddIntConstant(m, "daylight", daylight);
otz0 = PyUnicode_DecodeFSDefaultAndSize(tzname[0], strlen(tzname[0]));
otz1 = PyUnicode_DecodeFSDefaultAndSize(tzname[1], strlen(tzname[1]));
otz0 = PyUnicode_DecodeLocale(tzname[0], 1);
otz1 = PyUnicode_DecodeLocale(tzname[1], 1);
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
#ifdef HAVE_STRUCT_TM_TM_ZONE

View File

@ -3234,6 +3234,83 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
return NULL;
}
PyObject*
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
int surrogateescape)
{
wchar_t smallbuf[256];
size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
wchar_t *wstr;
size_t wlen, wlen2;
PyObject *unicode;
if (str[len] != '\0' || len != strlen(str)) {
PyErr_SetString(PyExc_TypeError, "embedded null character");
return NULL;
}
if (surrogateescape)
{
wstr = _Py_char2wchar(str, &wlen);
if (wstr == NULL) {
if (wlen == (size_t)-1)
PyErr_NoMemory();
else
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
unicode = PyUnicode_FromWideChar(wstr, wlen);
PyMem_Free(wstr);
}
else {
#ifndef HAVE_BROKEN_MBSTOWCS
wlen = mbstowcs(NULL, str, 0);
#else
wlen = len;
#endif
if (wlen == (size_t)-1) {
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
if (wlen+1 <= smallbuf_len) {
wstr = smallbuf;
}
else {
if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
return PyErr_NoMemory();
wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
if (!wstr)
return PyErr_NoMemory();
}
/* This shouldn't fail now */
wlen2 = mbstowcs(wstr, str, wlen+1);
if (wlen2 == (size_t)-1) {
if (wstr != smallbuf)
PyMem_Free(wstr);
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
#ifdef HAVE_BROKEN_MBSTOWCS
assert(wlen2 == wlen);
#endif
unicode = PyUnicode_FromWideChar(wstr, wlen2);
if (wstr != smallbuf)
PyMem_Free(wstr);
}
return unicode;
}
PyObject*
PyUnicode_DecodeLocale(const char *str, int surrogateescape)
{
Py_ssize_t size = (Py_ssize_t)strlen(str);
return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape);
}
PyObject*
PyUnicode_DecodeFSDefault(const char *s) {
Py_ssize_t size = (Py_ssize_t)strlen(s);
@ -3264,23 +3341,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
"surrogateescape");
}
else {
/* locale encoding with surrogateescape */
wchar_t *wchar;
PyObject *unicode;
size_t len;
if (s[size] != '\0' || size != strlen(s)) {
PyErr_SetString(PyExc_TypeError, "embedded NUL character");
return NULL;
}
wchar = _Py_char2wchar(s, &len);
if (wchar == NULL)
return PyErr_NoMemory();
unicode = PyUnicode_FromWideChar(wchar, len);
PyMem_Free(wchar);
return unicode;
return PyUnicode_DecodeLocaleAndSize(s, size, 1);
}
#endif
}

View File

@ -16,7 +16,9 @@
Return a pointer to a newly allocated wide character string (use
PyMem_Free() to free the memory) and write the number of written wide
characters excluding the null character into *size if size is not NULL, or
NULL on error (conversion or memory allocation error).
NULL on error (decoding or memory allocation error). If size is not NULL,
*size is set to (size_t)-1 on memory error and (size_t)-2 on decoding
error.
Conversion errors should never happen, unless there is a bug in the C
library. */
@ -82,8 +84,9 @@ _Py_char2wchar(const char* arg, size_t *size)
since we provide everything that we have -
unless there is a bug in the C library, or I
misunderstood how mbrtowc works. */
fprintf(stderr, "unexpected mbrtowc result -2\n");
PyMem_Free(res);
if (size != NULL)
*size = (size_t)-2;
return NULL;
}
if (converted == (size_t)-1) {
@ -112,7 +115,8 @@ _Py_char2wchar(const char* arg, size_t *size)
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
correctly in the locale's charset, which must be an ASCII superset. */
res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
if (!res) goto oom;
if (!res)
goto oom;
in = (unsigned char*)arg;
out = res;
while(*in)
@ -126,7 +130,8 @@ _Py_char2wchar(const char* arg, size_t *size)
*size = out - res;
return res;
oom:
fprintf(stderr, "out of memory\n");
if (size != NULL)
*size = (size_t)-1;
return NULL;
}
@ -137,10 +142,10 @@ oom:
This function is the reverse of _Py_char2wchar().
Return a pointer to a newly allocated byte string (use PyMem_Free() to free
the memory), or NULL on conversion or memory allocation error.
the memory), or NULL on encoding or memory allocation error.
If error_pos is not NULL: *error_pos is the index of the invalid character
on conversion error, or (size_t)-1 otherwise. */
on encoding error, or (size_t)-1 otherwise. */
char*
_Py_wchar2char(const wchar_t *text, size_t *error_pos)
{
@ -328,7 +333,7 @@ _Py_fopen(PyObject *path, const char *mode)
#ifdef HAVE_READLINK
/* Read value of symbolic link. Encode the path to the locale encoding, decode
the result from the locale encoding. */
the result from the locale encoding. Return -1 on error. */
int
_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
@ -372,7 +377,8 @@ _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
#ifdef HAVE_REALPATH
/* Return the canonicalized absolute pathname. Encode path to the locale
encoding, decode the result from the locale encoding. */
encoding, decode the result from the locale encoding.
Return NULL on error. */
wchar_t*
_Py_wrealpath(const wchar_t *path,
@ -410,7 +416,8 @@ _Py_wrealpath(const wchar_t *path,
#endif
/* Get the current directory. size is the buffer size in wide characters
including the null character. Decode the path from the locale encoding. */
including the null character. Decode the path from the locale encoding.
Return NULL on error. */
wchar_t*
_Py_wgetcwd(wchar_t *buf, size_t size)