M.-A. Lemburg <mal@lemburg.com>:

Added support for user settable default encodings. The
current implementation uses a per-process global which
defines the value of the encoding parameter in case it
is set to NULL (meaning: use the default encoding).
This commit is contained in:
Fred Drake 2000-05-09 19:53:39 +00:00
parent aff601804d
commit e4315f58d2
1 changed files with 71 additions and 20 deletions

View File

@ -117,6 +117,16 @@ static PyUnicodeObject *unicode_empty = NULL;
static PyUnicodeObject *unicode_freelist = NULL;
static int unicode_freelist_size = 0;
/* Default encoding to use and assume when NULL is passed as encoding
parameter; it is initialized by _PyUnicode_Init().
Always use the PyUnicode_SetDefaultEncoding() and
PyUnicode_GetDefaultEncoding() APIs to access this global.
*/
static char unicode_default_encoding[100];
/* --- Unicode Object ----------------------------------------------------- */
static
@ -366,7 +376,7 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
Py_INCREF(unicode_empty);
return (PyObject *)unicode_empty;
}
return PyUnicode_DecodeUTF8(s, len, "strict");
return PyUnicode_Decode(s, len, NULL, "strict");
}
PyObject *PyUnicode_Decode(const char *s,
@ -376,10 +386,16 @@ PyObject *PyUnicode_Decode(const char *s,
{
PyObject *buffer = NULL, *unicode;
/* Shortcut for the default encoding UTF-8 */
if (encoding == NULL ||
(strcmp(encoding, "utf-8") == 0))
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
/* Shortcuts for common default encodings */
if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_DecodeUTF8(s, size, errors);
else if (strcmp(encoding, "latin-1") == 0)
return PyUnicode_DecodeLatin1(s, size, errors);
else if (strcmp(encoding, "ascii") == 0)
return PyUnicode_DecodeASCII(s, size, errors);
/* Decode via the codec registry */
buffer = PyBuffer_FromMemory((void *)s, size);
@ -428,11 +444,19 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
PyErr_BadArgument();
goto onError;
}
/* Shortcut for the default encoding UTF-8 */
if ((encoding == NULL ||
(strcmp(encoding, "utf-8") == 0)) &&
errors == NULL)
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
/* Shortcuts for common default encodings */
if (errors == NULL) {
if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_AsUTF8String(unicode);
else if (strcmp(encoding, "latin-1") == 0)
return PyUnicode_AsLatin1String(unicode);
else if (strcmp(encoding, "ascii") == 0)
return PyUnicode_AsASCIIString(unicode);
}
/* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors);
@ -476,6 +500,30 @@ int PyUnicode_GetSize(PyObject *unicode)
return -1;
}
const char *PyUnicode_GetDefaultEncoding()
{
return unicode_default_encoding;
}
int PyUnicode_SetDefaultEncoding(const char *encoding)
{
PyObject *v;
/* Make sure the encoding is valid. As side effect, this also
loads the encoding into the codec registry cache. */
v = _PyCodec_Lookup(encoding);
if (v == NULL)
goto onError;
Py_DECREF(v);
strncpy(unicode_default_encoding,
encoding,
sizeof(unicode_default_encoding));
return 0;
onError:
return -1;
}
/* --- UTF-8 Codec -------------------------------------------------------- */
static
@ -772,7 +820,8 @@ int utf16_decoding_error(const Py_UNICODE **source,
}
else {
PyErr_Format(PyExc_ValueError,
"UTF-16 decoding error; unknown error handling code: %.400s",
"UTF-16 decoding error; "
"unknown error handling code: %.400s",
errors);
return -1;
}
@ -3057,10 +3106,10 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
static char encode__doc__[] =
"S.encode([encoding[,errors]]) -> string\n\
\n\
Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
errors may be given to set a different error handling scheme. Default\n\
is 'strict' meaning that encoding errors raise a ValueError. Other\n\
possible values are 'ignore' and 'replace'.";
Return an encoded string version of S. Default encoding is the current\n\
default string encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
a ValueError. Other possible values are 'ignore' and 'replace'.";
static PyObject *
unicode_encode(PyUnicodeObject *self, PyObject *args)
@ -3816,7 +3865,7 @@ unicode_splitlines(PyUnicodeObject *self, PyObject *args)
static
PyObject *unicode_str(PyUnicodeObject *self)
{
return PyUnicode_AsUTF8String((PyObject *)self);
return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
}
static char strip__doc__[] =
@ -4246,6 +4295,8 @@ PyObject *PyUnicode_Format(PyObject *format,
return NULL;
}
uformat = PyUnicode_FromObject(format);
if (uformat == NULL)
return NULL;
fmt = PyUnicode_AS_UNICODE(uformat);
fmtcnt = PyUnicode_GET_SIZE(uformat);
@ -4322,13 +4373,10 @@ PyObject *PyUnicode_Format(PyObject *format,
"incomplete format key");
goto onError;
}
/* keys are converted to strings (using UTF-8) and
/* keys are converted to strings using UTF-8 and
then looked up since Python uses strings to hold
variables names etc. in its namespaces and we
wouldn't want to break common idioms. The
alternative would be using Unicode objects for the
lookup but u"abc" and "abc" have different hash
values (on purpose). */
wouldn't want to break common idioms. */
key = PyUnicode_EncodeUTF8(keystart,
keylen,
NULL);
@ -4472,8 +4520,9 @@ PyObject *PyUnicode_Format(PyObject *format,
"%s argument has non-string str()");
goto onError;
}
unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
PyString_GET_SIZE(temp),
NULL,
"strict");
Py_DECREF(temp);
temp = unicode;
@ -4659,7 +4708,9 @@ void _PyUnicode_Init()
Py_FatalError("Unicode configuration error: "
"sizeof(Py_UNICODE) != 2 bytes");
/* Init the implementation */
unicode_empty = _PyUnicode_New(0);
strcpy(unicode_default_encoding, "utf-8");
}
/* Finalize the Unicode implementation */