mirror of https://github.com/python/cpython
Marc-Andre Lemburg:
Attached you find the latest update of the Unicode implementation. The patch is against the current CVS version. It includes the fix I posted yesterday for the core dump problem in codecs.c (was introduced by my previous patch set -- sorry), adds more tests for the codecs and two new parser markers "es" and "es#".
This commit is contained in:
parent
27fc3c05e1
commit
d8855fde88
|
@ -46,7 +46,7 @@ class Codec:
|
|||
handling schemes by providing the errors argument. These
|
||||
string values are defined:
|
||||
|
||||
'strict' - raise an error (or a subclass)
|
||||
'strict' - raise a ValueError error (or a subclass)
|
||||
'ignore' - ignore the character and continue with the next
|
||||
'replace' - replace with a suitable replacement character;
|
||||
Python will use the official U+FFFD REPLACEMENT
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
test_unicode
|
||||
Testing Unicode comparisons... done.
|
||||
Testing Unicode contains method... done.
|
||||
Testing Unicode formatting strings... done.
|
||||
Testing unicodedata module... done.
|
||||
|
|
|
@ -293,3 +293,33 @@ else:
|
|||
assert unicodedata.combining(u'\u20e1') == 230
|
||||
|
||||
print 'done.'
|
||||
|
||||
# Test builtin codecs
|
||||
print 'Testing builtin codecs...',
|
||||
|
||||
assert unicode('hello','ascii') == u'hello'
|
||||
assert unicode('hello','utf-8') == u'hello'
|
||||
assert unicode('hello','utf8') == u'hello'
|
||||
assert unicode('hello','latin-1') == u'hello'
|
||||
|
||||
assert u'hello'.encode('ascii') == 'hello'
|
||||
assert u'hello'.encode('utf-8') == 'hello'
|
||||
assert u'hello'.encode('utf8') == 'hello'
|
||||
assert u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000'
|
||||
assert u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o'
|
||||
assert u'hello'.encode('latin-1') == 'hello'
|
||||
|
||||
u = u''.join(map(unichr, range(1024)))
|
||||
for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
|
||||
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
|
||||
assert unicode(u.encode(encoding),encoding) == u
|
||||
|
||||
u = u''.join(map(unichr, range(256)))
|
||||
for encoding in ('latin-1',):
|
||||
assert unicode(u.encode(encoding),encoding) == u
|
||||
|
||||
u = u''.join(map(unichr, range(128)))
|
||||
for encoding in ('ascii',):
|
||||
assert unicode(u.encode(encoding),encoding) == u
|
||||
|
||||
print 'done.'
|
||||
|
|
114
Misc/unicode.txt
114
Misc/unicode.txt
|
@ -715,21 +715,126 @@ Internal Argument Parsing:
|
|||
|
||||
These markers are used by the PyArg_ParseTuple() APIs:
|
||||
|
||||
'U': Check for Unicode object and return a pointer to it
|
||||
"U": Check for Unicode object and return a pointer to it
|
||||
|
||||
's': For Unicode objects: auto convert them to the <default encoding>
|
||||
"s": For Unicode objects: auto convert them to the <default encoding>
|
||||
and return a pointer to the object's <defencstr> buffer.
|
||||
|
||||
's#': Access to the Unicode object via the bf_getreadbuf buffer interface
|
||||
"s#": Access to the Unicode object via the bf_getreadbuf buffer interface
|
||||
(see Buffer Interface); note that the length relates to the buffer
|
||||
length, not the Unicode string length (this may be different
|
||||
depending on the Internal Format).
|
||||
|
||||
't#': Access to the Unicode object via the bf_getcharbuf buffer interface
|
||||
"t#": Access to the Unicode object via the bf_getcharbuf buffer interface
|
||||
(see Buffer Interface); note that the length relates to the buffer
|
||||
length, not necessarily to the Unicode string length (this may
|
||||
be different depending on the <default encoding>).
|
||||
|
||||
"es":
|
||||
Takes two parameters: encoding (const char *) and
|
||||
buffer (char **).
|
||||
|
||||
The input object is first coerced to Unicode in the usual way
|
||||
and then encoded into a string using the given encoding.
|
||||
|
||||
On output, a buffer of the needed size is allocated and
|
||||
returned through *buffer as NULL-terminated string.
|
||||
The encoded may not contain embedded NULL characters.
|
||||
The caller is responsible for free()ing the allocated *buffer
|
||||
after usage.
|
||||
|
||||
"es#":
|
||||
Takes three parameters: encoding (const char *),
|
||||
buffer (char **) and buffer_len (int *).
|
||||
|
||||
The input object is first coerced to Unicode in the usual way
|
||||
and then encoded into a string using the given encoding.
|
||||
|
||||
If *buffer is non-NULL, *buffer_len must be set to sizeof(buffer)
|
||||
on input. Output is then copied to *buffer.
|
||||
|
||||
If *buffer is NULL, a buffer of the needed size is
|
||||
allocated and output copied into it. *buffer is then
|
||||
updated to point to the allocated memory area. The caller
|
||||
is responsible for free()ing *buffer after usage.
|
||||
|
||||
In both cases *buffer_len is updated to the number of
|
||||
characters written (excluding the trailing NULL-byte).
|
||||
The output buffer is assured to be NULL-terminated.
|
||||
|
||||
Examples:
|
||||
|
||||
Using "es#" with auto-allocation:
|
||||
|
||||
static PyObject *
|
||||
test_parser(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
PyObject *str;
|
||||
const char *encoding = "latin-1";
|
||||
char *buffer = NULL;
|
||||
int buffer_len = 0;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "es#:test_parser",
|
||||
encoding, &buffer, &buffer_len))
|
||||
return NULL;
|
||||
if (!buffer) {
|
||||
PyErr_SetString(PyExc_SystemError,
|
||||
"buffer is NULL");
|
||||
return NULL;
|
||||
}
|
||||
str = PyString_FromStringAndSize(buffer, buffer_len);
|
||||
free(buffer);
|
||||
return str;
|
||||
}
|
||||
|
||||
Using "es" with auto-allocation returning a NULL-terminated string:
|
||||
|
||||
static PyObject *
|
||||
test_parser(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
PyObject *str;
|
||||
const char *encoding = "latin-1";
|
||||
char *buffer = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "es:test_parser",
|
||||
encoding, &buffer))
|
||||
return NULL;
|
||||
if (!buffer) {
|
||||
PyErr_SetString(PyExc_SystemError,
|
||||
"buffer is NULL");
|
||||
return NULL;
|
||||
}
|
||||
str = PyString_FromString(buffer);
|
||||
free(buffer);
|
||||
return str;
|
||||
}
|
||||
|
||||
Using "es#" with a pre-allocated buffer:
|
||||
|
||||
static PyObject *
|
||||
test_parser(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
PyObject *str;
|
||||
const char *encoding = "latin-1";
|
||||
char _buffer[10];
|
||||
char *buffer = _buffer;
|
||||
int buffer_len = sizeof(_buffer);
|
||||
|
||||
if (!PyArg_ParseTuple(args, "es#:test_parser",
|
||||
encoding, &buffer, &buffer_len))
|
||||
return NULL;
|
||||
if (!buffer) {
|
||||
PyErr_SetString(PyExc_SystemError,
|
||||
"buffer is NULL");
|
||||
return NULL;
|
||||
}
|
||||
str = PyString_FromStringAndSize(buffer, buffer_len);
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
File/Stream Output:
|
||||
-------------------
|
||||
|
@ -837,6 +942,7 @@ Encodings:
|
|||
|
||||
History of this Proposal:
|
||||
-------------------------
|
||||
1.3: Added new "es" and "es#" parser markers
|
||||
1.2: Removed POD about codecs.open()
|
||||
1.1: Added note about comparisons and hash values. Added note about
|
||||
case mapping algorithms. Changed stream codecs .read() and
|
||||
|
|
118
Python/getargs.c
118
Python/getargs.c
|
@ -178,6 +178,8 @@ vgetargs1(args, format, p_va, compat)
|
|||
}
|
||||
else if (level != 0)
|
||||
; /* Pass */
|
||||
else if (c == 'e')
|
||||
; /* Pass */
|
||||
else if (isalpha(c))
|
||||
max++;
|
||||
else if (c == '|')
|
||||
|
@ -654,6 +656,122 @@ convertsimple1(arg, p_format, p_va)
|
|||
break;
|
||||
}
|
||||
|
||||
case 'e': /* encoded string */
|
||||
{
|
||||
char **buffer;
|
||||
const char *encoding;
|
||||
PyObject *u, *s;
|
||||
int size;
|
||||
|
||||
/* Get 'e' parameter: the encoding name */
|
||||
encoding = (const char *)va_arg(*p_va, const char *);
|
||||
if (encoding == NULL)
|
||||
return "(encoding is NULL)";
|
||||
|
||||
/* Get 's' parameter: the output buffer to use */
|
||||
if (*format != 's')
|
||||
return "(unkown parser marker combination)";
|
||||
buffer = (char **)va_arg(*p_va, char **);
|
||||
format++;
|
||||
if (buffer == NULL)
|
||||
return "(buffer is NULL)";
|
||||
|
||||
/* Convert object to Unicode */
|
||||
u = PyUnicode_FromObject(arg);
|
||||
if (u == NULL)
|
||||
return "string, unicode or text buffer";
|
||||
|
||||
/* Encode object; use default error handling */
|
||||
s = PyUnicode_AsEncodedString(u,
|
||||
encoding,
|
||||
NULL);
|
||||
Py_DECREF(u);
|
||||
if (s == NULL)
|
||||
return "(encoding failed)";
|
||||
if (!PyString_Check(s)) {
|
||||
Py_DECREF(s);
|
||||
return "(encoder failed to return a string)";
|
||||
}
|
||||
size = PyString_GET_SIZE(s);
|
||||
|
||||
/* Write output; output is guaranteed to be
|
||||
0-terminated */
|
||||
if (*format == '#') {
|
||||
/* Using buffer length parameter '#':
|
||||
|
||||
- if *buffer is NULL, a new buffer
|
||||
of the needed size is allocated and
|
||||
the data copied into it; *buffer is
|
||||
updated to point to the new buffer;
|
||||
the caller is responsible for
|
||||
free()ing it after usage
|
||||
|
||||
- if *buffer is not NULL, the data
|
||||
is copied to *buffer; *buffer_len
|
||||
has to be set to the size of the
|
||||
buffer on input; buffer overflow is
|
||||
signalled with an error; buffer has
|
||||
to provide enough room for the
|
||||
encoded string plus the trailing
|
||||
0-byte
|
||||
|
||||
- in both cases, *buffer_len is
|
||||
updated to the size of the buffer
|
||||
/excluding/ the trailing 0-byte
|
||||
|
||||
*/
|
||||
int *buffer_len = va_arg(*p_va, int *);
|
||||
|
||||
format++;
|
||||
if (buffer_len == NULL)
|
||||
return "(buffer_len is NULL)";
|
||||
if (*buffer == NULL) {
|
||||
*buffer = PyMem_NEW(char, size + 1);
|
||||
if (*buffer == NULL) {
|
||||
Py_DECREF(s);
|
||||
return "(memory error)";
|
||||
}
|
||||
} else {
|
||||
if (size + 1 > *buffer_len) {
|
||||
Py_DECREF(s);
|
||||
return "(buffer overflow)";
|
||||
}
|
||||
}
|
||||
memcpy(*buffer,
|
||||
PyString_AS_STRING(s),
|
||||
size + 1);
|
||||
*buffer_len = size;
|
||||
} else {
|
||||
/* Using a 0-terminated buffer:
|
||||
|
||||
- the encoded string has to be
|
||||
0-terminated for this variant to
|
||||
work; if it is not, an error raised
|
||||
|
||||
- a new buffer of the needed size
|
||||
is allocated and the data copied
|
||||
into it; *buffer is updated to
|
||||
point to the new buffer; the caller
|
||||
is responsible for free()ing it
|
||||
after usage
|
||||
|
||||
*/
|
||||
if (strlen(PyString_AS_STRING(s)) != size)
|
||||
return "(encoded string without "\
|
||||
"NULL bytes)";
|
||||
*buffer = PyMem_NEW(char, size + 1);
|
||||
if (*buffer == NULL) {
|
||||
Py_DECREF(s);
|
||||
return "(memory error)";
|
||||
}
|
||||
memcpy(*buffer,
|
||||
PyString_AS_STRING(s),
|
||||
size + 1);
|
||||
}
|
||||
Py_DECREF(s);
|
||||
break;
|
||||
}
|
||||
|
||||
case 'S': /* string object */
|
||||
{
|
||||
PyObject **p = va_arg(*p_va, PyObject **);
|
||||
|
|
Loading…
Reference in New Issue