From d8855fde885ffcd9956352edb75674f38c64acaa Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 24 Mar 2000 22:14:19 +0000 Subject: [PATCH] Marc-Andre Lemburg: Attached you find the latest update of the Unicode implementation. The patch is against the current CVS version. It includes the fix I posted yesterday for the core dump problem in codecs.c (was introduced by my previous patch set -- sorry), adds more tests for the codecs and two new parser markers "es" and "es#". --- Lib/codecs.py | 2 +- Lib/test/output/test_unicode | 1 - Lib/test/test_unicode.py | 30 +++++++++ Misc/unicode.txt | 114 +++++++++++++++++++++++++++++++-- Python/getargs.c | 118 +++++++++++++++++++++++++++++++++++ 5 files changed, 259 insertions(+), 6 deletions(-) diff --git a/Lib/codecs.py b/Lib/codecs.py index 7f478d7191d..c09f804f1b6 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -46,7 +46,7 @@ class Codec: handling schemes by providing the errors argument. These string values are defined: - 'strict' - raise an error (or a subclass) + 'strict' - raise a ValueError error (or a subclass) 'ignore' - ignore the character and continue with the next 'replace' - replace with a suitable replacement character; Python will use the official U+FFFD REPLACEMENT diff --git a/Lib/test/output/test_unicode b/Lib/test/output/test_unicode index 382a631fd3d..1ec9031045e 100644 --- a/Lib/test/output/test_unicode +++ b/Lib/test/output/test_unicode @@ -1,5 +1,4 @@ test_unicode Testing Unicode comparisons... done. -Testing Unicode contains method... done. Testing Unicode formatting strings... done. Testing unicodedata module... done. diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 69d4273ace8..3d15f22a4ef 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -293,3 +293,33 @@ else: assert unicodedata.combining(u'\u20e1') == 230 print 'done.' + +# Test builtin codecs +print 'Testing builtin codecs...', + +assert unicode('hello','ascii') == u'hello' +assert unicode('hello','utf-8') == u'hello' +assert unicode('hello','utf8') == u'hello' +assert unicode('hello','latin-1') == u'hello' + +assert u'hello'.encode('ascii') == 'hello' +assert u'hello'.encode('utf-8') == 'hello' +assert u'hello'.encode('utf8') == 'hello' +assert u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000' +assert u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o' +assert u'hello'.encode('latin-1') == 'hello' + +u = u''.join(map(unichr, range(1024))) +for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', + 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): + assert unicode(u.encode(encoding),encoding) == u + +u = u''.join(map(unichr, range(256))) +for encoding in ('latin-1',): + assert unicode(u.encode(encoding),encoding) == u + +u = u''.join(map(unichr, range(128))) +for encoding in ('ascii',): + assert unicode(u.encode(encoding),encoding) == u + +print 'done.' diff --git a/Misc/unicode.txt b/Misc/unicode.txt index 9a4832afce8..fc1f2c5a249 100644 --- a/Misc/unicode.txt +++ b/Misc/unicode.txt @@ -715,21 +715,126 @@ Internal Argument Parsing: These markers are used by the PyArg_ParseTuple() APIs: - 'U': Check for Unicode object and return a pointer to it + "U": Check for Unicode object and return a pointer to it - 's': For Unicode objects: auto convert them to the + "s": For Unicode objects: auto convert them to the and return a pointer to the object's buffer. - 's#': Access to the Unicode object via the bf_getreadbuf buffer interface + "s#": Access to the Unicode object via the bf_getreadbuf buffer interface (see Buffer Interface); note that the length relates to the buffer length, not the Unicode string length (this may be different depending on the Internal Format). - 't#': Access to the Unicode object via the bf_getcharbuf buffer interface + "t#": Access to the Unicode object via the bf_getcharbuf buffer interface (see Buffer Interface); note that the length relates to the buffer length, not necessarily to the Unicode string length (this may be different depending on the ). + "es": + Takes two parameters: encoding (const char *) and + buffer (char **). + + The input object is first coerced to Unicode in the usual way + and then encoded into a string using the given encoding. + + On output, a buffer of the needed size is allocated and + returned through *buffer as NULL-terminated string. + The encoded may not contain embedded NULL characters. + The caller is responsible for free()ing the allocated *buffer + after usage. + + "es#": + Takes three parameters: encoding (const char *), + buffer (char **) and buffer_len (int *). + + The input object is first coerced to Unicode in the usual way + and then encoded into a string using the given encoding. + + If *buffer is non-NULL, *buffer_len must be set to sizeof(buffer) + on input. Output is then copied to *buffer. + + If *buffer is NULL, a buffer of the needed size is + allocated and output copied into it. *buffer is then + updated to point to the allocated memory area. The caller + is responsible for free()ing *buffer after usage. + + In both cases *buffer_len is updated to the number of + characters written (excluding the trailing NULL-byte). + The output buffer is assured to be NULL-terminated. + +Examples: + +Using "es#" with auto-allocation: + + static PyObject * + test_parser(PyObject *self, + PyObject *args) + { + PyObject *str; + const char *encoding = "latin-1"; + char *buffer = NULL; + int buffer_len = 0; + + if (!PyArg_ParseTuple(args, "es#:test_parser", + encoding, &buffer, &buffer_len)) + return NULL; + if (!buffer) { + PyErr_SetString(PyExc_SystemError, + "buffer is NULL"); + return NULL; + } + str = PyString_FromStringAndSize(buffer, buffer_len); + free(buffer); + return str; + } + +Using "es" with auto-allocation returning a NULL-terminated string: + + static PyObject * + test_parser(PyObject *self, + PyObject *args) + { + PyObject *str; + const char *encoding = "latin-1"; + char *buffer = NULL; + + if (!PyArg_ParseTuple(args, "es:test_parser", + encoding, &buffer)) + return NULL; + if (!buffer) { + PyErr_SetString(PyExc_SystemError, + "buffer is NULL"); + return NULL; + } + str = PyString_FromString(buffer); + free(buffer); + return str; + } + +Using "es#" with a pre-allocated buffer: + + static PyObject * + test_parser(PyObject *self, + PyObject *args) + { + PyObject *str; + const char *encoding = "latin-1"; + char _buffer[10]; + char *buffer = _buffer; + int buffer_len = sizeof(_buffer); + + if (!PyArg_ParseTuple(args, "es#:test_parser", + encoding, &buffer, &buffer_len)) + return NULL; + if (!buffer) { + PyErr_SetString(PyExc_SystemError, + "buffer is NULL"); + return NULL; + } + str = PyString_FromStringAndSize(buffer, buffer_len); + return str; + } + File/Stream Output: ------------------- @@ -837,6 +942,7 @@ Encodings: History of this Proposal: ------------------------- +1.3: Added new "es" and "es#" parser markers 1.2: Removed POD about codecs.open() 1.1: Added note about comparisons and hash values. Added note about case mapping algorithms. Changed stream codecs .read() and diff --git a/Python/getargs.c b/Python/getargs.c index 4617d0515e4..a4b0fe4c8d7 100644 --- a/Python/getargs.c +++ b/Python/getargs.c @@ -178,6 +178,8 @@ vgetargs1(args, format, p_va, compat) } else if (level != 0) ; /* Pass */ + else if (c == 'e') + ; /* Pass */ else if (isalpha(c)) max++; else if (c == '|') @@ -654,6 +656,122 @@ convertsimple1(arg, p_format, p_va) break; } + case 'e': /* encoded string */ + { + char **buffer; + const char *encoding; + PyObject *u, *s; + int size; + + /* Get 'e' parameter: the encoding name */ + encoding = (const char *)va_arg(*p_va, const char *); + if (encoding == NULL) + return "(encoding is NULL)"; + + /* Get 's' parameter: the output buffer to use */ + if (*format != 's') + return "(unkown parser marker combination)"; + buffer = (char **)va_arg(*p_va, char **); + format++; + if (buffer == NULL) + return "(buffer is NULL)"; + + /* Convert object to Unicode */ + u = PyUnicode_FromObject(arg); + if (u == NULL) + return "string, unicode or text buffer"; + + /* Encode object; use default error handling */ + s = PyUnicode_AsEncodedString(u, + encoding, + NULL); + Py_DECREF(u); + if (s == NULL) + return "(encoding failed)"; + if (!PyString_Check(s)) { + Py_DECREF(s); + return "(encoder failed to return a string)"; + } + size = PyString_GET_SIZE(s); + + /* Write output; output is guaranteed to be + 0-terminated */ + if (*format == '#') { + /* Using buffer length parameter '#': + + - if *buffer is NULL, a new buffer + of the needed size is allocated and + the data copied into it; *buffer is + updated to point to the new buffer; + the caller is responsible for + free()ing it after usage + + - if *buffer is not NULL, the data + is copied to *buffer; *buffer_len + has to be set to the size of the + buffer on input; buffer overflow is + signalled with an error; buffer has + to provide enough room for the + encoded string plus the trailing + 0-byte + + - in both cases, *buffer_len is + updated to the size of the buffer + /excluding/ the trailing 0-byte + + */ + int *buffer_len = va_arg(*p_va, int *); + + format++; + if (buffer_len == NULL) + return "(buffer_len is NULL)"; + if (*buffer == NULL) { + *buffer = PyMem_NEW(char, size + 1); + if (*buffer == NULL) { + Py_DECREF(s); + return "(memory error)"; + } + } else { + if (size + 1 > *buffer_len) { + Py_DECREF(s); + return "(buffer overflow)"; + } + } + memcpy(*buffer, + PyString_AS_STRING(s), + size + 1); + *buffer_len = size; + } else { + /* Using a 0-terminated buffer: + + - the encoded string has to be + 0-terminated for this variant to + work; if it is not, an error raised + + - a new buffer of the needed size + is allocated and the data copied + into it; *buffer is updated to + point to the new buffer; the caller + is responsible for free()ing it + after usage + + */ + if (strlen(PyString_AS_STRING(s)) != size) + return "(encoded string without "\ + "NULL bytes)"; + *buffer = PyMem_NEW(char, size + 1); + if (*buffer == NULL) { + Py_DECREF(s); + return "(memory error)"; + } + memcpy(*buffer, + PyString_AS_STRING(s), + size + 1); + } + Py_DECREF(s); + break; + } + case 'S': /* string object */ { PyObject **p = va_arg(*p_va, PyObject **);