Issue #5108: Handle %s like %S and %R in PyUnicode_FromFormatV(): Call

PyUnicode_DecodeUTF8() once, remember the result and output it in a second
step. This avoids problems with counting UTF-8 bytes that ignores the effect
of using the replace error handler in PyUnicode_DecodeUTF8().
This commit is contained in:
Walter Dörwald 2009-05-03 22:36:33 +00:00
parent 01fce5adc0
commit ed960ac404
2 changed files with 37 additions and 48 deletions

View File

@ -258,6 +258,11 @@ Core and Builtins
- Issue #5705: os.setuid() would not accept values > 2**31-1 but pwd.getpwnam()
returned them on 64bit platforms.
- Issue #5108: Handle %s like %S and %R in PyUnicode_FromFormatV(): Call
PyUnicode_DecodeUTF8() once, remember the result and output it in a second
step. This avoids problems with counting UTF-8 bytes that ignores the effect
of using the replace error handler in PyUnicode_DecodeUTF8().
Library
-------

View File

@ -674,15 +674,25 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
count = vargs;
#endif
#endif
/* step 1: count the number of %S/%R format specifications
* (we call PyObject_Str()/PyObject_Repr() for these objects
* once during step 3 and put the result in an array) */
/* step 1: count the number of %S/%R/%s format specifications
* (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
* objects once during step 3 and put the result in an array) */
for (f = format; *f; f++) {
if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
++callcount;
if (*f == '%') {
if (*(f+1)=='%')
continue;
if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
++callcount;
while (isdigit((unsigned)*f))
width = (width*10) + *f++ - '0';
while (*++f && *f != '%' && !isalpha((unsigned)*f))
;
if (*f == 's')
++callcount;
}
}
/* step 2: allocate memory for the results of
* PyObject_Str()/PyObject_Repr() calls */
* PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
if (callcount) {
callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
if (!callresults) {
@ -731,35 +741,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
case 's':
{
/* UTF-8 */
unsigned char*s;
s = va_arg(count, unsigned char*);
while (*s) {
if (*s < 128) {
n++; s++;
} else if (*s < 0xc0) {
/* invalid UTF-8 */
n++; s++;
} else if (*s < 0xc0) {
n++;
s++; if(!*s)break;
s++;
} else if (*s < 0xe0) {
n++;
s++; if(!*s)break;
s++; if(!*s)break;
s++;
} else {
#ifdef Py_UNICODE_WIDE
n++;
#else
n+=2;
#endif
s++; if(!*s)break;
s++; if(!*s)break;
s++; if(!*s)break;
s++;
}
}
unsigned char *s = va_arg(count, unsigned char*);
PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
if (!str)
goto fail;
n += PyUnicode_GET_SIZE(str);
/* Remember the str and switch to the next slot */
*callresult++ = str;
break;
}
case 'U':
@ -915,19 +903,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
break;
case 's':
{
/* Parameter must be UTF-8 encoded.
In case of encoding errors, use
the replacement character. */
PyObject *u;
p = va_arg(vargs, char*);
u = PyUnicode_DecodeUTF8(p, strlen(p),
"replace");
if (!u)
goto fail;
Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
PyUnicode_GET_SIZE(u));
s += PyUnicode_GET_SIZE(u);
Py_DECREF(u);
/* unused, since we already have the result */
(void) va_arg(vargs, char *);
Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
PyUnicode_GET_SIZE(*callresult));
s += PyUnicode_GET_SIZE(*callresult);
/* We're done with the unicode()/repr() => forget it */
Py_DECREF(*callresult);
/* switch to next unicode()/repr() result */
++callresult;
break;
}
case 'U':