diff --git a/Misc/NEWS b/Misc/NEWS index 579dcb49aaa..64e473ac589 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -258,6 +258,11 @@ Core and Builtins - Issue #5705: os.setuid() would not accept values > 2**31-1 but pwd.getpwnam() returned them on 64bit platforms. +- Issue #5108: Handle %s like %S and %R in PyUnicode_FromFormatV(): Call + PyUnicode_DecodeUTF8() once, remember the result and output it in a second + step. This avoids problems with counting UTF-8 bytes that ignores the effect + of using the replace error handler in PyUnicode_DecodeUTF8(). + Library ------- diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 62191ade0ab..8e18302a24b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -674,15 +674,25 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) count = vargs; #endif #endif - /* step 1: count the number of %S/%R format specifications - * (we call PyObject_Str()/PyObject_Repr() for these objects - * once during step 3 and put the result in an array) */ + /* step 1: count the number of %S/%R/%s format specifications + * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these + * objects once during step 3 and put the result in an array) */ for (f = format; *f; f++) { - if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R')) - ++callcount; + if (*f == '%') { + if (*(f+1)=='%') + continue; + if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A') + ++callcount; + while (isdigit((unsigned)*f)) + width = (width*10) + *f++ - '0'; + while (*++f && *f != '%' && !isalpha((unsigned)*f)) + ; + if (*f == 's') + ++callcount; + } } /* step 2: allocate memory for the results of - * PyObject_Str()/PyObject_Repr() calls */ + * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ if (callcount) { callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); if (!callresults) { @@ -731,35 +741,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) case 's': { /* UTF-8 */ - unsigned char*s; - s = va_arg(count, unsigned char*); - while (*s) { - if (*s < 128) { - n++; s++; - } else if (*s < 0xc0) { - /* invalid UTF-8 */ - n++; s++; - } else if (*s < 0xc0) { - n++; - s++; if(!*s)break; - s++; - } else if (*s < 0xe0) { - n++; - s++; if(!*s)break; - s++; if(!*s)break; - s++; - } else { -#ifdef Py_UNICODE_WIDE - n++; -#else - n+=2; -#endif - s++; if(!*s)break; - s++; if(!*s)break; - s++; if(!*s)break; - s++; - } - } + unsigned char *s = va_arg(count, unsigned char*); + PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); + if (!str) + goto fail; + n += PyUnicode_GET_SIZE(str); + /* Remember the str and switch to the next slot */ + *callresult++ = str; break; } case 'U': @@ -915,19 +903,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) break; case 's': { - /* Parameter must be UTF-8 encoded. - In case of encoding errors, use - the replacement character. */ - PyObject *u; - p = va_arg(vargs, char*); - u = PyUnicode_DecodeUTF8(p, strlen(p), - "replace"); - if (!u) - goto fail; - Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), - PyUnicode_GET_SIZE(u)); - s += PyUnicode_GET_SIZE(u); - Py_DECREF(u); + /* unused, since we already have the result */ + (void) va_arg(vargs, char *); + Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), + PyUnicode_GET_SIZE(*callresult)); + s += PyUnicode_GET_SIZE(*callresult); + /* We're done with the unicode()/repr() => forget it */ + Py_DECREF(*callresult); + /* switch to next unicode()/repr() result */ + ++callresult; break; } case 'U':