Issue #5108: Handle %s like %S and %R in PyUnicode_FromFormatV(): Call

PyUnicode_DecodeUTF8() once, remember the result and output it in a second step. This avoids problems with counting UTF-8 bytes that ignores the effect of using the replace error handler in PyUnicode_DecodeUTF8().
2009-05-03 22:36:33 +00:00 · 2009-05-03 22:36:33 +00:00 · ed960ac404
parent 01fce5adc0
commit ed960ac404
2 changed files with 37 additions and 48 deletions
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -258,6 +258,11 @@ Core and Builtins
 - Issue #5705: os.setuid() would not accept values > 2**31-1 but pwd.getpwnam()
  returned them on 64bit platforms.
 - Issue #5108: Handle %s like %S and %R in PyUnicode_FromFormatV(): Call
  PyUnicode_DecodeUTF8() once, remember the result and output it in a second
  step. This avoids problems with counting UTF-8 bytes that ignores the effect
  of using the replace error handler in PyUnicode_DecodeUTF8().
 Library
 -------
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -674,15 +674,25 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
    count = vargs;
 #endif
 #endif
-    /* step 1: count the number of %S/%R format specifications
+     /* step 1: count the number of %S/%R/%s format specifications
-     * (we call PyObject_Str()/PyObject_Repr() for these objects
+      * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
-     * once during step 3 and put the result in an array) */
+      * objects once during step 3 and put the result in an array) */
    for (f = format; *f; f++) {
-        if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
+         if (*f == '%') {
             if (*(f+1)=='%')
                 continue;
             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
                 ++callcount;
             while (isdigit((unsigned)*f))
                 width = (width*10) + *f++ - '0';
             while (*++f && *f != '%' && !isalpha((unsigned)*f))
                 ;
             if (*f == 's')
                 ++callcount;
         }
    }
    /* step 2: allocate memory for the results of
-     * PyObject_Str()/PyObject_Repr() calls */
+     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
    if (callcount) {
        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
        if (!callresults) {
@ -731,35 +741,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
            case 's':
            {
                /* UTF-8 */
-                unsigned char*s;
+                unsigned char *s = va_arg(count, unsigned char*);
-                s = va_arg(count, unsigned char*);
+                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
-                while (*s) {
+                if (!str)
-                    if (*s < 128) {
+                    goto fail;
-                        n++; s++;
+                n += PyUnicode_GET_SIZE(str);
-                    } else if (*s < 0xc0) {
+                /* Remember the str and switch to the next slot */
-                        /* invalid UTF-8 */
+                *callresult++ = str;
                        n++; s++;
                    } else if (*s < 0xc0) {
                        n++;
                        s++; if(!*s)break;
                        s++;
                    } else if (*s < 0xe0) {
                        n++;
                        s++; if(!*s)break;
                        s++; if(!*s)break;
                        s++;
                    } else {
 #ifdef Py_UNICODE_WIDE
                        n++;
 #else
                        n+=2;
 #endif
                        s++; if(!*s)break;
                        s++; if(!*s)break;
                        s++; if(!*s)break;
                        s++;
                    }
                }
                break;
            }
            case 'U':
@ -915,19 +903,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
                break;
            case 's':
            {
-                /* Parameter must be UTF-8 encoded.
+                /* unused, since we already have the result */
-                   In case of encoding errors, use
+                (void) va_arg(vargs, char *);
-                   the replacement character. */
+                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
-                PyObject *u;
+                                PyUnicode_GET_SIZE(*callresult));
-                p = va_arg(vargs, char*);
+                s += PyUnicode_GET_SIZE(*callresult);
-                u = PyUnicode_DecodeUTF8(p, strlen(p),
+                /* We're done with the unicode()/repr() => forget it */
-                                         "replace");
+                Py_DECREF(*callresult);
-                if (!u)
+                /* switch to next unicode()/repr() result */
-                    goto fail;
+                ++callresult;
                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
                                PyUnicode_GET_SIZE(u));
                s += PyUnicode_GET_SIZE(u);
                Py_DECREF(u);
                break;
            }
            case 'U':