Merged revisions 72260 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r72260 | walter.doerwald | 2009-05-04 00:36:33 +0200 (Mo, 04 Mai 2009) | 5 lines

  Issue #5108: Handle %s like %S and %R in PyUnicode_FromFormatV(): Call
  PyUnicode_DecodeUTF8() once, remember the result and output it in a second
  step. This avoids problems with counting UTF-8 bytes that ignores the effect
  of using the replace error handler in PyUnicode_DecodeUTF8().
........
This commit is contained in:
Walter Dörwald 2009-05-03 22:55:55 +00:00
parent 129ab1d809
commit c1651a0b96
2 changed files with 38 additions and 49 deletions

View File

@ -499,6 +499,11 @@ Core and Builtins
- The re.sub(), re.subn() and re.split() functions now accept a flags parameter. - The re.sub(), re.subn() and re.split() functions now accept a flags parameter.
- Issue #5108: Handle %s like %S, %R and %A in PyUnicode_FromFormatV(): Call
PyUnicode_DecodeUTF8() once, remember the result and output it in a second
step. This avoids problems with counting UTF-8 bytes that ignores the effect
of using the replace error handler in PyUnicode_DecodeUTF8().
Library Library
------- -------

View File

@ -723,16 +723,26 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
count = vargs; count = vargs;
#endif #endif
#endif #endif
/* step 1: count the number of %S/%R/%A format specifications /* step 1: count the number of %S/%R/%A/%s format specifications
* (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
* these objects once during step 3 and put the result in * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
an array) */ * result in an array) */
for (f = format; *f; f++) { for (f = format; *f; f++) {
if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')) if (*f == '%') {
++callcount; if (*(f+1)=='%')
continue;
if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
++callcount;
while (ISDIGIT((unsigned)*f))
width = (width*10) + *f++ - '0';
while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
;
if (*f == 's')
++callcount;
}
} }
/* step 2: allocate memory for the results of /* step 2: allocate memory for the results of
* PyObject_Str()/PyObject_Repr() calls */ * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
if (callcount) { if (callcount) {
callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
if (!callresults) { if (!callresults) {
@ -781,35 +791,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
case 's': case 's':
{ {
/* UTF-8 */ /* UTF-8 */
unsigned char*s; unsigned char *s = va_arg(count, unsigned char*);
s = va_arg(count, unsigned char*); PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
while (*s) { if (!str)
if (*s < 128) { goto fail;
n++; s++; n += PyUnicode_GET_SIZE(str);
} else if (*s < 0xc0) { /* Remember the str and switch to the next slot */
/* invalid UTF-8 */ *callresult++ = str;
n++; s++;
} else if (*s < 0xc0) {
n++;
s++; if(!*s)break;
s++;
} else if (*s < 0xe0) {
n++;
s++; if(!*s)break;
s++; if(!*s)break;
s++;
} else {
#ifdef Py_UNICODE_WIDE
n++;
#else
n+=2;
#endif
s++; if(!*s)break;
s++; if(!*s)break;
s++; if(!*s)break;
s++;
}
}
break; break;
} }
case 'U': case 'U':
@ -978,19 +966,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
break; break;
case 's': case 's':
{ {
/* Parameter must be UTF-8 encoded. /* unused, since we already have the result */
In case of encoding errors, use (void) va_arg(vargs, char *);
the replacement character. */ Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
PyObject *u; PyUnicode_GET_SIZE(*callresult));
p = va_arg(vargs, char*); s += PyUnicode_GET_SIZE(*callresult);
u = PyUnicode_DecodeUTF8(p, strlen(p), /* We're done with the unicode()/repr() => forget it */
"replace"); Py_DECREF(*callresult);
if (!u) /* switch to next unicode()/repr() result */
goto fail; ++callresult;
Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
PyUnicode_GET_SIZE(u));
s += PyUnicode_GET_SIZE(u);
Py_DECREF(u);
break; break;
} }
case 'U': case 'U':