mirror of https://github.com/python/cpython
gh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V (GH-120365)
PyUnicode_FromFormat() no longer produces the ending \ufffd character for truncated C string when use precision with %s and %V. It now truncates the string before the start of truncated multibyte sequences.
This commit is contained in:
parent
22b8a35d6e
commit
6eb23b1311
|
@ -419,8 +419,29 @@ class CAPITest(unittest.TestCase):
|
||||||
# truncated string
|
# truncated string
|
||||||
check_format('abc',
|
check_format('abc',
|
||||||
b'%.3s', b'abcdef')
|
b'%.3s', b'abcdef')
|
||||||
|
check_format('abc[',
|
||||||
|
b'%.6s', 'abc[\u20ac]'.encode('utf8'))
|
||||||
|
check_format('abc[\u20ac',
|
||||||
|
b'%.7s', 'abc[\u20ac]'.encode('utf8'))
|
||||||
check_format('abc[\ufffd',
|
check_format('abc[\ufffd',
|
||||||
b'%.5s', 'abc[\u20ac]'.encode('utf8'))
|
b'%.5s', b'abc[\xff]')
|
||||||
|
check_format('abc[',
|
||||||
|
b'%.6s', b'abc[\xe2\x82]')
|
||||||
|
check_format('abc[\ufffd]',
|
||||||
|
b'%.7s', b'abc[\xe2\x82]')
|
||||||
|
check_format('abc[\ufffd',
|
||||||
|
b'%.7s', b'abc[\xe2\x82\0')
|
||||||
|
check_format(' abc[',
|
||||||
|
b'%10.6s', 'abc[\u20ac]'.encode('utf8'))
|
||||||
|
check_format(' abc[\u20ac',
|
||||||
|
b'%10.7s', 'abc[\u20ac]'.encode('utf8'))
|
||||||
|
check_format(' abc[\ufffd',
|
||||||
|
b'%10.5s', b'abc[\xff]')
|
||||||
|
check_format(' abc[',
|
||||||
|
b'%10.6s', b'abc[\xe2\x82]')
|
||||||
|
check_format(' abc[\ufffd]',
|
||||||
|
b'%10.7s', b'abc[\xe2\x82]')
|
||||||
|
|
||||||
check_format("'\\u20acABC'",
|
check_format("'\\u20acABC'",
|
||||||
b'%A', '\u20acABC')
|
b'%A', '\u20acABC')
|
||||||
check_format("'\\u20",
|
check_format("'\\u20",
|
||||||
|
@ -433,10 +454,31 @@ class CAPITest(unittest.TestCase):
|
||||||
b'%.3S', '\u20acABCDEF')
|
b'%.3S', '\u20acABCDEF')
|
||||||
check_format('\u20acAB',
|
check_format('\u20acAB',
|
||||||
b'%.3U', '\u20acABCDEF')
|
b'%.3U', '\u20acABCDEF')
|
||||||
|
|
||||||
check_format('\u20acAB',
|
check_format('\u20acAB',
|
||||||
b'%.3V', '\u20acABCDEF', None)
|
b'%.3V', '\u20acABCDEF', None)
|
||||||
|
check_format('abc[',
|
||||||
|
b'%.6V', None, 'abc[\u20ac]'.encode('utf8'))
|
||||||
|
check_format('abc[\u20ac',
|
||||||
|
b'%.7V', None, 'abc[\u20ac]'.encode('utf8'))
|
||||||
check_format('abc[\ufffd',
|
check_format('abc[\ufffd',
|
||||||
b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
|
b'%.5V', None, b'abc[\xff]')
|
||||||
|
check_format('abc[',
|
||||||
|
b'%.6V', None, b'abc[\xe2\x82]')
|
||||||
|
check_format('abc[\ufffd]',
|
||||||
|
b'%.7V', None, b'abc[\xe2\x82]')
|
||||||
|
check_format(' abc[',
|
||||||
|
b'%10.6V', None, 'abc[\u20ac]'.encode('utf8'))
|
||||||
|
check_format(' abc[\u20ac',
|
||||||
|
b'%10.7V', None, 'abc[\u20ac]'.encode('utf8'))
|
||||||
|
check_format(' abc[\ufffd',
|
||||||
|
b'%10.5V', None, b'abc[\xff]')
|
||||||
|
check_format(' abc[',
|
||||||
|
b'%10.6V', None, b'abc[\xe2\x82]')
|
||||||
|
check_format(' abc[\ufffd]',
|
||||||
|
b'%10.7V', None, b'abc[\xe2\x82]')
|
||||||
|
check_format(' abc[\ufffd',
|
||||||
|
b'%10.7V', None, b'abc[\xe2\x82\0')
|
||||||
|
|
||||||
# following tests comes from #7330
|
# following tests comes from #7330
|
||||||
# test width modifier and precision modifier with %S
|
# test width modifier and precision modifier with %S
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd``
|
||||||
|
character for truncated C string when use precision with ``%s`` and ``%V``.
|
||||||
|
It now truncates the string before the start of truncated multibyte
|
||||||
|
sequences.
|
|
@ -2581,6 +2581,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
|
||||||
Py_ssize_t width, Py_ssize_t precision, int flags)
|
Py_ssize_t width, Py_ssize_t precision, int flags)
|
||||||
{
|
{
|
||||||
/* UTF-8 */
|
/* UTF-8 */
|
||||||
|
Py_ssize_t *pconsumed = NULL;
|
||||||
Py_ssize_t length;
|
Py_ssize_t length;
|
||||||
if (precision == -1) {
|
if (precision == -1) {
|
||||||
length = strlen(str);
|
length = strlen(str);
|
||||||
|
@ -2590,15 +2591,23 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
|
||||||
while (length < precision && str[length]) {
|
while (length < precision && str[length]) {
|
||||||
length++;
|
length++;
|
||||||
}
|
}
|
||||||
|
if (length == precision) {
|
||||||
|
/* The input string is not NUL-terminated. If it ends with an
|
||||||
|
* incomplete UTF-8 sequence, truncate the string just before it.
|
||||||
|
* Incomplete sequences in the middle and sequences which cannot
|
||||||
|
* be valid prefixes are still treated as errors and replaced
|
||||||
|
* with \xfffd. */
|
||||||
|
pconsumed = &length;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (width < 0) {
|
if (width < 0) {
|
||||||
return unicode_decode_utf8_writer(writer, str, length,
|
return unicode_decode_utf8_writer(writer, str, length,
|
||||||
_Py_ERROR_REPLACE, "replace", NULL);
|
_Py_ERROR_REPLACE, "replace", pconsumed);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
|
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
|
||||||
"replace", NULL);
|
"replace", pconsumed);
|
||||||
if (unicode == NULL)
|
if (unicode == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue