Issue #6097: Escape UTF-8 surrogates resulting from mbstocs conversion

of the command line.
This commit is contained in:
Martin v. Löwis 2009-05-29 16:22:26 +00:00
parent e23c8683a5
commit 8ed91b2768
2 changed files with 21 additions and 2 deletions

View File

@ -12,6 +12,9 @@ What's New in Python 3.1 release candidate 1?
Core and Builtins
-----------------
- Issue #6097: Escape UTF-8 surrogates resulting from mbstocs conversion
of the command line.
- Issue #6012: Add cleanup support to O& argument parsing.
- Issue #6089: Fixed str.format with certain invalid field specifiers

View File

@ -38,8 +38,16 @@ char2wchar(char* arg)
if (!res)
goto oom;
count = mbstowcs(res, arg, argsize+1);
if (count != (size_t)-1)
if (count != (size_t)-1) {
wchar_t *tmp;
/* Only use the result if it contains no
surrogate characters. */
for (tmp = res; *tmp != 0 &&
(*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
;
if (*tmp == 0)
return res;
}
PyMem_Free(res);
}
/* Conversion failed. Fall back to escaping with surrogateescape. */
@ -75,6 +83,14 @@ char2wchar(char* arg)
memset(&mbs, 0, sizeof mbs);
continue;
}
if (*out >= 0xd800 && *out <= 0xdfff) {
/* Surrogate character. Escape the original
byte sequence with surrogateescape. */
argsize -= converted;
while (converted--)
*out++ = 0xdc00 + *in++;
continue;
}
/* successfully converted some bytes */
in += converted;
argsize -= converted;