Fix to the UTF-8 encoder: it failed on 0-length input strings.

Fix for the UTF-8 decoder: it will now accept isolated surrogates
(previously it raised an exception which causes round-trips to
fail).

Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for
marshalling Unicode objects, so we better make sure it works for
all Unicode code points, including isolated surrogates).

Bumped the PYC magic in a non-standard way -- please review. This
was needed because the old PYC format used illegal UTF-8 sequences
for isolated high surrogates which now raise an exception.
This commit is contained in:
Marc-André Lemburg 2002-02-07 11:33:49 +00:00
parent 9273ec726c
commit bd3be8f0ca
4 changed files with 71 additions and 31 deletions

View File

@ -1,5 +1,5 @@
test_unicodedata
Testing Unicode Database...
Methods: 6c7a7c02657b69d0fdd7a7d174f573194bba2e18
Methods: 84b72943b1d4320bc1e64a4888f7cdf62eea219a
Functions: 41e1d4792185d6474a43c83ce4f593b1bdb01f8a
API: ok

View File

@ -23,21 +23,23 @@ if not sys.platform.startswith('java'):
verify(repr(u"'\"") == """u'\\'"'""")
verify(repr(u"'") == '''u"'"''')
verify(repr(u'"') == """u'"'""")
verify(repr(u''.join(map(unichr, range(256)))) ==
"u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
"\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
"\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
"\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
"\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
"\\xfe\\xff'")
latin1repr = (
"u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
"\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
"\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
"\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
"\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
"\\xfe\\xff'")
testrepr = repr(u''.join(map(unichr, range(256))))
verify(testrepr == latin1repr)
def test(method, input, output, *args):
if verbose:
@ -495,6 +497,7 @@ else:
verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
# UTF-8 specific encoding tests:
verify(u''.encode('utf-8') == '')
verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
@ -552,14 +555,7 @@ for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
verify(unicode(u.encode(encoding),encoding) == u)
# Roundtrip safety for non-BMP (just a few chars)
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
for encoding in ('utf-8',
'utf-16', 'utf-16-le', 'utf-16-be',
#'raw_unicode_escape',
'unicode_escape', 'unicode_internal'):
verify(unicode(u.encode(encoding),encoding) == u)
# Roundtrip safety for BMP (just the first 256 chars)
u = u''.join(map(unichr, range(256)))
for encoding in (
'latin-1',
@ -571,6 +567,7 @@ for encoding in (
except ValueError,why:
print '*** codec for "%s" failed: %s' % (encoding, why)
# Roundtrip safety for BMP (just the first 128 chars)
u = u''.join(map(unichr, range(128)))
for encoding in (
'ascii',
@ -582,6 +579,19 @@ for encoding in (
except ValueError,why:
print '*** codec for "%s" failed: %s' % (encoding, why)
# Roundtrip safety for non-BMP (just a few chars)
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
for encoding in ('utf-8',
'utf-16', 'utf-16-le', 'utf-16-be',
#'raw_unicode_escape',
'unicode_escape', 'unicode_internal'):
verify(unicode(u.encode(encoding),encoding) == u)
# UTF-8 must be roundtrip safe for all UCS-2 code points
u = u''.join(map(unichr, range(0x10000)))
for encoding in ('utf-8',):
verify(unicode(u.encode(encoding),encoding) == u)
print 'done.'
print 'Testing standard mapping codecs...',

View File

@ -1065,12 +1065,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
if (ch < 0x0800) {
/* Note: UTF-8 encodings of surrogates are considered
legal UTF-8 sequences;
XXX For wide builds (UCS-4) we should probably try
to recombine the surrogates into a single code
unit.
*/
errmsg = "illegal encoding";
goto utf8Error;
}
else
*p++ = (Py_UNICODE)ch;
*p++ = (Py_UNICODE)ch;
break;
case 4:
@ -1084,9 +1091,9 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
/* validate and convert to UTF-16 */
if ((ch < 0x10000) /* minimum value allowed for 4
byte encoding */
byte encoding */
|| (ch > 0x10ffff)) /* maximum value allowed for
UTF-16 */
UTF-16 */
{
errmsg = "illegal encoding";
goto utf8Error;
@ -1175,11 +1182,15 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
unsigned int cbWritten = 0;
int i = 0;
/* Short-cut for emtpy strings */
if (size == 0)
return PyString_FromStringAndSize(NULL, 0);
/* We allocate 4 more bytes to have room for at least one full
UTF-8 sequence; saves a few cycles in the loop below */
v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
if (v == NULL)
return NULL;
if (size == 0)
return v;
p = PyString_AS_STRING(v);
while (i < size) {

View File

@ -41,8 +41,27 @@ extern time_t PyOS_GetLastModificationTime(char *, FILE *);
the Unicode -U option is in use. IMO (Tim's), that's a Bad Idea
(quite apart from that the -U option doesn't work so isn't used
anyway).
XXX MAL, 2002-02-07: I had to modify the MAGIC due to a fix of the
UTF-8 encoder (it previously produced invalid UTF-8 for unpaired
high surrogates), so I simply bumped the month value to 20 (invalid
month) and set the day to 1. This should be recognizable by any
algorithm relying on the above scheme. Perhaps we should simply
start counting in increments of 10 from now on ?!
Known values:
Python 1.5: 20121
Python 1.5.1: 20121
Python 1.5.2: 20121
Python 2.0: 50823
Python 2.0.1: 50823
Python 2.1: 60202
Python 2.1.1: 60202
Python 2.1.2: 60202
Python 2.2: 60717
Python 2.3a0: 62001
*/
#define MAGIC (60717 | ((long)'\r'<<16) | ((long)'\n'<<24))
#define MAGIC (62001 | ((long)'\r'<<16) | ((long)'\n'<<24))
/* Magic word as global; note that _PyImport_Init() can change the
value of this global to accommodate for alterations of how the