Port error handlers from Py_UNICODE indexing to code point indexing.

This commit is contained in:
Martin v. Löwis 2011-11-04 11:16:41 +01:00
parent 495dcbd5c1
commit b09af03b8a
2 changed files with 51 additions and 77 deletions

View File

@ -1513,6 +1513,11 @@ UnicodeEncodeError_init(PyObject *self, PyObject *args, PyObject *kwds)
return -1; return -1;
} }
if (PyUnicode_READY(err->object) < -1) {
err->encoding = NULL;
return -1;
}
Py_INCREF(err->encoding); Py_INCREF(err->encoding);
Py_INCREF(err->object); Py_INCREF(err->object);
Py_INCREF(err->reason); Py_INCREF(err->reason);

View File

@ -573,82 +573,72 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
PyObject *restuple; PyObject *restuple;
PyObject *object; PyObject *object;
Py_ssize_t i, o;
Py_ssize_t start; Py_ssize_t start;
Py_ssize_t end; Py_ssize_t end;
PyObject *res; PyObject *res;
Py_UNICODE *p; unsigned char *outp;
Py_UNICODE *startp;
Py_UNICODE *outp;
int ressize; int ressize;
Py_UCS4 ch;
if (PyUnicodeEncodeError_GetStart(exc, &start)) if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL; return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end)) if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL; return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc))) if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL; return NULL;
startp = PyUnicode_AS_UNICODE(object); for (i = start, ressize = 0; i < end; ++i) {
for (p = startp+start, ressize = 0; p < startp+end; ++p) { /* object is guaranteed to be "ready" */
if (*p<10) ch = PyUnicode_READ_CHAR(object, i);
if (ch<10)
ressize += 2+1+1; ressize += 2+1+1;
else if (*p<100) else if (ch<100)
ressize += 2+2+1; ressize += 2+2+1;
else if (*p<1000) else if (ch<1000)
ressize += 2+3+1; ressize += 2+3+1;
else if (*p<10000) else if (ch<10000)
ressize += 2+4+1; ressize += 2+4+1;
#ifndef Py_UNICODE_WIDE else if (ch<100000)
else
ressize += 2+5+1; ressize += 2+5+1;
#else else if (ch<1000000)
else if (*p<100000)
ressize += 2+5+1;
else if (*p<1000000)
ressize += 2+6+1; ressize += 2+6+1;
else else
ressize += 2+7+1; ressize += 2+7+1;
#endif
} }
/* allocate replacement */ /* allocate replacement */
res = PyUnicode_FromUnicode(NULL, ressize); res = PyUnicode_New(ressize, 127);
if (res == NULL) { if (res == NULL) {
Py_DECREF(object); Py_DECREF(object);
return NULL; return NULL;
} }
outp = PyUnicode_1BYTE_DATA(res);
/* generate replacement */ /* generate replacement */
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); for (i = start, o = 0; i < end; ++i) {
p < startp+end; ++p) { ch = PyUnicode_READ_CHAR(object, i);
Py_UNICODE c = *p;
int digits; int digits;
int base; int base;
*outp++ = '&'; *outp++ = '&';
*outp++ = '#'; *outp++ = '#';
if (*p<10) { if (ch<10) {
digits = 1; digits = 1;
base = 1; base = 1;
} }
else if (*p<100) { else if (ch<100) {
digits = 2; digits = 2;
base = 10; base = 10;
} }
else if (*p<1000) { else if (ch<1000) {
digits = 3; digits = 3;
base = 100; base = 100;
} }
else if (*p<10000) { else if (ch<10000) {
digits = 4; digits = 4;
base = 1000; base = 1000;
} }
#ifndef Py_UNICODE_WIDE else if (ch<100000) {
else {
digits = 5; digits = 5;
base = 10000; base = 10000;
} }
#else else if (ch<1000000) {
else if (*p<100000) {
digits = 5;
base = 10000;
}
else if (*p<1000000) {
digits = 6; digits = 6;
base = 100000; base = 100000;
} }
@ -656,10 +646,9 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
digits = 7; digits = 7;
base = 1000000; base = 1000000;
} }
#endif
while (digits-->0) { while (digits-->0) {
*outp++ = '0' + c/base; *outp++ = '0' + ch/base;
c %= base; ch %= base;
base /= 10; base /= 10;
} }
*outp++ = ';'; *outp++ = ';';
@ -677,58 +666,41 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{ {
#ifndef Py_UNICODE_WIDE
#define IS_SURROGATE_PAIR(p, end) \
(*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
*(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
#else
#define IS_SURROGATE_PAIR(p, end) 0
#endif
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
PyObject *restuple; PyObject *restuple;
PyObject *object; PyObject *object;
Py_ssize_t i;
Py_ssize_t start; Py_ssize_t start;
Py_ssize_t end; Py_ssize_t end;
PyObject *res; PyObject *res;
Py_UNICODE *p; unsigned char *outp;
Py_UNICODE *startp;
Py_UNICODE *outp;
int ressize; int ressize;
Py_UCS4 c;
if (PyUnicodeEncodeError_GetStart(exc, &start)) if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL; return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end)) if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL; return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc))) if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL; return NULL;
startp = PyUnicode_AS_UNICODE(object); for (i = start, ressize = 0; i < end; ++i) {
for (p = startp+start, ressize = 0; p < startp+end; ++p) { /* object is guaranteed to be "ready" */
#ifdef Py_UNICODE_WIDE c = PyUnicode_READ_CHAR(object, i);
if (*p >= 0x00010000) if (c >= 0x10000) {
ressize += 1+1+8; ressize += 1+1+8;
else }
#endif else if (c >= 0x100) {
if (*p >= 0x100) { ressize += 1+1+4;
if (IS_SURROGATE_PAIR(p, startp+end)) {
ressize += 1+1+8;
++p;
}
else
ressize += 1+1+4;
} }
else else
ressize += 1+1+2; ressize += 1+1+2;
} }
res = PyUnicode_FromUnicode(NULL, ressize); res = PyUnicode_New(ressize, 127);
if (res==NULL) if (res==NULL)
return NULL; return NULL;
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); for (i = start, outp = PyUnicode_1BYTE_DATA(res);
p < startp+end; ++p) { i < end; ++i) {
Py_UCS4 c = (Py_UCS4) *p; c = PyUnicode_READ_CHAR(object, i);
*outp++ = '\\'; *outp++ = '\\';
if (IS_SURROGATE_PAIR(p, startp+end)) {
c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
++p;
}
if (c >= 0x00010000) { if (c >= 0x00010000) {
*outp++ = 'U'; *outp++ = 'U';
*outp++ = Py_hexdigits[(c>>28)&0xf]; *outp++ = Py_hexdigits[(c>>28)&0xf];
@ -758,7 +730,6 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
wrong_exception_type(exc); wrong_exception_type(exc);
return NULL; return NULL;
} }
#undef IS_SURROGATE_PAIR
} }
/* This handler is declared static until someone demonstrates /* This handler is declared static until someone demonstrates
@ -768,12 +739,11 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
{ {
PyObject *restuple; PyObject *restuple;
PyObject *object; PyObject *object;
Py_ssize_t i;
Py_ssize_t start; Py_ssize_t start;
Py_ssize_t end; Py_ssize_t end;
PyObject *res; PyObject *res;
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Py_UNICODE *p;
Py_UNICODE *startp;
char *outp; char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start)) if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL; return NULL;
@ -781,15 +751,15 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
return NULL; return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc))) if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL; return NULL;
startp = PyUnicode_AS_UNICODE(object);
res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
if (!res) { if (!res) {
Py_DECREF(object); Py_DECREF(object);
return NULL; return NULL;
} }
outp = PyBytes_AsString(res); outp = PyBytes_AsString(res);
for (p = startp+start; p < startp+end; p++) { for (i = start; i < end; i++) {
Py_UNICODE ch = *p; /* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
if (ch < 0xd800 || ch > 0xdfff) { if (ch < 0xd800 || ch > 0xdfff) {
/* Not a surrogate, fail with original exception */ /* Not a surrogate, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc); PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
@ -847,12 +817,11 @@ PyCodec_SurrogateEscapeErrors(PyObject *exc)
{ {
PyObject *restuple; PyObject *restuple;
PyObject *object; PyObject *object;
Py_ssize_t i;
Py_ssize_t start; Py_ssize_t start;
Py_ssize_t end; Py_ssize_t end;
PyObject *res; PyObject *res;
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Py_UNICODE *p;
Py_UNICODE *startp;
char *outp; char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start)) if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL; return NULL;
@ -860,15 +829,15 @@ PyCodec_SurrogateEscapeErrors(PyObject *exc)
return NULL; return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc))) if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL; return NULL;
startp = PyUnicode_AS_UNICODE(object);
res = PyBytes_FromStringAndSize(NULL, end-start); res = PyBytes_FromStringAndSize(NULL, end-start);
if (!res) { if (!res) {
Py_DECREF(object); Py_DECREF(object);
return NULL; return NULL;
} }
outp = PyBytes_AsString(res); outp = PyBytes_AsString(res);
for (p = startp+start; p < startp+end; p++) { for (i = start; i < end; i++) {
Py_UNICODE ch = *p; /* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
if (ch < 0xdc80 || ch > 0xdcff) { if (ch < 0xdc80 || ch > 0xdcff) {
/* Not a UTF-8b surrogate, fail with original exception */ /* Not a UTF-8b surrogate, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc); PyErr_SetObject(PyExceptionInstance_Class(exc), exc);