mirror of https://github.com/python/cpython
Modify _PyBytes_DecodeEscapeRecode() to use _PyBytesAPI
* Don't overallocate by 400% when recode is needed: only overallocate on demand using _PyBytesWriter. * Use _PyLong_DigitValue to convert hexadecimal digit to int * Create _PyBytes_DecodeEscapeRecode() subfunction
This commit is contained in:
parent
1285e5c805
commit
2ec8063cc9
|
@ -65,7 +65,8 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
|
||||||
# error "void* different in size from int, long and long long"
|
# error "void* different in size from int, long and long long"
|
||||||
#endif /* SIZEOF_VOID_P */
|
#endif /* SIZEOF_VOID_P */
|
||||||
|
|
||||||
/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */
|
/* Used by Python/mystrtoul.c, _PyBytes_FromHex(),
|
||||||
|
_PyBytes_DecodeEscapeRecode(), etc. */
|
||||||
#ifndef Py_LIMITED_API
|
#ifndef Py_LIMITED_API
|
||||||
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
|
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1068,6 +1068,42 @@ bytes_dealloc(PyObject *op)
|
||||||
the string is UTF-8 encoded and should be re-encoded in the
|
the string is UTF-8 encoded and should be re-encoded in the
|
||||||
specified encoding. */
|
specified encoding. */
|
||||||
|
|
||||||
|
static char *
|
||||||
|
_PyBytes_DecodeEscapeRecode(const char **s, const char *end,
|
||||||
|
const char *errors, const char *recode_encoding,
|
||||||
|
_PyBytesWriter *writer, char *p)
|
||||||
|
{
|
||||||
|
PyObject *u, *w;
|
||||||
|
const char* t;
|
||||||
|
|
||||||
|
t = *s;
|
||||||
|
/* Decode non-ASCII bytes as UTF-8. */
|
||||||
|
while (t < end && (*t & 0x80))
|
||||||
|
t++;
|
||||||
|
u = PyUnicode_DecodeUTF8(*s, t - *s, errors);
|
||||||
|
if (u == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/* Recode them in target encoding. */
|
||||||
|
w = PyUnicode_AsEncodedString(u, recode_encoding, errors);
|
||||||
|
Py_DECREF(u);
|
||||||
|
if (w == NULL)
|
||||||
|
return NULL;
|
||||||
|
assert(PyBytes_Check(w));
|
||||||
|
|
||||||
|
/* Append bytes to output buffer. */
|
||||||
|
writer->min_size--; /* substract 1 preallocated byte */
|
||||||
|
p = _PyBytesWriter_WriteBytes(writer, p,
|
||||||
|
PyBytes_AS_STRING(w),
|
||||||
|
PyBytes_GET_SIZE(w));
|
||||||
|
Py_DECREF(w);
|
||||||
|
if (p == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
*s = t;
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
PyObject *PyBytes_DecodeEscape(const char *s,
|
PyObject *PyBytes_DecodeEscape(const char *s,
|
||||||
Py_ssize_t len,
|
Py_ssize_t len,
|
||||||
const char *errors,
|
const char *errors,
|
||||||
|
@ -1075,54 +1111,42 @@ PyObject *PyBytes_DecodeEscape(const char *s,
|
||||||
const char *recode_encoding)
|
const char *recode_encoding)
|
||||||
{
|
{
|
||||||
int c;
|
int c;
|
||||||
char *p, *buf;
|
char *p;
|
||||||
const char *end;
|
const char *end;
|
||||||
PyObject *v;
|
_PyBytesWriter writer;
|
||||||
Py_ssize_t newlen = recode_encoding ? 4*len:len;
|
|
||||||
v = PyBytes_FromStringAndSize((char *)NULL, newlen);
|
_PyBytesWriter_Init(&writer);
|
||||||
if (v == NULL)
|
|
||||||
|
p = _PyBytesWriter_Alloc(&writer, len);
|
||||||
|
if (p == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
p = buf = PyBytes_AsString(v);
|
writer.overallocate = 1;
|
||||||
|
|
||||||
end = s + len;
|
end = s + len;
|
||||||
while (s < end) {
|
while (s < end) {
|
||||||
if (*s != '\\') {
|
if (*s != '\\') {
|
||||||
non_esc:
|
non_esc:
|
||||||
if (recode_encoding && (*s & 0x80)) {
|
if (!(recode_encoding && (*s & 0x80))) {
|
||||||
PyObject *u, *w;
|
|
||||||
char *r;
|
|
||||||
const char* t;
|
|
||||||
Py_ssize_t rn;
|
|
||||||
t = s;
|
|
||||||
/* Decode non-ASCII bytes as UTF-8. */
|
|
||||||
while (t < end && (*t & 0x80)) t++;
|
|
||||||
u = PyUnicode_DecodeUTF8(s, t - s, errors);
|
|
||||||
if(!u) goto failed;
|
|
||||||
|
|
||||||
/* Recode them in target encoding. */
|
|
||||||
w = PyUnicode_AsEncodedString(
|
|
||||||
u, recode_encoding, errors);
|
|
||||||
Py_DECREF(u);
|
|
||||||
if (!w) goto failed;
|
|
||||||
|
|
||||||
/* Append bytes to output buffer. */
|
|
||||||
assert(PyBytes_Check(w));
|
|
||||||
r = PyBytes_AS_STRING(w);
|
|
||||||
rn = PyBytes_GET_SIZE(w);
|
|
||||||
Py_MEMCPY(p, r, rn);
|
|
||||||
p += rn;
|
|
||||||
Py_DECREF(w);
|
|
||||||
s = t;
|
|
||||||
} else {
|
|
||||||
*p++ = *s++;
|
*p++ = *s++;
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
/* non-ASCII character and need to recode */
|
||||||
|
p = _PyBytes_DecodeEscapeRecode(&s, end,
|
||||||
|
errors, recode_encoding,
|
||||||
|
&writer, p);
|
||||||
|
if (p == NULL)
|
||||||
|
goto failed;
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
s++;
|
s++;
|
||||||
if (s==end) {
|
if (s == end) {
|
||||||
PyErr_SetString(PyExc_ValueError,
|
PyErr_SetString(PyExc_ValueError,
|
||||||
"Trailing \\ in string");
|
"Trailing \\ in string");
|
||||||
goto failed;
|
goto failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (*s++) {
|
switch (*s++) {
|
||||||
/* XXX This assumes ASCII! */
|
/* XXX This assumes ASCII! */
|
||||||
case '\n': break;
|
case '\n': break;
|
||||||
|
@ -1147,28 +1171,18 @@ PyObject *PyBytes_DecodeEscape(const char *s,
|
||||||
*p++ = c;
|
*p++ = c;
|
||||||
break;
|
break;
|
||||||
case 'x':
|
case 'x':
|
||||||
if (s+1 < end && Py_ISXDIGIT(s[0]) && Py_ISXDIGIT(s[1])) {
|
if (s+1 < end) {
|
||||||
unsigned int x = 0;
|
int digit1, digit2;
|
||||||
c = Py_CHARMASK(*s);
|
digit1 = _PyLong_DigitValue[Py_CHARMASK(s[0])];
|
||||||
s++;
|
digit2 = _PyLong_DigitValue[Py_CHARMASK(s[1])];
|
||||||
if (Py_ISDIGIT(c))
|
if (digit1 < 16 && digit2 < 16) {
|
||||||
x = c - '0';
|
*p++ = (unsigned char)((digit1 << 4) + digit2);
|
||||||
else if (Py_ISLOWER(c))
|
s += 2;
|
||||||
x = 10 + c - 'a';
|
break;
|
||||||
else
|
}
|
||||||
x = 10 + c - 'A';
|
|
||||||
x = x << 4;
|
|
||||||
c = Py_CHARMASK(*s);
|
|
||||||
s++;
|
|
||||||
if (Py_ISDIGIT(c))
|
|
||||||
x += c - '0';
|
|
||||||
else if (Py_ISLOWER(c))
|
|
||||||
x += 10 + c - 'a';
|
|
||||||
else
|
|
||||||
x += 10 + c - 'A';
|
|
||||||
*p++ = x;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
/* invalid hexadecimal digits */
|
||||||
|
|
||||||
if (!errors || strcmp(errors, "strict") == 0) {
|
if (!errors || strcmp(errors, "strict") == 0) {
|
||||||
PyErr_Format(PyExc_ValueError,
|
PyErr_Format(PyExc_ValueError,
|
||||||
"invalid \\x escape at position %d",
|
"invalid \\x escape at position %d",
|
||||||
|
@ -1190,6 +1204,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
|
||||||
if (s < end && Py_ISXDIGIT(s[0]))
|
if (s < end && Py_ISXDIGIT(s[0]))
|
||||||
s++; /* and a hexdigit */
|
s++; /* and a hexdigit */
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
s--;
|
s--;
|
||||||
|
@ -1197,11 +1212,11 @@ PyObject *PyBytes_DecodeEscape(const char *s,
|
||||||
UTF-8 bytes may follow. */
|
UTF-8 bytes may follow. */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (p-buf < newlen)
|
|
||||||
_PyBytes_Resize(&v, p - buf);
|
return _PyBytesWriter_Finish(&writer, p);
|
||||||
return v;
|
|
||||||
failed:
|
failed:
|
||||||
Py_DECREF(v);
|
_PyBytesWriter_Dealloc(&writer);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue