Modify _PyBytes_DecodeEscapeRecode() to use _PyBytesAPI

* Don't overallocate by 400% when recode is needed: only overallocate on demand
  using _PyBytesWriter.
* Use _PyLong_DigitValue to convert hexadecimal digit to int
* Create _PyBytes_DecodeEscapeRecode() subfunction
This commit is contained in:
Victor Stinner 2015-10-14 13:32:13 +02:00
parent 1285e5c805
commit 2ec8063cc9
2 changed files with 75 additions and 59 deletions

View File

@ -65,7 +65,8 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
# error "void* different in size from int, long and long long" # error "void* different in size from int, long and long long"
#endif /* SIZEOF_VOID_P */ #endif /* SIZEOF_VOID_P */
/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */ /* Used by Python/mystrtoul.c, _PyBytes_FromHex(),
_PyBytes_DecodeEscapeRecode(), etc. */
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256]; PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
#endif #endif

View File

@ -1068,6 +1068,42 @@ bytes_dealloc(PyObject *op)
the string is UTF-8 encoded and should be re-encoded in the the string is UTF-8 encoded and should be re-encoded in the
specified encoding. */ specified encoding. */
static char *
_PyBytes_DecodeEscapeRecode(const char **s, const char *end,
const char *errors, const char *recode_encoding,
_PyBytesWriter *writer, char *p)
{
PyObject *u, *w;
const char* t;
t = *s;
/* Decode non-ASCII bytes as UTF-8. */
while (t < end && (*t & 0x80))
t++;
u = PyUnicode_DecodeUTF8(*s, t - *s, errors);
if (u == NULL)
return NULL;
/* Recode them in target encoding. */
w = PyUnicode_AsEncodedString(u, recode_encoding, errors);
Py_DECREF(u);
if (w == NULL)
return NULL;
assert(PyBytes_Check(w));
/* Append bytes to output buffer. */
writer->min_size--; /* substract 1 preallocated byte */
p = _PyBytesWriter_WriteBytes(writer, p,
PyBytes_AS_STRING(w),
PyBytes_GET_SIZE(w));
Py_DECREF(w);
if (p == NULL)
return NULL;
*s = t;
return p;
}
PyObject *PyBytes_DecodeEscape(const char *s, PyObject *PyBytes_DecodeEscape(const char *s,
Py_ssize_t len, Py_ssize_t len,
const char *errors, const char *errors,
@ -1075,54 +1111,42 @@ PyObject *PyBytes_DecodeEscape(const char *s,
const char *recode_encoding) const char *recode_encoding)
{ {
int c; int c;
char *p, *buf; char *p;
const char *end; const char *end;
PyObject *v; _PyBytesWriter writer;
Py_ssize_t newlen = recode_encoding ? 4*len:len;
v = PyBytes_FromStringAndSize((char *)NULL, newlen); _PyBytesWriter_Init(&writer);
if (v == NULL)
p = _PyBytesWriter_Alloc(&writer, len);
if (p == NULL)
return NULL; return NULL;
p = buf = PyBytes_AsString(v); writer.overallocate = 1;
end = s + len; end = s + len;
while (s < end) { while (s < end) {
if (*s != '\\') { if (*s != '\\') {
non_esc: non_esc:
if (recode_encoding && (*s & 0x80)) { if (!(recode_encoding && (*s & 0x80))) {
PyObject *u, *w;
char *r;
const char* t;
Py_ssize_t rn;
t = s;
/* Decode non-ASCII bytes as UTF-8. */
while (t < end && (*t & 0x80)) t++;
u = PyUnicode_DecodeUTF8(s, t - s, errors);
if(!u) goto failed;
/* Recode them in target encoding. */
w = PyUnicode_AsEncodedString(
u, recode_encoding, errors);
Py_DECREF(u);
if (!w) goto failed;
/* Append bytes to output buffer. */
assert(PyBytes_Check(w));
r = PyBytes_AS_STRING(w);
rn = PyBytes_GET_SIZE(w);
Py_MEMCPY(p, r, rn);
p += rn;
Py_DECREF(w);
s = t;
} else {
*p++ = *s++; *p++ = *s++;
} }
else {
/* non-ASCII character and need to recode */
p = _PyBytes_DecodeEscapeRecode(&s, end,
errors, recode_encoding,
&writer, p);
if (p == NULL)
goto failed;
}
continue; continue;
} }
s++; s++;
if (s==end) { if (s == end) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"Trailing \\ in string"); "Trailing \\ in string");
goto failed; goto failed;
} }
switch (*s++) { switch (*s++) {
/* XXX This assumes ASCII! */ /* XXX This assumes ASCII! */
case '\n': break; case '\n': break;
@ -1147,28 +1171,18 @@ PyObject *PyBytes_DecodeEscape(const char *s,
*p++ = c; *p++ = c;
break; break;
case 'x': case 'x':
if (s+1 < end && Py_ISXDIGIT(s[0]) && Py_ISXDIGIT(s[1])) { if (s+1 < end) {
unsigned int x = 0; int digit1, digit2;
c = Py_CHARMASK(*s); digit1 = _PyLong_DigitValue[Py_CHARMASK(s[0])];
s++; digit2 = _PyLong_DigitValue[Py_CHARMASK(s[1])];
if (Py_ISDIGIT(c)) if (digit1 < 16 && digit2 < 16) {
x = c - '0'; *p++ = (unsigned char)((digit1 << 4) + digit2);
else if (Py_ISLOWER(c)) s += 2;
x = 10 + c - 'a'; break;
else }
x = 10 + c - 'A';
x = x << 4;
c = Py_CHARMASK(*s);
s++;
if (Py_ISDIGIT(c))
x += c - '0';
else if (Py_ISLOWER(c))
x += 10 + c - 'a';
else
x += 10 + c - 'A';
*p++ = x;
break;
} }
/* invalid hexadecimal digits */
if (!errors || strcmp(errors, "strict") == 0) { if (!errors || strcmp(errors, "strict") == 0) {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"invalid \\x escape at position %d", "invalid \\x escape at position %d",
@ -1190,6 +1204,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
if (s < end && Py_ISXDIGIT(s[0])) if (s < end && Py_ISXDIGIT(s[0]))
s++; /* and a hexdigit */ s++; /* and a hexdigit */
break; break;
default: default:
*p++ = '\\'; *p++ = '\\';
s--; s--;
@ -1197,11 +1212,11 @@ PyObject *PyBytes_DecodeEscape(const char *s,
UTF-8 bytes may follow. */ UTF-8 bytes may follow. */
} }
} }
if (p-buf < newlen)
_PyBytes_Resize(&v, p - buf); return _PyBytesWriter_Finish(&writer, p);
return v;
failed: failed:
Py_DECREF(v); _PyBytesWriter_Dealloc(&writer);
return NULL; return NULL;
} }