bpo-34454: Clean up datetime.fromisoformat surrogate handling (GH-8959)

* Use _PyUnicode_Copy in sanitize_isoformat_str

* Use repr in fromisoformat error message

This reverses commit 67b74a98b2 per Serhiy Storchaka's suggestion:

     I suggested to use %R in the error message because including the raw
     string can be confusing in the case of empty string, or string
     containing trailing whitespaces, invisible or unprintable characters.

We agree that it is better to change both the C and pure Python versions
to use repr.

* Retain non-sanitized dtstr for error printing

This does not create an extra string, it just holds on to a reference to
the original input string for purposes of creating the error message.

* PEP 7 fixes to from_isoformat

* Separate handling of Unicode and other errors

In the initial implementation, errors other than encoding errors would
both raise an error indicating an invalid format, which would not be
true for errors like MemoryError.

* Drop needs_decref from _sanitize_isoformat_str

Instead _sanitize_isoformat_str returns a new reference, even to the
original string.
This commit is contained in:
Paul Ganssle 2018-10-22 12:32:52 -04:00 committed by Victor Stinner
parent 5a95ba29da
commit 3df85404d4
3 changed files with 106 additions and 76 deletions

View File

@ -857,7 +857,7 @@ class date:
assert len(date_string) == 10 assert len(date_string) == 10
return cls(*_parse_isoformat_date(date_string)) return cls(*_parse_isoformat_date(date_string))
except Exception: except Exception:
raise ValueError('Invalid isoformat string: %s' % date_string) raise ValueError(f'Invalid isoformat string: {date_string!r}')
# Conversions to string # Conversions to string
@ -1369,7 +1369,7 @@ class time:
try: try:
return cls(*_parse_isoformat_time(time_string)) return cls(*_parse_isoformat_time(time_string))
except Exception: except Exception:
raise ValueError('Invalid isoformat string: %s' % time_string) raise ValueError(f'Invalid isoformat string: {time_string!r}')
def strftime(self, fmt): def strftime(self, fmt):
@ -1646,13 +1646,13 @@ class datetime(date):
try: try:
date_components = _parse_isoformat_date(dstr) date_components = _parse_isoformat_date(dstr)
except ValueError: except ValueError:
raise ValueError('Invalid isoformat string: %s' % date_string) raise ValueError(f'Invalid isoformat string: {date_string!r}')
if tstr: if tstr:
try: try:
time_components = _parse_isoformat_time(tstr) time_components = _parse_isoformat_time(tstr)
except ValueError: except ValueError:
raise ValueError('Invalid isoformat string: %s' % date_string) raise ValueError(f'Invalid isoformat string: {date_string!r}')
else: else:
time_components = [0, 0, 0, 0, None] time_components = [0, 0, 0, 0, None]

View File

@ -13,6 +13,7 @@ import sys
import os import os
import pickle import pickle
import random import random
import re
import struct import struct
import unittest import unittest
@ -2676,6 +2677,14 @@ class TestDateTime(TestDate):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.theclass.fromisoformat(bad_str) self.theclass.fromisoformat(bad_str)
def test_fromisoformat_fails_surrogate(self):
# Test that when fromisoformat() fails with a surrogate character as
# the separator, the error message contains the original string
dtstr = "2018-01-03\ud80001:0113"
with self.assertRaisesRegex(ValueError, re.escape(repr(dtstr))):
self.theclass.fromisoformat(dtstr)
def test_fromisoformat_utc(self): def test_fromisoformat_utc(self):
dt_str = '2014-04-19T13:21:13+00:00' dt_str = '2014-04-19T13:21:13+00:00'
dt = self.theclass.fromisoformat(dt_str) dt = self.theclass.fromisoformat(dt_str)

View File

@ -668,8 +668,8 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d)
* String parsing utilities and helper functions * String parsing utilities and helper functions
*/ */
static const char* static const char *
parse_digits(const char* ptr, int* var, size_t num_digits) parse_digits(const char *ptr, int *var, size_t num_digits)
{ {
for (size_t i = 0; i < num_digits; ++i) { for (size_t i = 0; i < num_digits; ++i) {
unsigned int tmp = (unsigned int)(*(ptr++) - '0'); unsigned int tmp = (unsigned int)(*(ptr++) - '0');
@ -683,8 +683,9 @@ parse_digits(const char* ptr, int* var, size_t num_digits)
return ptr; return ptr;
} }
static int parse_isoformat_date(const char *dtstr, static int
int* year, int *month, int* day) { parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
{
/* Parse the date components of the result of date.isoformat() /* Parse the date components of the result of date.isoformat()
* *
* Return codes: * Return codes:
@ -720,8 +721,9 @@ static int parse_isoformat_date(const char *dtstr,
} }
static int static int
parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
int* hour, int* minute, int *second, int *microsecond) { int *minute, int *second, int *microsecond)
{
const char *p = tstr; const char *p = tstr;
const char *p_end = tstr_end; const char *p_end = tstr_end;
int *vals[3] = {hour, minute, second}; int *vals[3] = {hour, minute, second};
@ -736,11 +738,14 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end,
char c = *(p++); char c = *(p++);
if (p >= p_end) { if (p >= p_end) {
return c != '\0'; return c != '\0';
} else if (c == ':') { }
else if (c == ':') {
continue; continue;
} else if (c == '.') { }
else if (c == '.') {
break; break;
} else { }
else {
return -4; // Malformed time separator return -4; // Malformed time separator
} }
} }
@ -765,9 +770,10 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end,
} }
static int static int
parse_isoformat_time(const char *dtstr, size_t dtlen, parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
int* hour, int *minute, int *second, int *microsecond, int *second, int *microsecond, int *tzoffset,
int* tzoffset, int *tzmicrosecond) { int *tzmicrosecond)
{
// Parse the time portion of a datetime.isoformat() string // Parse the time portion of a datetime.isoformat() string
// //
// Return codes: // Return codes:
@ -785,19 +791,21 @@ parse_isoformat_time(const char *dtstr, size_t dtlen,
if (*tzinfo_pos == '+' || *tzinfo_pos == '-') { if (*tzinfo_pos == '+' || *tzinfo_pos == '-') {
break; break;
} }
} while(++tzinfo_pos < p_end); } while (++tzinfo_pos < p_end);
int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos, int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos, hour, minute, second,
hour, minute, second, microsecond); microsecond);
if (rv < 0) { if (rv < 0) {
return rv; return rv;
} else if (tzinfo_pos == p_end) { }
else if (tzinfo_pos == p_end) {
// We know that there's no time zone, so if there's stuff at the // We know that there's no time zone, so if there's stuff at the
// end of the string it's an error. // end of the string it's an error.
if (rv == 1) { if (rv == 1) {
return -5; return -5;
} else { }
else {
return 0; return 0;
} }
} }
@ -812,19 +820,18 @@ parse_isoformat_time(const char *dtstr, size_t dtlen,
return -5; return -5;
} }
int tzsign = (*tzinfo_pos == '-')?-1:1; int tzsign = (*tzinfo_pos == '-') ? -1 : 1;
tzinfo_pos++; tzinfo_pos++;
int tzhour = 0, tzminute = 0, tzsecond = 0; int tzhour = 0, tzminute = 0, tzsecond = 0;
rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end, rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end, &tzhour, &tzminute, &tzsecond,
&tzhour, &tzminute, &tzsecond, tzmicrosecond); tzmicrosecond);
*tzoffset = tzsign * ((tzhour * 3600) + (tzminute * 60) + tzsecond); *tzoffset = tzsign * ((tzhour * 3600) + (tzminute * 60) + tzsecond);
*tzmicrosecond *= tzsign; *tzmicrosecond *= tzsign;
return rv?-5:1; return rv ? -5 : 1;
} }
/* --------------------------------------------------------------------------- /* ---------------------------------------------------------------------------
* Create various objects, mostly without range checking. * Create various objects, mostly without range checking.
*/ */
@ -839,30 +846,33 @@ new_date_ex(int year, int month, int day, PyTypeObject *type)
return NULL; return NULL;
} }
self = (PyDateTime_Date *) (type->tp_alloc(type, 0)); self = (PyDateTime_Date *)(type->tp_alloc(type, 0));
if (self != NULL) if (self != NULL)
set_date_fields(self, year, month, day); set_date_fields(self, year, month, day);
return (PyObject *) self; return (PyObject *)self;
} }
#define new_date(year, month, day) \ #define new_date(year, month, day) \
new_date_ex(year, month, day, &PyDateTime_DateType) new_date_ex(year, month, day, &PyDateTime_DateType)
// Forward declaration // Forward declaration
static PyObject * new_datetime_ex(int, int, int, int, int, int, int, static PyObject *
PyObject*, PyTypeObject*); new_datetime_ex(int, int, int, int, int, int, int, PyObject *, PyTypeObject *);
/* Create date instance with no range checking, or call subclass constructor */ /* Create date instance with no range checking, or call subclass constructor */
static PyObject * static PyObject *
new_date_subclass_ex(int year, int month, int day, PyObject *cls) { new_date_subclass_ex(int year, int month, int day, PyObject *cls)
{
PyObject *result; PyObject *result;
// We have "fast path" constructors for two subclasses: date and datetime // We have "fast path" constructors for two subclasses: date and datetime
if ((PyTypeObject *)cls == &PyDateTime_DateType) { if ((PyTypeObject *)cls == &PyDateTime_DateType) {
result = new_date_ex(year, month, day, (PyTypeObject *)cls); result = new_date_ex(year, month, day, (PyTypeObject *)cls);
} else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) { }
else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) {
result = new_datetime_ex(year, month, day, 0, 0, 0, 0, Py_None, result = new_datetime_ex(year, month, day, 0, 0, 0, 0, Py_None,
(PyTypeObject *)cls); (PyTypeObject *)cls);
} else { }
else {
result = PyObject_CallFunction(cls, "iii", year, month, day); result = PyObject_CallFunction(cls, "iii", year, month, day);
} }
@ -1281,7 +1291,8 @@ append_keyword_fold(PyObject *repr, int fold)
} }
static inline PyObject * static inline PyObject *
tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) { tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds)
{
PyObject *tzinfo; PyObject *tzinfo;
if (rv == 1) { if (rv == 1) {
// Create a timezone from offset in seconds (0 returns UTC) // Create a timezone from offset in seconds (0 returns UTC)
@ -1296,7 +1307,8 @@ tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) {
} }
tzinfo = new_timezone(delta, NULL); tzinfo = new_timezone(delta, NULL);
Py_DECREF(delta); Py_DECREF(delta);
} else { }
else {
tzinfo = Py_None; tzinfo = Py_None;
Py_INCREF(Py_None); Py_INCREF(Py_None);
} }
@ -2886,17 +2898,19 @@ date_fromordinal(PyObject *cls, PyObject *args)
/* Return the new date from a string as generated by date.isoformat() */ /* Return the new date from a string as generated by date.isoformat() */
static PyObject * static PyObject *
date_fromisoformat(PyObject *cls, PyObject *dtstr) { date_fromisoformat(PyObject *cls, PyObject *dtstr)
{
assert(dtstr != NULL); assert(dtstr != NULL);
if (!PyUnicode_Check(dtstr)) { if (!PyUnicode_Check(dtstr)) {
PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str"); PyErr_SetString(PyExc_TypeError,
"fromisoformat: argument must be str");
return NULL; return NULL;
} }
Py_ssize_t len; Py_ssize_t len;
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
if (dt_ptr == NULL) { if (dt_ptr == NULL) {
goto invalid_string_error; goto invalid_string_error;
} }
@ -2906,7 +2920,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
int rv; int rv;
if (len == 10) { if (len == 10) {
rv = parse_isoformat_date(dt_ptr, &year, &month, &day); rv = parse_isoformat_date(dt_ptr, &year, &month, &day);
} else { }
else {
rv = -1; rv = -1;
} }
@ -2917,12 +2932,10 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
return new_date_subclass_ex(year, month, day, cls); return new_date_subclass_ex(year, month, day, cls);
invalid_string_error: invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
dtstr);
return NULL; return NULL;
} }
/* /*
* Date arithmetic. * Date arithmetic.
*/ */
@ -4863,53 +4876,66 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
} }
static PyObject * static PyObject *
_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) { _sanitize_isoformat_str(PyObject *dtstr)
{
// `fromisoformat` allows surrogate characters in exactly one position, // `fromisoformat` allows surrogate characters in exactly one position,
// the separator; to allow datetime_fromisoformat to make the simplifying // the separator; to allow datetime_fromisoformat to make the simplifying
// assumption that all valid strings can be encoded in UTF-8, this function // assumption that all valid strings can be encoded in UTF-8, this function
// replaces any surrogate character separators with `T`. // replaces any surrogate character separators with `T`.
//
// The result of this, if not NULL, returns a new reference
Py_ssize_t len = PyUnicode_GetLength(dtstr); Py_ssize_t len = PyUnicode_GetLength(dtstr);
*needs_decref = 0; if (len < 0) {
if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { return NULL;
}
if (len <= 10 ||
!Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
Py_INCREF(dtstr);
return dtstr; return dtstr;
} }
PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr)); PyObject *str_out = _PyUnicode_Copy(dtstr);
if (str_out == NULL) { if (str_out == NULL) {
return NULL; return NULL;
} }
if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 || if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
Py_DECREF(str_out); Py_DECREF(str_out);
return NULL; return NULL;
} }
*needs_decref = 1;
return str_out; return str_out;
} }
static PyObject * static PyObject *
datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
{
assert(dtstr != NULL); assert(dtstr != NULL);
if (!PyUnicode_Check(dtstr)) { if (!PyUnicode_Check(dtstr)) {
PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str"); PyErr_SetString(PyExc_TypeError,
"fromisoformat: argument must be str");
return NULL; return NULL;
} }
int needs_decref = 0; PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr);
dtstr = _sanitize_isoformat_str(dtstr, &needs_decref); if (dtstr_clean == NULL) {
if (dtstr == NULL) {
goto error; goto error;
} }
Py_ssize_t len; Py_ssize_t len;
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr_clean, &len);
if (dt_ptr == NULL) { if (dt_ptr == NULL) {
if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) {
// Encoding errors are invalid string errors at this point
goto invalid_string_error; goto invalid_string_error;
} }
else {
goto error;
}
}
const char *p = dt_ptr; const char *p = dt_ptr;
@ -4924,8 +4950,9 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
// In UTF-8, the length of multi-byte characters is encoded in the MSB // In UTF-8, the length of multi-byte characters is encoded in the MSB
if ((p[10] & 0x80) == 0) { if ((p[10] & 0x80) == 0) {
p += 11; p += 11;
} else { }
switch(p[10] & 0xf0) { else {
switch (p[10] & 0xf0) {
case 0xe0: case 0xe0:
p += 13; p += 13;
break; break;
@ -4939,15 +4966,14 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
} }
len -= (p - dt_ptr); len -= (p - dt_ptr);
rv = parse_isoformat_time(p, len, rv = parse_isoformat_time(p, len, &hour, &minute, &second,
&hour, &minute, &second, &microsecond, &microsecond, &tzoffset, &tzusec);
&tzoffset, &tzusec);
} }
if (rv < 0) { if (rv < 0) {
goto invalid_string_error; goto invalid_string_error;
} }
PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec); PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
if (tzinfo == NULL) { if (tzinfo == NULL) {
goto error; goto error;
} }
@ -4956,23 +4982,18 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
second, microsecond, tzinfo, cls); second, microsecond, tzinfo, cls);
Py_DECREF(tzinfo); Py_DECREF(tzinfo);
if (needs_decref) { Py_DECREF(dtstr_clean);
Py_DECREF(dtstr);
}
return dt; return dt;
invalid_string_error: invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
error: error:
if (needs_decref) { Py_XDECREF(dtstr_clean);
Py_DECREF(dtstr);
}
return NULL; return NULL;
} }
/* /*
* Destructor. * Destructor.
*/ */