bpo-34454: Clean up datetime.fromisoformat surrogate handling (GH-8959)

* Use _PyUnicode_Copy in sanitize_isoformat_str

* Use repr in fromisoformat error message

This reverses commit 67b74a98b2 per Serhiy Storchaka's suggestion:

     I suggested to use %R in the error message because including the raw
     string can be confusing in the case of empty string, or string
     containing trailing whitespaces, invisible or unprintable characters.

We agree that it is better to change both the C and pure Python versions
to use repr.

* Retain non-sanitized dtstr for error printing

This does not create an extra string, it just holds on to a reference to
the original input string for purposes of creating the error message.

* PEP 7 fixes to from_isoformat

* Separate handling of Unicode and other errors

In the initial implementation, errors other than encoding errors would
both raise an error indicating an invalid format, which would not be
true for errors like MemoryError.

* Drop needs_decref from _sanitize_isoformat_str

Instead _sanitize_isoformat_str returns a new reference, even to the
original string.
This commit is contained in:
Paul Ganssle 2018-10-22 12:32:52 -04:00 committed by Victor Stinner
parent 5a95ba29da
commit 3df85404d4
3 changed files with 106 additions and 76 deletions

View File

@ -857,7 +857,7 @@ class date:
assert len(date_string) == 10
return cls(*_parse_isoformat_date(date_string))
except Exception:
raise ValueError('Invalid isoformat string: %s' % date_string)
raise ValueError(f'Invalid isoformat string: {date_string!r}')
# Conversions to string
@ -1369,7 +1369,7 @@ class time:
try:
return cls(*_parse_isoformat_time(time_string))
except Exception:
raise ValueError('Invalid isoformat string: %s' % time_string)
raise ValueError(f'Invalid isoformat string: {time_string!r}')
def strftime(self, fmt):
@ -1646,13 +1646,13 @@ class datetime(date):
try:
date_components = _parse_isoformat_date(dstr)
except ValueError:
raise ValueError('Invalid isoformat string: %s' % date_string)
raise ValueError(f'Invalid isoformat string: {date_string!r}')
if tstr:
try:
time_components = _parse_isoformat_time(tstr)
except ValueError:
raise ValueError('Invalid isoformat string: %s' % date_string)
raise ValueError(f'Invalid isoformat string: {date_string!r}')
else:
time_components = [0, 0, 0, 0, None]

View File

@ -13,6 +13,7 @@ import sys
import os
import pickle
import random
import re
import struct
import unittest
@ -2676,6 +2677,14 @@ class TestDateTime(TestDate):
with self.assertRaises(ValueError):
self.theclass.fromisoformat(bad_str)
def test_fromisoformat_fails_surrogate(self):
# Test that when fromisoformat() fails with a surrogate character as
# the separator, the error message contains the original string
dtstr = "2018-01-03\ud80001:0113"
with self.assertRaisesRegex(ValueError, re.escape(repr(dtstr))):
self.theclass.fromisoformat(dtstr)
def test_fromisoformat_utc(self):
dt_str = '2014-04-19T13:21:13+00:00'
dt = self.theclass.fromisoformat(dt_str)

View File

@ -668,8 +668,8 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d)
* String parsing utilities and helper functions
*/
static const char*
parse_digits(const char* ptr, int* var, size_t num_digits)
static const char *
parse_digits(const char *ptr, int *var, size_t num_digits)
{
for (size_t i = 0; i < num_digits; ++i) {
unsigned int tmp = (unsigned int)(*(ptr++) - '0');
@ -683,15 +683,16 @@ parse_digits(const char* ptr, int* var, size_t num_digits)
return ptr;
}
static int parse_isoformat_date(const char *dtstr,
int* year, int *month, int* day) {
static int
parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
{
/* Parse the date components of the result of date.isoformat()
*
* Return codes:
* 0: Success
* -1: Failed to parse date component
* -2: Failed to parse dateseparator
*/
*
* Return codes:
* 0: Success
* -1: Failed to parse date component
* -2: Failed to parse dateseparator
*/
const char *p = dtstr;
p = parse_digits(p, year, 4);
if (NULL == p) {
@ -720,8 +721,9 @@ static int parse_isoformat_date(const char *dtstr,
}
static int
parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end,
int* hour, int* minute, int *second, int *microsecond) {
parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
int *minute, int *second, int *microsecond)
{
const char *p = tstr;
const char *p_end = tstr_end;
int *vals[3] = {hour, minute, second};
@ -736,12 +738,15 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end,
char c = *(p++);
if (p >= p_end) {
return c != '\0';
} else if (c == ':') {
}
else if (c == ':') {
continue;
} else if (c == '.') {
}
else if (c == '.') {
break;
} else {
return -4; // Malformed time separator
}
else {
return -4; // Malformed time separator
}
}
@ -765,9 +770,10 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end,
}
static int
parse_isoformat_time(const char *dtstr, size_t dtlen,
int* hour, int *minute, int *second, int *microsecond,
int* tzoffset, int *tzmicrosecond) {
parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
int *second, int *microsecond, int *tzoffset,
int *tzmicrosecond)
{
// Parse the time portion of a datetime.isoformat() string
//
// Return codes:
@ -785,19 +791,21 @@ parse_isoformat_time(const char *dtstr, size_t dtlen,
if (*tzinfo_pos == '+' || *tzinfo_pos == '-') {
break;
}
} while(++tzinfo_pos < p_end);
} while (++tzinfo_pos < p_end);
int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos,
hour, minute, second, microsecond);
int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos, hour, minute, second,
microsecond);
if (rv < 0) {
return rv;
} else if (tzinfo_pos == p_end) {
}
else if (tzinfo_pos == p_end) {
// We know that there's no time zone, so if there's stuff at the
// end of the string it's an error.
if (rv == 1) {
return -5;
} else {
}
else {
return 0;
}
}
@ -812,19 +820,18 @@ parse_isoformat_time(const char *dtstr, size_t dtlen,
return -5;
}
int tzsign = (*tzinfo_pos == '-')?-1:1;
int tzsign = (*tzinfo_pos == '-') ? -1 : 1;
tzinfo_pos++;
int tzhour = 0, tzminute = 0, tzsecond = 0;
rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end,
&tzhour, &tzminute, &tzsecond, tzmicrosecond);
rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end, &tzhour, &tzminute, &tzsecond,
tzmicrosecond);
*tzoffset = tzsign * ((tzhour * 3600) + (tzminute * 60) + tzsecond);
*tzmicrosecond *= tzsign;
return rv?-5:1;
return rv ? -5 : 1;
}
/* ---------------------------------------------------------------------------
* Create various objects, mostly without range checking.
*/
@ -839,30 +846,33 @@ new_date_ex(int year, int month, int day, PyTypeObject *type)
return NULL;
}
self = (PyDateTime_Date *) (type->tp_alloc(type, 0));
self = (PyDateTime_Date *)(type->tp_alloc(type, 0));
if (self != NULL)
set_date_fields(self, year, month, day);
return (PyObject *) self;
return (PyObject *)self;
}
#define new_date(year, month, day) \
new_date_ex(year, month, day, &PyDateTime_DateType)
// Forward declaration
static PyObject * new_datetime_ex(int, int, int, int, int, int, int,
PyObject*, PyTypeObject*);
static PyObject *
new_datetime_ex(int, int, int, int, int, int, int, PyObject *, PyTypeObject *);
/* Create date instance with no range checking, or call subclass constructor */
static PyObject *
new_date_subclass_ex(int year, int month, int day, PyObject *cls) {
new_date_subclass_ex(int year, int month, int day, PyObject *cls)
{
PyObject *result;
// We have "fast path" constructors for two subclasses: date and datetime
if ((PyTypeObject *)cls == &PyDateTime_DateType) {
result = new_date_ex(year, month, day, (PyTypeObject *)cls);
} else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) {
}
else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) {
result = new_datetime_ex(year, month, day, 0, 0, 0, 0, Py_None,
(PyTypeObject *)cls);
} else {
}
else {
result = PyObject_CallFunction(cls, "iii", year, month, day);
}
@ -1281,7 +1291,8 @@ append_keyword_fold(PyObject *repr, int fold)
}
static inline PyObject *
tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) {
tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds)
{
PyObject *tzinfo;
if (rv == 1) {
// Create a timezone from offset in seconds (0 returns UTC)
@ -1296,7 +1307,8 @@ tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) {
}
tzinfo = new_timezone(delta, NULL);
Py_DECREF(delta);
} else {
}
else {
tzinfo = Py_None;
Py_INCREF(Py_None);
}
@ -2886,17 +2898,19 @@ date_fromordinal(PyObject *cls, PyObject *args)
/* Return the new date from a string as generated by date.isoformat() */
static PyObject *
date_fromisoformat(PyObject *cls, PyObject *dtstr) {
date_fromisoformat(PyObject *cls, PyObject *dtstr)
{
assert(dtstr != NULL);
if (!PyUnicode_Check(dtstr)) {
PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str");
PyErr_SetString(PyExc_TypeError,
"fromisoformat: argument must be str");
return NULL;
}
Py_ssize_t len;
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
if (dt_ptr == NULL) {
goto invalid_string_error;
}
@ -2906,7 +2920,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
int rv;
if (len == 10) {
rv = parse_isoformat_date(dt_ptr, &year, &month, &day);
} else {
}
else {
rv = -1;
}
@ -2917,12 +2932,10 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
return new_date_subclass_ex(year, month, day, cls);
invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R",
dtstr);
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
return NULL;
}
/*
* Date arithmetic.
*/
@ -4863,52 +4876,65 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
}
static PyObject *
_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
_sanitize_isoformat_str(PyObject *dtstr)
{
// `fromisoformat` allows surrogate characters in exactly one position,
// the separator; to allow datetime_fromisoformat to make the simplifying
// assumption that all valid strings can be encoded in UTF-8, this function
// replaces any surrogate character separators with `T`.
//
// The result of this, if not NULL, returns a new reference
Py_ssize_t len = PyUnicode_GetLength(dtstr);
*needs_decref = 0;
if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
if (len < 0) {
return NULL;
}
if (len <= 10 ||
!Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
Py_INCREF(dtstr);
return dtstr;
}
PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr));
PyObject *str_out = _PyUnicode_Copy(dtstr);
if (str_out == NULL) {
return NULL;
}
if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 ||
PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
Py_DECREF(str_out);
return NULL;
}
*needs_decref = 1;
return str_out;
}
static PyObject *
datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
{
assert(dtstr != NULL);
if (!PyUnicode_Check(dtstr)) {
PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str");
PyErr_SetString(PyExc_TypeError,
"fromisoformat: argument must be str");
return NULL;
}
int needs_decref = 0;
dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
if (dtstr == NULL) {
PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr);
if (dtstr_clean == NULL) {
goto error;
}
Py_ssize_t len;
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr_clean, &len);
if (dt_ptr == NULL) {
goto invalid_string_error;
if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) {
// Encoding errors are invalid string errors at this point
goto invalid_string_error;
}
else {
goto error;
}
}
const char *p = dt_ptr;
@ -4924,8 +4950,9 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
// In UTF-8, the length of multi-byte characters is encoded in the MSB
if ((p[10] & 0x80) == 0) {
p += 11;
} else {
switch(p[10] & 0xf0) {
}
else {
switch (p[10] & 0xf0) {
case 0xe0:
p += 13;
break;
@ -4939,15 +4966,14 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
}
len -= (p - dt_ptr);
rv = parse_isoformat_time(p, len,
&hour, &minute, &second, &microsecond,
&tzoffset, &tzusec);
rv = parse_isoformat_time(p, len, &hour, &minute, &second,
&microsecond, &tzoffset, &tzusec);
}
if (rv < 0) {
goto invalid_string_error;
}
PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
if (tzinfo == NULL) {
goto error;
}
@ -4956,23 +4982,18 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
second, microsecond, tzinfo, cls);
Py_DECREF(tzinfo);
if (needs_decref) {
Py_DECREF(dtstr);
}
Py_DECREF(dtstr_clean);
return dt;
invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
error:
if (needs_decref) {
Py_DECREF(dtstr);
}
Py_XDECREF(dtstr_clean);
return NULL;
}
/*
* Destructor.
*/