bpo-34454: fix .fromisoformat() methods crashing on inputs with surrogate code points (GH-8862)

The current C implementations **crash** if the input includes a surrogate Unicode code point, which is not possible to encode in UTF-8. Important notes: 1. It is possible to pass a non-UTF-8 string as a separator to the `.isoformat()` methods. 2. The pure-Python `datetime.fromisoformat()` implementation accepts strings with a surrogate as the separator. In `datetime.fromisoformat()`, in the special case of non-UTF-8 separators, this implementation will take a performance hit by making a copy of the input string and replacing the separator with 'T'. Co-authored-by: Alexey Izbyshev <izbyshev@ispras.ru> Co-authored-by: Paul Ganssle <paul@ganssle.io>
2018-08-23 11:06:20 -04:00 · 2018-08-23 11:06:20 -04:00 · 096329f0b2
parent c33bb5d401
commit 096329f0b2
3 changed files with 84 additions and 10 deletions
--- a/Lib/test/datetimetester.py
+++ b/Lib/test/datetimetester.py
@ -1667,6 +1667,7 @@ class TestDate(HarmlessMixedComparison, unittest.TestCase):
        # Test that fromisoformat() fails on invalid values
        bad_strs = [
            '',                 # Empty string
+            '\ud800',           # bpo-34454: Surrogate code point
            '009-03-04',        # Not 10 characters
            '123456789',        # Not a date
            '200a-12-04',       # Invalid character in year
@ -1675,6 +1676,7 @@ class TestDate(HarmlessMixedComparison, unittest.TestCase):
            '2009-01-32',       # Invalid day
            '2009-02-29',       # Invalid leap day
            '20090228',         # Valid ISO8601 output not from isoformat()
+            '2009\ud80002\ud80028',     # Separators are surrogate codepoints
        ]

        for bad_str in bad_strs:
@ -2587,7 +2589,8 @@ class TestDateTime(TestDate):
            ' ', 'T', '\u007f',     # 1-bit widths
            '\u0080', 'ʁ',          # 2-bit widths
            'ᛇ', '時',               # 3-bit widths
-            '🐍'                     # 4-bit widths
+            '🐍',                    # 4-bit widths
+            '\ud800',               # bpo-34454: Surrogate code point
        ]

        for sep in separators:
@ -2639,6 +2642,7 @@ class TestDateTime(TestDate):
        # Test that fromisoformat() fails on invalid values
        bad_strs = [
            '',                             # Empty string
+            '\ud800',                       # bpo-34454: Surrogate code point
            '2009.04-19T03',                # Wrong first separator
            '2009-04.19T03',                # Wrong second separator
            '2009-04-19T0a',                # Invalid hours
@ -2652,6 +2656,8 @@ class TestDateTime(TestDate):
            '2009-04-19T03:15:45.123456+24:30',    # Invalid time zone offset
            '2009-04-19T03:15:45.123456-24:30',    # Invalid negative offset
            '2009-04-10ᛇᛇᛇᛇᛇ12:15',         # Too many unicode separators
+            '2009-04\ud80010T12:15',        # Surrogate char in date
+            '2009-04-10T12\ud80015',        # Surrogate char in time
            '2009-04-19T1',                 # Incomplete hours
            '2009-04-19T12:3',              # Incomplete minutes
            '2009-04-19T12:30:4',           # Incomplete seconds
@ -3521,6 +3527,7 @@ class TestTimeTZ(TestTime, TZInfoBase, unittest.TestCase):
    def test_fromisoformat_fails(self):
        bad_strs = [
            '',                         # Empty string
+            '12\ud80000',               # Invalid separator - surrogate char
            '12:',                      # Ends on a separator
            '12:30:',                   # Ends on a separator
            '12:30:15.',                # Ends on a separator
--- a/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst
+++ b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst
@ -0,0 +1,4 @@
+Fix the .fromisoformat() methods of datetime types crashing when given
+unicode with non-UTF-8-encodable code points.  Specifically,
+datetime.fromisoformat() now accepts surrogate unicode code points used as
+the separator. Report and tests by Alexey Izbyshev, patch by Paul Ganssle.
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@ -2883,6 +2883,9 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
    Py_ssize_t len;

    const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
+    if (dt_ptr == NULL) {
+        goto invalid_string_error;
+    }

    int year = 0, month = 0, day = 0;

@ -2894,12 +2897,15 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
    }

    if (rv < 0) {
-        PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s",
-                     dt_ptr);
-        return NULL;
+        goto invalid_string_error;
    }

    return new_date_subclass_ex(year, month, day, cls);
+
+invalid_string_error:
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R",
+                 dtstr);
+    return NULL;
 }


@ -4258,6 +4264,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
    Py_ssize_t len;
    const char *p = PyUnicode_AsUTF8AndSize(tstr, &len);

+    if (p == NULL) {
+        goto invalid_string_error;
+    }
+
    int hour = 0, minute = 0, second = 0, microsecond = 0;
    int tzoffset, tzimicrosecond = 0;
    int rv = parse_isoformat_time(p, len,
@ -4265,8 +4275,7 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
                                  &tzoffset, &tzimicrosecond);

    if (rv < 0) {
-        PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p);
-        return NULL;
+        goto invalid_string_error;
    }

    PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset,
@ -4286,6 +4295,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {

    Py_DECREF(tzinfo);
    return t;
+
+invalid_string_error:
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", tstr);
+    return NULL;
 }


@ -4839,6 +4852,33 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
    return result;
 }

+static PyObject *
+_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
+    // `fromisoformat` allows surrogate characters in exactly one position,
+    // the separator; to allow datetime_fromisoformat to make the simplifying
+    // assumption that all valid strings can be encoded in UTF-8, this function
+    // replaces any surrogate character separators with `T`.
+    Py_ssize_t len = PyUnicode_GetLength(dtstr);
+    *needs_decref = 0;
+    if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
+        return dtstr;
+    }
+
+    PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr));
+    if (str_out == NULL) {
+        return NULL;
+    }
+
+    if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 ||
+            PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
+        Py_DECREF(str_out);
+        return NULL;
+    }
+
+    *needs_decref = 1;
+    return str_out;
+}
+
 static PyObject *
 datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
    assert(dtstr != NULL);
@ -4848,9 +4888,20 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
        return NULL;
    }

+    int needs_decref = 0;
+    dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
+    if (dtstr == NULL) {
+        goto error;
+    }
+
    Py_ssize_t len;
    const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
-    const char * p = dt_ptr;
+
+    if (dt_ptr == NULL) {
+        goto invalid_string_error;
+    }
+
+    const char *p = dt_ptr;

    int year = 0, month = 0, day = 0;
    int hour = 0, minute = 0, second = 0, microsecond = 0;
@ -4883,20 +4934,32 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
                                  &tzoffset, &tzusec);
    }
    if (rv < 0) {
-        PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr);
-        return NULL;
+        goto invalid_string_error;
    }

    PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
    if (tzinfo == NULL) {
-        return NULL;
+        goto error;
    }

    PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
                                            second, microsecond, tzinfo, cls);

    Py_DECREF(tzinfo);
+    if (needs_decref) {
+        Py_DECREF(dtstr);
+    }
    return dt;
+
+invalid_string_error:
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
+
+error:
+    if (needs_decref) {
+        Py_DECREF(dtstr);
+    }
+
+    return NULL;
 }