bpo-29505: Add more fuzzing for re.compile, re.load and csv.reader (GH-14255)

Add more fuzz testing for re.compile, re.load and csv.reader
2019-06-30 01:54:43 -04:00 · 2019-06-30 01:54:43 -04:00 · 5cbbbd73a6
parent eb97b9211e
commit 5cbbbd73a6
9 changed files with 497 additions and 20 deletions
--- a/Lib/test/test_xxtestfuzz.py
+++ b/Lib/test/test_xxtestfuzz.py
@ -16,6 +16,8 @@ class TestFuzzer(unittest.TestCase):
        _xxtestfuzz.run(b" ")
        _xxtestfuzz.run(b"x")
        _xxtestfuzz.run(b"1")
        _xxtestfuzz.run(b"AAAAAAA")
        _xxtestfuzz.run(b"AAAAAA\0")
 if __name__ == "__main__":
--- a/Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict
+++ b/Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict
@ -0,0 +1,219 @@
 "?"
 "abc"
 "()"
 "[]"
 "abc|def"
 "abc|def|ghi"
 "^xxx$"
 "ab\\b\\d\\bcd"
 "\\w|\\d"
 "a*?"
 "abc+"
 "abc+?"
 "xyz?"
 "xyz??"
 "xyz{0,1}"
 "xyz{0,1}?"
 "xyz{93}"
 "xyz{1,32}"
 "xyz{1,32}?"
 "xyz{1,}"
 "xyz{1,}?"
 "a\\fb\\nc\\rd\\te\\vf"
 "a\\nb\\bc"
 "(?:foo)"
 "(?: foo )"
 "foo|(bar|baz)|quux"
 "foo(?=bar)baz"
 "foo(?!bar)baz"
 "foo(?<=bar)baz"
 "foo(?<!bar)baz"
 "()"
 "(?=)"
 "[]"
 "[x]"
 "[xyz]"
 "[a-zA-Z0-9]"
 "[-123]"
 "[^123]"
 "]"
 "}"
 "[a-b-c]"
 "[x\\dz]"
 "[\\d-z]"
 "[\\d-\\d]"
 "[z-\\d]"
 "\\cj\\cJ\\ci\\cI\\ck\\cK"
 "\\c!"
 "\\c_"
 "\\c~"
 "[\\c!]"
 "[\\c_]"
 "[\\c~]"
 "[\\ca]"
 "[\\cz]"
 "[\\cA]"
 "[\\cZ]"
 "[\\c1]"
 "\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ "
 "[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]"
 "\\8"
 "\\9"
 "\\11"
 "\\11a"
 "\\011"
 "\\118"
 "\\111"
 "\\1111"
 "(x)(x)(x)\\1"
 "(x)(x)(x)\\2"
 "(x)(x)(x)\\3"
 "(x)(x)(x)\\4"
 "(x)(x)(x)\\1*"
 "(x)(x)(x)\\3*"
 "(x)(x)(x)\\4*"
 "(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10"
 "(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11"
 "(a)\\1"
 "(a\\1)"
 "(\\1a)"
 "(\\2)(\\1)"
 "(?=a){0,10}a"
 "(?=a){1,10}a"
 "(?=a){9,10}a"
 "(?!a)?a"
 "\\1(a)"
 "(?!(a))\\1"
 "(?!\\1(a\\1)\\1)\\1"
 "\\1\\2(a(?:\\1(b\\1\\2))\\2)\\1"
 "[\\0]"
 "[\\11]"
 "[\\11a]"
 "[\\011]"
 "[\\00011]"
 "[\\118]"
 "[\\111]"
 "[\\1111]"
 "\\x60"
 "\\x3z"
 "\\c"
 "\\u0034"
 "\\u003z"
 "foo[z]*"
 "\\u{12345}"
 "\\u{12345}\\u{23456}"
 "\\u{12345}{3}"
 "\\u{12345}*"
 "\\ud808\\udf45*"
 "[\\ud808\\udf45-\\ud809\\udccc]"
 "a"
 "a|b"
 "a\\n"
 "a$"
 "a\\b!"
 "a\\Bb"
 "a*?"
 "a?"
 "a??"
 "a{0,1}?"
 "a{1,2}?"
 "a+?"
 "(a)"
 "(a)\\1"
 "(\\1a)"
 "\\1(a)"
 "a\\s"
 "a\\S"
 "a\\D"
 "a\\w"
 "a\\W"
 "a."
 "a\\q"
 "a[a]"
 "a[^a]"
 "a[a-z]"
 "a(?:b)"
 "a(?=b)"
 "a(?!b)"
 "\\x60"
 "\\u0060"
 "\\cA"
 "\\q"
 "\\1112"
 "(a)\\1"
 "(?!a)?a\\1"
 "(?:(?=a))a\\1"
 "a{}"
 "a{,}"
 "a{"
 "a{z}"
 "a{12z}"
 "a{12,"
 "a{12,3b"
 "{}"
 "{,}"
 "{"
 "{z}"
 "{1z}"
 "{12,"
 "{12,3b"
 "a"
 "abc"
 "a[bc]d"
 "a|bc"
 "ab|c"
 "a||bc"
 "(?:ab)"
 "(?:ab|cde)"
 "(?:ab)|cde"
 "(ab)"
 "(ab|cde)"
 "(ab)\\1"
 "(ab|cde)\\1"
 "(?:ab)?"
 "(?:ab)+"
 "a?"
 "a+"
 "a??"
 "a*?"
 "a+?"
 "(?:a?)?"
 "(?:a+)?"
 "(?:a?)+"
 "(?:a*)+"
 "(?:a+)+"
 "(?:a?)*"
 "(?:a*)*"
 "(?:a+)*"
 "a{0}"
 "(?:a+){0,0}"
 "a*b"
 "a+b"
 "a*b|c"
 "a+b|c"
 "(?:a{5,1000000}){3,1000000}"
 "(?:ab){4,7}"
 "a\\bc"
 "a\\sc"
 "a\\Sc"
 "a(?=b)c"
 "a(?=bbb|bb)c"
 "a(?!bbb|bb)c"
 "\xe2\x81\xa3"
 "[\xe2\x81\xa3]"
 "\xed\xb0\x80"
 "\xed\xa0\x80"
 "(\xed\xb0\x80)\x01"
 "((\xed\xa0\x80))\x02"
 "\xf0\x9f\x92\xa9"
 "\x01"
 "\x0f"
 "[-\xf0\x9f\x92\xa9]+"
 "[\xf0\x9f\x92\xa9-\xf4\x8f\xbf\xbf]"
 "(?<=)"
 "(?<=a)"
 "(?<!)"
 "(?<!a)"
 "(?<a>)"
 "(?<a>.)"
 "(?<a>.)\\k<a>"
--- a/Modules/_xxtestfuzz/fuzz_csv_reader_corpus/test.csv
+++ b/Modules/_xxtestfuzz/fuzz_csv_reader_corpus/test.csv
--- a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links
+++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links
@ -0,0 +1 @@
 XX<a\s*href=(.*?)[\s|>]
--- a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters
+++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters
@ -0,0 +1 @@
 XX^(Tim|Robert)\s+the\s+(Enchanter|Shrubber)$
--- a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn
+++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn
@ -0,0 +1 @@
 XX/((978[\--– ])?[0-9][0-9\--– ]{10}[\--– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])/
--- a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number
+++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number
@ -0,0 +1 @@
 XX(\+1|1)?[ \-\.]?\(?(?<areacode>[0-9]{3})\)?[ \-\.]?(?<prefix>[0-9]{3})[ \-\.]?(?<number>[0-9]{4})[ \.]*(ext|x)?[ \.]*(?<extension>[0-9]{0,5})
--- a/Modules/_xxtestfuzz/fuzz_tests.txt
+++ b/Modules/_xxtestfuzz/fuzz_tests.txt
@ -2,3 +2,6 @@ fuzz_builtin_float
 fuzz_builtin_int
 fuzz_builtin_unicode
 fuzz_json_loads
 fuzz_sre_compile
 fuzz_sre_match
 fuzz_csv_reader
--- a/Modules/_xxtestfuzz/fuzzer.c
+++ b/Modules/_xxtestfuzz/fuzzer.c
@ -81,8 +81,17 @@ static int fuzz_builtin_unicode(const char* data, size_t size) {
 #define MAX_JSON_TEST_SIZE 0x10000
 /* Initialized in LLVMFuzzerTestOneInput */
 PyObject* json_loads_method = NULL;
 /* Called by LLVMFuzzerTestOneInput for initialization */
 static int init_json_loads() {
    /* Import json.loads */
    PyObject* json_module = PyImport_ImportModule("json");
    if (json_module == NULL) {
        return 0;
    }
    json_loads_method = PyObject_GetAttrString(json_module, "loads");
    return json_loads_method != NULL;
 }
 /* Fuzz json.loads(x) */
 static int fuzz_json_loads(const char* data, size_t size) {
    /* Since python supports arbitrarily large ints in JSON,
@ -96,25 +105,230 @@ static int fuzz_json_loads(const char* data, size_t size) {
        return 0;
    }
    PyObject* parsed = PyObject_CallFunctionObjArgs(json_loads_method, input_bytes, NULL);
    if (parsed == NULL) {
        /* Ignore ValueError as the fuzzer will more than likely
           generate some invalid json and values */
-    if (parsed == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
+        if (PyErr_ExceptionMatches(PyExc_ValueError) ||
        PyErr_Clear();
    }
        /* Ignore RecursionError as the fuzzer generates long sequences of
           arrays such as `[[[...` */
-    if (parsed == NULL && PyErr_ExceptionMatches(PyExc_RecursionError)) {
+            PyErr_ExceptionMatches(PyExc_RecursionError) ||
        /* Ignore unicode errors, invalid byte sequences are common */
            PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)
        ) {
            PyErr_Clear();
        }
    /* Ignore unicode errors, invalid byte sequences are common */
    if (parsed == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
        PyErr_Clear();
    }
    Py_DECREF(input_bytes);
    Py_XDECREF(parsed);
    return 0;
 }
 #define MAX_RE_TEST_SIZE 0x10000
 PyObject* sre_compile_method = NULL;
 PyObject* sre_error_exception = NULL;
 int SRE_FLAG_DEBUG = 0;
 /* Called by LLVMFuzzerTestOneInput for initialization */
 static int init_sre_compile() {
    /* Import sre_compile.compile and sre.error */
    PyObject* sre_compile_module = PyImport_ImportModule("sre_compile");
    if (sre_compile_module == NULL) {
        return 0;
    }
    sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile");
    if (sre_compile_method == NULL) {
        return 0;
    }
    PyObject* sre_constants = PyImport_ImportModule("sre_constants");
    if (sre_constants == NULL) {
        return 0;
    }
    sre_error_exception = PyObject_GetAttrString(sre_constants, "error");
    if (sre_error_exception == NULL) {
        return 0;
    }
    PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG");
    if (debug_flag == NULL) {
        return 0;
    }
    SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag);
    return 1;
 }
 /* Fuzz _sre.compile(x) */
 static int fuzz_sre_compile(const char* data, size_t size) {
    /* Ignore really long regex patterns that will timeout the fuzzer */
    if (size > MAX_RE_TEST_SIZE) {
        return 0;
    }
    /* We treat the first 2 bytes of the input as a number for the flags */
    if (size < 2) {
        return 0;
    }
    uint16_t flags = ((uint16_t*) data)[0];
    /* We remove the SRE_FLAG_DEBUG if present. This is because it
       prints to stdout which greatly decreases fuzzing speed */
    flags &= ~SRE_FLAG_DEBUG;
    /* Pull the pattern from the remaining bytes */
    PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2);
    if (pattern_bytes == NULL) {
        return 0;
    }
    PyObject* flags_obj = PyLong_FromUnsignedLong(flags);
    if (flags_obj == NULL) {
        Py_DECREF(pattern_bytes);
        return 0;
    }
    /* compiled = _sre.compile(data[2:], data[0:2] */
    PyObject* compiled = PyObject_CallFunctionObjArgs(
        sre_compile_method, pattern_bytes, flags_obj, NULL);
    /* Ignore ValueError as the fuzzer will more than likely
       generate some invalid combination of flags */
    if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
        PyErr_Clear();
    }
    /* Ignore some common errors thrown by sre_parse:
       Overflow, Assertion and Index */
    if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) ||
                             PyErr_ExceptionMatches(PyExc_AssertionError) ||
                             PyErr_ExceptionMatches(PyExc_IndexError))
    ) {
        PyErr_Clear();
    }
    /* Ignore re.error */
    if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) {
        PyErr_Clear();
    }
    Py_DECREF(pattern_bytes);
    Py_DECREF(flags_obj);
    Py_XDECREF(compiled);
    return 0;
 }
 /* Some random patterns used to test re.match.
   Be careful not to add catostraphically slow regexes here, we want to
   excercise the matching code without causing timeouts.*/
 static const char* regex_patterns[] = {
    ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]",
    "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?",
    "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$",
    "(?:a*)*", "a{1,2}?"
 };
 const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]);
 PyObject** compiled_patterns = NULL;
 /* Called by LLVMFuzzerTestOneInput for initialization */
 static int init_sre_match() {
    PyObject* re_module = PyImport_ImportModule("re");
    if (re_module == NULL) {
        return 0;
    }
    compiled_patterns = (PyObject**) PyMem_RawMalloc(
        sizeof(PyObject*) * NUM_PATTERNS);
    if (compiled_patterns == NULL) {
        PyErr_NoMemory();
        return 0;
    }
    /* Precompile all the regex patterns on the first run for faster fuzzing */
    for (size_t i = 0; i < NUM_PATTERNS; i++) {
        PyObject* compiled = PyObject_CallMethod(
            re_module, "compile", "y", regex_patterns[i]);
        /* Bail if any of the patterns fail to compile */
        if (compiled == NULL) {
            return 0;
        }
        compiled_patterns[i] = compiled;
    }
    return 1;
 }
 /* Fuzz re.match(x) */
 static int fuzz_sre_match(const char* data, size_t size) {
    if (size < 1 || size > MAX_RE_TEST_SIZE) {
        return 0;
    }
    /* Use the first byte as a uint8_t specifying the index of the
       regex to use */
    unsigned char idx = (unsigned char) data[0];
    idx = idx % NUM_PATTERNS;
    /* Pull the string to match from the remaining bytes */
    PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1);
    if (to_match == NULL) {
        return 0;
    }
    PyObject* pattern = compiled_patterns[idx];
    PyObject* match_callable = PyObject_GetAttrString(pattern, "match");
    PyObject* matches = PyObject_CallFunctionObjArgs(match_callable, to_match, NULL);
    Py_XDECREF(matches);
    Py_DECREF(match_callable);
    Py_DECREF(to_match);
    return 0;
 }
 #define MAX_CSV_TEST_SIZE 0x10000
 PyObject* csv_module = NULL;
 PyObject* csv_error = NULL;
 /* Called by LLVMFuzzerTestOneInput for initialization */
 static int init_csv_reader() {
    /* Import csv and csv.Error */
    csv_module = PyImport_ImportModule("csv");
    if (csv_module == NULL) {
        return 0;
    }
    csv_error = PyObject_GetAttrString(csv_module, "Error");
    return csv_error != NULL;
 }
 /* Fuzz csv.reader([x]) */
 static int fuzz_csv_reader(const char* data, size_t size) {
    if (size < 1 || size > MAX_CSV_TEST_SIZE) {
        return 0;
    }
    /* Ignore non null-terminated strings since _csv can't handle
       embeded nulls */
    if (memchr(data, '\0', size) == NULL) {
        return 0;
    }
    PyObject* s = PyUnicode_FromString(data);
    /* Ignore exceptions until we have a valid string */
    if (s == NULL) {
        PyErr_Clear();
        return 0;
    }
    /* Split on \n so we can test multiple lines */
    PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n");
    if (lines == NULL) {
        Py_DECREF(s);
        return 0;
    }
    PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines);
    if (reader) {
        /* Consume all of the reader as an iterator */
        PyObject* parsed_line;
        while ((parsed_line = PyIter_Next(reader))) {
            Py_DECREF(parsed_line);
        }
    }
    /* Ignore csv.Error because we're probably going to generate
       some bad files (embeded new-lines, unterminated quotes etc) */
    if (PyErr_ExceptionMatches(csv_error)) {
        PyErr_Clear();
    }
    Py_XDECREF(reader);
    Py_DECREF(s);
    return 0;
 }
 /* Run fuzzer and abort on failure. */
 static int _run_fuzz(const uint8_t *data, size_t size, int(*fuzzer)(const char* , size_t)) {
    int rv = fuzzer((const char*) data, size);
@ -152,12 +366,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
           initialize CPython ourselves on the first run. */
        Py_InitializeEx(0);
    }
 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads)
    if (json_loads_method == NULL) {
        PyObject* json_module = PyImport_ImportModule("json");
        json_loads_method = PyObject_GetAttrString(json_module, "loads");
    }
 #endif
    int rv = 0;
@ -171,7 +379,48 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
    rv |= _run_fuzz(data, size, fuzz_builtin_unicode);
 #endif
 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads)
    static int JSON_LOADS_INITIALIZED = 0;
    if (!JSON_LOADS_INITIALIZED && !init_json_loads()) {
        PyErr_Print();
        abort();
    } else {
        JSON_LOADS_INITIALIZED = 1;
    }
    rv |= _run_fuzz(data, size, fuzz_json_loads);
 #endif
 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile)
    static int SRE_COMPILE_INITIALIZED = 0;
    if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) {
        PyErr_Print();
        abort();
    } else {
        SRE_COMPILE_INITIALIZED = 1;
    }
    rv |= _run_fuzz(data, size, fuzz_sre_compile);
 #endif
 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match)
    static int SRE_MATCH_INITIALIZED = 0;
    if (!SRE_MATCH_INITIALIZED && !init_sre_match()) {
        PyErr_Print();
        abort();
    } else {
        SRE_MATCH_INITIALIZED = 1;
    }
    rv |= _run_fuzz(data, size, fuzz_sre_match);
 #endif
 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader)
    static int CSV_READER_INITIALIZED = 0;
    if (!CSV_READER_INITIALIZED && !init_csv_reader()) {
        PyErr_Print();
        abort();
    } else {
        CSV_READER_INITIALIZED = 1;
    }
    rv |= _run_fuzz(data, size, fuzz_csv_reader);
 #endif
  return rv;
 }
		`@ -0,0 +1 @@`
							`XX^(Tim\|Robert)\s+the\s+(Enchanter\|Shrubber)$`
		`@ -0,0 +1 @@`
							`XX/((978[\--– ])?[0-9][0-9\--– ]{10}[\--– ][0-9xX])\|((978)?[0-9]{9}[0-9Xx])/`
		`@ -0,0 +1 @@`
							`XX(\+1\|1)?[ \-\.]?\(?(?<areacode>[0-9]{3})\)?[ \-\.]?(?<prefix>[0-9]{3})[ \-\.]?(?<number>[0-9]{4})[ \.](ext\|x)?[ \.](?<extension>[0-9]{0,5})`