gh-95534: Improve gzip reading speed by 10% (#97664)

Change summary: + There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks. + a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this. + The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case. + GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls. EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.
2022-10-17 04:10:58 +02:00 · 2022-10-17 04:10:58 +02:00 · eae7dad402
parent bb38b39b33
commit eae7dad402
5 changed files with 850 additions and 80 deletions
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@ -21,6 +21,8 @@ _COMPRESS_LEVEL_FAST = 1
 _COMPRESS_LEVEL_TRADEOFF = 6
 _COMPRESS_LEVEL_BEST = 9

+READ_BUFFER_SIZE = 128 * 1024
+

 def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
         encoding=None, errors=None, newline=None):
@ -446,7 +448,7 @@ def _read_gzip_header(fp):

 class _GzipReader(_compression.DecompressReader):
    def __init__(self, fp):
-        super().__init__(_PaddedFile(fp), zlib.decompressobj,
+        super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
                         wbits=-zlib.MAX_WBITS)
        # Set flag indicating start of a new member
        self._new_member = True
@ -494,12 +496,13 @@ class _GzipReader(_compression.DecompressReader):
                self._new_member = False

            # Read a chunk of data from the file
-            buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
+            if self._decompressor.needs_input:
+                buf = self._fp.read(READ_BUFFER_SIZE)
+                uncompress = self._decompressor.decompress(buf, size)
+            else:
+                uncompress = self._decompressor.decompress(b"", size)

-            uncompress = self._decompressor.decompress(buf, size)
-            if self._decompressor.unconsumed_tail != b"":
-                self._fp.prepend(self._decompressor.unconsumed_tail)
-            elif self._decompressor.unused_data != b"":
+            if self._decompressor.unused_data != b"":
                # Prepend the already read bytes to the fileobj so they can
                # be seen by _read_eof() and _read_gzip_header()
                self._fp.prepend(self._decompressor.unused_data)
@ -510,14 +513,11 @@ class _GzipReader(_compression.DecompressReader):
                raise EOFError("Compressed file ended before the "
                               "end-of-stream marker was reached")

-        self._add_read_data( uncompress )
+        self._crc = zlib.crc32(uncompress, self._crc)
+        self._stream_size += len(uncompress)
        self._pos += len(uncompress)
        return uncompress

-    def _add_read_data(self, data):
-        self._crc = zlib.crc32(data, self._crc)
-        self._stream_size = self._stream_size + len(data)
-
    def _read_eof(self):
        # We've read to the end of the file
        # We check that the computed CRC and size of the
@ -647,7 +647,7 @@ def main():
                f = builtins.open(arg, "rb")
                g = open(arg + ".gz", "wb")
        while True:
-            chunk = f.read(io.DEFAULT_BUFFER_SIZE)
+            chunk = f.read(READ_BUFFER_SIZE)
            if not chunk:
                break
            g.write(chunk)
--- a/Lib/test/test_zlib.py
+++ b/Lib/test/test_zlib.py
@ -944,6 +944,173 @@ LAERTES
 """


+class ZlibDecompressorTest():
+    # Test adopted from test_bz2.py
+    TEXT = HAMLET_SCENE
+    DATA = zlib.compress(HAMLET_SCENE)
+    BAD_DATA = b"Not a valid deflate block"
+    def test_Constructor(self):
+        self.assertRaises(TypeError, zlib._ZlibDecompressor, 42)
+
+    def testDecompress(self):
+        zlibd = zlib._ZlibDecompressor()
+        self.assertRaises(TypeError, zlibd.decompress)
+        text = zlibd.decompress(self.DATA)
+        self.assertEqual(text, self.TEXT)
+
+    def testDecompressChunks10(self):
+        zlibd = zlib._ZlibDecompressor()
+        text = b''
+        n = 0
+        while True:
+            str = self.DATA[n*10:(n+1)*10]
+            if not str:
+                break
+            text += zlibd.decompress(str)
+            n += 1
+        self.assertEqual(text, self.TEXT)
+
+    def testDecompressUnusedData(self):
+        zlibd = zlib._ZlibDecompressor()
+        unused_data = b"this is unused data"
+        text = zlibd.decompress(self.DATA+unused_data)
+        self.assertEqual(text, self.TEXT)
+        self.assertEqual(zlibd.unused_data, unused_data)
+
+    def testEOFError(self):
+        zlibd = zlib._ZlibDecompressor()
+        text = zlibd.decompress(self.DATA)
+        self.assertRaises(EOFError, zlibd.decompress, b"anything")
+        self.assertRaises(EOFError, zlibd.decompress, b"")
+
+    @support.skip_if_pgo_task
+    @bigmemtest(size=_4G + 100, memuse=3.3)
+    def testDecompress4G(self, size):
+        # "Test zlib._ZlibDecompressor.decompress() with >4GiB input"
+        blocksize = 10 * 1024 * 1024
+        block = random.randbytes(blocksize)
+        try:
+            data = block * (size // blocksize + 1)
+            compressed = zlib.compress(data)
+            zlibd = zlib._ZlibDecompressor()
+            decompressed = zlibd.decompress(compressed)
+            self.assertTrue(decompressed == data)
+        finally:
+            data = None
+            compressed = None
+            decompressed = None
+
+    def testPickle(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            with self.assertRaises(TypeError):
+                pickle.dumps(zlib._ZlibDecompressor(), proto)
+
+    def testDecompressorChunksMaxsize(self):
+        zlibd = zlib._ZlibDecompressor()
+        max_length = 100
+        out = []
+
+        # Feed some input
+        len_ = len(self.BIG_DATA) - 64
+        out.append(zlibd.decompress(self.BIG_DATA[:len_],
+                                  max_length=max_length))
+        self.assertFalse(zlibd.needs_input)
+        self.assertEqual(len(out[-1]), max_length)
+
+        # Retrieve more data without providing more input
+        out.append(zlibd.decompress(b'', max_length=max_length))
+        self.assertFalse(zlibd.needs_input)
+        self.assertEqual(len(out[-1]), max_length)
+
+        # Retrieve more data while providing more input
+        out.append(zlibd.decompress(self.BIG_DATA[len_:],
+                                  max_length=max_length))
+        self.assertLessEqual(len(out[-1]), max_length)
+
+        # Retrieve remaining uncompressed data
+        while not zlibd.eof:
+            out.append(zlibd.decompress(b'', max_length=max_length))
+            self.assertLessEqual(len(out[-1]), max_length)
+
+        out = b"".join(out)
+        self.assertEqual(out, self.BIG_TEXT)
+        self.assertEqual(zlibd.unused_data, b"")
+
+    def test_decompressor_inputbuf_1(self):
+        # Test reusing input buffer after moving existing
+        # contents to beginning
+        zlibd = zlib._ZlibDecompressor()
+        out = []
+
+        # Create input buffer and fill it
+        self.assertEqual(zlibd.decompress(self.DATA[:100],
+                                        max_length=0), b'')
+
+        # Retrieve some results, freeing capacity at beginning
+        # of input buffer
+        out.append(zlibd.decompress(b'', 2))
+
+        # Add more data that fits into input buffer after
+        # moving existing data to beginning
+        out.append(zlibd.decompress(self.DATA[100:105], 15))
+
+        # Decompress rest of data
+        out.append(zlibd.decompress(self.DATA[105:]))
+        self.assertEqual(b''.join(out), self.TEXT)
+
+    def test_decompressor_inputbuf_2(self):
+        # Test reusing input buffer by appending data at the
+        # end right away
+        zlibd = zlib._ZlibDecompressor()
+        out = []
+
+        # Create input buffer and empty it
+        self.assertEqual(zlibd.decompress(self.DATA[:200],
+                                        max_length=0), b'')
+        out.append(zlibd.decompress(b''))
+
+        # Fill buffer with new data
+        out.append(zlibd.decompress(self.DATA[200:280], 2))
+
+        # Append some more data, not enough to require resize
+        out.append(zlibd.decompress(self.DATA[280:300], 2))
+
+        # Decompress rest of data
+        out.append(zlibd.decompress(self.DATA[300:]))
+        self.assertEqual(b''.join(out), self.TEXT)
+
+    def test_decompressor_inputbuf_3(self):
+        # Test reusing input buffer after extending it
+
+        zlibd = zlib._ZlibDecompressor()
+        out = []
+
+        # Create almost full input buffer
+        out.append(zlibd.decompress(self.DATA[:200], 5))
+
+        # Add even more data to it, requiring resize
+        out.append(zlibd.decompress(self.DATA[200:300], 5))
+
+        # Decompress rest of data
+        out.append(zlibd.decompress(self.DATA[300:]))
+        self.assertEqual(b''.join(out), self.TEXT)
+
+    def test_failure(self):
+        zlibd = zlib._ZlibDecompressor()
+        self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)
+        # Previously, a second call could crash due to internal inconsistency
+        self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)
+
+    @support.refcount_test
+    def test_refleaks_in___init__(self):
+        gettotalrefcount = support.get_attribute(sys, 'gettotalrefcount')
+        zlibd = zlib._ZlibDecompressor()
+        refs_before = gettotalrefcount()
+        for i in range(100):
+            zlibd.__init__()
+        self.assertAlmostEqual(gettotalrefcount() - refs_before, 0, delta=10)
+
+
 class CustomInt:
    def __index__(self):
        return 100
--- a/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst
+++ b/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst
@ -0,0 +1 @@
+:meth:`gzip.GzipFile.read` reads 10% faster.
--- a/Modules/clinic/zlibmodule.c.h
+++ b/Modules/clinic/zlibmodule.c.h
@ -897,6 +897,104 @@ exit:
    return return_value;
 }

+PyDoc_STRVAR(zlib_ZlibDecompressor_decompress__doc__,
+"decompress($self, /, data, max_length=-1)\n"
+"--\n"
+"\n"
+"Decompress *data*, returning uncompressed data as bytes.\n"
+"\n"
+"If *max_length* is nonnegative, returns at most *max_length* bytes of\n"
+"decompressed data. If this limit is reached and further output can be\n"
+"produced, *self.needs_input* will be set to ``False``. In this case, the next\n"
+"call to *decompress()* may provide *data* as b\'\' to obtain more of the output.\n"
+"\n"
+"If all of the input data was decompressed and returned (either because this\n"
+"was less than *max_length* bytes, or because *max_length* was negative),\n"
+"*self.needs_input* will be set to True.\n"
+"\n"
+"Attempting to decompress data after the end of stream is reached raises an\n"
+"EOFError.  Any data found after the end of the stream is ignored and saved in\n"
+"the unused_data attribute.");
+
+#define ZLIB_ZLIBDECOMPRESSOR_DECOMPRESS_METHODDEF    \
+    {"decompress", _PyCFunction_CAST(zlib_ZlibDecompressor_decompress), METH_FASTCALL|METH_KEYWORDS, zlib_ZlibDecompressor_decompress__doc__},
+
+static PyObject *
+zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self,
+                                      Py_buffer *data, Py_ssize_t max_length);
+
+static PyObject *
+zlib_ZlibDecompressor_decompress(ZlibDecompressor *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_item = { &_Py_ID(data), &_Py_ID(max_length), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"data", "max_length", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "decompress",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
+    Py_buffer data = {NULL, NULL};
+    Py_ssize_t max_length = -1;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 2, 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (PyObject_GetBuffer(args[0], &data, PyBUF_SIMPLE) != 0) {
+        goto exit;
+    }
+    if (!PyBuffer_IsContiguous(&data, 'C')) {
+        _PyArg_BadArgument("decompress", "argument 'data'", "contiguous buffer", args[0]);
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    {
+        Py_ssize_t ival = -1;
+        PyObject *iobj = _PyNumber_Index(args[1]);
+        if (iobj != NULL) {
+            ival = PyLong_AsSsize_t(iobj);
+            Py_DECREF(iobj);
+        }
+        if (ival == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        max_length = ival;
+    }
+skip_optional_pos:
+    return_value = zlib_ZlibDecompressor_decompress_impl(self, &data, max_length);
+
+exit:
+    /* Cleanup for data */
+    if (data.obj) {
+       PyBuffer_Release(&data);
+    }
+
+    return return_value;
+}
+
 PyDoc_STRVAR(zlib_adler32__doc__,
 "adler32($module, data, value=1, /)\n"
 "--\n"
@ -1031,4 +1129,4 @@ exit:
 #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
    #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
 #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
-/*[clinic end generated code: output=9e5f9911d0c273e1 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=57ff7b511ab23132 input=a9049054013a1b77]*/
--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@ -8,6 +8,11 @@
 #include "Python.h"
 #include "structmember.h"         // PyMemberDef
 #include "zlib.h"
+#include "stdbool.h"
+
+#if defined(ZLIB_VERNUM) && ZLIB_VERNUM < 0x1221
+#error "At least zlib version 1.2.2.1 is required"
+#endif

 // Blocks output buffer wrappers
 #include "pycore_blocks_output_buffer.h"
@ -171,9 +176,6 @@ OutputBuffer_WindowOnError(_BlocksOutputBuffer *buffer, _Uint32Window *window)
    } } while (0)
 #define LEAVE_ZLIB(obj) PyThread_release_lock((obj)->lock);

-#if defined(ZLIB_VERNUM) && ZLIB_VERNUM >= 0x1221
-#  define AT_LEAST_ZLIB_1_2_2_1
-#endif

 /* The following parameters are copied from zutil.h, version 0.95 */
 #define DEFLATED   8
@ -185,12 +187,14 @@ OutputBuffer_WindowOnError(_BlocksOutputBuffer *buffer, _Uint32Window *window)

 /* Initial buffer size. */
 #define DEF_BUF_SIZE (16*1024)
+#define DEF_MAX_INITIAL_BUF_SIZE (16 * 1024 * 1024)

 static PyModuleDef zlibmodule;

 typedef struct {
    PyTypeObject *Comptype;
    PyTypeObject *Decomptype;
+    PyTypeObject *ZlibDecompressorType;
    PyObject *ZlibError;
 } zlibstate;

@ -209,7 +213,7 @@ typedef struct
    PyObject *unused_data;
    PyObject *unconsumed_tail;
    char eof;
-    int is_initialised;
+    bool is_initialised;
    PyObject *zdict;
    PyThread_type_lock lock;
 } compobject;
@ -320,7 +324,7 @@ static PyObject *
 zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)
 /*[clinic end generated code: output=46bd152fadd66df2 input=c4d06ee5782a7e3f]*/
 {
-    PyObject *RetVal;
+    PyObject *return_value;
    int flush;
    z_stream zst;
    _BlocksOutputBuffer buffer = {.list = NULL};
@ -387,11 +391,11 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)

    err = deflateEnd(&zst);
    if (err == Z_OK) {
-        RetVal = OutputBuffer_Finish(&buffer, zst.avail_out);
-        if (RetVal == NULL) {
+        return_value = OutputBuffer_Finish(&buffer, zst.avail_out);
+        if (return_value == NULL) {
            goto error;
        }
-        return RetVal;
+        return return_value;
    }
    else
        zlib_error(state, zst, err, "while finishing compression");
@ -419,7 +423,7 @@ zlib_decompress_impl(PyObject *module, Py_buffer *data, int wbits,
                     Py_ssize_t bufsize)
 /*[clinic end generated code: output=77c7e35111dc8c42 input=a9ac17beff1f893f]*/
 {
-    PyObject *RetVal;
+    PyObject *return_value;
    Byte *ibuf;
    Py_ssize_t ibuflen;
    int err, flush;
@ -514,9 +518,9 @@ zlib_decompress_impl(PyObject *module, Py_buffer *data, int wbits,
        goto error;
    }

-    RetVal = OutputBuffer_WindowFinish(&buffer, &window, zst.avail_out);
-    if (RetVal != NULL) {
-        return RetVal;
+    return_value = OutputBuffer_WindowFinish(&buffer, &window, zst.avail_out);
+    if (return_value != NULL) {
+        return return_value;
    }

 error:
@ -675,18 +679,10 @@ zlib_decompressobj_impl(PyObject *module, int wbits, PyObject *zdict)
    case Z_OK:
        self->is_initialised = 1;
        if (self->zdict != NULL && wbits < 0) {
-#ifdef AT_LEAST_ZLIB_1_2_2_1
            if (set_inflate_zdict(state, self) < 0) {
                Py_DECREF(self);
                return NULL;
            }
-#else
-            PyErr_Format(state->ZlibError,
-                         "zlib version %s does not allow raw inflate with dictionary",
-                         ZLIB_VERSION);
-            Py_DECREF(self);
-            return NULL;
-#endif
        }
        return (PyObject *)self;
    case Z_STREAM_ERROR:
@ -753,7 +749,7 @@ zlib_Compress_compress_impl(compobject *self, PyTypeObject *cls,
                            Py_buffer *data)
 /*[clinic end generated code: output=6731b3f0ff357ca6 input=04d00f65ab01d260]*/
 {
-    PyObject *RetVal;
+    PyObject *return_value;
    int err;
    _BlocksOutputBuffer buffer = {.list = NULL};
    zlibstate *state = PyType_GetModuleState(cls);
@ -791,17 +787,17 @@ zlib_Compress_compress_impl(compobject *self, PyTypeObject *cls,

    } while (ibuflen != 0);

-    RetVal = OutputBuffer_Finish(&buffer, self->zst.avail_out);
-    if (RetVal != NULL) {
+    return_value = OutputBuffer_Finish(&buffer, self->zst.avail_out);
+    if (return_value != NULL) {
        goto success;
    }

 error:
    OutputBuffer_OnError(&buffer);
-    RetVal = NULL;
+    return_value = NULL;
 success:
    LEAVE_ZLIB(self);
-    return RetVal;
+    return return_value;
 }

 /* Helper for objdecompress() and flush(). Saves any unconsumed input data in
@ -875,7 +871,7 @@ zlib_Decompress_decompress_impl(compobject *self, PyTypeObject *cls,
 {
    int err = Z_OK;
    Py_ssize_t ibuflen;
-    PyObject *RetVal;
+    PyObject *return_value;
    _BlocksOutputBuffer buffer = {.list = NULL};

    PyObject *module = PyType_GetModule(cls);
@ -953,17 +949,17 @@ zlib_Decompress_decompress_impl(compobject *self, PyTypeObject *cls,
        goto abort;
    }

-    RetVal = OutputBuffer_Finish(&buffer, self->zst.avail_out);
-    if (RetVal != NULL) {
+    return_value = OutputBuffer_Finish(&buffer, self->zst.avail_out);
+    if (return_value != NULL) {
        goto success;
    }

 abort:
    OutputBuffer_OnError(&buffer);
-    RetVal = NULL;
+    return_value = NULL;
 success:
    LEAVE_ZLIB(self);
-    return RetVal;
+    return return_value;
 }

 /*[clinic input]
@ -985,7 +981,7 @@ zlib_Compress_flush_impl(compobject *self, PyTypeObject *cls, int mode)
 /*[clinic end generated code: output=c7efd13efd62add2 input=286146e29442eb6c]*/
 {
    int err;
-    PyObject *RetVal;
+    PyObject *return_value;
    _BlocksOutputBuffer buffer = {.list = NULL};

    zlibstate *state = PyType_GetModuleState(cls);
@ -1042,17 +1038,17 @@ zlib_Compress_flush_impl(compobject *self, PyTypeObject *cls, int mode)
        goto error;
    }

-    RetVal = OutputBuffer_Finish(&buffer, self->zst.avail_out);
-    if (RetVal != NULL) {
+    return_value = OutputBuffer_Finish(&buffer, self->zst.avail_out);
+    if (return_value != NULL) {
        goto success;
    }

 error:
    OutputBuffer_OnError(&buffer);
-    RetVal = NULL;
+    return_value = NULL;
 success:
    LEAVE_ZLIB(self);
-    return RetVal;
+    return return_value;
 }

 #ifdef HAVE_ZLIB_COPY
@ -1071,14 +1067,14 @@ zlib_Compress_copy_impl(compobject *self, PyTypeObject *cls)
 {
    zlibstate *state = PyType_GetModuleState(cls);

-    compobject *retval = newcompobject(state->Comptype);
-    if (!retval) return NULL;
+    compobject *return_value = newcompobject(state->Comptype);
+    if (!return_value) return NULL;

    /* Copy the zstream state
     * We use ENTER_ZLIB / LEAVE_ZLIB to make this thread-safe
     */
    ENTER_ZLIB(self);
-    int err = deflateCopy(&retval->zst, &self->zst);
+    int err = deflateCopy(&return_value->zst, &self->zst);
    switch (err) {
    case Z_OK:
        break;
@ -1094,22 +1090,22 @@ zlib_Compress_copy_impl(compobject *self, PyTypeObject *cls)
        goto error;
    }
    Py_INCREF(self->unused_data);
-    Py_XSETREF(retval->unused_data, self->unused_data);
+    Py_XSETREF(return_value->unused_data, self->unused_data);
    Py_INCREF(self->unconsumed_tail);
-    Py_XSETREF(retval->unconsumed_tail, self->unconsumed_tail);
+    Py_XSETREF(return_value->unconsumed_tail, self->unconsumed_tail);
    Py_XINCREF(self->zdict);
-    Py_XSETREF(retval->zdict, self->zdict);
-    retval->eof = self->eof;
+    Py_XSETREF(return_value->zdict, self->zdict);
+    return_value->eof = self->eof;

    /* Mark it as being initialized */
-    retval->is_initialised = 1;
+    return_value->is_initialised = 1;

    LEAVE_ZLIB(self);
-    return (PyObject *)retval;
+    return (PyObject *)return_value;

 error:
    LEAVE_ZLIB(self);
-    Py_XDECREF(retval);
+    Py_XDECREF(return_value);
    return NULL;
 }

@ -1158,14 +1154,14 @@ zlib_Decompress_copy_impl(compobject *self, PyTypeObject *cls)
 {
    zlibstate *state = PyType_GetModuleState(cls);

-    compobject *retval = newcompobject(state->Decomptype);
-    if (!retval) return NULL;
+    compobject *return_value = newcompobject(state->Decomptype);
+    if (!return_value) return NULL;

    /* Copy the zstream state
     * We use ENTER_ZLIB / LEAVE_ZLIB to make this thread-safe
     */
    ENTER_ZLIB(self);
-    int err = inflateCopy(&retval->zst, &self->zst);
+    int err = inflateCopy(&return_value->zst, &self->zst);
    switch (err) {
    case Z_OK:
        break;
@ -1182,22 +1178,22 @@ zlib_Decompress_copy_impl(compobject *self, PyTypeObject *cls)
    }

    Py_INCREF(self->unused_data);
-    Py_XSETREF(retval->unused_data, self->unused_data);
+    Py_XSETREF(return_value->unused_data, self->unused_data);
    Py_INCREF(self->unconsumed_tail);
-    Py_XSETREF(retval->unconsumed_tail, self->unconsumed_tail);
+    Py_XSETREF(return_value->unconsumed_tail, self->unconsumed_tail);
    Py_XINCREF(self->zdict);
-    Py_XSETREF(retval->zdict, self->zdict);
-    retval->eof = self->eof;
+    Py_XSETREF(return_value->zdict, self->zdict);
+    return_value->eof = self->eof;

    /* Mark it as being initialized */
-    retval->is_initialised = 1;
+    return_value->is_initialised = 1;

    LEAVE_ZLIB(self);
-    return (PyObject *)retval;
+    return (PyObject *)return_value;

 error:
    LEAVE_ZLIB(self);
-    Py_XDECREF(retval);
+    Py_XDECREF(return_value);
    return NULL;
 }

@ -1252,7 +1248,7 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls,
 {
    int err, flush;
    Py_buffer data;
-    PyObject *RetVal;
+    PyObject *return_value;
    Py_ssize_t ibuflen;
    _BlocksOutputBuffer buffer = {.list = NULL};
    _Uint32Window window;  // output buffer's UINT32_MAX sliding window
@ -1306,13 +1302,6 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls,
            case Z_STREAM_END:
                break;
            default:
-                if (err == Z_NEED_DICT && self->zdict != NULL) {
-                    if (set_inflate_zdict(state, self) < 0) {
-                        goto abort;
-                    }
-                    else
-                        break;
-                }
                goto save;
            }

@ -1336,18 +1325,475 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls,
        }
    }

-    RetVal = OutputBuffer_WindowFinish(&buffer, &window, self->zst.avail_out);
-    if (RetVal != NULL) {
+    return_value = OutputBuffer_WindowFinish(&buffer, &window, self->zst.avail_out);
+    if (return_value != NULL) {
        goto success;
    }

 abort:
    OutputBuffer_WindowOnError(&buffer, &window);
-    RetVal = NULL;
+    return_value = NULL;
 success:
    PyBuffer_Release(&data);
    LEAVE_ZLIB(self);
-    return RetVal;
+    return return_value;
+}
+
+
+typedef struct {
+    PyObject_HEAD
+    z_stream zst;
+    PyObject *zdict;
+    PyThread_type_lock lock;
+    PyObject *unused_data;
+    uint8_t *input_buffer;
+    Py_ssize_t input_buffer_size;
+    /* zst>avail_in is only 32 bit, so we store the true length
+       separately. Conversion and looping is encapsulated in
+       decompress_buf() */
+    Py_ssize_t avail_in_real;
+    bool is_initialised;
+    char eof;           /* T_BOOL expects a char */
+    char needs_input;
+} ZlibDecompressor;
+
+/*[clinic input]
+class zlib.ZlibDecompressor "ZlibDecompressor *" "&ZlibDecompressorType"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=0658178ab94645df]*/
+
+static void
+ZlibDecompressor_dealloc(ZlibDecompressor *self)
+{
+    PyObject *type = (PyObject *)Py_TYPE(self);
+    PyThread_free_lock(self->lock);
+    if (self->is_initialised) {
+        inflateEnd(&self->zst);
+    }
+    PyMem_Free(self->input_buffer);
+    Py_CLEAR(self->unused_data);
+    Py_CLEAR(self->zdict);
+    PyObject_Free(self);
+    Py_DECREF(type);
+}
+
+static int
+set_inflate_zdict_ZlibDecompressor(zlibstate *state, ZlibDecompressor *self)
+{
+    Py_buffer zdict_buf;
+    if (PyObject_GetBuffer(self->zdict, &zdict_buf, PyBUF_SIMPLE) == -1) {
+        return -1;
+    }
+    if ((size_t)zdict_buf.len > UINT_MAX) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "zdict length does not fit in an unsigned int");
+        PyBuffer_Release(&zdict_buf);
+        return -1;
+    }
+    int err;
+    err = inflateSetDictionary(&self->zst,
+                               zdict_buf.buf, (unsigned int)zdict_buf.len);
+    PyBuffer_Release(&zdict_buf);
+    if (err != Z_OK) {
+        zlib_error(state, self->zst, err, "while setting zdict");
+        return -1;
+    }
+    return 0;
+}
+
+static Py_ssize_t
+arrange_output_buffer_with_maximum(uint32_t *avail_out,
+                                   uint8_t **next_out,
+                                   PyObject **buffer,
+                                   Py_ssize_t length,
+                                   Py_ssize_t max_length)
+{
+    Py_ssize_t occupied;
+
+    if (*buffer == NULL) {
+        if (!(*buffer = PyBytes_FromStringAndSize(NULL, length)))
+            return -1;
+        occupied = 0;
+    }
+    else {
+        occupied = *next_out - (uint8_t *)PyBytes_AS_STRING(*buffer);
+
+        if (length == occupied) {
+            Py_ssize_t new_length;
+            assert(length <= max_length);
+            /* can not scale the buffer over max_length */
+            if (length == max_length)
+                return -2;
+            if (length <= (max_length >> 1))
+                new_length = length << 1;
+            else
+                new_length = max_length;
+            if (_PyBytes_Resize(buffer, new_length) < 0)
+                return -1;
+            length = new_length;
+        }
+    }
+
+    *avail_out = (uint32_t)Py_MIN((size_t)(length - occupied), UINT32_MAX);
+    *next_out = (uint8_t *)PyBytes_AS_STRING(*buffer) + occupied;
+
+    return length;
+}
+
+static inline Py_ssize_t
+arrange_output_buffer(uint32_t *avail_out,
+                      uint8_t **next_out,
+                      PyObject **buffer,
+                      Py_ssize_t length)
+{
+    Py_ssize_t ret;
+
+    ret = arrange_output_buffer_with_maximum(avail_out, next_out, buffer,
+                                             length,
+                                             PY_SSIZE_T_MAX);
+    if (ret == -2)
+        PyErr_NoMemory();
+    return ret;
+}
+
+/* Decompress data of length self->avail_in_real in self->state.next_in. The 
+   output buffer is allocated dynamically and returned. If the max_length is 
+   of sufficiently low size, max_length is allocated immediately. At most 
+   max_length bytes are returned, so some of the input may not be consumed. 
+   self->state.next_in and self->avail_in_real are updated to reflect the 
+   consumed input. */
+static PyObject*
+decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length)
+{
+    /* data_size is strictly positive, but because we repeatedly have to
+       compare against max_length and PyBytes_GET_SIZE we declare it as
+       signed */
+    PyObject *return_value = NULL;
+    Py_ssize_t hard_limit;
+    Py_ssize_t obuflen;
+    zlibstate *state = PyType_GetModuleState(Py_TYPE(self));
+    
+    int err = Z_OK;
+
+    /* When sys.maxsize is passed as default use DEF_BUF_SIZE as start buffer. 
+       In this particular case the data may not necessarily be very big, so 
+       it is better to grow dynamically.*/
+    if ((max_length < 0) || max_length == PY_SSIZE_T_MAX) {
+        hard_limit = PY_SSIZE_T_MAX;
+        obuflen = DEF_BUF_SIZE;
+    } else {
+        /* Assume that decompressor is used in file decompression with a fixed
+           block size of max_length. In that case we will reach max_length almost
+           always (except at the end of the file). So it makes sense to allocate
+           max_length. */
+        hard_limit = max_length;
+        obuflen = max_length;
+        if (obuflen > DEF_MAX_INITIAL_BUF_SIZE){
+            // Safeguard against memory overflow.
+            obuflen = DEF_MAX_INITIAL_BUF_SIZE;
+        }
+    }
+
+    do {
+        arrange_input_buffer(&(self->zst), &(self->avail_in_real));
+
+        do {
+            obuflen = arrange_output_buffer_with_maximum(&(self->zst.avail_out),
+                                                        &(self->zst.next_out),
+                                                        &return_value,
+                                                        obuflen,
+                                                        hard_limit);
+            if (obuflen == -1){
+                PyErr_SetString(PyExc_MemoryError,
+                                "Insufficient memory for buffer allocation");
+                goto error;
+            }
+            else if (obuflen == -2) {
+                break;
+            }
+            Py_BEGIN_ALLOW_THREADS
+            err = inflate(&self->zst, Z_SYNC_FLUSH);
+            Py_END_ALLOW_THREADS
+            switch (err) {
+            case Z_OK:            /* fall through */
+            case Z_BUF_ERROR:     /* fall through */
+            case Z_STREAM_END:
+                break;
+            default:
+                if (err == Z_NEED_DICT) {
+                    goto error;
+                }
+                else {
+                    break;
+                }
+            }
+        } while (self->zst.avail_out == 0);
+    } while(err != Z_STREAM_END && self->avail_in_real != 0);
+
+    if (err == Z_STREAM_END) {
+        self->eof = 1;
+        self->is_initialised = 0;
+        /* Unlike the Decompress object we call inflateEnd here as there are no
+           backwards compatibility issues */
+        err = inflateEnd(&self->zst);
+        if (err != Z_OK) {
+            zlib_error(state, self->zst, err, "while finishing decompression");
+            goto error;
+        }
+    } else if (err != Z_OK && err != Z_BUF_ERROR) {
+        zlib_error(state, self->zst, err, "while decompressing data");
+    }
+
+    self->avail_in_real += self->zst.avail_in;
+
+    if (_PyBytes_Resize(&return_value, self->zst.next_out -
+                        (uint8_t *)PyBytes_AS_STRING(return_value)) != 0) {
+        goto error;
+    }
+
+    goto success;
+error:
+    Py_CLEAR(return_value);
+success:
+    return return_value;
+}
+
+
+static PyObject *
+decompress(ZlibDecompressor *self, uint8_t *data, 
+           size_t len, Py_ssize_t max_length)
+{
+    bool input_buffer_in_use;
+    PyObject *result;
+
+    /* Prepend unconsumed input if necessary */
+    if (self->zst.next_in != NULL) {
+        size_t avail_now, avail_total;
+
+        /* Number of bytes we can append to input buffer */
+        avail_now = (self->input_buffer + self->input_buffer_size)
+            - (self->zst.next_in + self->avail_in_real);
+
+        /* Number of bytes we can append if we move existing
+           contents to beginning of buffer (overwriting
+           consumed input) */
+        avail_total = self->input_buffer_size - self->avail_in_real;
+
+        if (avail_total < len) {
+            size_t offset = self->zst.next_in - self->input_buffer;
+            uint8_t *tmp;
+            size_t new_size = self->input_buffer_size + len - avail_now;
+
+            /* Assign to temporary variable first, so we don't
+               lose address of allocated buffer if realloc fails */
+            tmp = PyMem_Realloc(self->input_buffer, new_size);
+            if (tmp == NULL) {
+                PyErr_SetNone(PyExc_MemoryError);
+                return NULL;
+            }
+            self->input_buffer = tmp;
+            self->input_buffer_size = new_size;
+
+            self->zst.next_in = self->input_buffer + offset;
+        }
+        else if (avail_now < len) {
+            memmove(self->input_buffer, self->zst.next_in,
+                    self->avail_in_real);
+            self->zst.next_in = self->input_buffer;
+        }
+        memcpy((void*)(self->zst.next_in + self->avail_in_real), data, len);
+        self->avail_in_real += len;
+        input_buffer_in_use = 1;
+    }
+    else {
+        self->zst.next_in = data;
+        self->avail_in_real = len;
+        input_buffer_in_use = 0;
+    }
+
+    result = decompress_buf(self, max_length);
+    if(result == NULL) {
+        self->zst.next_in = NULL;
+        return NULL;
+    }
+
+    if (self->eof) {
+        self->needs_input = 0;
+
+        if (self->avail_in_real > 0) {
+            PyObject *unused_data = PyBytes_FromStringAndSize(
+                (char *)self->zst.next_in, self->avail_in_real);
+            if (unused_data == NULL) {
+                goto error;
+            }
+            Py_XSETREF(self->unused_data, unused_data);
+        }
+    }
+    else if (self->avail_in_real == 0) {
+        self->zst.next_in = NULL;
+        self->needs_input = 1;
+    }
+    else {
+        self->needs_input = 0;
+
+        /* If we did not use the input buffer, we now have
+           to copy the tail from the caller's buffer into the
+           input buffer */
+        if (!input_buffer_in_use) {
+
+            /* Discard buffer if it's too small
+               (resizing it may needlessly copy the current contents) */
+            if (self->input_buffer != NULL &&
+                self->input_buffer_size < self->avail_in_real) {
+                PyMem_Free(self->input_buffer);
+                self->input_buffer = NULL;
+            }
+
+            /* Allocate if necessary */
+            if (self->input_buffer == NULL) {
+                self->input_buffer = PyMem_Malloc(self->avail_in_real);
+                if (self->input_buffer == NULL) {
+                    PyErr_SetNone(PyExc_MemoryError);
+                    goto error;
+                }
+                self->input_buffer_size = self->avail_in_real;
+            }
+
+            /* Copy tail */
+            memcpy(self->input_buffer, self->zst.next_in, self->avail_in_real);
+            self->zst.next_in = self->input_buffer;
+        }
+    }
+    return result;
+
+error:
+    Py_XDECREF(result);
+    return NULL;
+}
+
+/*[clinic input]
+zlib.ZlibDecompressor.decompress
+
+    data: Py_buffer
+    max_length: Py_ssize_t=-1
+
+Decompress *data*, returning uncompressed data as bytes.
+
+If *max_length* is nonnegative, returns at most *max_length* bytes of
+decompressed data. If this limit is reached and further output can be
+produced, *self.needs_input* will be set to ``False``. In this case, the next
+call to *decompress()* may provide *data* as b'' to obtain more of the output.
+
+If all of the input data was decompressed and returned (either because this
+was less than *max_length* bytes, or because *max_length* was negative),
+*self.needs_input* will be set to True.
+
+Attempting to decompress data after the end of stream is reached raises an
+EOFError.  Any data found after the end of the stream is ignored and saved in
+the unused_data attribute.
+[clinic start generated code]*/
+
+static PyObject *
+zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self,
+                                      Py_buffer *data, Py_ssize_t max_length)
+/*[clinic end generated code: output=990d32787b775f85 input=0b29d99715250b96]*/
+
+{
+    PyObject *result = NULL;
+
+    ENTER_ZLIB(self);
+    if (self->eof) {
+        PyErr_SetString(PyExc_EOFError, "End of stream already reached");
+    }
+    else {
+        result = decompress(self, data->buf, data->len, max_length);
+    }
+    LEAVE_ZLIB(self);
+    return result;
+}
+
+PyDoc_STRVAR(ZlibDecompressor__new____doc__,
+"_ZlibDecompressor(wbits=15, zdict=b\'\')\n"
+"--\n"
+"\n"
+"Create a decompressor object for decompressing data incrementally.\n"
+"\n"
+"  wbits = 15\n"
+"  zdict\n"
+"     The predefined compression dictionary. This is a sequence of bytes\n"
+"     (such as a bytes object) containing subsequences that are expected\n"
+"     to occur frequently in the data that is to be compressed. Those\n"
+"     subsequences that are expected to be most common should come at the\n"
+"     end of the dictionary. This must be the same dictionary as used by the\n"
+"     compressor that produced the input data.\n"
+"\n");
+
+static PyObject *
+ZlibDecompressor__new__(PyTypeObject *cls, 
+                        PyObject *args, 
+                        PyObject *kwargs)
+{
+    static char *keywords[] = {"wbits", "zdict", NULL};
+    static char *format = "|iO:_ZlibDecompressor";
+    int wbits = MAX_WBITS;
+    PyObject *zdict = NULL;
+    zlibstate *state = PyType_GetModuleState(cls);
+
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwargs, format, keywords, &wbits, &zdict)) {
+        return NULL;
+    }
+    ZlibDecompressor *self = PyObject_New(ZlibDecompressor, cls); 
+    self->eof = 0;
+    self->needs_input = 1;
+    self->avail_in_real = 0;
+    self->input_buffer = NULL;
+    self->input_buffer_size = 0;
+    if (zdict != NULL) {
+        Py_INCREF(zdict);
+    }
+    self->zdict = zdict;
+    self->zst.opaque = NULL;
+    self->zst.zalloc = PyZlib_Malloc;
+    self->zst.zfree = PyZlib_Free;
+    self->zst.next_in = NULL;
+    self->zst.avail_in = 0;
+    self->unused_data = PyBytes_FromStringAndSize(NULL, 0);
+    if (self->unused_data == NULL) {
+        Py_CLEAR(self);
+        return NULL;
+    }
+    self->lock = PyThread_allocate_lock();
+    if (self->lock == NULL) {
+        Py_DECREF(self);
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate lock");
+        return NULL;
+    }
+    int err = inflateInit2(&(self->zst), wbits);
+    switch (err) {
+        case Z_OK:
+        self->is_initialised = 1;
+        if (self->zdict != NULL && wbits < 0) {
+            if (set_inflate_zdict_ZlibDecompressor(state, self) < 0) {
+                Py_DECREF(self);
+                return NULL;
+            }
+        }
+        return (PyObject *)self;
+    case Z_STREAM_ERROR:
+        Py_DECREF(self);
+        PyErr_SetString(PyExc_ValueError, "Invalid initialization option");
+        return NULL;
+    case Z_MEM_ERROR:
+        Py_DECREF(self);
+        PyErr_SetString(PyExc_MemoryError,
+                        "Can't allocate memory for decompression object");
+        return NULL;
+    default:
+        zlib_error(state, self->zst, err, "while creating decompression object");
+        Py_DECREF(self);
+        return NULL;
+    }
 }

 #include "clinic/zlibmodule.c.h"
@ -1372,6 +1818,11 @@ static PyMethodDef Decomp_methods[] =
    {NULL, NULL}
 };

+static PyMethodDef ZlibDecompressor_methods[] = {
+    ZLIB_ZLIBDECOMPRESSOR_DECOMPRESS_METHODDEF
+    {NULL}
+};
+
 #define COMP_OFF(x) offsetof(compobject, x)
 static PyMemberDef Decomp_members[] = {
    {"unused_data",     T_OBJECT, COMP_OFF(unused_data), READONLY},
@ -1380,6 +1831,26 @@ static PyMemberDef Decomp_members[] = {
    {NULL},
 };

+PyDoc_STRVAR(ZlibDecompressor_eof__doc__,
+"True if the end-of-stream marker has been reached.");
+
+PyDoc_STRVAR(ZlibDecompressor_unused_data__doc__,
+"Data found after the end of the compressed stream.");
+
+PyDoc_STRVAR(ZlibDecompressor_needs_input_doc,
+"True if more input is needed before more decompressed data can be produced.");
+
+static PyMemberDef ZlibDecompressor_members[] = {
+    {"eof", T_BOOL, offsetof(ZlibDecompressor, eof),
+     READONLY, ZlibDecompressor_eof__doc__},
+    {"unused_data", T_OBJECT_EX, offsetof(ZlibDecompressor, unused_data),
+     READONLY, ZlibDecompressor_unused_data__doc__},
+    {"needs_input", T_BOOL, offsetof(ZlibDecompressor, needs_input), READONLY,
+     ZlibDecompressor_needs_input_doc},
+    {NULL},
+};
+
+
 /*[clinic input]
 zlib.adler32

@ -1497,6 +1968,25 @@ static PyType_Spec Decomptype_spec = {
    .slots = Decomptype_slots,
 };

+static PyType_Slot ZlibDecompressor_type_slots[] = {
+    {Py_tp_dealloc, ZlibDecompressor_dealloc},
+    {Py_tp_members, ZlibDecompressor_members},
+    {Py_tp_new, ZlibDecompressor__new__},
+    {Py_tp_doc, (char *)ZlibDecompressor__new____doc__},
+    {Py_tp_methods, ZlibDecompressor_methods},
+    {0, 0},
+};
+
+static PyType_Spec ZlibDecompressor_type_spec = {
+    .name = "zlib._ZlibDecompressor",
+    .basicsize = sizeof(ZlibDecompressor),
+    // Calling PyType_GetModuleState() on a subclass is not safe.
+    // ZlibDecompressor_type_spec does not have Py_TPFLAGS_BASETYPE flag
+    // which prevents to create a subclass.
+    // So calling PyType_GetModuleState() in this file is always safe.
+    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
+    .slots = ZlibDecompressor_type_slots,
+};
 PyDoc_STRVAR(zlib_module_documentation,
 "The functions in this module allow compression and decompression using the\n"
 "zlib library, which is based on GNU zip.\n"
@ -1518,6 +2008,7 @@ zlib_clear(PyObject *mod)
    zlibstate *state = get_zlib_state(mod);
    Py_CLEAR(state->Comptype);
    Py_CLEAR(state->Decomptype);
+    Py_CLEAR(state->ZlibDecompressorType);
    Py_CLEAR(state->ZlibError);
    return 0;
 }
@ -1528,6 +2019,7 @@ zlib_traverse(PyObject *mod, visitproc visit, void *arg)
    zlibstate *state = get_zlib_state(mod);
    Py_VISIT(state->Comptype);
    Py_VISIT(state->Decomptype);
+    Py_VISIT(state->ZlibDecompressorType);
    Py_VISIT(state->ZlibError);
    return 0;
 }
@ -1555,6 +2047,12 @@ zlib_exec(PyObject *mod)
        return -1;
    }

+    state->ZlibDecompressorType = (PyTypeObject *)PyType_FromModuleAndSpec(
+        mod, &ZlibDecompressor_type_spec, NULL);
+    if (state->ZlibDecompressorType == NULL) {
+        return -1;
+    }
+
    state->ZlibError = PyErr_NewException("zlib.error", NULL, NULL);
    if (state->ZlibError == NULL) {
        return -1;
@ -1565,6 +2063,12 @@ zlib_exec(PyObject *mod)
        Py_DECREF(state->ZlibError);
        return -1;
    }
+    Py_INCREF(state->ZlibDecompressorType);
+    if (PyModule_AddObject(mod, "_ZlibDecompressor", 
+                           (PyObject *)state->ZlibDecompressorType) < 0) {
+        Py_DECREF(state->ZlibDecompressorType);
+        return -1;
+    }

 #define ZLIB_ADD_INT_MACRO(c)                           \
    do {                                                \