From 9af485436b83003b5705a6e54bdeb900c70e0c69 Mon Sep 17 00:00:00 2001 From: Arjun Date: Mon, 8 May 2023 10:55:59 -0700 Subject: [PATCH] gh-89550: Buffer GzipFile.write to reduce execution time by ~15% (#101251) Use `io.BufferedWriter` to buffer gzip writes. --------- Co-authored-by: Alex Waygood Co-authored-by: Gregory P. Smith --- Lib/gzip.py | 40 ++++++++++++++++--- ...3-01-22-14-53-12.gh-issue-89550.c1U23f.rst | 2 + 2 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst diff --git a/Lib/gzip.py b/Lib/gzip.py index 75c6ddc3f2c..8796c8d9fd9 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -22,6 +22,7 @@ _COMPRESS_LEVEL_TRADEOFF = 6 _COMPRESS_LEVEL_BEST = 9 READ_BUFFER_SIZE = 128 * 1024 +_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, @@ -120,6 +121,21 @@ class BadGzipFile(OSError): """Exception raised in some cases for invalid gzip files.""" +class _WriteBufferStream(io.RawIOBase): + """Minimal object to pass WriteBuffer flushes into GzipFile""" + def __init__(self, gzip_file): + self.gzip_file = gzip_file + + def write(self, data): + return self.gzip_file._write_raw(data) + + def seekable(self): + return False + + def writable(self): + return True + + class GzipFile(_compression.BaseStream): """The GzipFile class simulates most of the methods of a file object with the exception of the truncate() method. @@ -184,6 +200,7 @@ class GzipFile(_compression.BaseStream): if mode is None: mode = getattr(fileobj, 'mode', 'rb') + if mode.startswith('r'): self.mode = READ raw = _GzipReader(fileobj) @@ -206,6 +223,9 @@ class GzipFile(_compression.BaseStream): zlib.DEF_MEM_LEVEL, 0) self._write_mtime = mtime + self._buffer_size = _WRITE_BUFFER_SIZE + self._buffer = io.BufferedWriter(_WriteBufferStream(self), + buffer_size=self._buffer_size) else: raise ValueError("Invalid mode: {!r}".format(mode)) @@ -231,6 +251,11 @@ class GzipFile(_compression.BaseStream): self.bufsize = 0 self.offset = 0 # Current file offset for seek(), tell(), etc + def tell(self): + self._check_not_closed() + self._buffer.flush() + return super().tell() + def _write_gzip_header(self, compresslevel): self.fileobj.write(b'\037\213') # magic header self.fileobj.write(b'\010') # compression method @@ -272,6 +297,10 @@ class GzipFile(_compression.BaseStream): if self.fileobj is None: raise ValueError("write() on closed GzipFile object") + return self._buffer.write(data) + + def _write_raw(self, data): + # Called by our self._buffer underlying WriteBufferStream. if isinstance(data, (bytes, bytearray)): length = len(data) else: @@ -322,9 +351,9 @@ class GzipFile(_compression.BaseStream): fileobj = self.fileobj if fileobj is None: return - self.fileobj = None try: if self.mode == WRITE: + self._buffer.flush() fileobj.write(self.compress.flush()) write32u(fileobj, self.crc) # self.size may exceed 2 GiB, or even 4 GiB @@ -332,6 +361,7 @@ class GzipFile(_compression.BaseStream): elif self.mode == READ: self._buffer.close() finally: + self.fileobj = None myfileobj = self.myfileobj if myfileobj: self.myfileobj = None @@ -341,7 +371,7 @@ class GzipFile(_compression.BaseStream): self._check_not_closed() if self.mode == WRITE: # Ensure the compressor's buffer is flushed - self.fileobj.write(self.compress.flush(zlib_mode)) + self._buffer.flush() self.fileobj.flush() def fileno(self): @@ -378,10 +408,10 @@ class GzipFile(_compression.BaseStream): if offset < self.offset: raise OSError('Negative seek in write mode') count = offset - self.offset - chunk = b'\0' * 1024 - for i in range(count // 1024): + chunk = b'\0' * self._buffer_size + for i in range(count // self._buffer_size): self.write(chunk) - self.write(b'\0' * (count % 1024)) + self.write(b'\0' * (count % self._buffer_size)) elif self.mode == READ: self._check_not_closed() return self._buffer.seek(offset, whence) diff --git a/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst b/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst new file mode 100644 index 00000000000..556db0eae00 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst @@ -0,0 +1,2 @@ +Decrease execution time of some :mod:`gzip` file writes by 15% by +adding more appropriate buffering.