mirror of https://github.com/python/cpython
gh-89550: Buffer GzipFile.write to reduce execution time by ~15% (#101251)
Use `io.BufferedWriter` to buffer gzip writes. --------- Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com> Co-authored-by: Gregory P. Smith <greg@krypto.org>
This commit is contained in:
parent
405eacc1b8
commit
9af485436b
40
Lib/gzip.py
40
Lib/gzip.py
|
@ -22,6 +22,7 @@ _COMPRESS_LEVEL_TRADEOFF = 6
|
||||||
_COMPRESS_LEVEL_BEST = 9
|
_COMPRESS_LEVEL_BEST = 9
|
||||||
|
|
||||||
READ_BUFFER_SIZE = 128 * 1024
|
READ_BUFFER_SIZE = 128 * 1024
|
||||||
|
_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE
|
||||||
|
|
||||||
|
|
||||||
def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
|
def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
|
||||||
|
@ -120,6 +121,21 @@ class BadGzipFile(OSError):
|
||||||
"""Exception raised in some cases for invalid gzip files."""
|
"""Exception raised in some cases for invalid gzip files."""
|
||||||
|
|
||||||
|
|
||||||
|
class _WriteBufferStream(io.RawIOBase):
|
||||||
|
"""Minimal object to pass WriteBuffer flushes into GzipFile"""
|
||||||
|
def __init__(self, gzip_file):
|
||||||
|
self.gzip_file = gzip_file
|
||||||
|
|
||||||
|
def write(self, data):
|
||||||
|
return self.gzip_file._write_raw(data)
|
||||||
|
|
||||||
|
def seekable(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def writable(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class GzipFile(_compression.BaseStream):
|
class GzipFile(_compression.BaseStream):
|
||||||
"""The GzipFile class simulates most of the methods of a file object with
|
"""The GzipFile class simulates most of the methods of a file object with
|
||||||
the exception of the truncate() method.
|
the exception of the truncate() method.
|
||||||
|
@ -184,6 +200,7 @@ class GzipFile(_compression.BaseStream):
|
||||||
if mode is None:
|
if mode is None:
|
||||||
mode = getattr(fileobj, 'mode', 'rb')
|
mode = getattr(fileobj, 'mode', 'rb')
|
||||||
|
|
||||||
|
|
||||||
if mode.startswith('r'):
|
if mode.startswith('r'):
|
||||||
self.mode = READ
|
self.mode = READ
|
||||||
raw = _GzipReader(fileobj)
|
raw = _GzipReader(fileobj)
|
||||||
|
@ -206,6 +223,9 @@ class GzipFile(_compression.BaseStream):
|
||||||
zlib.DEF_MEM_LEVEL,
|
zlib.DEF_MEM_LEVEL,
|
||||||
0)
|
0)
|
||||||
self._write_mtime = mtime
|
self._write_mtime = mtime
|
||||||
|
self._buffer_size = _WRITE_BUFFER_SIZE
|
||||||
|
self._buffer = io.BufferedWriter(_WriteBufferStream(self),
|
||||||
|
buffer_size=self._buffer_size)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid mode: {!r}".format(mode))
|
raise ValueError("Invalid mode: {!r}".format(mode))
|
||||||
|
|
||||||
|
@ -231,6 +251,11 @@ class GzipFile(_compression.BaseStream):
|
||||||
self.bufsize = 0
|
self.bufsize = 0
|
||||||
self.offset = 0 # Current file offset for seek(), tell(), etc
|
self.offset = 0 # Current file offset for seek(), tell(), etc
|
||||||
|
|
||||||
|
def tell(self):
|
||||||
|
self._check_not_closed()
|
||||||
|
self._buffer.flush()
|
||||||
|
return super().tell()
|
||||||
|
|
||||||
def _write_gzip_header(self, compresslevel):
|
def _write_gzip_header(self, compresslevel):
|
||||||
self.fileobj.write(b'\037\213') # magic header
|
self.fileobj.write(b'\037\213') # magic header
|
||||||
self.fileobj.write(b'\010') # compression method
|
self.fileobj.write(b'\010') # compression method
|
||||||
|
@ -272,6 +297,10 @@ class GzipFile(_compression.BaseStream):
|
||||||
if self.fileobj is None:
|
if self.fileobj is None:
|
||||||
raise ValueError("write() on closed GzipFile object")
|
raise ValueError("write() on closed GzipFile object")
|
||||||
|
|
||||||
|
return self._buffer.write(data)
|
||||||
|
|
||||||
|
def _write_raw(self, data):
|
||||||
|
# Called by our self._buffer underlying WriteBufferStream.
|
||||||
if isinstance(data, (bytes, bytearray)):
|
if isinstance(data, (bytes, bytearray)):
|
||||||
length = len(data)
|
length = len(data)
|
||||||
else:
|
else:
|
||||||
|
@ -322,9 +351,9 @@ class GzipFile(_compression.BaseStream):
|
||||||
fileobj = self.fileobj
|
fileobj = self.fileobj
|
||||||
if fileobj is None:
|
if fileobj is None:
|
||||||
return
|
return
|
||||||
self.fileobj = None
|
|
||||||
try:
|
try:
|
||||||
if self.mode == WRITE:
|
if self.mode == WRITE:
|
||||||
|
self._buffer.flush()
|
||||||
fileobj.write(self.compress.flush())
|
fileobj.write(self.compress.flush())
|
||||||
write32u(fileobj, self.crc)
|
write32u(fileobj, self.crc)
|
||||||
# self.size may exceed 2 GiB, or even 4 GiB
|
# self.size may exceed 2 GiB, or even 4 GiB
|
||||||
|
@ -332,6 +361,7 @@ class GzipFile(_compression.BaseStream):
|
||||||
elif self.mode == READ:
|
elif self.mode == READ:
|
||||||
self._buffer.close()
|
self._buffer.close()
|
||||||
finally:
|
finally:
|
||||||
|
self.fileobj = None
|
||||||
myfileobj = self.myfileobj
|
myfileobj = self.myfileobj
|
||||||
if myfileobj:
|
if myfileobj:
|
||||||
self.myfileobj = None
|
self.myfileobj = None
|
||||||
|
@ -341,7 +371,7 @@ class GzipFile(_compression.BaseStream):
|
||||||
self._check_not_closed()
|
self._check_not_closed()
|
||||||
if self.mode == WRITE:
|
if self.mode == WRITE:
|
||||||
# Ensure the compressor's buffer is flushed
|
# Ensure the compressor's buffer is flushed
|
||||||
self.fileobj.write(self.compress.flush(zlib_mode))
|
self._buffer.flush()
|
||||||
self.fileobj.flush()
|
self.fileobj.flush()
|
||||||
|
|
||||||
def fileno(self):
|
def fileno(self):
|
||||||
|
@ -378,10 +408,10 @@ class GzipFile(_compression.BaseStream):
|
||||||
if offset < self.offset:
|
if offset < self.offset:
|
||||||
raise OSError('Negative seek in write mode')
|
raise OSError('Negative seek in write mode')
|
||||||
count = offset - self.offset
|
count = offset - self.offset
|
||||||
chunk = b'\0' * 1024
|
chunk = b'\0' * self._buffer_size
|
||||||
for i in range(count // 1024):
|
for i in range(count // self._buffer_size):
|
||||||
self.write(chunk)
|
self.write(chunk)
|
||||||
self.write(b'\0' * (count % 1024))
|
self.write(b'\0' * (count % self._buffer_size))
|
||||||
elif self.mode == READ:
|
elif self.mode == READ:
|
||||||
self._check_not_closed()
|
self._check_not_closed()
|
||||||
return self._buffer.seek(offset, whence)
|
return self._buffer.seek(offset, whence)
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Decrease execution time of some :mod:`gzip` file writes by 15% by
|
||||||
|
adding more appropriate buffering.
|
Loading…
Reference in New Issue