bpo-34043: Optimize tarfile uncompress performance (GH-8089)

tarfile._Stream has two buffer for compressed and uncompressed data.
Those buffers are not aligned so unnecessary bytes slicing happens
for every reading chunks.

This commit bypass compressed buffering.

In this benchmark [1], user time become 250ms from 300ms.

[1]: https://bugs.python.org/msg320763
This commit is contained in:
INADA Naoki 2018-07-06 14:06:00 +09:00 committed by GitHub
parent f12028809b
commit 8d130913cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 13 additions and 18 deletions

View File

@ -513,21 +513,10 @@ class _Stream:
raise StreamError("seeking backwards is not allowed")
return self.pos
def read(self, size=None):
"""Return the next size number of bytes from the stream.
If size is not defined, return all bytes of the stream
up to EOF.
"""
if size is None:
t = []
while True:
buf = self._read(self.bufsize)
if not buf:
break
t.append(buf)
buf = b"".join(t)
else:
buf = self._read(size)
def read(self, size):
"""Return the next size number of bytes from the stream."""
assert size is not None
buf = self._read(size)
self.pos += len(buf)
return buf
@ -540,9 +529,14 @@ class _Stream:
c = len(self.dbuf)
t = [self.dbuf]
while c < size:
buf = self.__read(self.bufsize)
if not buf:
break
# Skip underlying buffer to avoid unaligned double buffering.
if self.buf:
buf = self.buf
self.buf = b""
else:
buf = self.fileobj.read(self.bufsize)
if not buf:
break
try:
buf = self.cmp.decompress(buf)
except self.exception:

View File

@ -0,0 +1 @@
Optimize tarfile uncompress performance about 15% when gzip is used.