Merge #16034: Fix performance regressions in the new BZ2File implementation.

Thanks to Victor Hooi for the bug report, and Serhiy Storchaka for the initial patch.
This commit is contained in:
Nadeem Vawda 2012-09-30 04:01:31 +02:00
commit 65c848e484
2 changed files with 58 additions and 26 deletions

View File

@ -79,7 +79,8 @@ class BZ2File(io.BufferedIOBase):
mode = "rb" mode = "rb"
mode_code = _MODE_READ mode_code = _MODE_READ
self._decompressor = BZ2Decompressor() self._decompressor = BZ2Decompressor()
self._buffer = None self._buffer = b""
self._buffer_offset = 0
elif mode in ("w", "wb"): elif mode in ("w", "wb"):
mode = "wb" mode = "wb"
mode_code = _MODE_WRITE mode_code = _MODE_WRITE
@ -124,7 +125,8 @@ class BZ2File(io.BufferedIOBase):
self._fp = None self._fp = None
self._closefp = False self._closefp = False
self._mode = _MODE_CLOSED self._mode = _MODE_CLOSED
self._buffer = None self._buffer = b""
self._buffer_offset = 0
@property @property
def closed(self): def closed(self):
@ -174,16 +176,13 @@ class BZ2File(io.BufferedIOBase):
# Fill the readahead buffer if it is empty. Returns False on EOF. # Fill the readahead buffer if it is empty. Returns False on EOF.
def _fill_buffer(self): def _fill_buffer(self):
if self._mode == _MODE_READ_EOF:
return False
# Depending on the input data, our call to the decompressor may not # Depending on the input data, our call to the decompressor may not
# return any data. In this case, try again after reading another block. # return any data. In this case, try again after reading another block.
while True: while self._buffer_offset == len(self._buffer):
if self._buffer: rawblock = (self._decompressor.unused_data or
return True self._fp.read(_BUFFER_SIZE))
if self._decompressor.unused_data:
rawblock = self._decompressor.unused_data
else:
rawblock = self._fp.read(_BUFFER_SIZE)
if not rawblock: if not rawblock:
if self._decompressor.eof: if self._decompressor.eof:
@ -199,30 +198,48 @@ class BZ2File(io.BufferedIOBase):
self._decompressor = BZ2Decompressor() self._decompressor = BZ2Decompressor()
self._buffer = self._decompressor.decompress(rawblock) self._buffer = self._decompressor.decompress(rawblock)
self._buffer_offset = 0
return True
# Read data until EOF. # Read data until EOF.
# If return_data is false, consume the data without returning it. # If return_data is false, consume the data without returning it.
def _read_all(self, return_data=True): def _read_all(self, return_data=True):
# The loop assumes that _buffer_offset is 0. Ensure that this is true.
self._buffer = self._buffer[self._buffer_offset:]
self._buffer_offset = 0
blocks = [] blocks = []
while self._fill_buffer(): while self._fill_buffer():
if return_data: if return_data:
blocks.append(self._buffer) blocks.append(self._buffer)
self._pos += len(self._buffer) self._pos += len(self._buffer)
self._buffer = None self._buffer = b""
if return_data: if return_data:
return b"".join(blocks) return b"".join(blocks)
# Read a block of up to n bytes. # Read a block of up to n bytes.
# If return_data is false, consume the data without returning it. # If return_data is false, consume the data without returning it.
def _read_block(self, n, return_data=True): def _read_block(self, n, return_data=True):
# If we have enough data buffered, return immediately.
end = self._buffer_offset + n
if end <= len(self._buffer):
data = self._buffer[self._buffer_offset : end]
self._buffer_offset = end
self._pos += len(data)
return data
# The loop assumes that _buffer_offset is 0. Ensure that this is true.
self._buffer = self._buffer[self._buffer_offset:]
self._buffer_offset = 0
blocks = [] blocks = []
while n > 0 and self._fill_buffer(): while n > 0 and self._fill_buffer():
if n < len(self._buffer): if n < len(self._buffer):
data = self._buffer[:n] data = self._buffer[:n]
self._buffer = self._buffer[n:] self._buffer_offset = n
else: else:
data = self._buffer data = self._buffer
self._buffer = None self._buffer = b""
if return_data: if return_data:
blocks.append(data) blocks.append(data)
self._pos += len(data) self._pos += len(data)
@ -238,9 +255,9 @@ class BZ2File(io.BufferedIOBase):
""" """
with self._lock: with self._lock:
self._check_can_read() self._check_can_read()
if self._mode == _MODE_READ_EOF or not self._fill_buffer(): if not self._fill_buffer():
return b"" return b""
return self._buffer return self._buffer[self._buffer_offset:]
def read(self, size=-1): def read(self, size=-1):
"""Read up to size uncompressed bytes from the file. """Read up to size uncompressed bytes from the file.
@ -250,7 +267,7 @@ class BZ2File(io.BufferedIOBase):
""" """
with self._lock: with self._lock:
self._check_can_read() self._check_can_read()
if self._mode == _MODE_READ_EOF or size == 0: if size == 0:
return b"" return b""
elif size < 0: elif size < 0:
return self._read_all() return self._read_all()
@ -268,15 +285,19 @@ class BZ2File(io.BufferedIOBase):
# In this case we make multiple reads, to avoid returning b"". # In this case we make multiple reads, to avoid returning b"".
with self._lock: with self._lock:
self._check_can_read() self._check_can_read()
if (size == 0 or self._mode == _MODE_READ_EOF or if (size == 0 or
not self._fill_buffer()): # Only call _fill_buffer() if the buffer is actually empty.
# This gives a significant speedup if *size* is small.
(self._buffer_offset == len(self._buffer) and not self._fill_buffer())):
return b"" return b""
if 0 < size < len(self._buffer): if size > 0:
data = self._buffer[:size] data = self._buffer[self._buffer_offset :
self._buffer = self._buffer[size:] self._buffer_offset + size]
self._buffer_offset += len(data)
else: else:
data = self._buffer data = self._buffer[self._buffer_offset:]
self._buffer = None self._buffer = b""
self._buffer_offset = 0
self._pos += len(data) self._pos += len(data)
return data return data
@ -299,6 +320,14 @@ class BZ2File(io.BufferedIOBase):
raise TypeError("Integer argument expected") raise TypeError("Integer argument expected")
size = size.__index__() size = size.__index__()
with self._lock: with self._lock:
# Shortcut for the common case - the whole line is in the buffer.
if size < 0:
end = self._buffer.find(b"\n", self._buffer_offset) + 1
if end > 0:
line = self._buffer[self._buffer_offset : end]
self._buffer_offset = end
self._pos += len(line)
return line
return io.BufferedIOBase.readline(self, size) return io.BufferedIOBase.readline(self, size)
def readlines(self, size=-1): def readlines(self, size=-1):
@ -345,7 +374,8 @@ class BZ2File(io.BufferedIOBase):
self._mode = _MODE_READ self._mode = _MODE_READ
self._pos = 0 self._pos = 0
self._decompressor = BZ2Decompressor() self._decompressor = BZ2Decompressor()
self._buffer = None self._buffer = b""
self._buffer_offset = 0
def seek(self, offset, whence=0): def seek(self, offset, whence=0):
"""Change the file position. """Change the file position.
@ -385,7 +415,6 @@ class BZ2File(io.BufferedIOBase):
offset -= self._pos offset -= self._pos
# Read and discard data until we reach the desired position. # Read and discard data until we reach the desired position.
if self._mode != _MODE_READ_EOF:
self._read_block(offset, return_data=False) self._read_block(offset, return_data=False)
return self._pos return self._pos

View File

@ -15,6 +15,9 @@ Core and Builtins
Library Library
------- -------
- Issue #16034: Fix performance regressions in the new BZ2File implementation.
Initial patch by Serhiy Storchaka.
- pty.spawn() now returns the child process status returned by os.waitpid(). - pty.spawn() now returns the child process status returned by os.waitpid().
- Issue #15756: subprocess.poll() now properly handles errno.ECHILD to - Issue #15756: subprocess.poll() now properly handles errno.ECHILD to