From 6c57318c3a88e4803cc374cce2309c11382c9a37 Mon Sep 17 00:00:00 2001 From: Nadeem Vawda Date: Sun, 30 Sep 2012 03:57:33 +0200 Subject: [PATCH] Issue #16034: Fix performance regressions in the new BZ2File implementation. Thanks to Victor Hooi for the bug report, and Serhiy Storchaka for the initial patch. --- Lib/bz2.py | 81 ++++++++++++++++++++++++++++++++++++------------------ Misc/NEWS | 3 ++ 2 files changed, 58 insertions(+), 26 deletions(-) diff --git a/Lib/bz2.py b/Lib/bz2.py index a50adf79f78..61d989bcace 100644 --- a/Lib/bz2.py +++ b/Lib/bz2.py @@ -79,7 +79,8 @@ class BZ2File(io.BufferedIOBase): mode = "rb" mode_code = _MODE_READ self._decompressor = BZ2Decompressor() - self._buffer = None + self._buffer = b"" + self._buffer_offset = 0 elif mode in ("w", "wb"): mode = "wb" mode_code = _MODE_WRITE @@ -124,7 +125,8 @@ class BZ2File(io.BufferedIOBase): self._fp = None self._closefp = False self._mode = _MODE_CLOSED - self._buffer = None + self._buffer = b"" + self._buffer_offset = 0 @property def closed(self): @@ -174,16 +176,13 @@ class BZ2File(io.BufferedIOBase): # Fill the readahead buffer if it is empty. Returns False on EOF. def _fill_buffer(self): + if self._mode == _MODE_READ_EOF: + return False # Depending on the input data, our call to the decompressor may not # return any data. In this case, try again after reading another block. - while True: - if self._buffer: - return True - - if self._decompressor.unused_data: - rawblock = self._decompressor.unused_data - else: - rawblock = self._fp.read(_BUFFER_SIZE) + while self._buffer_offset == len(self._buffer): + rawblock = (self._decompressor.unused_data or + self._fp.read(_BUFFER_SIZE)) if not rawblock: if self._decompressor.eof: @@ -199,30 +198,48 @@ class BZ2File(io.BufferedIOBase): self._decompressor = BZ2Decompressor() self._buffer = self._decompressor.decompress(rawblock) + self._buffer_offset = 0 + return True # Read data until EOF. # If return_data is false, consume the data without returning it. def _read_all(self, return_data=True): + # The loop assumes that _buffer_offset is 0. Ensure that this is true. + self._buffer = self._buffer[self._buffer_offset:] + self._buffer_offset = 0 + blocks = [] while self._fill_buffer(): if return_data: blocks.append(self._buffer) self._pos += len(self._buffer) - self._buffer = None + self._buffer = b"" if return_data: return b"".join(blocks) # Read a block of up to n bytes. # If return_data is false, consume the data without returning it. def _read_block(self, n, return_data=True): + # If we have enough data buffered, return immediately. + end = self._buffer_offset + n + if end <= len(self._buffer): + data = self._buffer[self._buffer_offset : end] + self._buffer_offset = end + self._pos += len(data) + return data + + # The loop assumes that _buffer_offset is 0. Ensure that this is true. + self._buffer = self._buffer[self._buffer_offset:] + self._buffer_offset = 0 + blocks = [] while n > 0 and self._fill_buffer(): if n < len(self._buffer): data = self._buffer[:n] - self._buffer = self._buffer[n:] + self._buffer_offset = n else: data = self._buffer - self._buffer = None + self._buffer = b"" if return_data: blocks.append(data) self._pos += len(data) @@ -238,9 +255,9 @@ class BZ2File(io.BufferedIOBase): """ with self._lock: self._check_can_read() - if self._mode == _MODE_READ_EOF or not self._fill_buffer(): + if not self._fill_buffer(): return b"" - return self._buffer + return self._buffer[self._buffer_offset:] def read(self, size=-1): """Read up to size uncompressed bytes from the file. @@ -250,7 +267,7 @@ class BZ2File(io.BufferedIOBase): """ with self._lock: self._check_can_read() - if self._mode == _MODE_READ_EOF or size == 0: + if size == 0: return b"" elif size < 0: return self._read_all() @@ -268,15 +285,19 @@ class BZ2File(io.BufferedIOBase): # In this case we make multiple reads, to avoid returning b"". with self._lock: self._check_can_read() - if (size == 0 or self._mode == _MODE_READ_EOF or - not self._fill_buffer()): + if (size == 0 or + # Only call _fill_buffer() if the buffer is actually empty. + # This gives a significant speedup if *size* is small. + (self._buffer_offset == len(self._buffer) and not self._fill_buffer())): return b"" - if 0 < size < len(self._buffer): - data = self._buffer[:size] - self._buffer = self._buffer[size:] + if size > 0: + data = self._buffer[self._buffer_offset : + self._buffer_offset + size] + self._buffer_offset += len(data) else: - data = self._buffer - self._buffer = None + data = self._buffer[self._buffer_offset:] + self._buffer = b"" + self._buffer_offset = 0 self._pos += len(data) return data @@ -299,6 +320,14 @@ class BZ2File(io.BufferedIOBase): raise TypeError("Integer argument expected") size = size.__index__() with self._lock: + # Shortcut for the common case - the whole line is in the buffer. + if size < 0: + end = self._buffer.find(b"\n", self._buffer_offset) + 1 + if end > 0: + line = self._buffer[self._buffer_offset : end] + self._buffer_offset = end + self._pos += len(line) + return line return io.BufferedIOBase.readline(self, size) def readlines(self, size=-1): @@ -345,7 +374,8 @@ class BZ2File(io.BufferedIOBase): self._mode = _MODE_READ self._pos = 0 self._decompressor = BZ2Decompressor() - self._buffer = None + self._buffer = b"" + self._buffer_offset = 0 def seek(self, offset, whence=0): """Change the file position. @@ -385,8 +415,7 @@ class BZ2File(io.BufferedIOBase): offset -= self._pos # Read and discard data until we reach the desired position. - if self._mode != _MODE_READ_EOF: - self._read_block(offset, return_data=False) + self._read_block(offset, return_data=False) return self._pos diff --git a/Misc/NEWS b/Misc/NEWS index 245cf1a677e..7bfa6498aca 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -13,6 +13,9 @@ Core and Builtins Library ------- +- Issue #16034: Fix performance regressions in the new BZ2File implementation. + Initial patch by Serhiy Storchaka. + - Issue #15756: subprocess.poll() now properly handles errno.ECHILD to return a returncode of 0 when the child has already exited or cannot be waited on.