Merge #16034: Fix performance regressions in the new BZ2File implementation.
Thanks to Victor Hooi for the bug report, and Serhiy Storchaka for the initial patch.
This commit is contained in:
commit
65c848e484
79
Lib/bz2.py
79
Lib/bz2.py
|
@ -79,7 +79,8 @@ class BZ2File(io.BufferedIOBase):
|
||||||
mode = "rb"
|
mode = "rb"
|
||||||
mode_code = _MODE_READ
|
mode_code = _MODE_READ
|
||||||
self._decompressor = BZ2Decompressor()
|
self._decompressor = BZ2Decompressor()
|
||||||
self._buffer = None
|
self._buffer = b""
|
||||||
|
self._buffer_offset = 0
|
||||||
elif mode in ("w", "wb"):
|
elif mode in ("w", "wb"):
|
||||||
mode = "wb"
|
mode = "wb"
|
||||||
mode_code = _MODE_WRITE
|
mode_code = _MODE_WRITE
|
||||||
|
@ -124,7 +125,8 @@ class BZ2File(io.BufferedIOBase):
|
||||||
self._fp = None
|
self._fp = None
|
||||||
self._closefp = False
|
self._closefp = False
|
||||||
self._mode = _MODE_CLOSED
|
self._mode = _MODE_CLOSED
|
||||||
self._buffer = None
|
self._buffer = b""
|
||||||
|
self._buffer_offset = 0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def closed(self):
|
def closed(self):
|
||||||
|
@ -174,16 +176,13 @@ class BZ2File(io.BufferedIOBase):
|
||||||
|
|
||||||
# Fill the readahead buffer if it is empty. Returns False on EOF.
|
# Fill the readahead buffer if it is empty. Returns False on EOF.
|
||||||
def _fill_buffer(self):
|
def _fill_buffer(self):
|
||||||
|
if self._mode == _MODE_READ_EOF:
|
||||||
|
return False
|
||||||
# Depending on the input data, our call to the decompressor may not
|
# Depending on the input data, our call to the decompressor may not
|
||||||
# return any data. In this case, try again after reading another block.
|
# return any data. In this case, try again after reading another block.
|
||||||
while True:
|
while self._buffer_offset == len(self._buffer):
|
||||||
if self._buffer:
|
rawblock = (self._decompressor.unused_data or
|
||||||
return True
|
self._fp.read(_BUFFER_SIZE))
|
||||||
|
|
||||||
if self._decompressor.unused_data:
|
|
||||||
rawblock = self._decompressor.unused_data
|
|
||||||
else:
|
|
||||||
rawblock = self._fp.read(_BUFFER_SIZE)
|
|
||||||
|
|
||||||
if not rawblock:
|
if not rawblock:
|
||||||
if self._decompressor.eof:
|
if self._decompressor.eof:
|
||||||
|
@ -199,30 +198,48 @@ class BZ2File(io.BufferedIOBase):
|
||||||
self._decompressor = BZ2Decompressor()
|
self._decompressor = BZ2Decompressor()
|
||||||
|
|
||||||
self._buffer = self._decompressor.decompress(rawblock)
|
self._buffer = self._decompressor.decompress(rawblock)
|
||||||
|
self._buffer_offset = 0
|
||||||
|
return True
|
||||||
|
|
||||||
# Read data until EOF.
|
# Read data until EOF.
|
||||||
# If return_data is false, consume the data without returning it.
|
# If return_data is false, consume the data without returning it.
|
||||||
def _read_all(self, return_data=True):
|
def _read_all(self, return_data=True):
|
||||||
|
# The loop assumes that _buffer_offset is 0. Ensure that this is true.
|
||||||
|
self._buffer = self._buffer[self._buffer_offset:]
|
||||||
|
self._buffer_offset = 0
|
||||||
|
|
||||||
blocks = []
|
blocks = []
|
||||||
while self._fill_buffer():
|
while self._fill_buffer():
|
||||||
if return_data:
|
if return_data:
|
||||||
blocks.append(self._buffer)
|
blocks.append(self._buffer)
|
||||||
self._pos += len(self._buffer)
|
self._pos += len(self._buffer)
|
||||||
self._buffer = None
|
self._buffer = b""
|
||||||
if return_data:
|
if return_data:
|
||||||
return b"".join(blocks)
|
return b"".join(blocks)
|
||||||
|
|
||||||
# Read a block of up to n bytes.
|
# Read a block of up to n bytes.
|
||||||
# If return_data is false, consume the data without returning it.
|
# If return_data is false, consume the data without returning it.
|
||||||
def _read_block(self, n, return_data=True):
|
def _read_block(self, n, return_data=True):
|
||||||
|
# If we have enough data buffered, return immediately.
|
||||||
|
end = self._buffer_offset + n
|
||||||
|
if end <= len(self._buffer):
|
||||||
|
data = self._buffer[self._buffer_offset : end]
|
||||||
|
self._buffer_offset = end
|
||||||
|
self._pos += len(data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
# The loop assumes that _buffer_offset is 0. Ensure that this is true.
|
||||||
|
self._buffer = self._buffer[self._buffer_offset:]
|
||||||
|
self._buffer_offset = 0
|
||||||
|
|
||||||
blocks = []
|
blocks = []
|
||||||
while n > 0 and self._fill_buffer():
|
while n > 0 and self._fill_buffer():
|
||||||
if n < len(self._buffer):
|
if n < len(self._buffer):
|
||||||
data = self._buffer[:n]
|
data = self._buffer[:n]
|
||||||
self._buffer = self._buffer[n:]
|
self._buffer_offset = n
|
||||||
else:
|
else:
|
||||||
data = self._buffer
|
data = self._buffer
|
||||||
self._buffer = None
|
self._buffer = b""
|
||||||
if return_data:
|
if return_data:
|
||||||
blocks.append(data)
|
blocks.append(data)
|
||||||
self._pos += len(data)
|
self._pos += len(data)
|
||||||
|
@ -238,9 +255,9 @@ class BZ2File(io.BufferedIOBase):
|
||||||
"""
|
"""
|
||||||
with self._lock:
|
with self._lock:
|
||||||
self._check_can_read()
|
self._check_can_read()
|
||||||
if self._mode == _MODE_READ_EOF or not self._fill_buffer():
|
if not self._fill_buffer():
|
||||||
return b""
|
return b""
|
||||||
return self._buffer
|
return self._buffer[self._buffer_offset:]
|
||||||
|
|
||||||
def read(self, size=-1):
|
def read(self, size=-1):
|
||||||
"""Read up to size uncompressed bytes from the file.
|
"""Read up to size uncompressed bytes from the file.
|
||||||
|
@ -250,7 +267,7 @@ class BZ2File(io.BufferedIOBase):
|
||||||
"""
|
"""
|
||||||
with self._lock:
|
with self._lock:
|
||||||
self._check_can_read()
|
self._check_can_read()
|
||||||
if self._mode == _MODE_READ_EOF or size == 0:
|
if size == 0:
|
||||||
return b""
|
return b""
|
||||||
elif size < 0:
|
elif size < 0:
|
||||||
return self._read_all()
|
return self._read_all()
|
||||||
|
@ -268,15 +285,19 @@ class BZ2File(io.BufferedIOBase):
|
||||||
# In this case we make multiple reads, to avoid returning b"".
|
# In this case we make multiple reads, to avoid returning b"".
|
||||||
with self._lock:
|
with self._lock:
|
||||||
self._check_can_read()
|
self._check_can_read()
|
||||||
if (size == 0 or self._mode == _MODE_READ_EOF or
|
if (size == 0 or
|
||||||
not self._fill_buffer()):
|
# Only call _fill_buffer() if the buffer is actually empty.
|
||||||
|
# This gives a significant speedup if *size* is small.
|
||||||
|
(self._buffer_offset == len(self._buffer) and not self._fill_buffer())):
|
||||||
return b""
|
return b""
|
||||||
if 0 < size < len(self._buffer):
|
if size > 0:
|
||||||
data = self._buffer[:size]
|
data = self._buffer[self._buffer_offset :
|
||||||
self._buffer = self._buffer[size:]
|
self._buffer_offset + size]
|
||||||
|
self._buffer_offset += len(data)
|
||||||
else:
|
else:
|
||||||
data = self._buffer
|
data = self._buffer[self._buffer_offset:]
|
||||||
self._buffer = None
|
self._buffer = b""
|
||||||
|
self._buffer_offset = 0
|
||||||
self._pos += len(data)
|
self._pos += len(data)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@ -299,6 +320,14 @@ class BZ2File(io.BufferedIOBase):
|
||||||
raise TypeError("Integer argument expected")
|
raise TypeError("Integer argument expected")
|
||||||
size = size.__index__()
|
size = size.__index__()
|
||||||
with self._lock:
|
with self._lock:
|
||||||
|
# Shortcut for the common case - the whole line is in the buffer.
|
||||||
|
if size < 0:
|
||||||
|
end = self._buffer.find(b"\n", self._buffer_offset) + 1
|
||||||
|
if end > 0:
|
||||||
|
line = self._buffer[self._buffer_offset : end]
|
||||||
|
self._buffer_offset = end
|
||||||
|
self._pos += len(line)
|
||||||
|
return line
|
||||||
return io.BufferedIOBase.readline(self, size)
|
return io.BufferedIOBase.readline(self, size)
|
||||||
|
|
||||||
def readlines(self, size=-1):
|
def readlines(self, size=-1):
|
||||||
|
@ -345,7 +374,8 @@ class BZ2File(io.BufferedIOBase):
|
||||||
self._mode = _MODE_READ
|
self._mode = _MODE_READ
|
||||||
self._pos = 0
|
self._pos = 0
|
||||||
self._decompressor = BZ2Decompressor()
|
self._decompressor = BZ2Decompressor()
|
||||||
self._buffer = None
|
self._buffer = b""
|
||||||
|
self._buffer_offset = 0
|
||||||
|
|
||||||
def seek(self, offset, whence=0):
|
def seek(self, offset, whence=0):
|
||||||
"""Change the file position.
|
"""Change the file position.
|
||||||
|
@ -385,7 +415,6 @@ class BZ2File(io.BufferedIOBase):
|
||||||
offset -= self._pos
|
offset -= self._pos
|
||||||
|
|
||||||
# Read and discard data until we reach the desired position.
|
# Read and discard data until we reach the desired position.
|
||||||
if self._mode != _MODE_READ_EOF:
|
|
||||||
self._read_block(offset, return_data=False)
|
self._read_block(offset, return_data=False)
|
||||||
|
|
||||||
return self._pos
|
return self._pos
|
||||||
|
|
|
@ -15,6 +15,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #16034: Fix performance regressions in the new BZ2File implementation.
|
||||||
|
Initial patch by Serhiy Storchaka.
|
||||||
|
|
||||||
- pty.spawn() now returns the child process status returned by os.waitpid().
|
- pty.spawn() now returns the child process status returned by os.waitpid().
|
||||||
|
|
||||||
- Issue #15756: subprocess.poll() now properly handles errno.ECHILD to
|
- Issue #15756: subprocess.poll() now properly handles errno.ECHILD to
|
||||||
|
|
Loading…
Reference in New Issue