Merged revisions 77288 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r77288 | antoine.pitrou | 2010-01-03 23:29:56 +0100 (dim., 03 janv. 2010) | 5 lines

  Issue #7471: Improve the performance of GzipFile's buffering mechanism,
  and make it implement the `io.BufferedIOBase` ABC to allow for further
  speedups by wrapping it in an `io.BufferedReader`.  Patch by Nir Aides.
........
This commit is contained in:
Antoine Pitrou 2010-01-03 22:37:40 +00:00
parent a81d881e13
commit b1f8835b21
3 changed files with 58 additions and 58 deletions

View File

@ -8,6 +8,7 @@ but random access is not allowed."""
import struct, sys, time, os import struct, sys, time, os
import zlib import zlib
import builtins import builtins
import io
__all__ = ["GzipFile","open"] __all__ = ["GzipFile","open"]
@ -44,7 +45,7 @@ def open(filename, mode="rb", compresslevel=9):
""" """
return GzipFile(filename, mode, compresslevel) return GzipFile(filename, mode, compresslevel)
class GzipFile: class GzipFile(io.BufferedIOBase):
"""The GzipFile class simulates most of the methods of a file object with """The GzipFile class simulates most of the methods of a file object with
the exception of the readinto() and truncate() methods. the exception of the readinto() and truncate() methods.
@ -109,8 +110,12 @@ class GzipFile:
self.mode = READ self.mode = READ
# Set flag indicating start of a new member # Set flag indicating start of a new member
self._new_member = True self._new_member = True
# Buffer data read from gzip file. extrastart is offset in
# stream where buffer starts. extrasize is number of
# bytes remaining in buffer from current stream position.
self.extrabuf = b"" self.extrabuf = b""
self.extrasize = 0 self.extrasize = 0
self.extrastart = 0
self.name = filename self.name = filename
# Starts small, scales exponentially # Starts small, scales exponentially
self.min_readsize = 100 self.min_readsize = 100
@ -214,7 +219,6 @@ class GzipFile:
if flag & FHCRC: if flag & FHCRC:
self.fileobj.read(2) # Read & discard the 16-bit header CRC self.fileobj.read(2) # Read & discard the 16-bit header CRC
def write(self,data): def write(self,data):
if self.mode != WRITE: if self.mode != WRITE:
import errno import errno
@ -222,12 +226,19 @@ class GzipFile:
if self.fileobj is None: if self.fileobj is None:
raise ValueError("write() on closed GzipFile object") raise ValueError("write() on closed GzipFile object")
# Convert data type if called by io.BufferedWriter.
if isinstance(data, memoryview):
data = data.tobytes()
if len(data) > 0: if len(data) > 0:
self.size = self.size + len(data) self.size = self.size + len(data)
self.crc = zlib.crc32(data, self.crc) & 0xffffffff self.crc = zlib.crc32(data, self.crc) & 0xffffffff
self.fileobj.write( self.compress.compress(data) ) self.fileobj.write( self.compress.compress(data) )
self.offset += len(data) self.offset += len(data)
return len(data)
def read(self, size=-1): def read(self, size=-1):
if self.mode != READ: if self.mode != READ:
import errno import errno
@ -253,15 +264,14 @@ class GzipFile:
if size > self.extrasize: if size > self.extrasize:
size = self.extrasize size = self.extrasize
chunk = self.extrabuf[:size] offset = self.offset - self.extrastart
self.extrabuf = self.extrabuf[size:] chunk = self.extrabuf[offset: offset + size]
self.extrasize = self.extrasize - size self.extrasize = self.extrasize - size
self.offset += size self.offset += size
return chunk return chunk
def _unread(self, buf): def _unread(self, buf):
self.extrabuf = buf + self.extrabuf
self.extrasize = len(buf) + self.extrasize self.extrasize = len(buf) + self.extrasize
self.offset -= len(buf) self.offset -= len(buf)
@ -317,8 +327,10 @@ class GzipFile:
def _add_read_data(self, data): def _add_read_data(self, data):
self.crc = zlib.crc32(data, self.crc) & 0xffffffff self.crc = zlib.crc32(data, self.crc) & 0xffffffff
self.extrabuf = self.extrabuf + data offset = self.offset - self.extrastart
self.extrabuf = self.extrabuf[offset:] + data
self.extrasize = self.extrasize + len(data) self.extrasize = self.extrasize + len(data)
self.extrastart = self.offset
self.size = self.size + len(data) self.size = self.size + len(data)
def _read_eof(self): def _read_eof(self):
@ -336,6 +348,10 @@ class GzipFile:
elif isize != (self.size & 0xffffffff): elif isize != (self.size & 0xffffffff):
raise IOError("Incorrect length of data produced") raise IOError("Incorrect length of data produced")
@property
def closed(self):
return self.fileobj is None
def close(self): def close(self):
if self.fileobj is None: if self.fileobj is None:
return return
@ -351,15 +367,6 @@ class GzipFile:
self.myfileobj.close() self.myfileobj.close()
self.myfileobj = None self.myfileobj = None
def __del__(self):
try:
if (self.myfileobj is None and
self.fileobj is None):
return
except AttributeError:
return
self.close()
def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
if self.mode == WRITE: if self.mode == WRITE:
# Ensure the compressor's buffer is flushed # Ensure the compressor's buffer is flushed
@ -374,12 +381,6 @@ class GzipFile:
""" """
return self.fileobj.fileno() return self.fileobj.fileno()
def isatty(self):
return False
def tell(self):
return self.offset
def rewind(self): def rewind(self):
'''Return the uncompressed stream file position indicator to the '''Return the uncompressed stream file position indicator to the
beginning of the file''' beginning of the file'''
@ -389,8 +390,18 @@ class GzipFile:
self._new_member = True self._new_member = True
self.extrabuf = b"" self.extrabuf = b""
self.extrasize = 0 self.extrasize = 0
self.extrastart = 0
self.offset = 0 self.offset = 0
def readable(self):
return self.mode == READ
def writable(self):
return self.mode == WRITE
def seekable(self):
return True
def seek(self, offset, whence=0): def seek(self, offset, whence=0):
if whence: if whence:
if whence == 1: if whence == 1:
@ -414,8 +425,18 @@ class GzipFile:
self.read(1024) self.read(1024)
self.read(count % 1024) self.read(count % 1024)
return self.offset
def readline(self, size=-1): def readline(self, size=-1):
if size < 0: if size < 0:
# Shortcut common case - newline found in buffer.
offset = self.offset - self.extrastart
i = self.extrabuf.find(b'\n', offset) + 1
if i > 0:
self.extrasize -= i - offset
self.offset += i - offset
return self.extrabuf[offset: i]
size = sys.maxsize size = sys.maxsize
readsize = self.min_readsize readsize = self.min_readsize
else: else:
@ -445,42 +466,6 @@ class GzipFile:
self.min_readsize = min(readsize, self.min_readsize * 2, 512) self.min_readsize = min(readsize, self.min_readsize * 2, 512)
return b''.join(bufs) # Return resulting line return b''.join(bufs) # Return resulting line
def readlines(self, sizehint=0):
# Negative numbers result in reading all the lines
if sizehint <= 0:
sizehint = sys.maxsize
L = []
while sizehint > 0:
line = self.readline()
if line == b"":
break
L.append(line)
sizehint = sizehint - len(line)
return L
def writelines(self, L):
for line in L:
self.write(line)
def __iter__(self):
return self
def __next__(self):
line = self.readline()
if line:
return line
else:
raise StopIteration
def __enter__(self):
if self.fileobj is None:
raise ValueError("I/O operation on closed GzipFile object")
return self
def __exit__(self, *args):
self.close()
def _test(): def _test():
# Act like gzip; with -d, act like gunzip. # Act like gzip; with -d, act like gunzip.

View File

@ -5,6 +5,7 @@
import unittest import unittest
from test import support from test import support
import os import os
import io
import struct import struct
gzip = support.import_module('gzip') gzip = support.import_module('gzip')
@ -80,6 +81,16 @@ class TestGzip(unittest.TestCase):
zgfile.close() zgfile.close()
self.assertEquals(contents, b'a'*201) self.assertEquals(contents, b'a'*201)
def test_buffered_reader(self):
# Issue #7471: a GzipFile can be wrapped in a BufferedReader for
# performance.
self.test_write()
f = gzip.GzipFile(self.filename, 'rb')
with io.BufferedReader(f) as r:
lines = [line for line in r]
self.assertEqual(lines, 50 * data1.splitlines(True))
def test_readline(self): def test_readline(self):
self.test_write() self.test_write()

View File

@ -191,7 +191,11 @@ C-API
Library Library
------- -------
_ Issue #3972: http.client.HTTPConnection now accepts an optional source_address - Issue #7471: Improve the performance of GzipFile's buffering mechanism,
and make it implement the `io.BufferedIOBase` ABC to allow for further
speedups by wrapping it in an `io.BufferedReader`. Patch by Nir Aides.
- Issue #3972: http.client.HTTPConnection now accepts an optional source_address
parameter to allow specifying where your connections come from. parameter to allow specifying where your connections come from.
- socket.create_connection now accepts an optional source_address parameter. - socket.create_connection now accepts an optional source_address parameter.