From b1f8835b213411d059d0e2ba4b78125328afeee6 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sun, 3 Jan 2010 22:37:40 +0000 Subject: [PATCH] Merged revisions 77288 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r77288 | antoine.pitrou | 2010-01-03 23:29:56 +0100 (dim., 03 janv. 2010) | 5 lines Issue #7471: Improve the performance of GzipFile's buffering mechanism, and make it implement the `io.BufferedIOBase` ABC to allow for further speedups by wrapping it in an `io.BufferedReader`. Patch by Nir Aides. ........ --- Lib/gzip.py | 99 ++++++++++++++++++------------------------- Lib/test/test_gzip.py | 11 +++++ Misc/NEWS | 6 ++- 3 files changed, 58 insertions(+), 58 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index f9a59d7ff0a..66fc88daa0d 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -8,6 +8,7 @@ but random access is not allowed.""" import struct, sys, time, os import zlib import builtins +import io __all__ = ["GzipFile","open"] @@ -44,7 +45,7 @@ def open(filename, mode="rb", compresslevel=9): """ return GzipFile(filename, mode, compresslevel) -class GzipFile: +class GzipFile(io.BufferedIOBase): """The GzipFile class simulates most of the methods of a file object with the exception of the readinto() and truncate() methods. @@ -109,8 +110,12 @@ class GzipFile: self.mode = READ # Set flag indicating start of a new member self._new_member = True + # Buffer data read from gzip file. extrastart is offset in + # stream where buffer starts. extrasize is number of + # bytes remaining in buffer from current stream position. self.extrabuf = b"" self.extrasize = 0 + self.extrastart = 0 self.name = filename # Starts small, scales exponentially self.min_readsize = 100 @@ -214,7 +219,6 @@ class GzipFile: if flag & FHCRC: self.fileobj.read(2) # Read & discard the 16-bit header CRC - def write(self,data): if self.mode != WRITE: import errno @@ -222,12 +226,19 @@ class GzipFile: if self.fileobj is None: raise ValueError("write() on closed GzipFile object") + + # Convert data type if called by io.BufferedWriter. + if isinstance(data, memoryview): + data = data.tobytes() + if len(data) > 0: self.size = self.size + len(data) self.crc = zlib.crc32(data, self.crc) & 0xffffffff self.fileobj.write( self.compress.compress(data) ) self.offset += len(data) + return len(data) + def read(self, size=-1): if self.mode != READ: import errno @@ -253,15 +264,14 @@ class GzipFile: if size > self.extrasize: size = self.extrasize - chunk = self.extrabuf[:size] - self.extrabuf = self.extrabuf[size:] + offset = self.offset - self.extrastart + chunk = self.extrabuf[offset: offset + size] self.extrasize = self.extrasize - size self.offset += size return chunk def _unread(self, buf): - self.extrabuf = buf + self.extrabuf self.extrasize = len(buf) + self.extrasize self.offset -= len(buf) @@ -317,8 +327,10 @@ class GzipFile: def _add_read_data(self, data): self.crc = zlib.crc32(data, self.crc) & 0xffffffff - self.extrabuf = self.extrabuf + data + offset = self.offset - self.extrastart + self.extrabuf = self.extrabuf[offset:] + data self.extrasize = self.extrasize + len(data) + self.extrastart = self.offset self.size = self.size + len(data) def _read_eof(self): @@ -336,6 +348,10 @@ class GzipFile: elif isize != (self.size & 0xffffffff): raise IOError("Incorrect length of data produced") + @property + def closed(self): + return self.fileobj is None + def close(self): if self.fileobj is None: return @@ -351,15 +367,6 @@ class GzipFile: self.myfileobj.close() self.myfileobj = None - def __del__(self): - try: - if (self.myfileobj is None and - self.fileobj is None): - return - except AttributeError: - return - self.close() - def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): if self.mode == WRITE: # Ensure the compressor's buffer is flushed @@ -374,12 +381,6 @@ class GzipFile: """ return self.fileobj.fileno() - def isatty(self): - return False - - def tell(self): - return self.offset - def rewind(self): '''Return the uncompressed stream file position indicator to the beginning of the file''' @@ -389,8 +390,18 @@ class GzipFile: self._new_member = True self.extrabuf = b"" self.extrasize = 0 + self.extrastart = 0 self.offset = 0 + def readable(self): + return self.mode == READ + + def writable(self): + return self.mode == WRITE + + def seekable(self): + return True + def seek(self, offset, whence=0): if whence: if whence == 1: @@ -414,8 +425,18 @@ class GzipFile: self.read(1024) self.read(count % 1024) + return self.offset + def readline(self, size=-1): if size < 0: + # Shortcut common case - newline found in buffer. + offset = self.offset - self.extrastart + i = self.extrabuf.find(b'\n', offset) + 1 + if i > 0: + self.extrasize -= i - offset + self.offset += i - offset + return self.extrabuf[offset: i] + size = sys.maxsize readsize = self.min_readsize else: @@ -445,42 +466,6 @@ class GzipFile: self.min_readsize = min(readsize, self.min_readsize * 2, 512) return b''.join(bufs) # Return resulting line - def readlines(self, sizehint=0): - # Negative numbers result in reading all the lines - if sizehint <= 0: - sizehint = sys.maxsize - L = [] - while sizehint > 0: - line = self.readline() - if line == b"": - break - L.append(line) - sizehint = sizehint - len(line) - - return L - - def writelines(self, L): - for line in L: - self.write(line) - - def __iter__(self): - return self - - def __next__(self): - line = self.readline() - if line: - return line - else: - raise StopIteration - - def __enter__(self): - if self.fileobj is None: - raise ValueError("I/O operation on closed GzipFile object") - return self - - def __exit__(self, *args): - self.close() - def _test(): # Act like gzip; with -d, act like gunzip. diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py index fa91dc02623..320adfda8a0 100644 --- a/Lib/test/test_gzip.py +++ b/Lib/test/test_gzip.py @@ -5,6 +5,7 @@ import unittest from test import support import os +import io import struct gzip = support.import_module('gzip') @@ -80,6 +81,16 @@ class TestGzip(unittest.TestCase): zgfile.close() self.assertEquals(contents, b'a'*201) + def test_buffered_reader(self): + # Issue #7471: a GzipFile can be wrapped in a BufferedReader for + # performance. + self.test_write() + + f = gzip.GzipFile(self.filename, 'rb') + with io.BufferedReader(f) as r: + lines = [line for line in r] + + self.assertEqual(lines, 50 * data1.splitlines(True)) def test_readline(self): self.test_write() diff --git a/Misc/NEWS b/Misc/NEWS index 451a2a06d4a..815e3922b9c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -191,7 +191,11 @@ C-API Library ------- -_ Issue #3972: http.client.HTTPConnection now accepts an optional source_address +- Issue #7471: Improve the performance of GzipFile's buffering mechanism, + and make it implement the `io.BufferedIOBase` ABC to allow for further + speedups by wrapping it in an `io.BufferedReader`. Patch by Nir Aides. + +- Issue #3972: http.client.HTTPConnection now accepts an optional source_address parameter to allow specifying where your connections come from. - socket.create_connection now accepts an optional source_address parameter.