From 7b9698435d890b2df5107a6a3efedc1aebc178a3 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 23 Sep 2010 16:22:51 +0000 Subject: [PATCH] Issue #1675951: Allow GzipFile to work with unseekable file objects. Patch by Florian Festi. --- Doc/library/gzip.rst | 3 ++ Lib/gzip.py | 91 ++++++++++++++++++++++++++++++++++--------- Lib/test/test_gzip.py | 21 ++++++++++ Misc/ACKS | 1 + Misc/NEWS | 3 ++ 5 files changed, 101 insertions(+), 18 deletions(-) diff --git a/Doc/library/gzip.rst b/Doc/library/gzip.rst index edd5587b4a8..934fcb38a50 100644 --- a/Doc/library/gzip.rst +++ b/Doc/library/gzip.rst @@ -74,6 +74,9 @@ The module defines the following items: .. versionchanged:: 3.2 Support for zero-padded files was added. + .. versionchanged:: 3.2 + Support for unseekable files was added. + .. function:: open(filename, mode='rb', compresslevel=9) diff --git a/Lib/gzip.py b/Lib/gzip.py index 83311cc0deb..3edc8395d68 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -45,6 +45,62 @@ def open(filename, mode="rb", compresslevel=9): """ return GzipFile(filename, mode, compresslevel) +class _PaddedFile: + """Minimal read-only file object that prepends a string to the contents + of an actual file. Shouldn't be used outside of gzip.py, as it lacks + essential functionality.""" + + def __init__(self, f, prepend=b''): + self._buffer = prepend + self._length = len(prepend) + self.file = f + self._read = 0 + + def read(self, size): + if self._read is None: + return self.file.read(size) + if self._read + size <= self._length: + read = self._read + self._read += size + return self._buffer[read:self._read] + else: + read = self._read + self._read = None + return self._buffer[read:] + \ + self.file.read(size-self._length+read) + + def prepend(self, prepend=b'', readprevious=False): + if self._read is None: + self._buffer = prepend + elif readprevious and len(prepend) <= self._read: + self._read -= len(prepend) + return + else: + self._buffer = self._buffer[read:] + prepend + self._length = len(self._buffer) + self._read = 0 + + def unused(self): + if self._read is None: + return b'' + return self._buffer[self._read:] + + def seek(self, offset, whence=0): + # This is only ever called with offset=whence=0 + if whence == 1 and self._read is not None: + if 0 <= offset + self._read <= self._length: + self._read += offset + return + else: + offset += self._length - self._read + self._read = None + self._buffer = None + return self.file.seek(offset, whence) + + def __getattr__(self, name): + return getattr(name, self.file) + + class GzipFile(io.BufferedIOBase): """The GzipFile class simulates most of the methods of a file object with the exception of the readinto() and truncate() methods. @@ -119,6 +175,7 @@ class GzipFile(io.BufferedIOBase): self.name = filename # Starts small, scales exponentially self.min_readsize = 100 + fileobj = _PaddedFile(fileobj) elif mode[0:1] == 'w' or mode[0:1] == 'a': self.mode = WRITE @@ -188,6 +245,9 @@ class GzipFile(io.BufferedIOBase): def _read_gzip_header(self): magic = self.fileobj.read(2) + if magic == b'': + raise EOFError("Reached EOF") + if magic != b'\037\213': raise IOError('Not a gzipped file') method = ord( self.fileobj.read(1) ) @@ -219,6 +279,11 @@ class GzipFile(io.BufferedIOBase): if flag & FHCRC: self.fileobj.read(2) # Read & discard the 16-bit header CRC + unused = self.fileobj.unused() + if unused: + uncompress = self.decompress.decompress(unused) + self._add_read_data(uncompress) + def write(self,data): if self.mode != WRITE: import errno @@ -282,16 +347,6 @@ class GzipFile(io.BufferedIOBase): if self._new_member: # If the _new_member flag is set, we have to # jump to the next member, if there is one. - # - # First, check if we're at the end of the file; - # if so, it's time to stop; no more members to read. - pos = self.fileobj.tell() # Save current position - self.fileobj.seek(0, 2) # Seek to end of file - if pos == self.fileobj.tell(): - raise EOFError("Reached EOF") - else: - self.fileobj.seek( pos ) # Return to original position - self._init_read() self._read_gzip_header() self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) @@ -305,6 +360,9 @@ class GzipFile(io.BufferedIOBase): if buf == b"": uncompress = self.decompress.flush() + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() + self.fileobj.prepend(self.decompress.unused_data, True) self._read_eof() self._add_read_data( uncompress ) raise EOFError('Reached EOF') @@ -316,10 +374,9 @@ class GzipFile(io.BufferedIOBase): # Ending case: we've come to the end of a member in the file, # so seek back to the start of the unused data, finish up # this member, and read a new gzip header. - # (The number of bytes to seek back is the length of the unused - # data, minus 8 because _read_eof() will rewind a further 8 bytes) - self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) - + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() and _read_gzip_header() + self.fileobj.prepend(self.decompress.unused_data, True) # Check the CRC and file size, and set the flag so we read # a new member on the next call self._read_eof() @@ -334,12 +391,10 @@ class GzipFile(io.BufferedIOBase): self.size = self.size + len(data) def _read_eof(self): - # We've read to the end of the file, so we have to rewind in order - # to reread the 8 bytes containing the CRC and the file size. + # We've read to the end of the file # We check the that the computed CRC and size of the # uncompressed data matches the stored values. Note that the size # stored is the true file size mod 2**32. - self.fileobj.seek(-8, 1) crc32 = read32(self.fileobj) isize = read32(self.fileobj) # may exceed 2GB if crc32 != self.crc: @@ -355,7 +410,7 @@ class GzipFile(io.BufferedIOBase): while c == b"\x00": c = self.fileobj.read(1) if c: - self.fileobj.seek(-1, 1) + self.fileobj.prepend(c, True) @property def closed(self): diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py index a95af058a39..e49fe00802b 100644 --- a/Lib/test/test_gzip.py +++ b/Lib/test/test_gzip.py @@ -22,6 +22,17 @@ data2 = b"""/* zlibmodule.c -- gzip-compatible data compression */ """ +class UnseekableIO(io.BytesIO): + def seekable(self): + return False + + def tell(self): + raise io.UnsupportedOperation + + def seek(self, *args): + raise io.UnsupportedOperation + + class TestGzip(unittest.TestCase): filename = support.TESTFN @@ -265,6 +276,16 @@ class TestGzip(unittest.TestCase): d = f.read() self.assertEqual(d, data1 * 50, "Incorrect data in file") + def test_non_seekable_file(self): + uncompressed = data1 * 50 + buf = UnseekableIO() + with gzip.GzipFile(fileobj=buf, mode="wb") as f: + f.write(uncompressed) + compressed = buf.getvalue() + buf = UnseekableIO(compressed) + with gzip.GzipFile(fileobj=buf, mode="rb") as f: + self.assertEqual(f.read(), uncompressed) + # Testing compress/decompress shortcut functions def test_compress(self): diff --git a/Misc/ACKS b/Misc/ACKS index 4f2780ac242..5d12c1f6c7a 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -260,6 +260,7 @@ Bill Fancher Mark Favas Niels Ferguson Sebastian Fernandez +Florian Festi Vincent Fiack Tomer Filiba Jeffrey Finkelstein diff --git a/Misc/NEWS b/Misc/NEWS index 15b4da2eebb..514ea6d6a37 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -62,6 +62,9 @@ Core and Builtins Library ------- +- Issue #1675951: Allow GzipFile to work with unseekable file objects. + Patch by Florian Festi. + - Logging: Added QueueListener class to facilitate logging usage for performance-critical threads.