Issue #1675951: Allow GzipFile to work with unseekable file objects.

Patch by Florian Festi.
This commit is contained in:
Antoine Pitrou 2010-09-23 16:22:51 +00:00
parent dda7fdf194
commit 7b9698435d
5 changed files with 101 additions and 18 deletions

View File

@ -74,6 +74,9 @@ The module defines the following items:
.. versionchanged:: 3.2 .. versionchanged:: 3.2
Support for zero-padded files was added. Support for zero-padded files was added.
.. versionchanged:: 3.2
Support for unseekable files was added.
.. function:: open(filename, mode='rb', compresslevel=9) .. function:: open(filename, mode='rb', compresslevel=9)

View File

@ -45,6 +45,62 @@ def open(filename, mode="rb", compresslevel=9):
""" """
return GzipFile(filename, mode, compresslevel) return GzipFile(filename, mode, compresslevel)
class _PaddedFile:
"""Minimal read-only file object that prepends a string to the contents
of an actual file. Shouldn't be used outside of gzip.py, as it lacks
essential functionality."""
def __init__(self, f, prepend=b''):
self._buffer = prepend
self._length = len(prepend)
self.file = f
self._read = 0
def read(self, size):
if self._read is None:
return self.file.read(size)
if self._read + size <= self._length:
read = self._read
self._read += size
return self._buffer[read:self._read]
else:
read = self._read
self._read = None
return self._buffer[read:] + \
self.file.read(size-self._length+read)
def prepend(self, prepend=b'', readprevious=False):
if self._read is None:
self._buffer = prepend
elif readprevious and len(prepend) <= self._read:
self._read -= len(prepend)
return
else:
self._buffer = self._buffer[read:] + prepend
self._length = len(self._buffer)
self._read = 0
def unused(self):
if self._read is None:
return b''
return self._buffer[self._read:]
def seek(self, offset, whence=0):
# This is only ever called with offset=whence=0
if whence == 1 and self._read is not None:
if 0 <= offset + self._read <= self._length:
self._read += offset
return
else:
offset += self._length - self._read
self._read = None
self._buffer = None
return self.file.seek(offset, whence)
def __getattr__(self, name):
return getattr(name, self.file)
class GzipFile(io.BufferedIOBase): class GzipFile(io.BufferedIOBase):
"""The GzipFile class simulates most of the methods of a file object with """The GzipFile class simulates most of the methods of a file object with
the exception of the readinto() and truncate() methods. the exception of the readinto() and truncate() methods.
@ -119,6 +175,7 @@ class GzipFile(io.BufferedIOBase):
self.name = filename self.name = filename
# Starts small, scales exponentially # Starts small, scales exponentially
self.min_readsize = 100 self.min_readsize = 100
fileobj = _PaddedFile(fileobj)
elif mode[0:1] == 'w' or mode[0:1] == 'a': elif mode[0:1] == 'w' or mode[0:1] == 'a':
self.mode = WRITE self.mode = WRITE
@ -188,6 +245,9 @@ class GzipFile(io.BufferedIOBase):
def _read_gzip_header(self): def _read_gzip_header(self):
magic = self.fileobj.read(2) magic = self.fileobj.read(2)
if magic == b'':
raise EOFError("Reached EOF")
if magic != b'\037\213': if magic != b'\037\213':
raise IOError('Not a gzipped file') raise IOError('Not a gzipped file')
method = ord( self.fileobj.read(1) ) method = ord( self.fileobj.read(1) )
@ -219,6 +279,11 @@ class GzipFile(io.BufferedIOBase):
if flag & FHCRC: if flag & FHCRC:
self.fileobj.read(2) # Read & discard the 16-bit header CRC self.fileobj.read(2) # Read & discard the 16-bit header CRC
unused = self.fileobj.unused()
if unused:
uncompress = self.decompress.decompress(unused)
self._add_read_data(uncompress)
def write(self,data): def write(self,data):
if self.mode != WRITE: if self.mode != WRITE:
import errno import errno
@ -282,16 +347,6 @@ class GzipFile(io.BufferedIOBase):
if self._new_member: if self._new_member:
# If the _new_member flag is set, we have to # If the _new_member flag is set, we have to
# jump to the next member, if there is one. # jump to the next member, if there is one.
#
# First, check if we're at the end of the file;
# if so, it's time to stop; no more members to read.
pos = self.fileobj.tell() # Save current position
self.fileobj.seek(0, 2) # Seek to end of file
if pos == self.fileobj.tell():
raise EOFError("Reached EOF")
else:
self.fileobj.seek( pos ) # Return to original position
self._init_read() self._init_read()
self._read_gzip_header() self._read_gzip_header()
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
@ -305,6 +360,9 @@ class GzipFile(io.BufferedIOBase):
if buf == b"": if buf == b"":
uncompress = self.decompress.flush() uncompress = self.decompress.flush()
# Prepend the already read bytes to the fileobj to they can be
# seen by _read_eof()
self.fileobj.prepend(self.decompress.unused_data, True)
self._read_eof() self._read_eof()
self._add_read_data( uncompress ) self._add_read_data( uncompress )
raise EOFError('Reached EOF') raise EOFError('Reached EOF')
@ -316,10 +374,9 @@ class GzipFile(io.BufferedIOBase):
# Ending case: we've come to the end of a member in the file, # Ending case: we've come to the end of a member in the file,
# so seek back to the start of the unused data, finish up # so seek back to the start of the unused data, finish up
# this member, and read a new gzip header. # this member, and read a new gzip header.
# (The number of bytes to seek back is the length of the unused # Prepend the already read bytes to the fileobj to they can be
# data, minus 8 because _read_eof() will rewind a further 8 bytes) # seen by _read_eof() and _read_gzip_header()
self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) self.fileobj.prepend(self.decompress.unused_data, True)
# Check the CRC and file size, and set the flag so we read # Check the CRC and file size, and set the flag so we read
# a new member on the next call # a new member on the next call
self._read_eof() self._read_eof()
@ -334,12 +391,10 @@ class GzipFile(io.BufferedIOBase):
self.size = self.size + len(data) self.size = self.size + len(data)
def _read_eof(self): def _read_eof(self):
# We've read to the end of the file, so we have to rewind in order # We've read to the end of the file
# to reread the 8 bytes containing the CRC and the file size.
# We check the that the computed CRC and size of the # We check the that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size # uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32. # stored is the true file size mod 2**32.
self.fileobj.seek(-8, 1)
crc32 = read32(self.fileobj) crc32 = read32(self.fileobj)
isize = read32(self.fileobj) # may exceed 2GB isize = read32(self.fileobj) # may exceed 2GB
if crc32 != self.crc: if crc32 != self.crc:
@ -355,7 +410,7 @@ class GzipFile(io.BufferedIOBase):
while c == b"\x00": while c == b"\x00":
c = self.fileobj.read(1) c = self.fileobj.read(1)
if c: if c:
self.fileobj.seek(-1, 1) self.fileobj.prepend(c, True)
@property @property
def closed(self): def closed(self):

View File

@ -22,6 +22,17 @@ data2 = b"""/* zlibmodule.c -- gzip-compatible data compression */
""" """
class UnseekableIO(io.BytesIO):
def seekable(self):
return False
def tell(self):
raise io.UnsupportedOperation
def seek(self, *args):
raise io.UnsupportedOperation
class TestGzip(unittest.TestCase): class TestGzip(unittest.TestCase):
filename = support.TESTFN filename = support.TESTFN
@ -265,6 +276,16 @@ class TestGzip(unittest.TestCase):
d = f.read() d = f.read()
self.assertEqual(d, data1 * 50, "Incorrect data in file") self.assertEqual(d, data1 * 50, "Incorrect data in file")
def test_non_seekable_file(self):
uncompressed = data1 * 50
buf = UnseekableIO()
with gzip.GzipFile(fileobj=buf, mode="wb") as f:
f.write(uncompressed)
compressed = buf.getvalue()
buf = UnseekableIO(compressed)
with gzip.GzipFile(fileobj=buf, mode="rb") as f:
self.assertEqual(f.read(), uncompressed)
# Testing compress/decompress shortcut functions # Testing compress/decompress shortcut functions
def test_compress(self): def test_compress(self):

View File

@ -260,6 +260,7 @@ Bill Fancher
Mark Favas Mark Favas
Niels Ferguson Niels Ferguson
Sebastian Fernandez Sebastian Fernandez
Florian Festi
Vincent Fiack Vincent Fiack
Tomer Filiba Tomer Filiba
Jeffrey Finkelstein Jeffrey Finkelstein

View File

@ -62,6 +62,9 @@ Core and Builtins
Library Library
------- -------
- Issue #1675951: Allow GzipFile to work with unseekable file objects.
Patch by Florian Festi.
- Logging: Added QueueListener class to facilitate logging usage for - Logging: Added QueueListener class to facilitate logging usage for
performance-critical threads. performance-critical threads.