mirror of https://github.com/python/cpython
Issue #1675951: Allow GzipFile to work with unseekable file objects.
Patch by Florian Festi.
This commit is contained in:
parent
dda7fdf194
commit
7b9698435d
|
@ -74,6 +74,9 @@ The module defines the following items:
|
|||
.. versionchanged:: 3.2
|
||||
Support for zero-padded files was added.
|
||||
|
||||
.. versionchanged:: 3.2
|
||||
Support for unseekable files was added.
|
||||
|
||||
|
||||
.. function:: open(filename, mode='rb', compresslevel=9)
|
||||
|
||||
|
|
91
Lib/gzip.py
91
Lib/gzip.py
|
@ -45,6 +45,62 @@ def open(filename, mode="rb", compresslevel=9):
|
|||
"""
|
||||
return GzipFile(filename, mode, compresslevel)
|
||||
|
||||
class _PaddedFile:
|
||||
"""Minimal read-only file object that prepends a string to the contents
|
||||
of an actual file. Shouldn't be used outside of gzip.py, as it lacks
|
||||
essential functionality."""
|
||||
|
||||
def __init__(self, f, prepend=b''):
|
||||
self._buffer = prepend
|
||||
self._length = len(prepend)
|
||||
self.file = f
|
||||
self._read = 0
|
||||
|
||||
def read(self, size):
|
||||
if self._read is None:
|
||||
return self.file.read(size)
|
||||
if self._read + size <= self._length:
|
||||
read = self._read
|
||||
self._read += size
|
||||
return self._buffer[read:self._read]
|
||||
else:
|
||||
read = self._read
|
||||
self._read = None
|
||||
return self._buffer[read:] + \
|
||||
self.file.read(size-self._length+read)
|
||||
|
||||
def prepend(self, prepend=b'', readprevious=False):
|
||||
if self._read is None:
|
||||
self._buffer = prepend
|
||||
elif readprevious and len(prepend) <= self._read:
|
||||
self._read -= len(prepend)
|
||||
return
|
||||
else:
|
||||
self._buffer = self._buffer[read:] + prepend
|
||||
self._length = len(self._buffer)
|
||||
self._read = 0
|
||||
|
||||
def unused(self):
|
||||
if self._read is None:
|
||||
return b''
|
||||
return self._buffer[self._read:]
|
||||
|
||||
def seek(self, offset, whence=0):
|
||||
# This is only ever called with offset=whence=0
|
||||
if whence == 1 and self._read is not None:
|
||||
if 0 <= offset + self._read <= self._length:
|
||||
self._read += offset
|
||||
return
|
||||
else:
|
||||
offset += self._length - self._read
|
||||
self._read = None
|
||||
self._buffer = None
|
||||
return self.file.seek(offset, whence)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(name, self.file)
|
||||
|
||||
|
||||
class GzipFile(io.BufferedIOBase):
|
||||
"""The GzipFile class simulates most of the methods of a file object with
|
||||
the exception of the readinto() and truncate() methods.
|
||||
|
@ -119,6 +175,7 @@ class GzipFile(io.BufferedIOBase):
|
|||
self.name = filename
|
||||
# Starts small, scales exponentially
|
||||
self.min_readsize = 100
|
||||
fileobj = _PaddedFile(fileobj)
|
||||
|
||||
elif mode[0:1] == 'w' or mode[0:1] == 'a':
|
||||
self.mode = WRITE
|
||||
|
@ -188,6 +245,9 @@ class GzipFile(io.BufferedIOBase):
|
|||
|
||||
def _read_gzip_header(self):
|
||||
magic = self.fileobj.read(2)
|
||||
if magic == b'':
|
||||
raise EOFError("Reached EOF")
|
||||
|
||||
if magic != b'\037\213':
|
||||
raise IOError('Not a gzipped file')
|
||||
method = ord( self.fileobj.read(1) )
|
||||
|
@ -219,6 +279,11 @@ class GzipFile(io.BufferedIOBase):
|
|||
if flag & FHCRC:
|
||||
self.fileobj.read(2) # Read & discard the 16-bit header CRC
|
||||
|
||||
unused = self.fileobj.unused()
|
||||
if unused:
|
||||
uncompress = self.decompress.decompress(unused)
|
||||
self._add_read_data(uncompress)
|
||||
|
||||
def write(self,data):
|
||||
if self.mode != WRITE:
|
||||
import errno
|
||||
|
@ -282,16 +347,6 @@ class GzipFile(io.BufferedIOBase):
|
|||
if self._new_member:
|
||||
# If the _new_member flag is set, we have to
|
||||
# jump to the next member, if there is one.
|
||||
#
|
||||
# First, check if we're at the end of the file;
|
||||
# if so, it's time to stop; no more members to read.
|
||||
pos = self.fileobj.tell() # Save current position
|
||||
self.fileobj.seek(0, 2) # Seek to end of file
|
||||
if pos == self.fileobj.tell():
|
||||
raise EOFError("Reached EOF")
|
||||
else:
|
||||
self.fileobj.seek( pos ) # Return to original position
|
||||
|
||||
self._init_read()
|
||||
self._read_gzip_header()
|
||||
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
|
||||
|
@ -305,6 +360,9 @@ class GzipFile(io.BufferedIOBase):
|
|||
|
||||
if buf == b"":
|
||||
uncompress = self.decompress.flush()
|
||||
# Prepend the already read bytes to the fileobj to they can be
|
||||
# seen by _read_eof()
|
||||
self.fileobj.prepend(self.decompress.unused_data, True)
|
||||
self._read_eof()
|
||||
self._add_read_data( uncompress )
|
||||
raise EOFError('Reached EOF')
|
||||
|
@ -316,10 +374,9 @@ class GzipFile(io.BufferedIOBase):
|
|||
# Ending case: we've come to the end of a member in the file,
|
||||
# so seek back to the start of the unused data, finish up
|
||||
# this member, and read a new gzip header.
|
||||
# (The number of bytes to seek back is the length of the unused
|
||||
# data, minus 8 because _read_eof() will rewind a further 8 bytes)
|
||||
self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
|
||||
|
||||
# Prepend the already read bytes to the fileobj to they can be
|
||||
# seen by _read_eof() and _read_gzip_header()
|
||||
self.fileobj.prepend(self.decompress.unused_data, True)
|
||||
# Check the CRC and file size, and set the flag so we read
|
||||
# a new member on the next call
|
||||
self._read_eof()
|
||||
|
@ -334,12 +391,10 @@ class GzipFile(io.BufferedIOBase):
|
|||
self.size = self.size + len(data)
|
||||
|
||||
def _read_eof(self):
|
||||
# We've read to the end of the file, so we have to rewind in order
|
||||
# to reread the 8 bytes containing the CRC and the file size.
|
||||
# We've read to the end of the file
|
||||
# We check the that the computed CRC and size of the
|
||||
# uncompressed data matches the stored values. Note that the size
|
||||
# stored is the true file size mod 2**32.
|
||||
self.fileobj.seek(-8, 1)
|
||||
crc32 = read32(self.fileobj)
|
||||
isize = read32(self.fileobj) # may exceed 2GB
|
||||
if crc32 != self.crc:
|
||||
|
@ -355,7 +410,7 @@ class GzipFile(io.BufferedIOBase):
|
|||
while c == b"\x00":
|
||||
c = self.fileobj.read(1)
|
||||
if c:
|
||||
self.fileobj.seek(-1, 1)
|
||||
self.fileobj.prepend(c, True)
|
||||
|
||||
@property
|
||||
def closed(self):
|
||||
|
|
|
@ -22,6 +22,17 @@ data2 = b"""/* zlibmodule.c -- gzip-compatible data compression */
|
|||
"""
|
||||
|
||||
|
||||
class UnseekableIO(io.BytesIO):
|
||||
def seekable(self):
|
||||
return False
|
||||
|
||||
def tell(self):
|
||||
raise io.UnsupportedOperation
|
||||
|
||||
def seek(self, *args):
|
||||
raise io.UnsupportedOperation
|
||||
|
||||
|
||||
class TestGzip(unittest.TestCase):
|
||||
filename = support.TESTFN
|
||||
|
||||
|
@ -265,6 +276,16 @@ class TestGzip(unittest.TestCase):
|
|||
d = f.read()
|
||||
self.assertEqual(d, data1 * 50, "Incorrect data in file")
|
||||
|
||||
def test_non_seekable_file(self):
|
||||
uncompressed = data1 * 50
|
||||
buf = UnseekableIO()
|
||||
with gzip.GzipFile(fileobj=buf, mode="wb") as f:
|
||||
f.write(uncompressed)
|
||||
compressed = buf.getvalue()
|
||||
buf = UnseekableIO(compressed)
|
||||
with gzip.GzipFile(fileobj=buf, mode="rb") as f:
|
||||
self.assertEqual(f.read(), uncompressed)
|
||||
|
||||
# Testing compress/decompress shortcut functions
|
||||
|
||||
def test_compress(self):
|
||||
|
|
|
@ -260,6 +260,7 @@ Bill Fancher
|
|||
Mark Favas
|
||||
Niels Ferguson
|
||||
Sebastian Fernandez
|
||||
Florian Festi
|
||||
Vincent Fiack
|
||||
Tomer Filiba
|
||||
Jeffrey Finkelstein
|
||||
|
|
Loading…
Reference in New Issue