Related to SF patch 618135: gzip.py and files > 2G.

Fixed the signed/unsigned confusions when dealing with files >= 2GB.
4GB is still a hard limitation of the gzip file format, though.

Testing this was a bitch on Win98SE due to frequent system freezes.  It
didn't freeze while running gzip, it kept freezing while trying to *create*
a > 2GB test file!  This wasn't Python's doing.  I don't know of a
reasonable way to test this functionality in regrtest.py, so I'm not
checking in a test case (a test case would necessarily require creating
a 2GB+ file first, using gzip to zip it, using gzip to unzip it again,
and then compare before-and-after; so >4GB free space would be required,
and a loooong time; I did all this "by hand" once).

Bugfix candidate, I guess.
This commit is contained in:
Tim Peters 2002-11-04 19:50:11 +00:00
parent 47ca2bc661
commit fb0ea525d5
2 changed files with 39 additions and 19 deletions

View File

@ -15,12 +15,21 @@ FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
READ, WRITE = 1, 2 READ, WRITE = 1, 2
def U32(i):
"""Return i as an unsigned integer, assuming it fits in 32 bits.
If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
"""
if i < 0:
i += 1L << 32
return i
def write32(output, value): def write32(output, value):
output.write(struct.pack("<l", value)) output.write(struct.pack("<l", value))
def write32u(output, value): def write32u(output, value):
if value < 0: # The L format writes the bit pattern correctly whether signed
value = value + 0x100000000L # or unsigned.
output.write(struct.pack("<L", value)) output.write(struct.pack("<L", value))
def read32(input): def read32(input):
@ -164,12 +173,14 @@ class GzipFile:
# Read and discard a null-terminated string containing the filename # Read and discard a null-terminated string containing the filename
while True: while True:
s = self.fileobj.read(1) s = self.fileobj.read(1)
if not s or s=='\000': break if not s or s=='\000':
break
if flag & FCOMMENT: if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment # Read and discard a null-terminated string containing a comment
while True: while True:
s = self.fileobj.read(1) s = self.fileobj.read(1)
if not s or s=='\000': break if not s or s=='\000':
break
if flag & FHCRC: if flag & FHCRC:
self.fileobj.read(2) # Read & discard the 16-bit header CRC self.fileobj.read(2) # Read & discard the 16-bit header CRC
@ -225,7 +236,8 @@ class GzipFile:
self.offset -= len(buf) self.offset -= len(buf)
def _read(self, size=1024): def _read(self, size=1024):
if self.fileobj is None: raise EOFError, "Reached EOF" if self.fileobj is None:
raise EOFError, "Reached EOF"
if self._new_member: if self._new_member:
# If the _new_member flag is set, we have to # If the _new_member flag is set, we have to
@ -286,8 +298,8 @@ class GzipFile:
# uncompressed data matches the stored values. # uncompressed data matches the stored values.
self.fileobj.seek(-8, 1) self.fileobj.seek(-8, 1)
crc32 = read32(self.fileobj) crc32 = read32(self.fileobj)
isize = read32(self.fileobj) isize = U32(read32(self.fileobj)) # may exceed 2GB
if crc32%0x100000000L != self.crc%0x100000000L: if U32(crc32) != U32(self.crc):
raise ValueError, "CRC check failed" raise ValueError, "CRC check failed"
elif isize != self.size: elif isize != self.size:
raise ValueError, "Incorrect length of data produced" raise ValueError, "Incorrect length of data produced"
@ -296,7 +308,8 @@ class GzipFile:
if self.mode == WRITE: if self.mode == WRITE:
self.fileobj.write(self.compress.flush()) self.fileobj.write(self.compress.flush())
write32(self.fileobj, self.crc) write32(self.fileobj, self.crc)
write32(self.fileobj, self.size) # self.size may exceed 2GB
write32u(self.fileobj, self.size)
self.fileobj = None self.fileobj = None
elif self.mode == READ: elif self.mode == READ:
self.fileobj = None self.fileobj = None
@ -338,7 +351,7 @@ class GzipFile:
if offset < self.offset: if offset < self.offset:
raise IOError('Negative seek in write mode') raise IOError('Negative seek in write mode')
count = offset - self.offset count = offset - self.offset
for i in range(count/1024): for i in range(count // 1024):
self.write(1024 * '\0') self.write(1024 * '\0')
self.write((count % 1024) * '\0') self.write((count % 1024) * '\0')
elif self.mode == READ: elif self.mode == READ:
@ -346,7 +359,8 @@ class GzipFile:
# for negative seek, rewind and do positive seek # for negative seek, rewind and do positive seek
self.rewind() self.rewind()
count = offset - self.offset count = offset - self.offset
for i in range(count/1024): self.read(1024) for i in range(count // 1024):
self.read(1024)
self.read(count % 1024) self.read(count % 1024)
def readline(self, size=-1): def readline(self, size=-1):
@ -379,11 +393,13 @@ class GzipFile:
def readlines(self, sizehint=0): def readlines(self, sizehint=0):
# Negative numbers result in reading all the lines # Negative numbers result in reading all the lines
if sizehint <= 0: sizehint = sys.maxint if sizehint <= 0:
sizehint = sys.maxint
L = [] L = []
while sizehint > 0: while sizehint > 0:
line = self.readline() line = self.readline()
if line == "": break if line == "":
break
L.append(line) L.append(line)
sizehint = sizehint - len(line) sizehint = sizehint - len(line)

View File

@ -355,6 +355,10 @@ Extension modules
Library Library
------- -------
- gzip.py now handles files exceeding 2GB. Note that 4GB is still a
fundamental limitation of the underlying gzip file format (it only
has 32 bits to record the file size).
- xml.sax.saxutils.unescape has been added, to replace entity references - xml.sax.saxutils.unescape has been added, to replace entity references
with their entity value. with their entity value.