Related to SF patch 618135: gzip.py and files > 2G.
Fixed the signed/unsigned confusions when dealing with files >= 2GB. 4GB is still a hard limitation of the gzip file format, though. Testing this was a bitch on Win98SE due to frequent system freezes. It didn't freeze while running gzip, it kept freezing while trying to *create* a > 2GB test file! This wasn't Python's doing. I don't know of a reasonable way to test this functionality in regrtest.py, so I'm not checking in a test case (a test case would necessarily require creating a 2GB+ file first, using gzip to zip it, using gzip to unzip it again, and then compare before-and-after; so >4GB free space would be required, and a loooong time; I did all this "by hand" once). Bugfix candidate, I guess.
This commit is contained in:
parent
47ca2bc661
commit
fb0ea525d5
40
Lib/gzip.py
40
Lib/gzip.py
|
@ -15,12 +15,21 @@ FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
|
||||||
|
|
||||||
READ, WRITE = 1, 2
|
READ, WRITE = 1, 2
|
||||||
|
|
||||||
|
def U32(i):
|
||||||
|
"""Return i as an unsigned integer, assuming it fits in 32 bits.
|
||||||
|
|
||||||
|
If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
|
||||||
|
"""
|
||||||
|
if i < 0:
|
||||||
|
i += 1L << 32
|
||||||
|
return i
|
||||||
|
|
||||||
def write32(output, value):
|
def write32(output, value):
|
||||||
output.write(struct.pack("<l", value))
|
output.write(struct.pack("<l", value))
|
||||||
|
|
||||||
def write32u(output, value):
|
def write32u(output, value):
|
||||||
if value < 0:
|
# The L format writes the bit pattern correctly whether signed
|
||||||
value = value + 0x100000000L
|
# or unsigned.
|
||||||
output.write(struct.pack("<L", value))
|
output.write(struct.pack("<L", value))
|
||||||
|
|
||||||
def read32(input):
|
def read32(input):
|
||||||
|
@ -164,12 +173,14 @@ class GzipFile:
|
||||||
# Read and discard a null-terminated string containing the filename
|
# Read and discard a null-terminated string containing the filename
|
||||||
while True:
|
while True:
|
||||||
s = self.fileobj.read(1)
|
s = self.fileobj.read(1)
|
||||||
if not s or s=='\000': break
|
if not s or s=='\000':
|
||||||
|
break
|
||||||
if flag & FCOMMENT:
|
if flag & FCOMMENT:
|
||||||
# Read and discard a null-terminated string containing a comment
|
# Read and discard a null-terminated string containing a comment
|
||||||
while True:
|
while True:
|
||||||
s = self.fileobj.read(1)
|
s = self.fileobj.read(1)
|
||||||
if not s or s=='\000': break
|
if not s or s=='\000':
|
||||||
|
break
|
||||||
if flag & FHCRC:
|
if flag & FHCRC:
|
||||||
self.fileobj.read(2) # Read & discard the 16-bit header CRC
|
self.fileobj.read(2) # Read & discard the 16-bit header CRC
|
||||||
|
|
||||||
|
@ -225,7 +236,8 @@ class GzipFile:
|
||||||
self.offset -= len(buf)
|
self.offset -= len(buf)
|
||||||
|
|
||||||
def _read(self, size=1024):
|
def _read(self, size=1024):
|
||||||
if self.fileobj is None: raise EOFError, "Reached EOF"
|
if self.fileobj is None:
|
||||||
|
raise EOFError, "Reached EOF"
|
||||||
|
|
||||||
if self._new_member:
|
if self._new_member:
|
||||||
# If the _new_member flag is set, we have to
|
# If the _new_member flag is set, we have to
|
||||||
|
@ -286,8 +298,8 @@ class GzipFile:
|
||||||
# uncompressed data matches the stored values.
|
# uncompressed data matches the stored values.
|
||||||
self.fileobj.seek(-8, 1)
|
self.fileobj.seek(-8, 1)
|
||||||
crc32 = read32(self.fileobj)
|
crc32 = read32(self.fileobj)
|
||||||
isize = read32(self.fileobj)
|
isize = U32(read32(self.fileobj)) # may exceed 2GB
|
||||||
if crc32%0x100000000L != self.crc%0x100000000L:
|
if U32(crc32) != U32(self.crc):
|
||||||
raise ValueError, "CRC check failed"
|
raise ValueError, "CRC check failed"
|
||||||
elif isize != self.size:
|
elif isize != self.size:
|
||||||
raise ValueError, "Incorrect length of data produced"
|
raise ValueError, "Incorrect length of data produced"
|
||||||
|
@ -296,7 +308,8 @@ class GzipFile:
|
||||||
if self.mode == WRITE:
|
if self.mode == WRITE:
|
||||||
self.fileobj.write(self.compress.flush())
|
self.fileobj.write(self.compress.flush())
|
||||||
write32(self.fileobj, self.crc)
|
write32(self.fileobj, self.crc)
|
||||||
write32(self.fileobj, self.size)
|
# self.size may exceed 2GB
|
||||||
|
write32u(self.fileobj, self.size)
|
||||||
self.fileobj = None
|
self.fileobj = None
|
||||||
elif self.mode == READ:
|
elif self.mode == READ:
|
||||||
self.fileobj = None
|
self.fileobj = None
|
||||||
|
@ -338,7 +351,7 @@ class GzipFile:
|
||||||
if offset < self.offset:
|
if offset < self.offset:
|
||||||
raise IOError('Negative seek in write mode')
|
raise IOError('Negative seek in write mode')
|
||||||
count = offset - self.offset
|
count = offset - self.offset
|
||||||
for i in range(count/1024):
|
for i in range(count // 1024):
|
||||||
self.write(1024 * '\0')
|
self.write(1024 * '\0')
|
||||||
self.write((count % 1024) * '\0')
|
self.write((count % 1024) * '\0')
|
||||||
elif self.mode == READ:
|
elif self.mode == READ:
|
||||||
|
@ -346,7 +359,8 @@ class GzipFile:
|
||||||
# for negative seek, rewind and do positive seek
|
# for negative seek, rewind and do positive seek
|
||||||
self.rewind()
|
self.rewind()
|
||||||
count = offset - self.offset
|
count = offset - self.offset
|
||||||
for i in range(count/1024): self.read(1024)
|
for i in range(count // 1024):
|
||||||
|
self.read(1024)
|
||||||
self.read(count % 1024)
|
self.read(count % 1024)
|
||||||
|
|
||||||
def readline(self, size=-1):
|
def readline(self, size=-1):
|
||||||
|
@ -379,11 +393,13 @@ class GzipFile:
|
||||||
|
|
||||||
def readlines(self, sizehint=0):
|
def readlines(self, sizehint=0):
|
||||||
# Negative numbers result in reading all the lines
|
# Negative numbers result in reading all the lines
|
||||||
if sizehint <= 0: sizehint = sys.maxint
|
if sizehint <= 0:
|
||||||
|
sizehint = sys.maxint
|
||||||
L = []
|
L = []
|
||||||
while sizehint > 0:
|
while sizehint > 0:
|
||||||
line = self.readline()
|
line = self.readline()
|
||||||
if line == "": break
|
if line == "":
|
||||||
|
break
|
||||||
L.append(line)
|
L.append(line)
|
||||||
sizehint = sizehint - len(line)
|
sizehint = sizehint - len(line)
|
||||||
|
|
||||||
|
|
|
@ -355,6 +355,10 @@ Extension modules
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- gzip.py now handles files exceeding 2GB. Note that 4GB is still a
|
||||||
|
fundamental limitation of the underlying gzip file format (it only
|
||||||
|
has 32 bits to record the file size).
|
||||||
|
|
||||||
- xml.sax.saxutils.unescape has been added, to replace entity references
|
- xml.sax.saxutils.unescape has been added, to replace entity references
|
||||||
with their entity value.
|
with their entity value.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue