Related to SF patch 618135: gzip.py and files > 2G.

Fixed the signed/unsigned confusions when dealing with files >= 2GB. 4GB is still a hard limitation of the gzip file format, though. Testing this was a bitch on Win98SE due to frequent system freezes. It didn't freeze while running gzip, it kept freezing while trying to *create* a > 2GB test file! This wasn't Python's doing. I don't know of a reasonable way to test this functionality in regrtest.py, so I'm not checking in a test case (a test case would necessarily require creating a 2GB+ file first, using gzip to zip it, using gzip to unzip it again, and then compare before-and-after; so >4GB free space would be required, and a loooong time; I did all this "by hand" once). Bugfix candidate, I guess.
2002-11-04 19:50:11 +00:00 · 2002-11-04 19:50:11 +00:00 · fb0ea525d5
parent 47ca2bc661
commit fb0ea525d5
2 changed files with 39 additions and 19 deletions
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@ -15,12 +15,21 @@ FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16

 READ, WRITE = 1, 2

+def U32(i):
+    """Return i as an unsigned integer, assuming it fits in 32 bits.
+
+    If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
+    """
+    if i < 0:
+        i += 1L << 32
+    return i
+
 def write32(output, value):
    output.write(struct.pack("<l", value))

 def write32u(output, value):
-    if value < 0:
-        value = value + 0x100000000L
+    # The L format writes the bit pattern correctly whether signed
+    # or unsigned.
    output.write(struct.pack("<L", value))

 def read32(input):
@ -157,19 +166,21 @@ class GzipFile:

        if flag & FEXTRA:
            # Read & discard the extra field, if present
-            xlen=ord(self.fileobj.read(1))
-            xlen=xlen+256*ord(self.fileobj.read(1))
+            xlen = ord(self.fileobj.read(1))
+            xlen = xlen + 256*ord(self.fileobj.read(1))
            self.fileobj.read(xlen)
        if flag & FNAME:
            # Read and discard a null-terminated string containing the filename
            while True:
-                s=self.fileobj.read(1)
-                if not s or s=='\000': break
+                s = self.fileobj.read(1)
+                if not s or s=='\000':
+                    break
        if flag & FCOMMENT:
            # Read and discard a null-terminated string containing a comment
            while True:
-                s=self.fileobj.read(1)
-                if not s or s=='\000': break
+                s = self.fileobj.read(1)
+                if not s or s=='\000':
+                    break
        if flag & FHCRC:
            self.fileobj.read(2)     # Read & discard the 16-bit header CRC

@ -225,7 +236,8 @@ class GzipFile:
        self.offset -= len(buf)

    def _read(self, size=1024):
-        if self.fileobj is None: raise EOFError, "Reached EOF"
+        if self.fileobj is None:
+            raise EOFError, "Reached EOF"

        if self._new_member:
            # If the _new_member flag is set, we have to
@ -286,8 +298,8 @@ class GzipFile:
        # uncompressed data matches the stored values.
        self.fileobj.seek(-8, 1)
        crc32 = read32(self.fileobj)
-        isize = read32(self.fileobj)
-        if crc32%0x100000000L != self.crc%0x100000000L:
+        isize = U32(read32(self.fileobj))   # may exceed 2GB
+        if U32(crc32) != U32(self.crc):
            raise ValueError, "CRC check failed"
        elif isize != self.size:
            raise ValueError, "Incorrect length of data produced"
@ -296,7 +308,8 @@ class GzipFile:
        if self.mode == WRITE:
            self.fileobj.write(self.compress.flush())
            write32(self.fileobj, self.crc)
-            write32(self.fileobj, self.size)
+            # self.size may exceed 2GB
+            write32u(self.fileobj, self.size)
            self.fileobj = None
        elif self.mode == READ:
            self.fileobj = None
@ -338,15 +351,16 @@ class GzipFile:
            if offset < self.offset:
                raise IOError('Negative seek in write mode')
            count = offset - self.offset
-            for i in range(count/1024):
-                self.write(1024*'\0')
-            self.write((count%1024)*'\0')
+            for i in range(count // 1024):
+                self.write(1024 * '\0')
+            self.write((count % 1024) * '\0')
        elif self.mode == READ:
            if offset < self.offset:
                # for negative seek, rewind and do positive seek
                self.rewind()
            count = offset - self.offset
-            for i in range(count/1024): self.read(1024)
+            for i in range(count // 1024):
+                self.read(1024)
            self.read(count % 1024)

    def readline(self, size=-1):
@ -379,11 +393,13 @@ class GzipFile:

    def readlines(self, sizehint=0):
        # Negative numbers result in reading all the lines
-        if sizehint <= 0: sizehint = sys.maxint
+        if sizehint <= 0:
+            sizehint = sys.maxint
        L = []
        while sizehint > 0:
            line = self.readline()
-            if line == "": break
+            if line == "":
+                break
            L.append(line)
            sizehint = sizehint - len(line)

--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -355,6 +355,10 @@ Extension modules
 Library
 -------

+- gzip.py now handles files exceeding 2GB.  Note that 4GB is still a
+  fundamental limitation of the underlying gzip file format (it only
+  has 32 bits to record the file size).
+
 - xml.sax.saxutils.unescape has been added, to replace entity references
  with their entity value.