From 9cbdd75ec5deda8f55edd7caab42dff65d009da2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lars=20Gust=C3=A4bel?= Date: Fri, 29 Oct 2010 09:08:19 +0000 Subject: [PATCH] Add read support for all missing variants of the GNU sparse extensions. Thus, in addition to GNUTYPE_SPARSE headers, sparse information in pax headers created by GNU tar can now be decoded. All three formats 0.0, 0.1 and 1.0 are supported. On filesystems that support this, holes in files are now restored whenever a sparse member is extracted. --- Doc/library/tarfile.rst | 3 +- Lib/tarfile.py | 231 +++++++++++++++++++-------------------- Lib/test/test_tarfile.py | 70 ++++++++++-- Lib/test/testtar.tar | Bin 298496 -> 427008 bytes Misc/NEWS | 3 + 5 files changed, 180 insertions(+), 127 deletions(-) diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 853406c2194..0dfb065252a 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -20,7 +20,8 @@ Some facts and figures: * read/write support for the POSIX.1-1988 (ustar) format. * read/write support for the GNU tar format including *longname* and *longlink* - extensions, read-only support for the *sparse* extension. + extensions, read-only support for all variants of the *sparse* extension + including restoration of sparse files. * read/write support for the POSIX.1-2001 (pax) format. diff --git a/Lib/tarfile.py b/Lib/tarfile.py index cc7514d0a63..e33b9820812 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -701,13 +701,29 @@ class _FileInFile(object): object. """ - def __init__(self, fileobj, offset, size, sparse=None): + def __init__(self, fileobj, offset, size, blockinfo=None): self.fileobj = fileobj self.offset = offset self.size = size - self.sparse = sparse self.position = 0 + if blockinfo is None: + blockinfo = [(0, size)] + + # Construct a map with data and zero blocks. + self.map_index = 0 + self.map = [] + lastpos = 0 + realpos = self.offset + for offset, size in blockinfo: + if offset > lastpos: + self.map.append((False, lastpos, offset, None)) + self.map.append((True, offset, offset + size, realpos)) + realpos += size + lastpos = offset + size + if lastpos < self.size: + self.map.append((False, lastpos, self.size, None)) + def seekable(self): if not hasattr(self.fileobj, "seekable"): # XXX gzip.GzipFile and bz2.BZ2File @@ -732,48 +748,26 @@ class _FileInFile(object): else: size = min(size, self.size - self.position) - if self.sparse is None: - return self.readnormal(size) - else: - return self.readsparse(size) - - def readnormal(self, size): - """Read operation for regular files. - """ - self.fileobj.seek(self.offset + self.position) - self.position += size - return self.fileobj.read(size) - - def readsparse(self, size): - """Read operation for sparse files. - """ - data = b"" + buf = b"" while size > 0: - buf = self.readsparsesection(size) - if not buf: - break - size -= len(buf) - data += buf - return data - - def readsparsesection(self, size): - """Read a single section of a sparse file. - """ - section = self.sparse.find(self.position) - - if section is None: - return b"" - - size = min(size, section.offset + section.size - self.position) - - if isinstance(section, _data): - realpos = section.realpos + self.position - section.offset - self.fileobj.seek(self.offset + realpos) - self.position += size - return self.fileobj.read(size) - else: - self.position += size - return NUL * size + while True: + data, start, stop, offset = self.map[self.map_index] + if start <= self.position < stop: + break + else: + self.map_index += 1 + if self.map_index == len(self.map): + self.map_index = 0 + length = min(size, stop - self.position) + if data: + self.fileobj.seek(offset) + block = self.fileobj.read(stop - start) + buf += block[self.position - start:self.position + length] + else: + buf += NUL * length + size -= length + self.position += length + return buf #class _FileInFile @@ -1367,28 +1361,15 @@ class TarInfo(object): numbytes = nti(buf[pos + 12:pos + 24]) except ValueError: break - structs.append((offset, numbytes)) + if offset and numbytes: + structs.append((offset, numbytes)) pos += 24 isextended = bool(buf[504]) - - # Transform the sparse structures to something we can use - # in ExFileObject. - self.sparse = _ringbuffer() - lastpos = 0 - realpos = 0 - for offset, numbytes in structs: - if offset > lastpos: - self.sparse.append(_hole(lastpos, offset - lastpos)) - self.sparse.append(_data(offset, numbytes, realpos)) - realpos += numbytes - lastpos = offset + numbytes - if lastpos < origsize: - self.sparse.append(_hole(lastpos, origsize - lastpos)) + self.sparse = structs self.offset_data = tarfile.fileobj.tell() tarfile.offset = self.offset_data + self._block(self.size) self.size = origsize - return self def _proc_pax(self, tarfile): @@ -1464,6 +1445,19 @@ class TarInfo(object): except HeaderError: raise SubsequentHeaderError("missing or bad subsequent header") + # Process GNU sparse information. + if "GNU.sparse.map" in pax_headers: + # GNU extended sparse format version 0.1. + self._proc_gnusparse_01(next, pax_headers) + + elif "GNU.sparse.size" in pax_headers: + # GNU extended sparse format version 0.0. + self._proc_gnusparse_00(next, pax_headers, buf) + + elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": + # GNU extended sparse format version 1.0. + self._proc_gnusparse_10(next, pax_headers, tarfile) + if self.type in (XHDTYPE, SOLARIS_XHDTYPE): # Patch the TarInfo object with the extended header info. next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) @@ -1480,24 +1474,59 @@ class TarInfo(object): return next + def _proc_gnusparse_00(self, next, pax_headers, buf): + """Process a GNU tar extended sparse header, version 0.0. + """ + offsets = [] + for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): + offsets.append(int(match.group(1))) + numbytes = [] + for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): + numbytes.append(int(match.group(1))) + next.sparse = list(zip(offsets, numbytes)) + + def _proc_gnusparse_01(self, next, pax_headers): + """Process a GNU tar extended sparse header, version 0.1. + """ + sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] + next.sparse = list(zip(sparse[::2], sparse[1::2])) + + def _proc_gnusparse_10(self, next, pax_headers, tarfile): + """Process a GNU tar extended sparse header, version 1.0. + """ + fields = None + sparse = [] + buf = tarfile.fileobj.read(BLOCKSIZE) + fields, buf = buf.split(b"\n", 1) + fields = int(fields) + while len(sparse) < fields * 2: + if b"\n" not in buf: + buf += tarfile.fileobj.read(BLOCKSIZE) + number, buf = buf.split(b"\n", 1) + sparse.append(int(number)) + next.offset_data = tarfile.fileobj.tell() + next.sparse = list(zip(sparse[::2], sparse[1::2])) + def _apply_pax_info(self, pax_headers, encoding, errors): """Replace fields with supplemental information from a previous pax extended or global header. """ for keyword, value in pax_headers.items(): - if keyword not in PAX_FIELDS: - continue - - if keyword == "path": - value = value.rstrip("/") - - if keyword in PAX_NUMBER_FIELDS: - try: - value = PAX_NUMBER_FIELDS[keyword](value) - except ValueError: - value = 0 - - setattr(self, keyword, value) + if keyword == "GNU.sparse.name": + setattr(self, "path", value) + elif keyword == "GNU.sparse.size": + setattr(self, "size", int(value)) + elif keyword == "GNU.sparse.realsize": + setattr(self, "size", int(value)) + elif keyword in PAX_FIELDS: + if keyword in PAX_NUMBER_FIELDS: + try: + value = PAX_NUMBER_FIELDS[keyword](value) + except ValueError: + value = 0 + if keyword == "path": + value = value.rstrip("/") + setattr(self, keyword, value) self.pax_headers = pax_headers.copy() @@ -1535,7 +1564,7 @@ class TarInfo(object): def isfifo(self): return self.type == FIFOTYPE def issparse(self): - return self.type == GNUTYPE_SPARSE + return self.sparse is not None def isdev(self): return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) # class TarInfo @@ -2255,10 +2284,17 @@ class TarFile(object): def makefile(self, tarinfo, targetpath): """Make a file called targetpath. """ - source = self.extractfile(tarinfo) + source = self.fileobj + source.seek(tarinfo.offset_data) target = bltn_open(targetpath, "wb") - copyfileobj(source, target) - source.close() + if tarinfo.sparse is not None: + for offset, size in tarinfo.sparse: + target.seek(offset) + copyfileobj(source, target, size) + else: + copyfileobj(source, target, tarinfo.size) + target.seek(tarinfo.size) + target.truncate() target.close() def makeunknown(self, tarinfo, targetpath): @@ -2544,49 +2580,6 @@ class TarIter: self.index += 1 return tarinfo -# Helper classes for sparse file support -class _section: - """Base class for _data and _hole. - """ - def __init__(self, offset, size): - self.offset = offset - self.size = size - def __contains__(self, offset): - return self.offset <= offset < self.offset + self.size - -class _data(_section): - """Represent a data section in a sparse file. - """ - def __init__(self, offset, size, realpos): - _section.__init__(self, offset, size) - self.realpos = realpos - -class _hole(_section): - """Represent a hole section in a sparse file. - """ - pass - -class _ringbuffer(list): - """Ringbuffer class which increases performance - over a regular list. - """ - def __init__(self): - self.idx = 0 - def find(self, offset): - idx = self.idx - while True: - item = self[idx] - if offset in item: - break - idx += 1 - if idx == len(self): - idx = 0 - if idx == self.idx: - # End of File - return None - self.idx = idx - return item - #-------------------- # exported functions #-------------------- diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 3a217dc8150..8dc3ff9aa27 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -526,6 +526,22 @@ class MemberReadTest(ReadTest): tarinfo = self.tar.getmember("ustar/sparse") self._test_member(tarinfo, size=86016, chksum=md5_sparse) + def test_find_gnusparse(self): + tarinfo = self.tar.getmember("gnu/sparse") + self._test_member(tarinfo, size=86016, chksum=md5_sparse) + + def test_find_gnusparse_00(self): + tarinfo = self.tar.getmember("gnu/sparse-0.0") + self._test_member(tarinfo, size=86016, chksum=md5_sparse) + + def test_find_gnusparse_01(self): + tarinfo = self.tar.getmember("gnu/sparse-0.1") + self._test_member(tarinfo, size=86016, chksum=md5_sparse) + + def test_find_gnusparse_10(self): + tarinfo = self.tar.getmember("gnu/sparse-1.0") + self._test_member(tarinfo, size=86016, chksum=md5_sparse) + def test_find_umlauts(self): tarinfo = self.tar.getmember("ustar/umlauts-\xc4\xd6\xdc\xe4\xf6\xfc\xdf") self._test_member(tarinfo, size=7011, chksum=md5_regtype) @@ -589,13 +605,53 @@ class GNUReadTest(LongnameTest): subdir = "gnu" longnametype = tarfile.GNUTYPE_LONGNAME - def test_sparse_file(self): - tarinfo1 = self.tar.getmember("ustar/sparse") - fobj1 = self.tar.extractfile(tarinfo1) - tarinfo2 = self.tar.getmember("gnu/sparse") - fobj2 = self.tar.extractfile(tarinfo2) - self.assertEqual(fobj1.read(), fobj2.read(), - "sparse file extraction failed") + # Since 3.2 tarfile is supposed to accurately restore sparse members and + # produce files with holes. This is what we actually want to test here. + # Unfortunately, not all platforms/filesystems support sparse files, and + # even on platforms that do it is non-trivial to make reliable assertions + # about holes in files. Therefore, we first do one basic test which works + # an all platforms, and after that a test that will work only on + # platforms/filesystems that prove to support sparse files. + def _test_sparse_file(self, name): + self.tar.extract(name, TEMPDIR) + filename = os.path.join(TEMPDIR, name) + with open(filename, "rb") as fobj: + data = fobj.read() + self.assertEqual(md5sum(data), md5_sparse, + "wrong md5sum for %s" % name) + + if self._fs_supports_holes(): + s = os.stat(filename) + self.assertTrue(s.st_blocks * 512 < s.st_size) + + def test_sparse_file_old(self): + self._test_sparse_file("gnu/sparse") + + def test_sparse_file_00(self): + self._test_sparse_file("gnu/sparse-0.0") + + def test_sparse_file_01(self): + self._test_sparse_file("gnu/sparse-0.1") + + def test_sparse_file_10(self): + self._test_sparse_file("gnu/sparse-1.0") + + @staticmethod + def _fs_supports_holes(): + # Return True if the platform knows the st_blocks stat attribute and + # uses st_blocks units of 512 bytes, and if the filesystem is able to + # store holes in files. + if sys.platform == "linux2": + # Linux evidentially has 512 byte st_blocks units. + name = os.path.join(TEMPDIR, "sparse-test") + with open(name, "wb") as fobj: + fobj.seek(4096) + fobj.truncate() + s = os.stat(name) + os.remove(name) + return s.st_blocks == 0 + else: + return False class PaxReadTest(LongnameTest): diff --git a/Lib/test/testtar.tar b/Lib/test/testtar.tar index dc1942c19d05fc6c966a6f5b722b064ac5966db2..b93210453d132cf0db07ab26cb50d06f83ee579a 100644 GIT binary patch delta 1739 zcmb_cJ#W)M7`B}dAy3e)r7HOWsr%mf?zJ;SLP!M$gpd$SET(m7ARldiPY_D)&HpNoT!|>{J97G5!jD;BR!dM!6OMIq}A&SD>Qe#4b{0AxNB37i&#R{aL z68Ng5q)IxhqhDlYsoMoX$X6gGvsdQ3ki0w7Kz%<{qkgj;E2m+dYZoiWL3M9#?fiID zdT5Voj$78wSDt>Rtv^1jIosEnkepdny{er)tk}Q4KELr}z?_c@B80&#LTnbH&2<4d zOHlA16Xc6RP9k+%o$P6lnalh_$lB2nrKqN3R%1Fe269MM5;CmaZ6F!eKr*3$c_cLaN?twqjjTbNt@T=@cO6%j$%_ zf5{NbSKa11=l)bD2>~@7!}+G;dCuF>t8TBMoK;+HcGN1kf7eI-yUXkO(|U2#%6I3y zg@xj5El(P8hT6j&dm(A_mz|QPiX8wf2oO-21ev~WfJ_!?OjS=)CDV%xV}6_F%-UMF uAwg4cW`IhIpy?rkrdtf8x*%vOfS`#rxR!a&&qk%U_GReaPn_?Q($7CQiR;$@ delta 19 bcmZqZkZPDC)X>7Xgy}-tF_vY=SoZ+{PR0mJ diff --git a/Misc/NEWS b/Misc/NEWS index e194d44ce6f..7fd0d7b66e2 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -54,6 +54,9 @@ Core and Builtins Library ------- +- tarfile.py: Add support for all missing variants of the GNU sparse + extensions and create files with holes when extracting sparse members. + - Issue #10218: Return timeout status from ``Condition.wait`` in threading. - Issue #7351: Add ``zipfile.BadZipFile`` spelling of the exception name