From c2ea8c6c3ace398ed757f104d59b32ecad046281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lars=20Gust=C3=A4bel?= Date: Mon, 14 Apr 2008 10:05:48 +0000 Subject: [PATCH] Issue #2058: Remove the buf attribute and add __slots__ to the TarInfo class in order to reduce tarfile's memory usage. --- Lib/tarfile.py | 77 ++++++++++++++++++++++++++++---------------------- Misc/NEWS | 3 ++ 2 files changed, 46 insertions(+), 34 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index b789ccad8e4..c744951d706 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -767,7 +767,7 @@ class ExFileObject(object): self.fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data, tarinfo.size, - getattr(tarinfo, "sparse", None)) + tarinfo.sparse) self.name = tarinfo.name self.mode = "r" self.closed = False @@ -906,6 +906,12 @@ class TarInfo(object): usually created internally. """ + __slots__ = ("name", "mode", "uid", "gid", "size", "mtime", + "chksum", "type", "linkname", "uname", "gname", + "devmajor", "devminor", + "offset", "offset_data", "pax_headers", "sparse", + "tarfile", "_sparse_structs", "_link_target") + def __init__(self, name=""): """Construct a TarInfo object. name is the optional name of the member. @@ -927,6 +933,7 @@ class TarInfo(object): self.offset = 0 # the tar header starts here self.offset_data = 0 # the file's data starts here + self.sparse = None # sparse member information self.pax_headers = {} # pax header information # In pax headers the "name" and "linkname" field are called @@ -1181,7 +1188,6 @@ class TarInfo(object): raise HeaderError("bad checksum") obj = cls() - obj.buf = buf obj.name = nts(buf[0:100], encoding, errors) obj.mode = nti(buf[100:108]) obj.uid = nti(buf[108:116]) @@ -1202,6 +1208,24 @@ class TarInfo(object): if obj.type == AREGTYPE and obj.name.endswith("/"): obj.type = DIRTYPE + # The old GNU sparse format occupies some of the unused + # space in the buffer for up to 4 sparse structures. + # Save the them for later processing in _proc_sparse(). + if obj.type == GNUTYPE_SPARSE: + pos = 386 + structs = [] + for i in range(4): + try: + offset = nti(buf[pos:pos + 12]) + numbytes = nti(buf[pos + 12:pos + 24]) + except ValueError: + break + structs.append((offset, numbytes)) + pos += 24 + isextended = bool(buf[482]) + origsize = nti(buf[483:495]) + obj._sparse_structs = (structs, isextended, origsize) + # Remove redundant slashes from directories. if obj.isdir(): obj.name = obj.name.rstrip("/") @@ -1288,31 +1312,11 @@ class TarInfo(object): def _proc_sparse(self, tarfile): """Process a GNU sparse header plus extra headers. """ - buf = self.buf - sp = _ringbuffer() - pos = 386 - lastpos = 0 - realpos = 0 - # There are 4 possible sparse structs in the - # first header. - for i in range(4): - try: - offset = nti(buf[pos:pos + 12]) - numbytes = nti(buf[pos + 12:pos + 24]) - except ValueError: - break - if offset > lastpos: - sp.append(_hole(lastpos, offset - lastpos)) - sp.append(_data(offset, numbytes, realpos)) - realpos += numbytes - lastpos = offset + numbytes - pos += 24 + # We already collected some sparse structures in frombuf(). + structs, isextended, origsize = self._sparse_structs + del self._sparse_structs - isextended = bool(buf[482]) - origsize = nti(buf[483:495]) - - # If the isextended flag is given, - # there are extra headers to process. + # Collect sparse structures from extended header blocks. while isextended: buf = tarfile.fileobj.read(BLOCKSIZE) pos = 0 @@ -1322,18 +1326,23 @@ class TarInfo(object): numbytes = nti(buf[pos + 12:pos + 24]) except ValueError: break - if offset > lastpos: - sp.append(_hole(lastpos, offset - lastpos)) - sp.append(_data(offset, numbytes, realpos)) - realpos += numbytes - lastpos = offset + numbytes + structs.append((offset, numbytes)) pos += 24 isextended = bool(buf[504]) + # Transform the sparse structures to something we can use + # in ExFileObject. + self.sparse = _ringbuffer() + lastpos = 0 + realpos = 0 + for offset, numbytes in structs: + if offset > lastpos: + self.sparse.append(_hole(lastpos, offset - lastpos)) + self.sparse.append(_data(offset, numbytes, realpos)) + realpos += numbytes + lastpos = offset + numbytes if lastpos < origsize: - sp.append(_hole(lastpos, origsize - lastpos)) - - self.sparse = sp + self.sparse.append(_hole(lastpos, origsize - lastpos)) self.offset_data = tarfile.fileobj.tell() tarfile.offset = self.offset_data + self._block(self.size) diff --git a/Misc/NEWS b/Misc/NEWS index 40de49663a2..58524ee830b 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -29,6 +29,9 @@ Extension Modules Library ------- +- Issue #2058: Remove the buf attribute and add __slots__ to the TarInfo + class in order to reduce tarfile's memory usage. + - Bug #2606: Avoid calling .sort() on a dict_keys object. - The bundled libffi copy is now in sync with the recently released