diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 891af1bcf7e..2f330f018a4 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -318,7 +318,7 @@ be finalized; only the internally used file object will be closed. See the .. versionadded:: 3.2 Added support for the context management protocol. -.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1) +.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1, stream=False) All following arguments are optional and can be accessed as instance attributes as well. @@ -369,6 +369,9 @@ be finalized; only the internally used file object will be closed. See the The *pax_headers* argument is an optional dictionary of strings which will be added as a pax global header if *format* is :const:`PAX_FORMAT`. + If *stream* is set to :const:`True` then while reading the archive info about files + in the archive are not cached, saving memory. + .. versionchanged:: 3.2 Use ``'surrogateescape'`` as the default for the *errors* argument. @@ -378,6 +381,8 @@ be finalized; only the internally used file object will be closed. See the .. versionchanged:: 3.6 The *name* parameter accepts a :term:`path-like object`. + .. versionchanged:: 3.13 + Add the *stream* parameter. .. classmethod:: TarFile.open(...) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 7781a430839..df4e41f7a0d 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -1633,7 +1633,7 @@ class TarFile(object): def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, errors="surrogateescape", pax_headers=None, debug=None, - errorlevel=None, copybufsize=None): + errorlevel=None, copybufsize=None, stream=False): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' @@ -1665,6 +1665,8 @@ class TarFile(object): self.name = os.path.abspath(name) if name else None self.fileobj = fileobj + self.stream = stream + # Init attributes. if format is not None: self.format = format @@ -2631,7 +2633,9 @@ class TarFile(object): break if tarinfo is not None: - self.members.append(tarinfo) + # if streaming the file we do not want to cache the tarinfo + if not self.stream: + self.members.append(tarinfo) else: self._loaded = True @@ -2682,11 +2686,12 @@ class TarFile(object): def _load(self): """Read through the entire archive file and look for readable - members. + members. This should not run if the file is set to stream. """ - while self.next() is not None: - pass - self._loaded = True + if not self.stream: + while self.next() is not None: + pass + self._loaded = True def _check(self, mode=None): """Check if TarFile is still open, and if the operation's mode diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index e8d322d20a5..2eda7fc4cea 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -100,6 +100,14 @@ class ReadTest(TarTest): def tearDown(self): self.tar.close() +class StreamModeTest(ReadTest): + + # Only needs to change how the tarfile is opened to set + # stream mode + def setUp(self): + self.tar = tarfile.open(self.tarname, mode=self.mode, + encoding="iso8859-1", + stream=True) class UstarReadTest(ReadTest, unittest.TestCase): @@ -852,6 +860,21 @@ class Bz2StreamReadTest(Bz2Test, StreamReadTest): class LzmaStreamReadTest(LzmaTest, StreamReadTest): pass +class TarStreamModeReadTest(StreamModeTest, unittest.TestCase): + + def test_stream_mode_no_cache(self): + for _ in self.tar: + pass + self.assertEqual(self.tar.members, []) + +class GzipStreamModeReadTest(GzipTest, TarStreamModeReadTest): + pass + +class Bz2StreamModeReadTest(Bz2Test, TarStreamModeReadTest): + pass + +class LzmaStreamModeReadTest(LzmaTest, TarStreamModeReadTest): + pass class DetectReadTest(TarTest, unittest.TestCase): def _testfunc_file(self, name, mode): diff --git a/Misc/NEWS.d/next/Library/2023-03-08-19-30-53.gh-issue-102120.xkQ5Wr.rst b/Misc/NEWS.d/next/Library/2023-03-08-19-30-53.gh-issue-102120.xkQ5Wr.rst new file mode 100644 index 00000000000..ca50242fdbe --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-03-08-19-30-53.gh-issue-102120.xkQ5Wr.rst @@ -0,0 +1,2 @@ +Added a stream mode to ``tarfile`` that allows for reading +archives without caching info about the inner files.