Issue #27199: TarFile expose copyfileobj bufsize to improve throughput

Patch by Jason Fried.
This commit is contained in:
Łukasz Langa 2016-09-09 19:48:14 -07:00
parent f5781958af
commit 04bedfa3ce
2 changed files with 21 additions and 15 deletions

View File

@ -228,21 +228,21 @@ def calc_chksums(buf):
signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf)) signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
return unsigned_chksum, signed_chksum return unsigned_chksum, signed_chksum
def copyfileobj(src, dst, length=None, exception=OSError): def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
"""Copy length bytes from fileobj src to fileobj dst. """Copy length bytes from fileobj src to fileobj dst.
If length is None, copy the entire content. If length is None, copy the entire content.
""" """
bufsize = bufsize or 16 * 1024
if length == 0: if length == 0:
return return
if length is None: if length is None:
shutil.copyfileobj(src, dst) shutil.copyfileobj(src, dst, bufsize)
return return
BUFSIZE = 16 * 1024 blocks, remainder = divmod(length, bufsize)
blocks, remainder = divmod(length, BUFSIZE)
for b in range(blocks): for b in range(blocks):
buf = src.read(BUFSIZE) buf = src.read(bufsize)
if len(buf) < BUFSIZE: if len(buf) < bufsize:
raise exception("unexpected end of data") raise exception("unexpected end of data")
dst.write(buf) dst.write(buf)
@ -1403,7 +1403,8 @@ class TarFile(object):
def __init__(self, name=None, mode="r", fileobj=None, format=None, def __init__(self, name=None, mode="r", fileobj=None, format=None,
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None): errors="surrogateescape", pax_headers=None, debug=None,
errorlevel=None, copybufsize=None):
"""Open an (uncompressed) tar archive `name'. `mode' is either 'r' to """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
read from an existing archive, 'a' to append data to an existing read from an existing archive, 'a' to append data to an existing
file or 'w' to create a new file overwriting an existing one. `mode' file or 'w' to create a new file overwriting an existing one. `mode'
@ -1459,6 +1460,7 @@ class TarFile(object):
self.errorlevel = errorlevel self.errorlevel = errorlevel
# Init datastructures. # Init datastructures.
self.copybufsize = copybufsize
self.closed = False self.closed = False
self.members = [] # list of members as TarInfo objects self.members = [] # list of members as TarInfo objects
self._loaded = False # flag if all members have been read self._loaded = False # flag if all members have been read
@ -1558,7 +1560,7 @@ class TarFile(object):
saved_pos = fileobj.tell() saved_pos = fileobj.tell()
try: try:
return func(name, "r", fileobj, **kwargs) return func(name, "r", fileobj, **kwargs)
except (ReadError, CompressionError) as e: except (ReadError, CompressionError):
if fileobj is not None: if fileobj is not None:
fileobj.seek(saved_pos) fileobj.seek(saved_pos)
continue continue
@ -1963,10 +1965,10 @@ class TarFile(object):
buf = tarinfo.tobuf(self.format, self.encoding, self.errors) buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
self.fileobj.write(buf) self.fileobj.write(buf)
self.offset += len(buf) self.offset += len(buf)
bufsize=self.copybufsize
# If there's data to follow, append it. # If there's data to follow, append it.
if fileobj is not None: if fileobj is not None:
copyfileobj(fileobj, self.fileobj, tarinfo.size) copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
if remainder > 0: if remainder > 0:
self.fileobj.write(NUL * (BLOCKSIZE - remainder)) self.fileobj.write(NUL * (BLOCKSIZE - remainder))
@ -2148,15 +2150,16 @@ class TarFile(object):
""" """
source = self.fileobj source = self.fileobj
source.seek(tarinfo.offset_data) source.seek(tarinfo.offset_data)
bufsize = self.copybufsize
with bltn_open(targetpath, "wb") as target: with bltn_open(targetpath, "wb") as target:
if tarinfo.sparse is not None: if tarinfo.sparse is not None:
for offset, size in tarinfo.sparse: for offset, size in tarinfo.sparse:
target.seek(offset) target.seek(offset)
copyfileobj(source, target, size, ReadError) copyfileobj(source, target, size, ReadError, bufsize)
target.seek(tarinfo.size) target.seek(tarinfo.size)
target.truncate() target.truncate()
else: else:
copyfileobj(source, target, tarinfo.size, ReadError) copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
def makeunknown(self, tarinfo, targetpath): def makeunknown(self, tarinfo, targetpath):
"""Make a file from a TarInfo object with an unknown type """Make a file from a TarInfo object with an unknown type
@ -2235,7 +2238,7 @@ class TarFile(object):
os.lchown(targetpath, u, g) os.lchown(targetpath, u, g)
else: else:
os.chown(targetpath, u, g) os.chown(targetpath, u, g)
except OSError as e: except OSError:
raise ExtractError("could not change owner") raise ExtractError("could not change owner")
def chmod(self, tarinfo, targetpath): def chmod(self, tarinfo, targetpath):
@ -2244,7 +2247,7 @@ class TarFile(object):
if hasattr(os, 'chmod'): if hasattr(os, 'chmod'):
try: try:
os.chmod(targetpath, tarinfo.mode) os.chmod(targetpath, tarinfo.mode)
except OSError as e: except OSError:
raise ExtractError("could not change mode") raise ExtractError("could not change mode")
def utime(self, tarinfo, targetpath): def utime(self, tarinfo, targetpath):
@ -2254,7 +2257,7 @@ class TarFile(object):
return return
try: try:
os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
except OSError as e: except OSError:
raise ExtractError("could not change modification time") raise ExtractError("could not change modification time")
#-------------------------------------------------------------------------- #--------------------------------------------------------------------------

View File

@ -10,6 +10,9 @@ What's New in Python 3.6.0 beta 1
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #27199: In tarfile, expose copyfileobj bufsize to improve throughput.
Patch by Jason Fried.
- Issue #27948: In f-strings, only allow backslashes inside the braces - Issue #27948: In f-strings, only allow backslashes inside the braces
(where the expressions are). This is a breaking change from the 3.6 (where the expressions are). This is a breaking change from the 3.6
alpha releases, where backslashes are allowed anywhere in an alpha releases, where backslashes are allowed anywhere in an