bpo-33695 shutil.copytree() + os.scandir() cache (#7874)

This commit is contained in:
Giampaolo Rodola 2018-11-12 06:18:15 -08:00 committed by GitHub
parent cd449806fa
commit 19c46a4c96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 98 additions and 58 deletions

View File

@ -277,6 +277,14 @@ Optimizations
See :ref:`shutil-platform-dependent-efficient-copy-operations` section.
(Contributed by Giampaolo Rodola' in :issue:`25427`.)
* :func:`shutil.copytree` uses :func:`os.scandir` function and all copy
functions depending from it use cached :func:`os.stat` values. The speedup
for copying a directory with 8000 files is around +9% on Linux, +20% on
Windows and +30% on a Windows SMB share. Also the number of :func:`os.stat`
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
on network filesystems. (Contributed by Giampaolo Rodola' in :issue:`33695`.)
* The default protocol in the :mod:`pickle` module is now Protocol 4,
first introduced in Python 3.4. It offers better performance and smaller
size compared to Protocol 3 available since Python 3.0.

View File

@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE):
def _samefile(src, dst):
# Macintosh, Unix.
if isinstance(src, os.DirEntry) and hasattr(os.path, 'samestat'):
try:
return os.path.samestat(src.stat(), os.stat(dst))
except OSError:
return False
if hasattr(os.path, 'samefile'):
try:
return os.path.samefile(src, dst)
@ -210,6 +216,12 @@ def _samefile(src, dst):
return (os.path.normcase(os.path.abspath(src)) ==
os.path.normcase(os.path.abspath(dst)))
def _stat(fn):
return fn.stat() if isinstance(fn, os.DirEntry) else os.stat(fn)
def _islink(fn):
return fn.is_symlink() if isinstance(fn, os.DirEntry) else os.path.islink(fn)
def copyfile(src, dst, *, follow_symlinks=True):
"""Copy data from src to dst in the most efficient way possible.
@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True):
file_size = 0
for i, fn in enumerate([src, dst]):
try:
st = os.stat(fn)
st = _stat(fn)
except OSError:
# File most likely does not exist
pass
else:
# XXX What about other special files? (sockets, devices...)
if stat.S_ISFIFO(st.st_mode):
fn = fn.path if isinstance(fn, os.DirEntry) else fn
raise SpecialFileError("`%s` is a named pipe" % fn)
if _WINDOWS and i == 0:
file_size = st.st_size
if not follow_symlinks and os.path.islink(src):
if not follow_symlinks and _islink(src):
os.symlink(os.readlink(src), dst)
else:
with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True):
(e.g. Linux) this method does nothing.
"""
if not follow_symlinks and os.path.islink(src) and os.path.islink(dst):
if not follow_symlinks and _islink(src) and os.path.islink(dst):
if hasattr(os, 'lchmod'):
stat_func, chmod_func = os.lstat, os.lchmod
else:
return
elif hasattr(os, 'chmod'):
stat_func, chmod_func = os.stat, os.chmod
stat_func, chmod_func = _stat, os.chmod
else:
return
@ -325,7 +338,7 @@ def copystat(src, dst, *, follow_symlinks=True):
pass
# follow symlinks (aka don't not follow symlinks)
follow = follow_symlinks or not (os.path.islink(src) and os.path.islink(dst))
follow = follow_symlinks or not (_islink(src) and os.path.islink(dst))
if follow:
# use the real function if it exists
def lookup(name):
@ -339,6 +352,9 @@ def copystat(src, dst, *, follow_symlinks=True):
return fn
return _nop
if isinstance(src, os.DirEntry):
st = src.stat(follow_symlinks=follow)
else:
st = lookup("stat")(src, follow_symlinks=follow)
mode = stat.S_IMODE(st.st_mode)
lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns),
@ -415,6 +431,63 @@ def ignore_patterns(*patterns):
return set(ignored_names)
return _ignore_patterns
def _copytree(entries, src, dst, symlinks, ignore, copy_function,
ignore_dangling_symlinks):
if ignore is not None:
ignored_names = ignore(src, set(os.listdir(src)))
else:
ignored_names = set()
os.makedirs(dst)
errors = []
use_srcentry = copy_function is copy2 or copy_function is copy
for srcentry in entries:
if srcentry.name in ignored_names:
continue
srcname = os.path.join(src, srcentry.name)
dstname = os.path.join(dst, srcentry.name)
srcobj = srcentry if use_srcentry else srcname
try:
if srcentry.is_symlink():
linkto = os.readlink(srcname)
if symlinks:
# We can't just leave it to `copy_function` because legacy
# code with a custom `copy_function` may rely on copytree
# doing the right thing.
os.symlink(linkto, dstname)
copystat(srcobj, dstname, follow_symlinks=not symlinks)
else:
# ignore dangling symlink if the flag is on
if not os.path.exists(linkto) and ignore_dangling_symlinks:
continue
# otherwise let the copy occurs. copy2 will raise an error
if srcentry.is_dir():
copytree(srcobj, dstname, symlinks, ignore,
copy_function)
else:
copy_function(srcobj, dstname)
elif srcentry.is_dir():
copytree(srcobj, dstname, symlinks, ignore, copy_function)
else:
# Will raise a SpecialFileError for unsupported file types
copy_function(srcentry, dstname)
# catch the Error from the recursive copytree so that we can
# continue with other files
except Error as err:
errors.extend(err.args[0])
except OSError as why:
errors.append((srcname, dstname, str(why)))
try:
copystat(src, dst)
except OSError as why:
# Copying file access times may fail on Windows
if getattr(why, 'winerror', None) is None:
errors.append((src, dst, str(why)))
if errors:
raise Error(errors)
return dst
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
ignore_dangling_symlinks=False):
"""Recursively copy a directory tree.
@ -451,58 +524,10 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
function that supports the same signature (like copy()) can be used.
"""
names = os.listdir(src)
if ignore is not None:
ignored_names = ignore(src, names)
else:
ignored_names = set()
os.makedirs(dst)
errors = []
for name in names:
if name in ignored_names:
continue
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
try:
if os.path.islink(srcname):
linkto = os.readlink(srcname)
if symlinks:
# We can't just leave it to `copy_function` because legacy
# code with a custom `copy_function` may rely on copytree
# doing the right thing.
os.symlink(linkto, dstname)
copystat(srcname, dstname, follow_symlinks=not symlinks)
else:
# ignore dangling symlink if the flag is on
if not os.path.exists(linkto) and ignore_dangling_symlinks:
continue
# otherwise let the copy occurs. copy2 will raise an error
if os.path.isdir(srcname):
copytree(srcname, dstname, symlinks, ignore,
copy_function)
else:
copy_function(srcname, dstname)
elif os.path.isdir(srcname):
copytree(srcname, dstname, symlinks, ignore, copy_function)
else:
# Will raise a SpecialFileError for unsupported file types
copy_function(srcname, dstname)
# catch the Error from the recursive copytree so that we can
# continue with other files
except Error as err:
errors.extend(err.args[0])
except OSError as why:
errors.append((srcname, dstname, str(why)))
try:
copystat(src, dst)
except OSError as why:
# Copying file access times may fail on Windows
if getattr(why, 'winerror', None) is None:
errors.append((src, dst, str(why)))
if errors:
raise Error(errors)
return dst
with os.scandir(src) as entries:
return _copytree(entries=entries, src=src, dst=dst, symlinks=symlinks,
ignore=ignore, copy_function=copy_function,
ignore_dangling_symlinks=ignore_dangling_symlinks)
# version vulnerable to race conditions
def _rmtree_unsafe(path, onerror):

View File

@ -0,0 +1,7 @@
:func:`shutil.copytree` uses :func:`os.scandir` function and all copy
functions depending from it use cached :func:`os.stat` values. The speedup
for copying a directory with 8000 files is around +9% on Linux, +20% on
Windows and + 30% on a Windows SMB share. Also the number of :func:`os.stat`
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
on network filesystems.
(Contributed by Giampaolo Rodola' in :issue:`33695`.)