bpo-33695 shutil.copytree() + os.scandir() cache (#7874)

This commit is contained in:
Giampaolo Rodola 2018-11-12 06:18:15 -08:00 committed by GitHub
parent cd449806fa
commit 19c46a4c96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 98 additions and 58 deletions

View File

@ -277,6 +277,14 @@ Optimizations
See :ref:`shutil-platform-dependent-efficient-copy-operations` section. See :ref:`shutil-platform-dependent-efficient-copy-operations` section.
(Contributed by Giampaolo Rodola' in :issue:`25427`.) (Contributed by Giampaolo Rodola' in :issue:`25427`.)
* :func:`shutil.copytree` uses :func:`os.scandir` function and all copy
functions depending from it use cached :func:`os.stat` values. The speedup
for copying a directory with 8000 files is around +9% on Linux, +20% on
Windows and +30% on a Windows SMB share. Also the number of :func:`os.stat`
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
on network filesystems. (Contributed by Giampaolo Rodola' in :issue:`33695`.)
* The default protocol in the :mod:`pickle` module is now Protocol 4, * The default protocol in the :mod:`pickle` module is now Protocol 4,
first introduced in Python 3.4. It offers better performance and smaller first introduced in Python 3.4. It offers better performance and smaller
size compared to Protocol 3 available since Python 3.0. size compared to Protocol 3 available since Python 3.0.

View File

@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE):
def _samefile(src, dst): def _samefile(src, dst):
# Macintosh, Unix. # Macintosh, Unix.
if isinstance(src, os.DirEntry) and hasattr(os.path, 'samestat'):
try:
return os.path.samestat(src.stat(), os.stat(dst))
except OSError:
return False
if hasattr(os.path, 'samefile'): if hasattr(os.path, 'samefile'):
try: try:
return os.path.samefile(src, dst) return os.path.samefile(src, dst)
@ -210,6 +216,12 @@ def _samefile(src, dst):
return (os.path.normcase(os.path.abspath(src)) == return (os.path.normcase(os.path.abspath(src)) ==
os.path.normcase(os.path.abspath(dst))) os.path.normcase(os.path.abspath(dst)))
def _stat(fn):
return fn.stat() if isinstance(fn, os.DirEntry) else os.stat(fn)
def _islink(fn):
return fn.is_symlink() if isinstance(fn, os.DirEntry) else os.path.islink(fn)
def copyfile(src, dst, *, follow_symlinks=True): def copyfile(src, dst, *, follow_symlinks=True):
"""Copy data from src to dst in the most efficient way possible. """Copy data from src to dst in the most efficient way possible.
@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True):
file_size = 0 file_size = 0
for i, fn in enumerate([src, dst]): for i, fn in enumerate([src, dst]):
try: try:
st = os.stat(fn) st = _stat(fn)
except OSError: except OSError:
# File most likely does not exist # File most likely does not exist
pass pass
else: else:
# XXX What about other special files? (sockets, devices...) # XXX What about other special files? (sockets, devices...)
if stat.S_ISFIFO(st.st_mode): if stat.S_ISFIFO(st.st_mode):
fn = fn.path if isinstance(fn, os.DirEntry) else fn
raise SpecialFileError("`%s` is a named pipe" % fn) raise SpecialFileError("`%s` is a named pipe" % fn)
if _WINDOWS and i == 0: if _WINDOWS and i == 0:
file_size = st.st_size file_size = st.st_size
if not follow_symlinks and os.path.islink(src): if not follow_symlinks and _islink(src):
os.symlink(os.readlink(src), dst) os.symlink(os.readlink(src), dst)
else: else:
with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst: with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True):
(e.g. Linux) this method does nothing. (e.g. Linux) this method does nothing.
""" """
if not follow_symlinks and os.path.islink(src) and os.path.islink(dst): if not follow_symlinks and _islink(src) and os.path.islink(dst):
if hasattr(os, 'lchmod'): if hasattr(os, 'lchmod'):
stat_func, chmod_func = os.lstat, os.lchmod stat_func, chmod_func = os.lstat, os.lchmod
else: else:
return return
elif hasattr(os, 'chmod'): elif hasattr(os, 'chmod'):
stat_func, chmod_func = os.stat, os.chmod stat_func, chmod_func = _stat, os.chmod
else: else:
return return
@ -325,7 +338,7 @@ def copystat(src, dst, *, follow_symlinks=True):
pass pass
# follow symlinks (aka don't not follow symlinks) # follow symlinks (aka don't not follow symlinks)
follow = follow_symlinks or not (os.path.islink(src) and os.path.islink(dst)) follow = follow_symlinks or not (_islink(src) and os.path.islink(dst))
if follow: if follow:
# use the real function if it exists # use the real function if it exists
def lookup(name): def lookup(name):
@ -339,7 +352,10 @@ def copystat(src, dst, *, follow_symlinks=True):
return fn return fn
return _nop return _nop
st = lookup("stat")(src, follow_symlinks=follow) if isinstance(src, os.DirEntry):
st = src.stat(follow_symlinks=follow)
else:
st = lookup("stat")(src, follow_symlinks=follow)
mode = stat.S_IMODE(st.st_mode) mode = stat.S_IMODE(st.st_mode)
lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns), lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns),
follow_symlinks=follow) follow_symlinks=follow)
@ -415,6 +431,63 @@ def ignore_patterns(*patterns):
return set(ignored_names) return set(ignored_names)
return _ignore_patterns return _ignore_patterns
def _copytree(entries, src, dst, symlinks, ignore, copy_function,
ignore_dangling_symlinks):
if ignore is not None:
ignored_names = ignore(src, set(os.listdir(src)))
else:
ignored_names = set()
os.makedirs(dst)
errors = []
use_srcentry = copy_function is copy2 or copy_function is copy
for srcentry in entries:
if srcentry.name in ignored_names:
continue
srcname = os.path.join(src, srcentry.name)
dstname = os.path.join(dst, srcentry.name)
srcobj = srcentry if use_srcentry else srcname
try:
if srcentry.is_symlink():
linkto = os.readlink(srcname)
if symlinks:
# We can't just leave it to `copy_function` because legacy
# code with a custom `copy_function` may rely on copytree
# doing the right thing.
os.symlink(linkto, dstname)
copystat(srcobj, dstname, follow_symlinks=not symlinks)
else:
# ignore dangling symlink if the flag is on
if not os.path.exists(linkto) and ignore_dangling_symlinks:
continue
# otherwise let the copy occurs. copy2 will raise an error
if srcentry.is_dir():
copytree(srcobj, dstname, symlinks, ignore,
copy_function)
else:
copy_function(srcobj, dstname)
elif srcentry.is_dir():
copytree(srcobj, dstname, symlinks, ignore, copy_function)
else:
# Will raise a SpecialFileError for unsupported file types
copy_function(srcentry, dstname)
# catch the Error from the recursive copytree so that we can
# continue with other files
except Error as err:
errors.extend(err.args[0])
except OSError as why:
errors.append((srcname, dstname, str(why)))
try:
copystat(src, dst)
except OSError as why:
# Copying file access times may fail on Windows
if getattr(why, 'winerror', None) is None:
errors.append((src, dst, str(why)))
if errors:
raise Error(errors)
return dst
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2, def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
ignore_dangling_symlinks=False): ignore_dangling_symlinks=False):
"""Recursively copy a directory tree. """Recursively copy a directory tree.
@ -451,58 +524,10 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
function that supports the same signature (like copy()) can be used. function that supports the same signature (like copy()) can be used.
""" """
names = os.listdir(src) with os.scandir(src) as entries:
if ignore is not None: return _copytree(entries=entries, src=src, dst=dst, symlinks=symlinks,
ignored_names = ignore(src, names) ignore=ignore, copy_function=copy_function,
else: ignore_dangling_symlinks=ignore_dangling_symlinks)
ignored_names = set()
os.makedirs(dst)
errors = []
for name in names:
if name in ignored_names:
continue
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
try:
if os.path.islink(srcname):
linkto = os.readlink(srcname)
if symlinks:
# We can't just leave it to `copy_function` because legacy
# code with a custom `copy_function` may rely on copytree
# doing the right thing.
os.symlink(linkto, dstname)
copystat(srcname, dstname, follow_symlinks=not symlinks)
else:
# ignore dangling symlink if the flag is on
if not os.path.exists(linkto) and ignore_dangling_symlinks:
continue
# otherwise let the copy occurs. copy2 will raise an error
if os.path.isdir(srcname):
copytree(srcname, dstname, symlinks, ignore,
copy_function)
else:
copy_function(srcname, dstname)
elif os.path.isdir(srcname):
copytree(srcname, dstname, symlinks, ignore, copy_function)
else:
# Will raise a SpecialFileError for unsupported file types
copy_function(srcname, dstname)
# catch the Error from the recursive copytree so that we can
# continue with other files
except Error as err:
errors.extend(err.args[0])
except OSError as why:
errors.append((srcname, dstname, str(why)))
try:
copystat(src, dst)
except OSError as why:
# Copying file access times may fail on Windows
if getattr(why, 'winerror', None) is None:
errors.append((src, dst, str(why)))
if errors:
raise Error(errors)
return dst
# version vulnerable to race conditions # version vulnerable to race conditions
def _rmtree_unsafe(path, onerror): def _rmtree_unsafe(path, onerror):

View File

@ -0,0 +1,7 @@
:func:`shutil.copytree` uses :func:`os.scandir` function and all copy
functions depending from it use cached :func:`os.stat` values. The speedup
for copying a directory with 8000 files is around +9% on Linux, +20% on
Windows and + 30% on a Windows SMB share. Also the number of :func:`os.stat`
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
on network filesystems.
(Contributed by Giampaolo Rodola' in :issue:`33695`.)