closes bpo-31650: PEP 552 (Deterministic pycs) implementation (#4575)

Python now supports checking bytecode cache up-to-dateness with a hash of the
source contents rather than volatile source metadata. See the PEP for details.

While a fairly straightforward idea, quite a lot of code had to be modified due
to the pervasiveness of pyc implementation details in the codebase. Changes in
this commit include:

- The core changes to importlib to understand how to read, validate, and
  regenerate hash-based pycs.

- Support for generating hash-based pycs in py_compile and compileall.

- Modifications to our siphash implementation to support passing a custom
  key. We then expose it to importlib through _imp.

- Updates to all places in the interpreter, standard library, and tests that
  manually generate or parse pyc files to grok the new format.

- Support in the interpreter command line code for long options like
  --check-hash-based-pycs.

- Tests and documentation for all of the above.
This commit is contained in:
Benjamin Peterson 2017-12-09 10:26:52 -08:00 committed by GitHub
parent 28d8d14013
commit 42aa93b8ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 3364 additions and 2505 deletions

View File

@ -458,6 +458,12 @@ Glossary
is believed that overcoming this performance issue would make the
implementation much more complicated and therefore costlier to maintain.
hash-based pyc
A bytecode cache file that uses the the hash rather than the last-modified
time of the corresponding source file to determine its validity. See
:ref:`pyc-invalidation`.
hashable
An object is *hashable* if it has a hash value which never changes during
its lifetime (it needs a :meth:`__hash__` method), and can be compared to

View File

@ -83,6 +83,16 @@ compile Python sources.
If ``0`` is used, then the result of :func:`os.cpu_count()`
will be used.
.. cmdoption:: --invalidation-mode [timestamp|checked-hash|unchecked-hash]
Control how the generated pycs will be invalidated at runtime. The default
setting, ``timestamp``, means that ``.pyc`` files with the source timestamp
and size embedded will be generated. The ``checked-hash`` and
``unchecked-hash`` values cause hash-based pycs to be generated. Hash-based
pycs embed a hash of the source file contents rather than a timestamp. See
:ref:`pyc-invalidation` for more information on how Python validates bytecode
cache files at runtime.
.. versionchanged:: 3.2
Added the ``-i``, ``-b`` and ``-h`` options.
@ -91,6 +101,9 @@ compile Python sources.
was changed to a multilevel value. ``-b`` will always produce a
byte-code file ending in ``.pyc``, never ``.pyo``.
.. versionchanged:: 3.7
Added the ``--invalidation-mode`` parameter.
There is no command-line option to control the optimization level used by the
:func:`compile` function, because the Python interpreter itself already
@ -99,7 +112,7 @@ provides the option: :program:`python -O -m compileall`.
Public functions
----------------
.. function:: compile_dir(dir, maxlevels=10, ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1, workers=1)
.. function:: compile_dir(dir, maxlevels=10, ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1, workers=1, invalidation_mode=py_compile.PycInvalidationMode.TIMESTAMP)
Recursively descend the directory tree named by *dir*, compiling all :file:`.py`
files along the way. Return a true value if all the files compiled successfully,
@ -140,6 +153,10 @@ Public functions
then sequential compilation will be used as a fallback. If *workers* is
lower than ``0``, a :exc:`ValueError` will be raised.
*invalidation_mode* should be a member of the
:class:`py_compile.PycInvalidationMode` enum and controls how the generated
pycs are invalidated at runtime.
.. versionchanged:: 3.2
Added the *legacy* and *optimize* parameter.
@ -156,7 +173,10 @@ Public functions
.. versionchanged:: 3.6
Accepts a :term:`path-like object`.
.. function:: compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1)
.. versionchanged:: 3.7
The *invalidation_mode* parameter was added.
.. function:: compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1, invalidation_mode=py_compile.PycInvalidationMode.TIMESTAMP)
Compile the file with path *fullname*. Return a true value if the file
compiled successfully, and a false value otherwise.
@ -184,6 +204,10 @@ Public functions
*optimize* specifies the optimization level for the compiler. It is passed to
the built-in :func:`compile` function.
*invalidation_mode* should be a member of the
:class:`py_compile.PycInvalidationMode` enum and controls how the generated
pycs are invalidated at runtime.
.. versionadded:: 3.2
.. versionchanged:: 3.5
@ -193,7 +217,10 @@ Public functions
The *legacy* parameter only writes out ``.pyc`` files, not ``.pyo`` files
no matter what the value of *optimize* is.
.. function:: compile_path(skip_curdir=True, maxlevels=0, force=False, quiet=0, legacy=False, optimize=-1)
.. versionchanged:: 3.7
The *invalidation_mode* parameter was added.
.. function:: compile_path(skip_curdir=True, maxlevels=0, force=False, quiet=0, legacy=False, optimize=-1, invalidation_mode=py_compile.PycInvalidationMode.TIMESTAMP)
Byte-compile all the :file:`.py` files found along ``sys.path``. Return a
true value if all the files compiled successfully, and a false value otherwise.
@ -213,6 +240,9 @@ Public functions
The *legacy* parameter only writes out ``.pyc`` files, not ``.pyo`` files
no matter what the value of *optimize* is.
.. versionchanged:: 3.7
The *invalidation_mode* parameter was added.
To force a recompile of all the :file:`.py` files in the :file:`Lib/`
subdirectory and all its subdirectories::

View File

@ -67,6 +67,9 @@ generically as an :term:`importer`) to participate in the import process.
:pep:`489`
Multi-phase extension module initialization
:pep:`552`
Deterministic pycs
:pep:`3120`
Using UTF-8 as the Default Source Encoding
@ -1327,6 +1330,14 @@ an :term:`importer`.
.. versionchanged:: 3.6
Accepts a :term:`path-like object`.
.. function:: source_hash(source_bytes)
Return the hash of *source_bytes* as bytes. A hash-based ``.pyc`` file embeds
the :func:`source_hash` of the corresponding source file's contents in its
header.
.. versionadded:: 3.7
.. class:: LazyLoader(loader)
A class which postpones the execution of the loader of a module until the

View File

@ -27,7 +27,7 @@ byte-code cache files in the directory containing the source code.
Exception raised when an error occurs while attempting to compile the file.
.. function:: compile(file, cfile=None, dfile=None, doraise=False, optimize=-1)
.. function:: compile(file, cfile=None, dfile=None, doraise=False, optimize=-1, invalidation_mode=PycInvalidationMode.TIMESTAMP)
Compile a source file to byte-code and write out the byte-code cache file.
The source code is loaded from the file named *file*. The byte-code is
@ -53,6 +53,10 @@ byte-code cache files in the directory containing the source code.
:func:`compile` function. The default of ``-1`` selects the optimization
level of the current interpreter.
*invalidation_mode* should be a member of the :class:`PycInvalidationMode`
enum and controls how the generated ``.pyc`` files are invalidated at
runtime.
.. versionchanged:: 3.2
Changed default value of *cfile* to be :PEP:`3147`-compliant. Previous
default was *file* + ``'c'`` (``'o'`` if optimization was enabled).
@ -65,6 +69,41 @@ byte-code cache files in the directory containing the source code.
caveat that :exc:`FileExistsError` is raised if *cfile* is a symlink or
non-regular file.
.. versionchanged:: 3.7
The *invalidation_mode* parameter was added as specified in :pep:`552`.
.. class:: PycInvalidationMode
A enumeration of possible methods the interpreter can use to determine
whether a bytecode file is up to date with a source file. The ``.pyc`` file
indicates the desired invalidation mode in its header. See
:ref:`pyc-invalidation` for more information on how Python invalidates
``.pyc`` files at runtime.
.. versionadded:: 3.7
.. attribute:: TIMESTAMP
The ``.pyc`` file includes the timestamp and size of the source file,
which Python will compare against the metadata of the source file at
runtime to determine if the ``.pyc`` file needs to be regenerated.
.. attribute:: CHECKED_HASH
The ``.pyc`` file includes a hash of the source file content, which Python
will compare against the source at runtime to determine if the ``.pyc``
file needs to be regenerated.
.. attribute:: UNCHECKED_HASH
Like :attr:`CHECKED_HASH`, the ``.pyc`` file includes a hash of the source
file content. However, Python will at runtime assume the ``.pyc`` file is
up to date and not validate the ``.pyc`` against the source file at all.
This option is useful when the ``.pycs`` are kept up to date by some
system external to Python like a build system.
.. function:: main(args=None)

View File

@ -675,6 +675,33 @@ Here are the exact rules used:
:meth:`~importlib.abc.Loader.module_repr` method, if defined, before
trying either approach described above. However, the method is deprecated.
.. _pyc-invalidation:
Cached bytecode invalidation
----------------------------
Before Python loads cached bytecode from ``.pyc`` file, it checks whether the
cache is up-to-date with the source ``.py`` file. By default, Python does this
by storing the source's last-modified timestamp and size in the cache file when
writing it. At runtime, the import system then validates the cache file by
checking the stored metadata in the cache file against at source's
metadata.
Python also supports "hash-based" cache files, which store a hash of the source
file's contents rather than its metadata. There are two variants of hash-based
``.pyc`` files: checked and unchecked. For checked hash-based ``.pyc`` files,
Python validates the cache file by hashing the source file and comparing the
resulting hash with the hash in the cache file. If a checked hash-based cache
file is found to be invalid, Python regenerates it and writes a new checked
hash-based cache file. For unchecked hash-based ``.pyc`` files, Python simply
assumes the cache file is valid if it exists. Hash-based ``.pyc`` files
validation behavior may be overridden with the :option:`--check-hash-based-pycs`
flag.
.. versionchanged:: 3.7
Added hash-based ``.pyc`` files. Previously, Python only supported
timestamp-based invalidation of bytecode caches.
The Path Based Finder
=====================

View File

@ -210,6 +210,20 @@ Miscellaneous options
import of source modules. See also :envvar:`PYTHONDONTWRITEBYTECODE`.
.. cmdoption:: --check-hash-based-pycs default|always|never
Control the validation behavior of hash-based ``.pyc`` files. See
:ref:`pyc-invalidation`. When set to ``default``, checked and unchecked
hash-based bytecode cache files are validated according to their default
semantics. When set to ``always``, all hash-based ``.pyc`` files, whether
checked or unchecked, are validated against their corresponding source
file. When set to ``never``, hash-based ``.pyc`` files are not validated
against their corresponding source files.
The semantics of timestamp-based ``.pyc`` files are unaffected by this
option.
.. cmdoption:: -d
Turn on parser debugging output (for expert only, depending on compilation

View File

@ -197,6 +197,33 @@ variable is not set in practice.
See :option:`-X` ``dev`` for the details.
Hash-based pycs
---------------
Python has traditionally checked the up-to-dateness of bytecode cache files
(i.e., ``.pyc`` files) by comparing the source metadata (last-modified timestamp
and size) with source metadata saved in the cache file header when it was
generated. While effective, this invalidation method has its drawbacks. When
filesystem timestamps are too coarse, Python can miss source updates, leading to
user confusion. Additionally, having a timestamp in the cache file is
problematic for `build reproduciblity <https://reproducible-builds.org/>`_ and
content-based build systems.
:pep:`552` extends the pyc format to allow the hash of the source file to be
used for invalidation instead of the source timestamp. Such ``.pyc`` files are
called "hash-based". By default, Python still uses timestamp-based invalidation
and does not generate hash-based ``.pyc`` files at runtime. Hash-based ``.pyc``
files may be generated with :mod:`py_compile` or :mod:`compileall`.
Hash-based ``.pyc`` files come in two variants: checked and unchecked. Python
validates checked hash-based ``.pyc`` files against the corresponding source
files at runtime but doesn't do so for unchecked hash-based pycs. Unchecked
hash-based ``.pyc`` files are a useful performance optimization for environments
where a system external to Python (e.g., the build system) is responsible for
keeping ``.pyc`` files up-to-date.
See :ref:`pyc-invalidation` for more information.
Other Language Changes
======================

6
Include/internal/hash.h Normal file
View File

@ -0,0 +1,6 @@
#ifndef Py_INTERNAL_HASH_H
#define Py_INTERNAL_HASH_H
uint64_t _Py_KeyedHash(uint64_t, const char *, Py_ssize_t);
#endif

View File

@ -0,0 +1,6 @@
#ifndef Py_INTERNAL_IMPORT_H
#define Py_INTERNAL_IMPORT_H
extern const char *_Py_CheckHashBasedPycsMode;
#endif

View File

@ -12,7 +12,14 @@ PyAPI_DATA(wchar_t *) _PyOS_optarg;
PyAPI_FUNC(void) _PyOS_ResetGetOpt(void);
PyAPI_FUNC(int) _PyOS_GetOpt(int argc, wchar_t **argv, wchar_t *optstring);
typedef struct {
const wchar_t *name;
int has_arg;
int val;
} _PyOS_LongOption;
PyAPI_FUNC(int) _PyOS_GetOpt(int argc, wchar_t **argv, wchar_t *optstring,
const _PyOS_LongOption *longopts, int *longindex);
#endif /* !Py_LIMITED_API */
#ifdef __cplusplus

View File

@ -52,7 +52,8 @@ def _walk_dir(dir, ddir=None, maxlevels=10, quiet=0):
maxlevels=maxlevels - 1, quiet=quiet)
def compile_dir(dir, maxlevels=10, ddir=None, force=False, rx=None,
quiet=0, legacy=False, optimize=-1, workers=1):
quiet=0, legacy=False, optimize=-1, workers=1,
invalidation_mode=py_compile.PycInvalidationMode.TIMESTAMP):
"""Byte-compile all modules in the given directory tree.
Arguments (only dir is required):
@ -67,6 +68,7 @@ def compile_dir(dir, maxlevels=10, ddir=None, force=False, rx=None,
legacy: if True, produce legacy pyc paths instead of PEP 3147 paths
optimize: optimization level or -1 for level of the interpreter
workers: maximum number of parallel workers
invalidation_mode: how the up-to-dateness of the pyc will be checked
"""
if workers is not None and workers < 0:
raise ValueError('workers must be greater or equal to 0')
@ -81,18 +83,20 @@ def compile_dir(dir, maxlevels=10, ddir=None, force=False, rx=None,
ddir=ddir, force=force,
rx=rx, quiet=quiet,
legacy=legacy,
optimize=optimize),
optimize=optimize,
invalidation_mode=invalidation_mode),
files)
success = min(results, default=True)
else:
for file in files:
if not compile_file(file, ddir, force, rx, quiet,
legacy, optimize):
legacy, optimize, invalidation_mode):
success = False
return success
def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0,
legacy=False, optimize=-1):
legacy=False, optimize=-1,
invalidation_mode=py_compile.PycInvalidationMode.TIMESTAMP):
"""Byte-compile one file.
Arguments (only fullname is required):
@ -105,6 +109,7 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0,
no output with 2
legacy: if True, produce legacy pyc paths instead of PEP 3147 paths
optimize: optimization level or -1 for level of the interpreter
invalidation_mode: how the up-to-dateness of the pyc will be checked
"""
success = True
if quiet < 2 and isinstance(fullname, os.PathLike):
@ -134,10 +139,10 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0,
if not force:
try:
mtime = int(os.stat(fullname).st_mtime)
expect = struct.pack('<4sl', importlib.util.MAGIC_NUMBER,
mtime)
expect = struct.pack('<4sll', importlib.util.MAGIC_NUMBER,
0, mtime)
with open(cfile, 'rb') as chandle:
actual = chandle.read(8)
actual = chandle.read(12)
if expect == actual:
return success
except OSError:
@ -146,7 +151,8 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0,
print('Compiling {!r}...'.format(fullname))
try:
ok = py_compile.compile(fullname, cfile, dfile, True,
optimize=optimize)
optimize=optimize,
invalidation_mode=invalidation_mode)
except py_compile.PyCompileError as err:
success = False
if quiet >= 2:
@ -175,7 +181,8 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0,
return success
def compile_path(skip_curdir=1, maxlevels=0, force=False, quiet=0,
legacy=False, optimize=-1):
legacy=False, optimize=-1,
invalidation_mode=py_compile.PycInvalidationMode.TIMESTAMP):
"""Byte-compile all module on sys.path.
Arguments (all optional):
@ -186,6 +193,7 @@ def compile_path(skip_curdir=1, maxlevels=0, force=False, quiet=0,
quiet: as for compile_dir() (default 0)
legacy: as for compile_dir() (default False)
optimize: as for compile_dir() (default -1)
invalidation_mode: as for compiler_dir()
"""
success = True
for dir in sys.path:
@ -193,9 +201,16 @@ def compile_path(skip_curdir=1, maxlevels=0, force=False, quiet=0,
if quiet < 2:
print('Skipping current directory')
else:
success = success and compile_dir(dir, maxlevels, None,
force, quiet=quiet,
legacy=legacy, optimize=optimize)
success = success and compile_dir(
dir,
maxlevels,
None,
force,
quiet=quiet,
legacy=legacy,
optimize=optimize,
invalidation_mode=invalidation_mode,
)
return success
@ -238,6 +253,11 @@ def main():
'to the equivalent of -l sys.path'))
parser.add_argument('-j', '--workers', default=1,
type=int, help='Run compileall concurrently')
invalidation_modes = [mode.name.lower().replace('_', '-')
for mode in py_compile.PycInvalidationMode]
parser.add_argument('--invalidation-mode', default='timestamp',
choices=sorted(invalidation_modes),
help='How the pycs will be invalidated at runtime')
args = parser.parse_args()
compile_dests = args.compile_dest
@ -266,23 +286,29 @@ def main():
if args.workers is not None:
args.workers = args.workers or None
ivl_mode = args.invalidation_mode.replace('-', '_').upper()
invalidation_mode = py_compile.PycInvalidationMode[ivl_mode]
success = True
try:
if compile_dests:
for dest in compile_dests:
if os.path.isfile(dest):
if not compile_file(dest, args.ddir, args.force, args.rx,
args.quiet, args.legacy):
args.quiet, args.legacy,
invalidation_mode=invalidation_mode):
success = False
else:
if not compile_dir(dest, maxlevels, args.ddir,
args.force, args.rx, args.quiet,
args.legacy, workers=args.workers):
args.legacy, workers=args.workers,
invalidation_mode=invalidation_mode):
success = False
return success
else:
return compile_path(legacy=args.legacy, force=args.force,
quiet=args.quiet)
quiet=args.quiet,
invalidation_mode=invalidation_mode)
except KeyboardInterrupt:
if args.quiet < 2:
print("\n[interrupted]")

View File

@ -242,6 +242,7 @@ _code_type = type(_write_atomic.__code__)
# Python 3.6rc1 3379 (more thorough __class__ validation #23722)
# Python 3.7a0 3390 (add LOAD_METHOD and CALL_METHOD opcodes)
# Python 3.7a0 3391 (update GET_AITER #31709)
# Python 3.7a0 3392 (PEP 552: Deterministic pycs)
#
# MAGIC must change whenever the bytecode emitted by the compiler may no
# longer be understood by older implementations of the eval loop (usually
@ -250,7 +251,7 @@ _code_type = type(_write_atomic.__code__)
# Whenever MAGIC_NUMBER is changed, the ranges in the magic_values array
# in PC/launcher.c must also be updated.
MAGIC_NUMBER = (3391).to_bytes(2, 'little') + b'\r\n'
MAGIC_NUMBER = (3392).to_bytes(2, 'little') + b'\r\n'
_RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little') # For import.c
_PYCACHE = '__pycache__'
@ -429,63 +430,93 @@ def _find_module_shim(self, fullname):
return loader
def _validate_bytecode_header(data, source_stats=None, name=None, path=None):
"""Validate the header of the passed-in bytecode against source_stats (if
given) and returning the bytecode that can be compiled by compile().
def _classify_pyc(data, name, exc_details):
"""Perform basic validity checking of a pyc header and return the flags field,
which determines how the pyc should be further validated against the source.
All other arguments are used to enhance error reporting.
*data* is the contents of the pyc file. (Only the first 16 bytes are
required, though.)
ImportError is raised when the magic number is incorrect or the bytecode is
found to be stale. EOFError is raised when the data is found to be
truncated.
*name* is the name of the module being imported. It is used for logging.
*exc_details* is a dictionary passed to ImportError if it raised for
improved debugging.
ImportError is raised when the magic number is incorrect or when the flags
field is invalid. EOFError is raised when the data is found to be truncated.
"""
exc_details = {}
if name is not None:
exc_details['name'] = name
else:
# To prevent having to make all messages have a conditional name.
name = '<bytecode>'
if path is not None:
exc_details['path'] = path
magic = data[:4]
raw_timestamp = data[4:8]
raw_size = data[8:12]
if magic != MAGIC_NUMBER:
message = 'bad magic number in {!r}: {!r}'.format(name, magic)
message = f'bad magic number in {name!r}: {magic!r}'
_bootstrap._verbose_message('{}', message)
raise ImportError(message, **exc_details)
elif len(raw_timestamp) != 4:
message = 'reached EOF while reading timestamp in {!r}'.format(name)
if len(data) < 16:
message = f'reached EOF while reading pyc header of {name!r}'
_bootstrap._verbose_message('{}', message)
raise EOFError(message)
elif len(raw_size) != 4:
message = 'reached EOF while reading size of source in {!r}'.format(name)
flags = _r_long(data[4:8])
# Only the first two flags are defined.
if flags & ~0b11:
message = f'invalid flags {flags!r} in {name!r}'
raise ImportError(message, **exc_details)
return flags
def _validate_timestamp_pyc(data, source_mtime, source_size, name,
exc_details):
"""Validate a pyc against the source last-modified time.
*data* is the contents of the pyc file. (Only the first 16 bytes are
required.)
*source_mtime* is the last modified timestamp of the source file.
*source_size* is None or the size of the source file in bytes.
*name* is the name of the module being imported. It is used for logging.
*exc_details* is a dictionary passed to ImportError if it raised for
improved debugging.
An ImportError is raised if the bytecode is stale.
"""
if _r_long(data[8:12]) != (source_mtime & 0xFFFFFFFF):
message = f'bytecode is stale for {name!r}'
_bootstrap._verbose_message('{}', message)
raise EOFError(message)
if source_stats is not None:
try:
source_mtime = int(source_stats['mtime'])
except KeyError:
pass
else:
if _r_long(raw_timestamp) != source_mtime:
message = 'bytecode is stale for {!r}'.format(name)
_bootstrap._verbose_message('{}', message)
raise ImportError(message, **exc_details)
try:
source_size = source_stats['size'] & 0xFFFFFFFF
except KeyError:
pass
else:
if _r_long(raw_size) != source_size:
raise ImportError('bytecode is stale for {!r}'.format(name),
**exc_details)
return data[12:]
raise ImportError(message, **exc_details)
if (source_size is not None and
_r_long(data[12:16]) != (source_size & 0xFFFFFFFF)):
raise ImportError(f'bytecode is stale for {name!r}', **exc_details)
def _validate_hash_pyc(data, source_hash, name, exc_details):
"""Validate a hash-based pyc by checking the real source hash against the one in
the pyc header.
*data* is the contents of the pyc file. (Only the first 16 bytes are
required.)
*source_hash* is the importlib.util.source_hash() of the source file.
*name* is the name of the module being imported. It is used for logging.
*exc_details* is a dictionary passed to ImportError if it raised for
improved debugging.
An ImportError is raised if the bytecode is stale.
"""
if data[8:16] != source_hash:
raise ImportError(
f'hash in bytecode doesn\'t match hash of source {name!r}',
**exc_details,
)
def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
"""Compile bytecode as returned by _validate_bytecode_header()."""
"""Compile bytecode as found in a pyc."""
code = marshal.loads(data)
if isinstance(code, _code_type):
_bootstrap._verbose_message('code object from {!r}', bytecode_path)
@ -496,16 +527,28 @@ def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
raise ImportError('Non-code object in {!r}'.format(bytecode_path),
name=name, path=bytecode_path)
def _code_to_bytecode(code, mtime=0, source_size=0):
"""Compile a code object into bytecode for writing out to a byte-compiled
file."""
def _code_to_timestamp_pyc(code, mtime=0, source_size=0):
"Produce the data for a timestamp-based pyc."
data = bytearray(MAGIC_NUMBER)
data.extend(_w_long(0))
data.extend(_w_long(mtime))
data.extend(_w_long(source_size))
data.extend(marshal.dumps(code))
return data
def _code_to_hash_pyc(code, source_hash, checked=True):
"Produce the data for a hash-based pyc."
data = bytearray(MAGIC_NUMBER)
flags = 0b1 | checked << 1
data.extend(_w_long(flags))
assert len(source_hash) == 8
data.extend(source_hash)
data.extend(marshal.dumps(code))
return data
def decode_source(source_bytes):
"""Decode bytes representing source code and return the string.
@ -751,6 +794,10 @@ class SourceLoader(_LoaderBasics):
"""
source_path = self.get_filename(fullname)
source_mtime = None
source_bytes = None
source_hash = None
hash_based = False
check_source = True
try:
bytecode_path = cache_from_source(source_path)
except NotImplementedError:
@ -767,10 +814,34 @@ class SourceLoader(_LoaderBasics):
except OSError:
pass
else:
exc_details = {
'name': fullname,
'path': bytecode_path,
}
try:
bytes_data = _validate_bytecode_header(data,
source_stats=st, name=fullname,
path=bytecode_path)
flags = _classify_pyc(data, fullname, exc_details)
bytes_data = memoryview(data)[16:]
hash_based = flags & 0b1 != 0
if hash_based:
check_source = flags & 0b10 != 0
if (_imp.check_hash_based_pycs != 'never' and
(check_source or
_imp.check_hash_based_pycs == 'always')):
source_bytes = self.get_data(source_path)
source_hash = _imp.source_hash(
_RAW_MAGIC_NUMBER,
source_bytes,
)
_validate_hash_pyc(data, source_hash, fullname,
exc_details)
else:
_validate_timestamp_pyc(
data,
source_mtime,
st['size'],
fullname,
exc_details,
)
except (ImportError, EOFError):
pass
else:
@ -779,13 +850,19 @@ class SourceLoader(_LoaderBasics):
return _compile_bytecode(bytes_data, name=fullname,
bytecode_path=bytecode_path,
source_path=source_path)
source_bytes = self.get_data(source_path)
if source_bytes is None:
source_bytes = self.get_data(source_path)
code_object = self.source_to_code(source_bytes, source_path)
_bootstrap._verbose_message('code object from {}', source_path)
if (not sys.dont_write_bytecode and bytecode_path is not None and
source_mtime is not None):
data = _code_to_bytecode(code_object, source_mtime,
len(source_bytes))
if hash_based:
if source_hash is None:
source_hash = _imp.source_hash(source_bytes)
data = _code_to_hash_pyc(code_object, source_hash, check_source)
else:
data = _code_to_timestamp_pyc(code_object, source_mtime,
len(source_bytes))
try:
self._cache_bytecode(source_path, bytecode_path, data)
_bootstrap._verbose_message('wrote {!r}', bytecode_path)
@ -887,8 +964,18 @@ class SourcelessFileLoader(FileLoader, _LoaderBasics):
def get_code(self, fullname):
path = self.get_filename(fullname)
data = self.get_data(path)
bytes_data = _validate_bytecode_header(data, name=fullname, path=path)
return _compile_bytecode(bytes_data, name=fullname, bytecode_path=path)
# Call _classify_pyc to do basic validation of the pyc but ignore the
# result. There's no source to check against.
exc_details = {
'name': fullname,
'path': path,
}
_classify_pyc(data, fullname, exc_details)
return _compile_bytecode(
memoryview(data)[16:],
name=fullname,
bytecode_path=path,
)
def get_source(self, fullname):
"""Return None as there is no source code."""

View File

@ -5,18 +5,25 @@ from ._bootstrap import _resolve_name
from ._bootstrap import spec_from_loader
from ._bootstrap import _find_spec
from ._bootstrap_external import MAGIC_NUMBER
from ._bootstrap_external import _RAW_MAGIC_NUMBER
from ._bootstrap_external import cache_from_source
from ._bootstrap_external import decode_source
from ._bootstrap_external import source_from_cache
from ._bootstrap_external import spec_from_file_location
from contextlib import contextmanager
import _imp
import functools
import sys
import types
import warnings
def source_hash(source_bytes):
"Return the hash of *source_bytes* as used in hash-based pyc files."
return _imp.source_hash(_RAW_MAGIC_NUMBER, source_bytes)
def resolve_name(name, package):
"""Resolve a relative module name to an absolute one."""
if not name.startswith('.'):

View File

@ -287,11 +287,12 @@ class ModuleFinder:
co = compile(fp.read()+'\n', pathname, 'exec')
elif type == imp.PY_COMPILED:
try:
marshal_data = importlib._bootstrap_external._validate_bytecode_header(fp.read())
data = fp.read()
importlib._bootstrap_external._classify_pyc(data, fqname, {})
except ImportError as exc:
self.msgout(2, "raise ImportError: " + str(exc), pathname)
raise
co = marshal.loads(marshal_data)
co = marshal.loads(memoryview(data)[16:])
else:
co = None
m = self.add_module(fqname)

View File

@ -46,7 +46,7 @@ def read_code(stream):
if magic != importlib.util.MAGIC_NUMBER:
return None
stream.read(8) # Skip timestamp and size
stream.read(12) # Skip rest of the header
return marshal.load(stream)

View File

@ -3,6 +3,7 @@
This module has intimate knowledge of the format of .pyc files.
"""
import enum
import importlib._bootstrap_external
import importlib.machinery
import importlib.util
@ -11,7 +12,7 @@ import os.path
import sys
import traceback
__all__ = ["compile", "main", "PyCompileError"]
__all__ = ["compile", "main", "PyCompileError", "PycInvalidationMode"]
class PyCompileError(Exception):
@ -62,7 +63,14 @@ class PyCompileError(Exception):
return self.msg
def compile(file, cfile=None, dfile=None, doraise=False, optimize=-1):
class PycInvalidationMode(enum.Enum):
TIMESTAMP = 1
CHECKED_HASH = 2
UNCHECKED_HASH = 3
def compile(file, cfile=None, dfile=None, doraise=False, optimize=-1,
invalidation_mode=PycInvalidationMode.TIMESTAMP):
"""Byte-compile one Python source file to Python bytecode.
:param file: The source file name.
@ -79,6 +87,7 @@ def compile(file, cfile=None, dfile=None, doraise=False, optimize=-1):
:param optimize: The optimization level for the compiler. Valid values
are -1, 0, 1 and 2. A value of -1 means to use the optimization
level of the current interpreter, as given by -O command line options.
:param invalidation_mode:
:return: Path to the resulting byte compiled file.
@ -136,9 +145,17 @@ def compile(file, cfile=None, dfile=None, doraise=False, optimize=-1):
os.makedirs(dirname)
except FileExistsError:
pass
source_stats = loader.path_stats(file)
bytecode = importlib._bootstrap_external._code_to_bytecode(
if invalidation_mode == PycInvalidationMode.TIMESTAMP:
source_stats = loader.path_stats(file)
bytecode = importlib._bootstrap_external._code_to_timestamp_pyc(
code, source_stats['mtime'], source_stats['size'])
else:
source_hash = importlib.util.source_hash(source_bytes)
bytecode = importlib._bootstrap_external._code_to_hash_pyc(
code,
source_hash,
(invalidation_mode == PycInvalidationMode.CHECKED_HASH),
)
mode = importlib._bootstrap_external._calc_mode(file)
importlib._bootstrap_external._write_atomic(cfile, bytecode, mode)
return cfile

View File

@ -48,9 +48,9 @@ class CompileallTests(unittest.TestCase):
def data(self):
with open(self.bc_path, 'rb') as file:
data = file.read(8)
data = file.read(12)
mtime = int(os.stat(self.source_path).st_mtime)
compare = struct.pack('<4sl', importlib.util.MAGIC_NUMBER, mtime)
compare = struct.pack('<4sll', importlib.util.MAGIC_NUMBER, 0, mtime)
return data, compare
@unittest.skipUnless(hasattr(os, 'stat'), 'test needs os.stat()')
@ -70,8 +70,8 @@ class CompileallTests(unittest.TestCase):
def test_mtime(self):
# Test a change in mtime leads to a new .pyc.
self.recreation_check(struct.pack('<4sl', importlib.util.MAGIC_NUMBER,
1))
self.recreation_check(struct.pack('<4sll', importlib.util.MAGIC_NUMBER,
0, 1))
def test_magic_number(self):
# Test a change in mtime leads to a new .pyc.
@ -519,6 +519,19 @@ class CommandLineTests(unittest.TestCase):
out = self.assertRunOK('badfilename')
self.assertRegex(out, b"Can't list 'badfilename'")
def test_pyc_invalidation_mode(self):
script_helper.make_script(self.pkgdir, 'f1', '')
pyc = importlib.util.cache_from_source(
os.path.join(self.pkgdir, 'f1.py'))
self.assertRunOK('--invalidation-mode=checked-hash', self.pkgdir)
with open(pyc, 'rb') as fp:
data = fp.read()
self.assertEqual(int.from_bytes(data[4:8], 'little'), 0b11)
self.assertRunOK('--invalidation-mode=unchecked-hash', self.pkgdir)
with open(pyc, 'rb') as fp:
data = fp.read()
self.assertEqual(int.from_bytes(data[4:8], 'little'), 0b01)
@skipUnless(_have_multiprocessing, "requires multiprocessing")
def test_workers(self):
bar2fn = script_helper.make_script(self.directory, 'bar2', '')

View File

@ -4,11 +4,13 @@ import os
import os.path
import sys
from test import support
from test.support import script_helper
import unittest
import warnings
with warnings.catch_warnings():
warnings.simplefilter('ignore', DeprecationWarning)
import imp
import _imp
def requires_load_dynamic(meth):
@ -329,6 +331,25 @@ class ImportTests(unittest.TestCase):
with self.assertRaises(TypeError):
create_dynamic(BadSpec())
def test_source_hash(self):
self.assertEqual(_imp.source_hash(42, b'hi'), b'\xc6\xe7Z\r\x03:}\xab')
self.assertEqual(_imp.source_hash(43, b'hi'), b'\x85\x9765\xf8\x9a\x8b9')
def test_pyc_invalidation_mode_from_cmdline(self):
cases = [
([], "default"),
(["--check-hash-based-pycs", "default"], "default"),
(["--check-hash-based-pycs", "always"], "always"),
(["--check-hash-based-pycs", "never"], "never"),
]
for interp_args, expected in cases:
args = interp_args + [
"-c",
"import _imp; print(_imp.check_hash_based_pycs)",
]
res = script_helper.assert_python_ok(*args)
self.assertEqual(res.out.strip().decode('utf-8'), expected)
class ReloadTests(unittest.TestCase):

View File

@ -598,7 +598,7 @@ func_filename = func.__code__.co_filename
def test_foreign_code(self):
py_compile.compile(self.file_name)
with open(self.compiled_name, "rb") as f:
header = f.read(12)
header = f.read(16)
code = marshal.load(f)
constants = list(code.co_consts)
foreign_code = importlib.import_module.__code__

View File

@ -235,6 +235,123 @@ class SimpleTest(abc.LoaderTests):
warnings.simplefilter('ignore', DeprecationWarning)
loader.load_module('bad name')
@util.writes_bytecode_files
def test_checked_hash_based_pyc(self):
with util.create_modules('_temp') as mapping:
source = mapping['_temp']
pyc = self.util.cache_from_source(source)
with open(source, 'wb') as fp:
fp.write(b'state = "old"')
os.utime(source, (50, 50))
py_compile.compile(
source,
invalidation_mode=py_compile.PycInvalidationMode.CHECKED_HASH,
)
loader = self.machinery.SourceFileLoader('_temp', source)
mod = types.ModuleType('_temp')
mod.__spec__ = self.util.spec_from_loader('_temp', loader)
loader.exec_module(mod)
self.assertEqual(mod.state, 'old')
# Write a new source with the same mtime and size as before.
with open(source, 'wb') as fp:
fp.write(b'state = "new"')
os.utime(source, (50, 50))
loader.exec_module(mod)
self.assertEqual(mod.state, 'new')
with open(pyc, 'rb') as fp:
data = fp.read()
self.assertEqual(int.from_bytes(data[4:8], 'little'), 0b11)
self.assertEqual(
self.util.source_hash(b'state = "new"'),
data[8:16],
)
@util.writes_bytecode_files
def test_overriden_checked_hash_based_pyc(self):
with util.create_modules('_temp') as mapping, \
unittest.mock.patch('_imp.check_hash_based_pycs', 'never'):
source = mapping['_temp']
pyc = self.util.cache_from_source(source)
with open(source, 'wb') as fp:
fp.write(b'state = "old"')
os.utime(source, (50, 50))
py_compile.compile(
source,
invalidation_mode=py_compile.PycInvalidationMode.CHECKED_HASH,
)
loader = self.machinery.SourceFileLoader('_temp', source)
mod = types.ModuleType('_temp')
mod.__spec__ = self.util.spec_from_loader('_temp', loader)
loader.exec_module(mod)
self.assertEqual(mod.state, 'old')
# Write a new source with the same mtime and size as before.
with open(source, 'wb') as fp:
fp.write(b'state = "new"')
os.utime(source, (50, 50))
loader.exec_module(mod)
self.assertEqual(mod.state, 'old')
@util.writes_bytecode_files
def test_unchecked_hash_based_pyc(self):
with util.create_modules('_temp') as mapping:
source = mapping['_temp']
pyc = self.util.cache_from_source(source)
with open(source, 'wb') as fp:
fp.write(b'state = "old"')
os.utime(source, (50, 50))
py_compile.compile(
source,
invalidation_mode=py_compile.PycInvalidationMode.UNCHECKED_HASH,
)
loader = self.machinery.SourceFileLoader('_temp', source)
mod = types.ModuleType('_temp')
mod.__spec__ = self.util.spec_from_loader('_temp', loader)
loader.exec_module(mod)
self.assertEqual(mod.state, 'old')
# Update the source file, which should be ignored.
with open(source, 'wb') as fp:
fp.write(b'state = "new"')
loader.exec_module(mod)
self.assertEqual(mod.state, 'old')
with open(pyc, 'rb') as fp:
data = fp.read()
self.assertEqual(int.from_bytes(data[4:8], 'little'), 0b1)
self.assertEqual(
self.util.source_hash(b'state = "old"'),
data[8:16],
)
@util.writes_bytecode_files
def test_overiden_unchecked_hash_based_pyc(self):
with util.create_modules('_temp') as mapping, \
unittest.mock.patch('_imp.check_hash_based_pycs', 'always'):
source = mapping['_temp']
pyc = self.util.cache_from_source(source)
with open(source, 'wb') as fp:
fp.write(b'state = "old"')
os.utime(source, (50, 50))
py_compile.compile(
source,
invalidation_mode=py_compile.PycInvalidationMode.UNCHECKED_HASH,
)
loader = self.machinery.SourceFileLoader('_temp', source)
mod = types.ModuleType('_temp')
mod.__spec__ = self.util.spec_from_loader('_temp', loader)
loader.exec_module(mod)
self.assertEqual(mod.state, 'old')
# Update the source file, which should be ignored.
with open(source, 'wb') as fp:
fp.write(b'state = "new"')
loader.exec_module(mod)
self.assertEqual(mod.state, 'new')
with open(pyc, 'rb') as fp:
data = fp.read()
self.assertEqual(int.from_bytes(data[4:8], 'little'), 0b1)
self.assertEqual(
self.util.source_hash(b'state = "new"'),
data[8:16],
)
(Frozen_SimpleTest,
Source_SimpleTest
@ -247,15 +364,17 @@ class BadBytecodeTest:
def import_(self, file, module_name):
raise NotImplementedError
def manipulate_bytecode(self, name, mapping, manipulator, *,
del_source=False):
def manipulate_bytecode(self,
name, mapping, manipulator, *,
del_source=False,
invalidation_mode=py_compile.PycInvalidationMode.TIMESTAMP):
"""Manipulate the bytecode of a module by passing it into a callable
that returns what to use as the new bytecode."""
try:
del sys.modules['_temp']
except KeyError:
pass
py_compile.compile(mapping[name])
py_compile.compile(mapping[name], invalidation_mode=invalidation_mode)
if not del_source:
bytecode_path = self.util.cache_from_source(mapping[name])
else:
@ -294,24 +413,51 @@ class BadBytecodeTest:
del_source=del_source)
test('_temp', mapping, bc_path)
def _test_partial_timestamp(self, test, *, del_source=False):
def _test_partial_flags(self, test, *, del_source=False):
with util.create_modules('_temp') as mapping:
bc_path = self.manipulate_bytecode('_temp', mapping,
lambda bc: bc[:7],
del_source=del_source)
lambda bc: bc[:7],
del_source=del_source)
test('_temp', mapping, bc_path)
def _test_partial_size(self, test, *, del_source=False):
def _test_partial_hash(self, test, *, del_source=False):
with util.create_modules('_temp') as mapping:
bc_path = self.manipulate_bytecode(
'_temp',
mapping,
lambda bc: bc[:13],
del_source=del_source,
invalidation_mode=py_compile.PycInvalidationMode.CHECKED_HASH,
)
test('_temp', mapping, bc_path)
with util.create_modules('_temp') as mapping:
bc_path = self.manipulate_bytecode(
'_temp',
mapping,
lambda bc: bc[:13],
del_source=del_source,
invalidation_mode=py_compile.PycInvalidationMode.UNCHECKED_HASH,
)
test('_temp', mapping, bc_path)
def _test_partial_timestamp(self, test, *, del_source=False):
with util.create_modules('_temp') as mapping:
bc_path = self.manipulate_bytecode('_temp', mapping,
lambda bc: bc[:11],
del_source=del_source)
test('_temp', mapping, bc_path)
def _test_partial_size(self, test, *, del_source=False):
with util.create_modules('_temp') as mapping:
bc_path = self.manipulate_bytecode('_temp', mapping,
lambda bc: bc[:15],
del_source=del_source)
test('_temp', mapping, bc_path)
def _test_no_marshal(self, *, del_source=False):
with util.create_modules('_temp') as mapping:
bc_path = self.manipulate_bytecode('_temp', mapping,
lambda bc: bc[:12],
lambda bc: bc[:16],
del_source=del_source)
file_path = mapping['_temp'] if not del_source else bc_path
with self.assertRaises(EOFError):
@ -320,7 +466,7 @@ class BadBytecodeTest:
def _test_non_code_marshal(self, *, del_source=False):
with util.create_modules('_temp') as mapping:
bytecode_path = self.manipulate_bytecode('_temp', mapping,
lambda bc: bc[:12] + marshal.dumps(b'abcd'),
lambda bc: bc[:16] + marshal.dumps(b'abcd'),
del_source=del_source)
file_path = mapping['_temp'] if not del_source else bytecode_path
with self.assertRaises(ImportError) as cm:
@ -331,7 +477,7 @@ class BadBytecodeTest:
def _test_bad_marshal(self, *, del_source=False):
with util.create_modules('_temp') as mapping:
bytecode_path = self.manipulate_bytecode('_temp', mapping,
lambda bc: bc[:12] + b'<test>',
lambda bc: bc[:16] + b'<test>',
del_source=del_source)
file_path = mapping['_temp'] if not del_source else bytecode_path
with self.assertRaises(EOFError):
@ -376,7 +522,7 @@ class SourceLoaderBadBytecodeTest:
def test(name, mapping, bytecode_path):
self.import_(mapping[name], name)
with open(bytecode_path, 'rb') as file:
self.assertGreater(len(file.read()), 12)
self.assertGreater(len(file.read()), 16)
self._test_empty_file(test)
@ -384,7 +530,7 @@ class SourceLoaderBadBytecodeTest:
def test(name, mapping, bytecode_path):
self.import_(mapping[name], name)
with open(bytecode_path, 'rb') as file:
self.assertGreater(len(file.read()), 12)
self.assertGreater(len(file.read()), 16)
self._test_partial_magic(test)
@ -395,7 +541,7 @@ class SourceLoaderBadBytecodeTest:
def test(name, mapping, bytecode_path):
self.import_(mapping[name], name)
with open(bytecode_path, 'rb') as file:
self.assertGreater(len(file.read()), 12)
self.assertGreater(len(file.read()), 16)
self._test_magic_only(test)
@ -418,10 +564,30 @@ class SourceLoaderBadBytecodeTest:
def test(name, mapping, bc_path):
self.import_(mapping[name], name)
with open(bc_path, 'rb') as file:
self.assertGreater(len(file.read()), 12)
self.assertGreater(len(file.read()), 16)
self._test_partial_timestamp(test)
@util.writes_bytecode_files
def test_partial_flags(self):
# When the flags is partial, regenerate the .pyc, else raise EOFError.
def test(name, mapping, bc_path):
self.import_(mapping[name], name)
with open(bc_path, 'rb') as file:
self.assertGreater(len(file.read()), 16)
self._test_partial_flags(test)
@util.writes_bytecode_files
def test_partial_hash(self):
# When the hash is partial, regenerate the .pyc, else raise EOFError.
def test(name, mapping, bc_path):
self.import_(mapping[name], name)
with open(bc_path, 'rb') as file:
self.assertGreater(len(file.read()), 16)
self._test_partial_hash(test)
@util.writes_bytecode_files
def test_partial_size(self):
# When the size is partial, regenerate the .pyc, else
@ -429,7 +595,7 @@ class SourceLoaderBadBytecodeTest:
def test(name, mapping, bc_path):
self.import_(mapping[name], name)
with open(bc_path, 'rb') as file:
self.assertGreater(len(file.read()), 12)
self.assertGreater(len(file.read()), 16)
self._test_partial_size(test)
@ -459,13 +625,13 @@ class SourceLoaderBadBytecodeTest:
py_compile.compile(mapping['_temp'])
bytecode_path = self.util.cache_from_source(mapping['_temp'])
with open(bytecode_path, 'r+b') as bytecode_file:
bytecode_file.seek(4)
bytecode_file.seek(8)
bytecode_file.write(zeros)
self.import_(mapping['_temp'], '_temp')
source_mtime = os.path.getmtime(mapping['_temp'])
source_timestamp = self.importlib._w_long(source_mtime)
with open(bytecode_path, 'rb') as bytecode_file:
bytecode_file.seek(4)
bytecode_file.seek(8)
self.assertEqual(bytecode_file.read(4), source_timestamp)
# [bytecode read-only]
@ -560,6 +726,20 @@ class SourcelessLoaderBadBytecodeTest:
self._test_partial_timestamp(test, del_source=True)
def test_partial_flags(self):
def test(name, mapping, bytecode_path):
with self.assertRaises(EOFError):
self.import_(bytecode_path, name)
self._test_partial_flags(test, del_source=True)
def test_partial_hash(self):
def test(name, mapping, bytecode_path):
with self.assertRaises(EOFError):
self.import_(bytecode_path, name)
self._test_partial_hash(test, del_source=True)
def test_partial_size(self):
def test(name, mapping, bytecode_path):
with self.assertRaises(EOFError):

View File

@ -673,6 +673,7 @@ class SourceLoader(SourceOnlyLoader):
if magic is None:
magic = self.util.MAGIC_NUMBER
data = bytearray(magic)
data.extend(self.init._w_long(0))
data.extend(self.init._w_long(self.source_mtime))
data.extend(self.init._w_long(self.source_size))
code_object = compile(self.source, self.path, 'exec',
@ -836,6 +837,7 @@ class SourceLoaderBytecodeTests(SourceLoaderTestHarness):
if bytecode_written:
self.assertIn(self.cached, self.loader.written)
data = bytearray(self.util.MAGIC_NUMBER)
data.extend(self.init._w_long(0))
data.extend(self.init._w_long(self.loader.source_mtime))
data.extend(self.init._w_long(self.loader.source_size))
data.extend(marshal.dumps(code_object))

View File

@ -122,6 +122,24 @@ class PyCompileTests(unittest.TestCase):
# Specifying optimized bytecode should lead to a path reflecting that.
self.assertIn('opt-2', py_compile.compile(self.source_path, optimize=2))
def test_invalidation_mode(self):
py_compile.compile(
self.source_path,
invalidation_mode=py_compile.PycInvalidationMode.CHECKED_HASH,
)
with open(self.cache_path, 'rb') as fp:
flags = importlib._bootstrap_external._classify_pyc(
fp.read(), 'test', {})
self.assertEqual(flags, 0b11)
py_compile.compile(
self.source_path,
invalidation_mode=py_compile.PycInvalidationMode.UNCHECKED_HASH,
)
with open(self.cache_path, 'rb') as fp:
flags = importlib._bootstrap_external._classify_pyc(
fp.read(), 'test', {})
self.assertEqual(flags, 0b1)
if __name__ == "__main__":
unittest.main()

View File

@ -40,7 +40,7 @@ def make_pyc(co, mtime, size):
else:
mtime = int(-0x100000000 + int(mtime))
pyc = (importlib.util.MAGIC_NUMBER +
struct.pack("<ii", int(mtime), size & 0xFFFFFFFF) + data)
struct.pack("<iii", 0, int(mtime), size & 0xFFFFFFFF) + data)
return pyc
def module_path_to_dotted_name(path):
@ -187,6 +187,20 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase):
TESTMOD + pyc_ext: (NOW, test_pyc)}
self.doTest(pyc_ext, files, TESTMOD)
def testUncheckedHashBasedPyc(self):
source = b"state = 'old'"
source_hash = importlib.util.source_hash(source)
bytecode = importlib._bootstrap_external._code_to_hash_pyc(
compile(source, "???", "exec"),
source_hash,
False, # unchecked
)
files = {TESTMOD + ".py": (NOW, "state = 'new'"),
TESTMOD + ".pyc": (NOW - 20, bytecode)}
def check(mod):
self.assertEqual(mod.state, 'old')
self.doTest(None, files, TESTMOD, call=check)
def testEmptyPy(self):
files = {TESTMOD + ".py": (NOW, "")}
self.doTest(None, files, TESTMOD)
@ -215,7 +229,7 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase):
badtime_pyc = bytearray(test_pyc)
# flip the second bit -- not the first as that one isn't stored in the
# .py's mtime in the zip archive.
badtime_pyc[7] ^= 0x02
badtime_pyc[11] ^= 0x02
files = {TESTMOD + ".py": (NOW, test_src),
TESTMOD + pyc_ext: (NOW, badtime_pyc)}
self.doTest(".py", files, TESTMOD)

View File

@ -0,0 +1,3 @@
Implement PEP 552 (Deterministic pycs). Python now supports invalidating
bytecode cache files bashed on a source content hash rather than source
last-modified time.

View File

@ -72,6 +72,11 @@ python \- an interpreted, interactive, object-oriented programming language
]
.B \-?
]
.br
[
.B \--check-hash-based-pycs
\'default\'|\'always\'|\'never\'
]
.br
[
.B \-c
@ -123,6 +128,9 @@ Specify the command to execute (see next section).
This terminates the option list (following options are passed as
arguments to the command).
.TP
.BI "\-\-check-hash-based-pycs " mode
Configure how Python evaluates the up-to-dateness of hash-based .pyc files.
.TP
.B \-d
Turn on parser debugging output (for expert only, depending on
compilation options).

View File

@ -2,6 +2,7 @@
#include "Python.h"
#include "osdefs.h"
#include "internal/import.h"
#include "internal/pystate.h"
#include <locale.h>
@ -61,6 +62,11 @@ static int orig_argc;
#define PROGRAM_OPTS BASE_OPTS
static const _PyOS_LongOption longoptions[] = {
{L"check-hash-based-pycs", 1, 0},
{NULL, 0, 0},
};
/* Short usage message (with %s for argv0) */
static const char usage_line[] =
"usage: %ls [option] ... [-c cmd | -m mod | file | -] [arg] ...\n";
@ -98,6 +104,8 @@ static const char usage_3[] = "\
also PYTHONWARNINGS=arg\n\
-x : skip first line of source, allowing use of non-Unix forms of #!cmd\n\
-X opt : set implementation-specific option\n\
--check-hash-based-pycs always|default|never:\n\
control how Python invalidates hash-based .pyc files\n\
";
static const char usage_4[] = "\
file : program read from script file\n\
@ -393,6 +401,7 @@ typedef struct {
int quiet_flag; /* Py_QuietFlag, -q */
int skip_first_line; /* -x option */
_Py_OptList xoptions; /* -X options */
const char *check_hash_pycs_mode; /* --check-hash-based-pycs */
#ifdef MS_WINDOWS
int legacy_windows_fs_encoding; /* Py_LegacyWindowsFSEncodingFlag,
PYTHONLEGACYWINDOWSFSENCODING */
@ -577,7 +586,9 @@ pymain_parse_cmdline_impl(_PyMain *pymain)
_PyOS_ResetGetOpt();
do {
int c = _PyOS_GetOpt(pymain->argc, pymain->argv, PROGRAM_OPTS);
int longindex = -1;
int c = _PyOS_GetOpt(pymain->argc, pymain->argv, PROGRAM_OPTS,
longoptions, &longindex);
if (c == EOF) {
break;
}
@ -608,6 +619,22 @@ pymain_parse_cmdline_impl(_PyMain *pymain)
}
switch (c) {
case 0:
// Handle long option.
assert(longindex == 0); // Only one long option now.
if (!wcscmp(_PyOS_optarg, L"always")) {
cmdline->check_hash_pycs_mode = "always";
} else if (!wcscmp(_PyOS_optarg, L"never")) {
cmdline->check_hash_pycs_mode = "never";
} else if (!wcscmp(_PyOS_optarg, L"default")) {
cmdline->check_hash_pycs_mode = "default";
} else {
fprintf(stderr, "--check-hash-based-pycs must be one of "
"'default', 'always', or 'never'\n");
return 1;
}
break;
case 'b':
cmdline->bytes_warning++;
break;
@ -1085,6 +1112,8 @@ pymain_set_global_config(_PyMain *pymain)
pymain_set_flag(&Py_UnbufferedStdioFlag, cmdline->use_unbuffered_io);
pymain_set_flag(&Py_VerboseFlag, cmdline->verbosity);
pymain_set_flag(&Py_QuietFlag, cmdline->quiet_flag);
if (cmdline->check_hash_pycs_mode)
_Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode;
#ifdef MS_WINDOWS
pymain_set_flag(&Py_LegacyWindowsFSEncodingFlag, cmdline->legacy_windows_fs_encoding);
pymain_set_flag(&Py_LegacyWindowsStdioFlag, cmdline->legacy_windows_stdio);

View File

@ -1,4 +1,5 @@
#include "Python.h"
#include "internal/import.h"
#include "internal/pystate.h"
#include "structmember.h"
#include "osdefs.h"
@ -1305,7 +1306,7 @@ unmarshal_code(PyObject *pathname, PyObject *data, time_t mtime)
unsigned char *buf = (unsigned char *)PyBytes_AsString(data);
Py_ssize_t size = PyBytes_Size(data);
if (size < 12) {
if (size < 16) {
PyErr_SetString(ZipImportError,
"bad pyc data");
return NULL;
@ -1319,7 +1320,16 @@ unmarshal_code(PyObject *pathname, PyObject *data, time_t mtime)
Py_RETURN_NONE; /* signal caller to try alternative */
}
if (mtime != 0 && !eq_mtime(get_uint32(buf + 4), mtime)) {
uint32_t flags = get_uint32(buf + 4);
if (flags != 0) {
// Hash-based pyc. We currently refuse to handle checked hash-based
// pycs. We could validate hash-based pycs against the source, but it
// seems likely that most people putting hash-based pycs in a zipfile
// will use unchecked ones.
if (strcmp(_Py_CheckHashBasedPycsMode, "never") &&
(flags != 0x1 || !strcmp(_Py_CheckHashBasedPycsMode, "always")))
Py_RETURN_NONE;
} else if ((mtime != 0 && !eq_mtime(get_uint32(buf + 8), mtime))) {
if (Py_VerboseFlag) {
PySys_FormatStderr("# %R has bad mtime\n",
pathname);
@ -1329,7 +1339,7 @@ unmarshal_code(PyObject *pathname, PyObject *data, time_t mtime)
/* XXX the pyc's size field is ignored; timestamp collisions are probably
unimportant with zip files. */
code = PyMarshal_ReadObjectFromString((char *)buf + 12, size - 12);
code = PyMarshal_ReadObjectFromString((char *)buf + 16, size - 16);
if (code == NULL) {
return NULL;
}

View File

@ -354,6 +354,41 @@ exit:
return return_value;
}
PyDoc_STRVAR(_imp_source_hash__doc__,
"source_hash($module, /, key, source)\n"
"--\n"
"\n");
#define _IMP_SOURCE_HASH_METHODDEF \
{"source_hash", (PyCFunction)_imp_source_hash, METH_FASTCALL|METH_KEYWORDS, _imp_source_hash__doc__},
static PyObject *
_imp_source_hash_impl(PyObject *module, long key, Py_buffer *source);
static PyObject *
_imp_source_hash(PyObject *module, PyObject **args, Py_ssize_t nargs, PyObject *kwnames)
{
PyObject *return_value = NULL;
static const char * const _keywords[] = {"key", "source", NULL};
static _PyArg_Parser _parser = {"ly*:source_hash", _keywords, 0};
long key;
Py_buffer source = {NULL, NULL};
if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser,
&key, &source)) {
goto exit;
}
return_value = _imp_source_hash_impl(module, key, &source);
exit:
/* Cleanup for source */
if (source.obj) {
PyBuffer_Release(&source);
}
return return_value;
}
#ifndef _IMP_CREATE_DYNAMIC_METHODDEF
#define _IMP_CREATE_DYNAMIC_METHODDEF
#endif /* !defined(_IMP_CREATE_DYNAMIC_METHODDEF) */
@ -361,4 +396,4 @@ exit:
#ifndef _IMP_EXEC_DYNAMIC_METHODDEF
#define _IMP_EXEC_DYNAMIC_METHODDEF
#endif /* !defined(_IMP_EXEC_DYNAMIC_METHODDEF) */
/*[clinic end generated code: output=d068dd493e513604 input=a9049054013a1b77]*/
/*[clinic end generated code: output=e8b2c0b0d0a75da8 input=a9049054013a1b77]*/

View File

@ -51,7 +51,8 @@ void _PyOS_ResetGetOpt(void)
opt_ptr = L"";
}
int _PyOS_GetOpt(int argc, wchar_t **argv, wchar_t *optstring)
int _PyOS_GetOpt(int argc, wchar_t **argv, wchar_t *optstring,
const _PyOS_LongOption *longopts, int *longindex)
{
wchar_t *ptr;
wchar_t option;
@ -86,13 +87,41 @@ int _PyOS_GetOpt(int argc, wchar_t **argv, wchar_t *optstring)
return 'V';
}
opt_ptr = &argv[_PyOS_optind++][1];
}
if ((option = *opt_ptr++) == L'\0')
return -1;
if (option == L'-') {
// Parse long option.
if (*opt_ptr == L'\0') {
fprintf(stderr, "expected long option\n");
return -1;
}
*longindex = 0;
const _PyOS_LongOption *opt;
for (opt = &longopts[*longindex]; opt->name; opt = &longopts[++(*longindex)]) {
if (!wcscmp(opt->name, opt_ptr))
break;
}
if (!opt->name) {
fprintf(stderr, "unknown option %ls\n", argv[_PyOS_optind - 1]);
return '_';
}
opt_ptr = L"";
if (!opt->has_arg) {
return opt->val;
}
if (_PyOS_optind >= argc) {
fprintf(stderr, "Argument expected for the %ls options\n",
argv[_PyOS_optind - 1]);
return '_';
}
_PyOS_optarg = argv[_PyOS_optind++];
return opt->val;
}
if (option == 'J') {
if (_PyOS_opterr)
fprintf(stderr, "-J is reserved for Jython\n");

View File

@ -5,6 +5,8 @@
#include "Python-ast.h"
#undef Yield /* undefine macro conflicting with winbase.h */
#include "internal/hash.h"
#include "internal/import.h"
#include "internal/pystate.h"
#include "errcode.h"
#include "marshal.h"
@ -2184,6 +2186,34 @@ _imp_exec_builtin_impl(PyObject *module, PyObject *mod)
return exec_builtin_or_dynamic(mod);
}
/*[clinic input]
_imp.source_hash
key: long
source: Py_buffer
[clinic start generated code]*/
static PyObject *
_imp_source_hash_impl(PyObject *module, long key, Py_buffer *source)
/*[clinic end generated code: output=edb292448cf399ea input=9aaad1e590089789]*/
{
uint64_t hash = _Py_KeyedHash((uint64_t)key, source->buf, source->len);
#if !PY_LITTLE_ENDIAN
// Force to little-endian. There really ought to be a succinct standard way
// to do this.
union {
uint64_t x;
unsigned char data[sizeof(uint64_t)];
} pun;
pun.x = hash;
for (size_t i = 0; i < sizeof(pun.data); i++) {
pun.data[sizeof(pun.data) - i - 1] = pun.data[i];
}
hash = pun.x;
#endif
return PyBytes_FromStringAndSize((const char *)&hash, sizeof(hash));
}
PyDoc_STRVAR(doc_imp,
"(Extremely) low-level import machinery bits as used by importlib and imp.");
@ -2203,6 +2233,7 @@ static PyMethodDef imp_methods[] = {
_IMP_EXEC_DYNAMIC_METHODDEF
_IMP_EXEC_BUILTIN_METHODDEF
_IMP__FIX_CO_FILENAME_METHODDEF
_IMP_SOURCE_HASH_METHODDEF
{NULL, NULL} /* sentinel */
};
@ -2219,6 +2250,8 @@ static struct PyModuleDef impmodule = {
NULL
};
const char *_Py_CheckHashBasedPycsMode = "default";
PyMODINIT_FUNC
PyInit_imp(void)
{
@ -2230,6 +2263,15 @@ PyInit_imp(void)
d = PyModule_GetDict(m);
if (d == NULL)
goto failure;
PyObject *pyc_mode = PyUnicode_FromString(_Py_CheckHashBasedPycsMode);
if (pyc_mode == NULL) {
goto failure;
}
if (PyDict_SetItemString(d, "check_hash_based_pycs", pyc_mode) < 0) {
Py_DECREF(pyc_mode);
goto failure;
}
Py_DECREF(pyc_mode);
return m;
failure:

File diff suppressed because it is too large Load Diff

View File

@ -284,7 +284,6 @@ static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
/* **************************************************************************
<MIT License>
Copyright (c) 2013 Marek Majkowski <marek@popcount.org>
@ -364,10 +363,10 @@ static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
HALF_ROUND(v2,v1,v0,v3,17,21);
static Py_hash_t
siphash24(const void *src, Py_ssize_t src_sz) {
uint64_t k0 = _le64toh(_Py_HashSecret.siphash.k0);
uint64_t k1 = _le64toh(_Py_HashSecret.siphash.k1);
static uint64_t
siphash24(uint64_t key0, uint64_t key1, const void *src, Py_ssize_t src_sz) {
uint64_t k0 = _le64toh(key0);
uint64_t k1 = _le64toh(key1);
uint64_t b = (uint64_t)src_sz << 56;
const uint64_t *in = (uint64_t*)src;
@ -412,12 +411,26 @@ siphash24(const void *src, Py_ssize_t src_sz) {
/* modified */
t = (v0 ^ v1) ^ (v2 ^ v3);
return (Py_hash_t)t;
return t;
}
static PyHash_FuncDef PyHash_Func = {siphash24, "siphash24", 64, 128};
static Py_hash_t
pysiphash(const void *src, Py_ssize_t src_sz) {
return (Py_hash_t)siphash24(
_Py_HashSecret.siphash.k0, _Py_HashSecret.siphash.k1,
src, src_sz);
}
#endif /* Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 */
uint64_t
_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
{
return siphash24(key, 0, src, src_sz);
}
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
#endif
#ifdef __cplusplus
}

View File

@ -1053,7 +1053,8 @@ run_pyc_file(FILE *fp, const char *filename, PyObject *globals,
"Bad magic number in .pyc file");
return NULL;
}
/* Skip mtime and size */
/* Skip the rest of the header. */
(void) PyMarshal_ReadLongFromFile(fp);
(void) PyMarshal_ReadLongFromFile(fp);
(void) PyMarshal_ReadLongFromFile(fp);
if (PyErr_Occurred())