gh-101282: Apply BOLT optimizations to libpython for shared builds (#104709)

Apply BOLT optimizations to libpython for shared builds. Most of the C
code is in libpython so it is critical to apply BOLT there fully realize
BOLT benefits.

This change also reworks how BOLT instrumentation is applied. It
effectively removes the readelf based logic added in gh-101525 and
replaces it with a mechanism that saves a copy of the pre-bolt binary
and restores that copy when necessary. This allows us to perform BOLT
optimizations without having to manually delete the output binary to
force a new bolt run.

Also:
- add a clean-bolt target for purging BOLT files and hook that up to the
  clean target
- .gitignore BOLT related files

Before and after this refactor, `make` will no-op after a previous run.
Both versions should also share common make DAG deficiencies where
targets fail to trigger as often as they need to or can trigger
prematurely in certain scenarios. e.g. after this change you may need to
`rm profile-bolt-stamp` to force a BOLT run because there aren't
appropriate non-phony targets for BOLT's make target to depend on.

To make it easier to iterate on custom BOLT settings, the flags to pass
to instrumentation and application are now defined in configure and can
be overridden by passing BOLT_INSTRUMENT_FLAGS and BOLT_APPLY_FLAGS.
This commit is contained in:
Gregory Szorc 2023-05-22 04:45:20 -07:00 committed by GitHub
parent 729b252241
commit 5360cb3d56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 154 additions and 131 deletions

5
.gitignore vendored
View File

@ -23,6 +23,10 @@
*.gc??
*.profclang?
*.profraw
# Copies of binaries before BOLT optimizations.
*.prebolt
# BOLT profile data.
*.fdata
*.dyn
.gdb_history
.purify
@ -124,6 +128,7 @@ Tools/unicode/data/
/platform
/profile-clean-stamp
/profile-run-stamp
/profile-bolt-stamp
/Python/deepfreeze/*.c
/pybuilddir.txt
/pyconfig.h

View File

@ -314,6 +314,13 @@ also be used to improve performance.
is dependent on a combination of the build environment + the other
optimization configure args + the CPU architecture, and not all combinations
are supported.
BOLT versions before LLVM 16 are known to crash BOLT under some scenarios.
Use of LLVM 16 or newer for BOLT optimization is strongly encouraged.
The :envvar:`!BOLT_INSTRUMENT_FLAGS` and :envvar:`!BOLT_APPLY_FLAGS`
:program:`configure` variables can be defined to override the default set of
arguments for :program:`llvm-bolt` to instrument and apply BOLT data to
binaries, respectively.
.. versionadded:: 3.12

View File

@ -672,21 +672,55 @@ profile-opt: profile-run-stamp
-rm -f profile-clean-stamp
$(MAKE) @DEF_MAKE_RULE@ CFLAGS_NODIST="$(CFLAGS_NODIST) $(PGO_PROF_USE_FLAG)" LDFLAGS_NODIST="$(LDFLAGS_NODIST)"
.PHONY: bolt-opt
bolt-opt: @PREBOLT_RULE@
rm -f *.fdata
@if $(READELF) -p .note.bolt_info $(BUILDPYTHON) | grep BOLT > /dev/null; then\
echo "skip: $(BUILDPYTHON) is already BOLTed."; \
else \
@LLVM_BOLT@ ./$(BUILDPYTHON) -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $(BUILDPYTHON).bolt) -o $(BUILDPYTHON).bolt_inst; \
./$(BUILDPYTHON).bolt_inst $(PROFILE_TASK) || true; \
@MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata; \
@LLVM_BOLT@ ./$(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=none -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot; \
rm -f *.fdata; \
rm -f $(BUILDPYTHON).bolt_inst; \
mv $(BUILDPYTHON).bolt $(BUILDPYTHON); \
fi
# List of binaries that BOLT runs on.
BOLT_BINARIES := @BOLT_BINARIES@
BOLT_INSTRUMENT_FLAGS := @BOLT_INSTRUMENT_FLAGS@
BOLT_APPLY_FLAGS := @BOLT_APPLY_FLAGS@
.PHONY: clean-bolt
clean-bolt:
# Profile data.
rm -f *.fdata
# Pristine binaries before BOLT optimization.
rm -f *.prebolt
# BOLT instrumented binaries.
rm -f *.bolt_inst
profile-bolt-stamp: $(BUILDPYTHON)
# Ensure a pristine, pre-BOLT copy of the binary and no profile data from last run.
for bin in $(BOLT_BINARIES); do \
prebolt="$${bin}.prebolt"; \
if [ -e "$${prebolt}" ]; then \
echo "Restoring pre-BOLT binary $${prebolt}"; \
mv "$${bin}.prebolt" "$${bin}"; \
fi; \
cp "$${bin}" "$${prebolt}"; \
rm -f $${bin}.bolt.*.fdata $${bin}.fdata; \
done
# Instrument each binary.
for bin in $(BOLT_BINARIES); do \
@LLVM_BOLT@ "$${bin}" -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $${bin}.bolt) -o $${bin}.bolt_inst $(BOLT_INSTRUMENT_FLAGS); \
mv "$${bin}.bolt_inst" "$${bin}"; \
done
# Run instrumented binaries to collect data.
$(RUNSHARED) ./$(BUILDPYTHON) $(PROFILE_TASK) || true
# Merge all the data files together.
for bin in $(BOLT_BINARIES); do \
@MERGE_FDATA@ $${bin}.*.fdata > "$${bin}.fdata"; \
rm -f $${bin}.*.fdata; \
done
# Run bolt against the merged data to produce an optimized binary.
for bin in $(BOLT_BINARIES); do \
@LLVM_BOLT@ "$${bin}.prebolt" -o "$${bin}.bolt" -data="$${bin}.fdata" $(BOLT_APPLY_FLAGS); \
mv "$${bin}.bolt" "$${bin}"; \
done
touch $@
.PHONY: bolt-opt
bolt-opt:
$(MAKE) @PREBOLT_RULE@
$(MAKE) profile-bolt-stamp
# Compile and run with gcov
.PHONY: coverage
@ -2623,10 +2657,11 @@ profile-removal:
rm -f $(COVERAGE_INFO)
rm -rf $(COVERAGE_REPORT)
rm -f profile-run-stamp
rm -f profile-bolt-stamp
.PHONY: clean
clean: clean-retain-profile
@if test @DEF_MAKE_ALL_RULE@ = profile-opt; then \
clean: clean-retain-profile clean-bolt
@if test @DEF_MAKE_ALL_RULE@ = profile-opt -o @DEF_MAKE_ALL_RULE@ = bolt-opt; then \
rm -f profile-gen-stamp profile-clean-stamp; \
$(MAKE) profile-removal; \
fi

View File

@ -0,0 +1,4 @@
BOLT optimization is now applied to the libpython shared library if building
a shared library. BOLT instrumentation and application settings can now be
influenced via the ``BOLT_INSTRUMENT_FLAGS`` and ``BOLT_APPLY_FLAGS``
configure variables.

147
configure generated vendored
View File

@ -883,10 +883,11 @@ CFLAGS_NODIST
BASECFLAGS
CFLAGS_ALIASING
OPT
BOLT_APPLY_FLAGS
BOLT_INSTRUMENT_FLAGS
BOLT_BINARIES
MERGE_FDATA
LLVM_BOLT
ac_ct_READELF
READELF
PREBOLT_RULE
LLVM_PROF_FOUND
LLVM_PROFDATA
@ -1105,6 +1106,8 @@ CPPFLAGS
CPP
HOSTRUNNER
PROFILE_TASK
BOLT_INSTRUMENT_FLAGS
BOLT_APPLY_FLAGS
LIBUUID_CFLAGS
LIBUUID_LIBS
LIBFFI_CFLAGS
@ -1916,6 +1919,10 @@ Some influential environment variables:
HOSTRUNNER Program to run CPython for the host platform
PROFILE_TASK
Python args for PGO generation task
BOLT_INSTRUMENT_FLAGS
Arguments to llvm-bolt when instrumenting binaries
BOLT_APPLY_FLAGS
Arguments to llvm-bolt when creating a BOLT optimized binary
LIBUUID_CFLAGS
C compiler flags for LIBUUID, overriding pkg-config
LIBUUID_LIBS
@ -8106,112 +8113,6 @@ if test "$Py_BOLT" = 'true' ; then
DEF_MAKE_ALL_RULE="bolt-opt"
DEF_MAKE_RULE="build_all"
if test -n "$ac_tool_prefix"; then
for ac_prog in readelf
do
# Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
set dummy $ac_tool_prefix$ac_prog; ac_word=$2
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
$as_echo_n "checking for $ac_word... " >&6; }
if ${ac_cv_prog_READELF+:} false; then :
$as_echo_n "(cached) " >&6
else
if test -n "$READELF"; then
ac_cv_prog_READELF="$READELF" # Let the user override the test.
else
as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
for as_dir in $PATH
do
IFS=$as_save_IFS
test -z "$as_dir" && as_dir=.
for ac_exec_ext in '' $ac_executable_extensions; do
if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
ac_cv_prog_READELF="$ac_tool_prefix$ac_prog"
$as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
break 2
fi
done
done
IFS=$as_save_IFS
fi
fi
READELF=$ac_cv_prog_READELF
if test -n "$READELF"; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $READELF" >&5
$as_echo "$READELF" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi
test -n "$READELF" && break
done
fi
if test -z "$READELF"; then
ac_ct_READELF=$READELF
for ac_prog in readelf
do
# Extract the first word of "$ac_prog", so it can be a program name with args.
set dummy $ac_prog; ac_word=$2
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
$as_echo_n "checking for $ac_word... " >&6; }
if ${ac_cv_prog_ac_ct_READELF+:} false; then :
$as_echo_n "(cached) " >&6
else
if test -n "$ac_ct_READELF"; then
ac_cv_prog_ac_ct_READELF="$ac_ct_READELF" # Let the user override the test.
else
as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
for as_dir in $PATH
do
IFS=$as_save_IFS
test -z "$as_dir" && as_dir=.
for ac_exec_ext in '' $ac_executable_extensions; do
if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
ac_cv_prog_ac_ct_READELF="$ac_prog"
$as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
break 2
fi
done
done
IFS=$as_save_IFS
fi
fi
ac_ct_READELF=$ac_cv_prog_ac_ct_READELF
if test -n "$ac_ct_READELF"; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_READELF" >&5
$as_echo "$ac_ct_READELF" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi
test -n "$ac_ct_READELF" && break
done
if test "x$ac_ct_READELF" = x; then
READELF=""notfound""
else
case $cross_compiling:$ac_tool_warned in
yes:)
{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
ac_tool_warned=yes ;;
esac
READELF=$ac_ct_READELF
fi
fi
if test "$READELF" == "notfound"
then
as_fn_error $? "readelf is required for a --enable-bolt build but could not be found." "$LINENO" 5
fi
# -fno-reorder-blocks-and-partition is required for bolt to work.
# Possibly GCC only.
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -fno-reorder-blocks-and-partition" >&5
@ -8474,6 +8375,36 @@ $as_echo "\"Found merge-fdata\"" >&6; }
fi
fi
BOLT_BINARIES='$(BUILDPYTHON)'
if test "x$enable_shared" = xyes; then :
BOLT_BINARIES="${BOLT_BINARIES} \$(INSTSONAME)"
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking BOLT_INSTRUMENT_FLAGS" >&5
$as_echo_n "checking BOLT_INSTRUMENT_FLAGS... " >&6; }
if test -z "${BOLT_INSTRUMENT_FLAGS}"
then
BOLT_INSTRUMENT_FLAGS=
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $BOLT_INSTRUMENT_FLAGS" >&5
$as_echo "$BOLT_INSTRUMENT_FLAGS" >&6; }
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking BOLT_APPLY_FLAGS" >&5
$as_echo_n "checking BOLT_APPLY_FLAGS... " >&6; }
if test -z "${BOLT_APPLY_FLAGS}"
then
BOLT_APPLY_FLAGS=-update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=none -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $BOLT_APPLY_FLAGS" >&5
$as_echo "$BOLT_APPLY_FLAGS" >&6; }
# XXX Shouldn't the code above that fiddles with BASECFLAGS and OPT be
# merged with this chunk of code?

View File

@ -2028,13 +2028,6 @@ if test "$Py_BOLT" = 'true' ; then
DEF_MAKE_ALL_RULE="bolt-opt"
DEF_MAKE_RULE="build_all"
AC_SUBST(READELF)
AC_CHECK_TOOLS(READELF, [readelf], "notfound")
if test "$READELF" == "notfound"
then
AC_MSG_ERROR([readelf is required for a --enable-bolt build but could not be found.])
fi
# -fno-reorder-blocks-and-partition is required for bolt to work.
# Possibly GCC only.
AX_CHECK_COMPILE_FLAG([-fno-reorder-blocks-and-partition],[
@ -2067,6 +2060,54 @@ if test "$Py_BOLT" = 'true' ; then
fi
fi
dnl Enable BOLT of libpython if built.
AC_SUBST(BOLT_BINARIES)
BOLT_BINARIES='$(BUILDPYTHON)'
AS_VAR_IF([enable_shared], [yes], [
BOLT_BINARIES="${BOLT_BINARIES} \$(INSTSONAME)"
])
AC_ARG_VAR(
[BOLT_INSTRUMENT_FLAGS],
[Arguments to llvm-bolt when instrumenting binaries]
)
AC_MSG_CHECKING([BOLT_INSTRUMENT_FLAGS])
if test -z "${BOLT_INSTRUMENT_FLAGS}"
then
BOLT_INSTRUMENT_FLAGS=
fi
AC_MSG_RESULT([$BOLT_INSTRUMENT_FLAGS])
AC_ARG_VAR(
[BOLT_APPLY_FLAGS],
[Arguments to llvm-bolt when creating a BOLT optimized binary]
)
AC_MSG_CHECKING([BOLT_APPLY_FLAGS])
if test -z "${BOLT_APPLY_FLAGS}"
then
AS_VAR_SET(
[BOLT_APPLY_FLAGS],
[m4_join([ ],
[-update-debug-sections],
[-reorder-blocks=ext-tsp],
[-reorder-functions=hfsort+],
[-split-functions],
[-icf=1],
[-inline-all],
[-split-eh],
[-reorder-functions-use-hot-size],
[-peepholes=none],
[-jump-tables=aggressive],
[-inline-ap],
[-indirect-call-promotion=all],
[-dyno-stats],
[-use-gnu-stack],
[-frame-opt=hot]
)]
)
fi
AC_MSG_RESULT([$BOLT_APPLY_FLAGS])
# XXX Shouldn't the code above that fiddles with BASECFLAGS and OPT be
# merged with this chunk of code?