From 2b2c307eac8af452f8b770996ff5648225000dcd Mon Sep 17 00:00:00 2001 From: Pavel Kirienko Date: Sat, 18 Mar 2017 17:16:20 +0300 Subject: [PATCH] Performance audit (intentionally duplicates #6829) (#6847) * UAVCAN ESC output: removing ESC output channels from published message that are always zero. This allows the UAVCAN stack to always transfer only the minimum number of output values, avoiding redundant zeroes and the associated increase in bus load and CPU time * Added a separate mixer file for CAN quadrotor * Sampling profiler improvements * PMSP: Output more endpoints * Matrix update * libc usage workaround * Removed UAVCAN perfcounters * Matrix submodule update --- Debug/poor-mans-profiler.sh | 18 ++++++++++---- ROMFS/px4fmu_common/init.d/4012_quad_x_can | 1 + .../px4fmu_common/mixers/quad_x_can.main.mix | 1 + src/lib/matrix | 2 +- src/modules/uavcan/actuators/esc.cpp | 19 +++++++++++++++ src/modules/uavcan/actuators/esc.hpp | 1 + src/modules/uavcan/uavcan_main.cpp | 24 ------------------- src/modules/uavcan/uavcan_main.hpp | 4 ---- src/systemcmds/tests/test_matrix.cpp | 6 ++--- 9 files changed, 40 insertions(+), 36 deletions(-) create mode 100644 ROMFS/px4fmu_common/mixers/quad_x_can.main.mix diff --git a/Debug/poor-mans-profiler.sh b/Debug/poor-mans-profiler.sh index ab06a1b66a..4b03e56310 100755 --- a/Debug/poor-mans-profiler.sh +++ b/Debug/poor-mans-profiler.sh @@ -1,12 +1,20 @@ #!/bin/bash # +# Author: Pavel Kirienko +# # Poor man's sampling profiler for NuttX. # # Usage: Install flamegraph.pl in your PATH, configure your .gdbinit, run the script with proper arguments and go # have a coffee. When you're back, you'll see the flamegraph. Note that frequent calls to GDB significantly # interfere with normal operation of the target, which means that you can't profile real-time tasks with it. +# For best results, ensure that the PC is not overloaded, the USB host controller to which the debugger is +# connected is not congested. You should also allow the current user to set negative nice values. # -# Requirements: ARM GDB with Python support +# The FlameGraph script can be downloaded from https://github.com/brendangregg/FlameGraph. Thanks Mr. Gregg. +# +# Requirements: ARM GDB with Python support. You can get one by downloading the sources from +# https://launchpad.net/gcc-arm-embedded and building them with correct flags. +# Note that Python support is not required if no per-task sampling is needed. # set -e @@ -33,7 +41,7 @@ which flamegraph.pl > /dev/null || die "Install flamegraph.pl first" nsamples=0 sleeptime=0.1 # Doctors recommend 7-8 hours a day taskname= -elf=$root/Build/px4fmu-v2_default.build/firmware.elf +elf= append=0 fgfontsize=10 fgwidth=1900 @@ -69,6 +77,8 @@ do shift done +[[ -z "$elf" ]] && die "Please specify the ELF file location, e.g.: build_px4fmu-v4_default/src/firmware/nuttx/firmware_nuttx" + # # Temporary files # @@ -237,8 +247,8 @@ for s, f in sorted(stacks.items(), key=lambda (s, f): s): print('Total stack frames:', num_stack_frames, file=sys.stderr) print('Top consumers (distribution of the stack tops):', file=sys.stderr) -for name,num in sorted(stack_tops.items(), key=lambda (name, num): num, reverse=True)[:10]: - print('% 5.1f%% ' % (100 * num / num_stack_frames), name, file=sys.stderr) +for name,num in sorted(stack_tops.items(), key=lambda (name, num): num, reverse=True)[:300]: + print('% 7.3f%% ' % (100 * num / num_stack_frames), name, file=sys.stderr) EOF cat $stacksfile | python /tmp/pmpn-folder.py > $foldfile diff --git a/ROMFS/px4fmu_common/init.d/4012_quad_x_can b/ROMFS/px4fmu_common/init.d/4012_quad_x_can index f6246fef8e..01f90b4704 100644 --- a/ROMFS/px4fmu_common/init.d/4012_quad_x_can +++ b/ROMFS/px4fmu_common/init.d/4012_quad_x_can @@ -25,4 +25,5 @@ then param set MC_YAWRATE_D 0.0 fi +set MIXER quad_x_can set OUTPUT_MODE uavcan_esc diff --git a/ROMFS/px4fmu_common/mixers/quad_x_can.main.mix b/ROMFS/px4fmu_common/mixers/quad_x_can.main.mix new file mode 100644 index 0000000000..d6a2980148 --- /dev/null +++ b/ROMFS/px4fmu_common/mixers/quad_x_can.main.mix @@ -0,0 +1 @@ +R: 4x 10000 10000 10000 0 diff --git a/src/lib/matrix b/src/lib/matrix index cf924956d7..499b897e5f 160000 --- a/src/lib/matrix +++ b/src/lib/matrix @@ -1 +1 @@ -Subproject commit cf924956d7d62ce18bfc4f8441e9177ddb69c0dc +Subproject commit 499b897e5f270c3207a0e88d2f7239c5885d1681 diff --git a/src/modules/uavcan/actuators/esc.cpp b/src/modules/uavcan/actuators/esc.cpp index d24ad9a1fa..92d12ba32c 100644 --- a/src/modules/uavcan/actuators/esc.cpp +++ b/src/modules/uavcan/actuators/esc.cpp @@ -136,6 +136,25 @@ void UavcanEscController::update_outputs(float *outputs, unsigned num_outputs) } } + /* + * Remove channels that are always zero. + * The objective of this optimization is to avoid broadcasting multi-frame transfers when a single frame + * transfer would be enough. This is a valid optimization as the UAVCAN specification implies that all + * non-specified ESC setpoints should be considered zero. + * The positive outcome is a (marginally) lower bus traffic and lower CPU load. + * + * From the standpoint of the PX4 architecture, however, this is a hack. It should be investigated why + * the mixer returns more outputs than are actually used. + */ + for (int index = int(msg.cmd.size()) - 1; index >= _max_number_of_nonzero_outputs; index--) { + if (msg.cmd[index] != 0) { + _max_number_of_nonzero_outputs = index + 1; + break; + } + } + + msg.cmd.resize(_max_number_of_nonzero_outputs); + /* * Publish the command message to the bus * Note that for a quadrotor it takes one CAN frame diff --git a/src/modules/uavcan/actuators/esc.hpp b/src/modules/uavcan/actuators/esc.hpp index 40b151e308..ced372f7c6 100644 --- a/src/modules/uavcan/actuators/esc.hpp +++ b/src/modules/uavcan/actuators/esc.hpp @@ -107,6 +107,7 @@ private: * ESC states */ uint32_t _armed_mask = 0; + uint8_t _max_number_of_nonzero_outputs = 0; /* * Perf counters diff --git a/src/modules/uavcan/uavcan_main.cpp b/src/modules/uavcan/uavcan_main.cpp index 204d8c79a3..e6bbc0d078 100644 --- a/src/modules/uavcan/uavcan_main.cpp +++ b/src/modules/uavcan/uavcan_main.cpp @@ -110,18 +110,6 @@ UavcanNode::UavcanNode(uavcan::ICanDriver &can_driver, uavcan::ISystemClock &sys } /* _server_command_sem use case is a signal */ px4_sem_setprotocol(&_server_command_sem, SEM_PRIO_NONE); - - if (_perfcnt_node_spin_elapsed == nullptr) { - errx(1, "uavcan: couldn't allocate _perfcnt_node_spin_elapsed"); - } - - if (_perfcnt_esc_mixer_output_elapsed == nullptr) { - errx(1, "uavcan: couldn't allocate _perfcnt_esc_mixer_output_elapsed"); - } - - if (_perfcnt_esc_mixer_total_elapsed == nullptr) { - errx(1, "uavcan: couldn't allocate _perfcnt_esc_mixer_total_elapsed"); - } } UavcanNode::~UavcanNode() @@ -164,9 +152,6 @@ UavcanNode::~UavcanNode() _instance = nullptr; - perf_free(_perfcnt_node_spin_elapsed); - perf_free(_perfcnt_esc_mixer_output_elapsed); - perf_free(_perfcnt_esc_mixer_total_elapsed); pthread_mutex_destroy(&_node_mutex); px4_sem_destroy(&_server_command_sem); @@ -697,7 +682,6 @@ int UavcanNode::init(uavcan::NodeID node_id) void UavcanNode::node_spin_once() { - perf_begin(_perfcnt_node_spin_elapsed); const int spin_res = _node.spinOnce(); if (spin_res < 0) { @@ -708,8 +692,6 @@ void UavcanNode::node_spin_once() if (_tx_injector != nullptr) { _tx_injector->injectTxFramesInto(_node); } - - perf_end(_perfcnt_node_spin_elapsed); } /* @@ -868,12 +850,8 @@ int UavcanNode::run() // Mutex is unlocked while the thread is blocked on IO multiplexing (void)pthread_mutex_unlock(&_node_mutex); - perf_end(_perfcnt_esc_mixer_total_elapsed); // end goes first, it's not a mistake - const int poll_ret = ::poll(_poll_fds, _poll_fds_num, PollTimeoutMs); - perf_begin(_perfcnt_esc_mixer_total_elapsed); - (void)pthread_mutex_lock(&_node_mutex); node_spin_once(); // Non-blocking @@ -965,9 +943,7 @@ int UavcanNode::run() // Output to the bus _outputs.timestamp = hrt_absolute_time(); - perf_begin(_perfcnt_esc_mixer_output_elapsed); _esc_controller.update_outputs(_outputs.output, _outputs.noutputs); - perf_end(_perfcnt_esc_mixer_output_elapsed); } diff --git a/src/modules/uavcan/uavcan_main.hpp b/src/modules/uavcan/uavcan_main.hpp index b962333a06..f84dff1630 100644 --- a/src/modules/uavcan/uavcan_main.hpp +++ b/src/modules/uavcan/uavcan_main.hpp @@ -209,10 +209,6 @@ private: // index into _poll_fds for each _control_subs handle uint8_t _poll_ids[NUM_ACTUATOR_CONTROL_GROUPS_UAVCAN]; - perf_counter_t _perfcnt_node_spin_elapsed = perf_alloc(PC_ELAPSED, "uavcan_node_spin_elapsed"); - perf_counter_t _perfcnt_esc_mixer_output_elapsed = perf_alloc(PC_ELAPSED, "uavcan_esc_mixer_output_elapsed"); - perf_counter_t _perfcnt_esc_mixer_total_elapsed = perf_alloc(PC_ELAPSED, "uavcan_esc_mixer_total_elapsed"); - void handle_time_sync(const uavcan::TimerEvent &); typedef uavcan::MethodBinder TimerCallback; diff --git a/src/systemcmds/tests/test_matrix.cpp b/src/systemcmds/tests/test_matrix.cpp index d2e574810f..029f36ceb5 100644 --- a/src/systemcmds/tests/test_matrix.cpp +++ b/src/systemcmds/tests/test_matrix.cpp @@ -317,9 +317,9 @@ bool MatrixTest::filterTests() bool MatrixTest::helperTests() { - ut_test(fabs(wrap_pi(4.0) - (4.0 - 2 * M_PI)) < 1e-5); - ut_test(fabs(wrap_pi(-4.0) - (-4.0 + 2 * M_PI)) < 1e-5); - ut_test(fabs(wrap_pi(3.0) - (3.0)) < 1e-3); + ut_test(::fabs(wrap_pi(4.0) - (4.0 - 2 * M_PI)) < 1e-5); + ut_test(::fabs(wrap_pi(-4.0) - (-4.0 + 2 * M_PI)) < 1e-5); + ut_test(::fabs(wrap_pi(3.0) - (3.0)) < 1e-3); wrap_pi(NAN); Vector3f a(1, 2, 3);