Performance audit (intentionally duplicates #6829) (#6847)

* UAVCAN ESC output: removing ESC output channels from published message that are always zero. This allows the UAVCAN stack to always transfer only the minimum number of output values, avoiding redundant zeroes and the associated increase in bus load and CPU time

* Added a separate mixer file for CAN quadrotor

* Sampling profiler improvements

* PMSP: Output more endpoints

* Matrix update

* libc usage workaround

* Removed UAVCAN perfcounters

* Matrix submodule update
This commit is contained in:
Pavel Kirienko 2017-03-18 17:16:20 +03:00 committed by David Sidrane
parent c20b85e6ad
commit 2b2c307eac
9 changed files with 40 additions and 36 deletions

View File

@ -1,12 +1,20 @@
#!/bin/bash
#
# Author: Pavel Kirienko <pavel.kirienko@zubax.com>
#
# Poor man's sampling profiler for NuttX.
#
# Usage: Install flamegraph.pl in your PATH, configure your .gdbinit, run the script with proper arguments and go
# have a coffee. When you're back, you'll see the flamegraph. Note that frequent calls to GDB significantly
# interfere with normal operation of the target, which means that you can't profile real-time tasks with it.
# For best results, ensure that the PC is not overloaded, the USB host controller to which the debugger is
# connected is not congested. You should also allow the current user to set negative nice values.
#
# Requirements: ARM GDB with Python support
# The FlameGraph script can be downloaded from https://github.com/brendangregg/FlameGraph. Thanks Mr. Gregg.
#
# Requirements: ARM GDB with Python support. You can get one by downloading the sources from
# https://launchpad.net/gcc-arm-embedded and building them with correct flags.
# Note that Python support is not required if no per-task sampling is needed.
#
set -e
@ -33,7 +41,7 @@ which flamegraph.pl > /dev/null || die "Install flamegraph.pl first"
nsamples=0
sleeptime=0.1 # Doctors recommend 7-8 hours a day
taskname=
elf=$root/Build/px4fmu-v2_default.build/firmware.elf
elf=
append=0
fgfontsize=10
fgwidth=1900
@ -69,6 +77,8 @@ do
shift
done
[[ -z "$elf" ]] && die "Please specify the ELF file location, e.g.: build_px4fmu-v4_default/src/firmware/nuttx/firmware_nuttx"
#
# Temporary files
#
@ -237,8 +247,8 @@ for s, f in sorted(stacks.items(), key=lambda (s, f): s):
print('Total stack frames:', num_stack_frames, file=sys.stderr)
print('Top consumers (distribution of the stack tops):', file=sys.stderr)
for name,num in sorted(stack_tops.items(), key=lambda (name, num): num, reverse=True)[:10]:
print('% 5.1f%% ' % (100 * num / num_stack_frames), name, file=sys.stderr)
for name,num in sorted(stack_tops.items(), key=lambda (name, num): num, reverse=True)[:300]:
print('% 7.3f%% ' % (100 * num / num_stack_frames), name, file=sys.stderr)
EOF
cat $stacksfile | python /tmp/pmpn-folder.py > $foldfile

View File

@ -25,4 +25,5 @@ then
param set MC_YAWRATE_D 0.0
fi
set MIXER quad_x_can
set OUTPUT_MODE uavcan_esc

View File

@ -0,0 +1 @@
R: 4x 10000 10000 10000 0

@ -1 +1 @@
Subproject commit cf924956d7d62ce18bfc4f8441e9177ddb69c0dc
Subproject commit 499b897e5f270c3207a0e88d2f7239c5885d1681

View File

@ -136,6 +136,25 @@ void UavcanEscController::update_outputs(float *outputs, unsigned num_outputs)
}
}
/*
* Remove channels that are always zero.
* The objective of this optimization is to avoid broadcasting multi-frame transfers when a single frame
* transfer would be enough. This is a valid optimization as the UAVCAN specification implies that all
* non-specified ESC setpoints should be considered zero.
* The positive outcome is a (marginally) lower bus traffic and lower CPU load.
*
* From the standpoint of the PX4 architecture, however, this is a hack. It should be investigated why
* the mixer returns more outputs than are actually used.
*/
for (int index = int(msg.cmd.size()) - 1; index >= _max_number_of_nonzero_outputs; index--) {
if (msg.cmd[index] != 0) {
_max_number_of_nonzero_outputs = index + 1;
break;
}
}
msg.cmd.resize(_max_number_of_nonzero_outputs);
/*
* Publish the command message to the bus
* Note that for a quadrotor it takes one CAN frame

View File

@ -107,6 +107,7 @@ private:
* ESC states
*/
uint32_t _armed_mask = 0;
uint8_t _max_number_of_nonzero_outputs = 0;
/*
* Perf counters

View File

@ -110,18 +110,6 @@ UavcanNode::UavcanNode(uavcan::ICanDriver &can_driver, uavcan::ISystemClock &sys
}
/* _server_command_sem use case is a signal */
px4_sem_setprotocol(&_server_command_sem, SEM_PRIO_NONE);
if (_perfcnt_node_spin_elapsed == nullptr) {
errx(1, "uavcan: couldn't allocate _perfcnt_node_spin_elapsed");
}
if (_perfcnt_esc_mixer_output_elapsed == nullptr) {
errx(1, "uavcan: couldn't allocate _perfcnt_esc_mixer_output_elapsed");
}
if (_perfcnt_esc_mixer_total_elapsed == nullptr) {
errx(1, "uavcan: couldn't allocate _perfcnt_esc_mixer_total_elapsed");
}
}
UavcanNode::~UavcanNode()
@ -164,9 +152,6 @@ UavcanNode::~UavcanNode()
_instance = nullptr;
perf_free(_perfcnt_node_spin_elapsed);
perf_free(_perfcnt_esc_mixer_output_elapsed);
perf_free(_perfcnt_esc_mixer_total_elapsed);
pthread_mutex_destroy(&_node_mutex);
px4_sem_destroy(&_server_command_sem);
@ -697,7 +682,6 @@ int UavcanNode::init(uavcan::NodeID node_id)
void UavcanNode::node_spin_once()
{
perf_begin(_perfcnt_node_spin_elapsed);
const int spin_res = _node.spinOnce();
if (spin_res < 0) {
@ -708,8 +692,6 @@ void UavcanNode::node_spin_once()
if (_tx_injector != nullptr) {
_tx_injector->injectTxFramesInto(_node);
}
perf_end(_perfcnt_node_spin_elapsed);
}
/*
@ -868,12 +850,8 @@ int UavcanNode::run()
// Mutex is unlocked while the thread is blocked on IO multiplexing
(void)pthread_mutex_unlock(&_node_mutex);
perf_end(_perfcnt_esc_mixer_total_elapsed); // end goes first, it's not a mistake
const int poll_ret = ::poll(_poll_fds, _poll_fds_num, PollTimeoutMs);
perf_begin(_perfcnt_esc_mixer_total_elapsed);
(void)pthread_mutex_lock(&_node_mutex);
node_spin_once(); // Non-blocking
@ -965,9 +943,7 @@ int UavcanNode::run()
// Output to the bus
_outputs.timestamp = hrt_absolute_time();
perf_begin(_perfcnt_esc_mixer_output_elapsed);
_esc_controller.update_outputs(_outputs.output, _outputs.noutputs);
perf_end(_perfcnt_esc_mixer_output_elapsed);
}

View File

@ -209,10 +209,6 @@ private:
// index into _poll_fds for each _control_subs handle
uint8_t _poll_ids[NUM_ACTUATOR_CONTROL_GROUPS_UAVCAN];
perf_counter_t _perfcnt_node_spin_elapsed = perf_alloc(PC_ELAPSED, "uavcan_node_spin_elapsed");
perf_counter_t _perfcnt_esc_mixer_output_elapsed = perf_alloc(PC_ELAPSED, "uavcan_esc_mixer_output_elapsed");
perf_counter_t _perfcnt_esc_mixer_total_elapsed = perf_alloc(PC_ELAPSED, "uavcan_esc_mixer_total_elapsed");
void handle_time_sync(const uavcan::TimerEvent &);
typedef uavcan::MethodBinder<UavcanNode *, void (UavcanNode::*)(const uavcan::TimerEvent &)> TimerCallback;

View File

@ -317,9 +317,9 @@ bool MatrixTest::filterTests()
bool MatrixTest::helperTests()
{
ut_test(fabs(wrap_pi(4.0) - (4.0 - 2 * M_PI)) < 1e-5);
ut_test(fabs(wrap_pi(-4.0) - (-4.0 + 2 * M_PI)) < 1e-5);
ut_test(fabs(wrap_pi(3.0) - (3.0)) < 1e-3);
ut_test(::fabs(wrap_pi(4.0) - (4.0 - 2 * M_PI)) < 1e-5);
ut_test(::fabs(wrap_pi(-4.0) - (-4.0 + 2 * M_PI)) < 1e-5);
ut_test(::fabs(wrap_pi(3.0) - (3.0)) < 1e-3);
wrap_pi(NAN);
Vector3f a(1, 2, 3);