AP_HAL_ChibiOS: hardware M4-Cortex and M7-Cortex (and H7) implementation of HAL FFT abstraction

implements an FFT engine based on the betaflight feature using ARM hardware accelerated CMSIS library
make the FFT feature optional
add dynamic gyro windows
add quinns and candans estimators and record in DSP state
disable DSP for boards with limited flash
calculate power spectrum rather than amplitude
start/analyse version of analysis to support threading
allocate memory in a specific region
constrain window size by CPU class
control inclusion of DSP based on board size
This commit is contained in:
Andy Piper 2019-08-09 17:04:28 +01:00 committed by Andrew Tridgell
parent f4a99a1589
commit 3d0cf7e12a
12 changed files with 425 additions and 3 deletions

View File

@ -4,6 +4,7 @@ namespace ChibiOS {
class AnalogIn; class AnalogIn;
class AnalogSource; class AnalogSource;
class DigitalSource; class DigitalSource;
class DSP;
class GPIO; class GPIO;
class I2CBus; class I2CBus;
class I2CDevice; class I2CDevice;

View File

@ -15,3 +15,4 @@
#include "RCOutput.h" #include "RCOutput.h"
#include "I2CDevice.h" #include "I2CDevice.h"
#include "Flash.h" #include "Flash.h"
#include "DSP.h"

View File

@ -0,0 +1,307 @@
/*
* This file is free software: you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This file is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Code by Andy Piper and the betaflight team
*/
#include "AP_HAL_ChibiOS.h"
#if HAL_WITH_DSP
#include <AP_HAL/AP_HAL.h>
#include <AP_Math/AP_Math.h>
#include <GCS_MAVLink/GCS.h>
#include "DSP.h"
#include <cmath>
using namespace ChibiOS;
#if DEBUG_FFT
#define TIMER_START(timer) \
void *istate = hal.scheduler->disable_interrupts_save(); \
uint32_t timer##now = AP_HAL::micros()
#define TIMER_END(timer) timer.time(timer##now); \
hal.scheduler->restore_interrupts(istate)
#else
#define TIMER_START(timer)
#define TIMER_END(timer)
#endif
#define TICK_CYCLE 10
extern const AP_HAL::HAL& hal;
// The algorithms originally came from betaflight but are now substantially modified based on theory and experiment.
// https://holometer.fnal.gov/GH_FFT.pdf "Spectrum and spectral density estimation by the Discrete Fourier transform (DFT),
// including a comprehensive list of window functions and some new flat-top windows." - Heinzel et. al is a great reference
// for understanding the underlying theory although we do not use spectral density here since time resolution is equally
// important as frequency resolution. Referred to as [Heinz] throughout the code.
// initialize the FFT state machine
AP_HAL::DSP::FFTWindowState* DSP::fft_init(uint16_t window_size, uint16_t sample_rate)
{
DSP::FFTWindowStateARM* fft = new DSP::FFTWindowStateARM(window_size, sample_rate);
if (fft->_hanning_window == nullptr || fft->_rfft_data == nullptr || fft->_freq_bins == nullptr) {
delete fft;
return nullptr;
}
return fft;
}
// start an FFT analysis
void DSP::fft_start(AP_HAL::DSP::FFTWindowState* state, const float* samples, uint16_t buffer_index, uint16_t buffer_size)
{
step_hanning((FFTWindowStateARM*)state, samples, buffer_index, buffer_size);
}
// perform remaining steps of an FFT analysis
uint16_t DSP::fft_analyse(AP_HAL::DSP::FFTWindowState* state, uint16_t start_bin, uint16_t end_bin, uint8_t harmonics, float noise_att_cutoff)
{
FFTWindowStateARM* fft = (FFTWindowStateARM*)state;
step_arm_cfft_f32(fft);
step_bitreversal(fft);
step_stage_rfft_f32(fft);
step_arm_cmplx_mag_f32(fft, start_bin, end_bin, harmonics, noise_att_cutoff);
return step_calc_frequencies_f32(fft, start_bin, end_bin);
}
// create an instance of the FFT state machine
DSP::FFTWindowStateARM::FFTWindowStateARM(uint16_t window_size, uint16_t sample_rate)
: AP_HAL::DSP::FFTWindowState::FFTWindowState(window_size, sample_rate)
{
if (_freq_bins == nullptr || _hanning_window == nullptr || _rfft_data == nullptr) {
gcs().send_text(MAV_SEVERITY_WARNING, "Failed to allocate %u bytes for window %u for DSP",
unsigned(sizeof(float) * (window_size * 3 + 2)), unsigned(window_size));
return;
}
// initialize the ARM data structure.
// it's important not to use arm_rfft_fast_init_f32() as this links all of the twiddle tables
// by being selective we save 70k in text space
switch (window_size) {
case 32:
arm_rfft_32_fast_init_f32(&_fft_instance);
break;
case 64:
arm_rfft_64_fast_init_f32(&_fft_instance);
break;
case 128:
arm_rfft_128_fast_init_f32(&_fft_instance);
break;
case 256:
arm_rfft_256_fast_init_f32(&_fft_instance);
break;
#if defined(STM32H7)
// Don't pull in the larger FFT tables unless we have to
case 512:
arm_rfft_512_fast_init_f32(&_fft_instance);
break;
case 1024:
arm_rfft_1024_fast_init_f32(&_fft_instance);
break;
#endif
}
}
DSP::FFTWindowStateARM::~FFTWindowStateARM()
{
}
extern "C" {
void stage_rfft_f32(arm_rfft_fast_instance_f32 *S, float32_t *p, float32_t *pOut);
void arm_cfft_radix8by2_f32(arm_cfft_instance_f32 *S, float32_t *p1);
void arm_cfft_radix8by4_f32(arm_cfft_instance_f32 *S, float32_t *p1);
void arm_radix8_butterfly_f32(float32_t *pSrc, uint16_t fftLen, const float32_t *pCoef, uint16_t twidCoefModifier);
void arm_bitreversal_32(uint32_t *pSrc, const uint16_t bitRevLen, const uint16_t *pBitRevTable);
}
// step 1: filter the incoming samples through a Hanning window
void DSP::step_hanning(FFTWindowStateARM* fft, const float* samples, uint16_t buffer_index, uint16_t buffer_size)
{
TIMER_START(_hanning_timer);
// 5us
// apply hanning window to gyro samples and store result in _freq_bins
// hanning starts and ends with 0, could be skipped for minor speed improvement
const uint16_t ring_buf_idx = MIN(buffer_size - buffer_index, fft->_window_size);
arm_mult_f32(&samples[buffer_index], &fft->_hanning_window[0], &fft->_freq_bins[0], ring_buf_idx);
if (buffer_index > 0) {
arm_mult_f32(&samples[0], &fft->_hanning_window[ring_buf_idx], &fft->_freq_bins[ring_buf_idx], fft->_window_size - ring_buf_idx);
}
TIMER_END(_hanning_timer);
}
// step 2: guts of complex fft processing
void DSP::step_arm_cfft_f32(FFTWindowStateARM* fft)
{
arm_cfft_instance_f32 *Sint = &(fft->_fft_instance.Sint);
Sint->fftLen = fft->_fft_instance.fftLenRFFT / 2;
TIMER_START(_arm_cfft_f32_timer);
switch (fft->_bin_count) {
case 16: // window 32
// 16us (BF)
// 5us F7, 7us F4, 8us H7
case 128: // window 256
// 37us F7, 81us F4, 17us H7
arm_cfft_radix8by2_f32(Sint, fft->_freq_bins);
break;
case 32: // window 64
// 35us (BF)
// 10us F7, 24us F4
case 256: // window 512
// 66us F7, 174us F4, 37us H7
arm_cfft_radix8by4_f32(Sint, fft->_freq_bins);
break;
case 64: // window 128
// 70us BF
// 21us F7, 34us F4
case 512: // window 1024
// 152us F7, 73us H7
arm_radix8_butterfly_f32(fft->_freq_bins, fft->_bin_count, Sint->pTwiddle, 1);
break;
}
TIMER_END(_arm_cfft_f32_timer);
}
// step 3: reverse the bits of the output
void DSP::step_bitreversal(FFTWindowStateARM* fft)
{
TIMER_START(_bitreversal_timer);
// 6us (BF)
// 32 - 2us F7, 3us F4, 1us H7
// 64 - 3us F7, 6us F4
// 128 - 4us F7, 9us F4
// 256 - 10us F7, 20us F4, 5us H7
// 512 - 22us F7, 54us F4, 15us H7
// 1024 - 42us F7, 15us H7
arm_bitreversal_32((uint32_t *)fft->_freq_bins, fft->_fft_instance.Sint.bitRevLength, fft->_fft_instance.Sint.pBitRevTable);
TIMER_END(_bitreversal_timer);
}
// step 4: convert from complex to real data
void DSP::step_stage_rfft_f32(FFTWindowStateARM* fft)
{
TIMER_START(_stage_rfft_f32_timer);
// 14us (BF)
// 32 - 2us F7, 5us F4, 2us H7
// 64 - 5us F7, 16us F4
// 128 - 17us F7, 26us F4
// 256 - 21us F7, 70us F4, 9us H7
// 512 - 35us F7, 71us F4, 17us H7
// 1024 - 76us F7, 33us H7
// this does not work in place => _freq_bins AND _rfft_data needed
stage_rfft_f32(&fft->_fft_instance, fft->_freq_bins, fft->_rfft_data);
TIMER_END(_stage_rfft_f32_timer);
}
// step 5: find the magnitudes of the complex data
void DSP::step_arm_cmplx_mag_f32(FFTWindowStateARM* fft, uint16_t start_bin, uint16_t end_bin, uint8_t harmonics, float noise_att_cutoff)
{
TIMER_START(_arm_cmplx_mag_f32_timer);
// 8us (BF)
// 32 - 4us F7, 5us F4, 5us H7
// 64 - 7us F7, 13us F4
// 128 - 14us F7, 17us F4
// 256 - 29us F7, 28us F4, 7us H7
// 512 - 55us F7, 93us F4, 13us H7
// 1024 - 131us F7, 25us H7
// General case for the magnitudes - see https://stackoverflow.com/questions/42299932/dsp-libraries-rfft-strange-results
// The frequency of each of those frequency components are given by k*fs/N
arm_cmplx_mag_squared_f32(&fft->_rfft_data[2], &fft->_freq_bins[1], fft->_bin_count - 1);
fft->_freq_bins[0] = sq(fft->_rfft_data[0]); // DC
fft->_freq_bins[fft->_bin_count] = sq(fft->_rfft_data[1]); // Nyquist
fft->_rfft_data[fft->_window_size] = fft->_rfft_data[1]; // Nyquist for the interpolator
fft->_rfft_data[fft->_window_size + 1] = 0;
step_cmplx_mag(fft, start_bin, end_bin, harmonics, noise_att_cutoff);
TIMER_END(_arm_cmplx_mag_f32_timer);
}
// step 6: find the bin with the highest energy and interpolate the required frequency
uint16_t DSP::step_calc_frequencies_f32(FFTWindowStateARM* fft, uint16_t start_bin, uint16_t end_bin)
{
TIMER_START(_step_calc_frequencies);
// 4us H7
step_calc_frequencies(fft, start_bin, end_bin);
TIMER_END(_step_calc_frequencies);
#if DEBUG_FFT
_output_count++;
// outputs at approx 1hz
if (_output_count % 400 == 0) {
gcs().send_text(MAV_SEVERITY_WARNING, "FFT(us): t1:%lu,t2:%lu,t3:%lu,t4:%lu,t5:%lu,t6:%lu",
_hanning_timer._timer_avg, _arm_cfft_f32_timer._timer_avg, _bitreversal_timer._timer_avg, _stage_rfft_f32_timer._timer_avg, _arm_cmplx_mag_f32_timer._timer_avg, _step_calc_frequencies._timer_avg);
}
#endif
return fft->_max_energy_bin;
}
static const float PI_N = M_PI / 32.0f;
static const float CANDAN_FACTOR = tanf(PI_N) / PI_N;
// Interpolate center frequency using http://users.metu.edu.tr/ccandan//pub_dir/FineDopplerEst_IEEE_SPL_June2011.pdf
// This is slightly less accurate than Quinn, but much cheaper to calculate
float DSP::calculate_candans_estimator(const FFTWindowStateARM* fft, uint16_t k_max) const
{
if (k_max <= 1 || k_max == fft->_bin_count) {
return 0.0f;
}
const uint16_t k_m1 = (k_max - 1) * 2;
const uint16_t k_p1 = (k_max + 1) * 2;
const uint16_t k = k_max * 2;
const float npr = fft->_rfft_data[k_m1] - fft->_rfft_data[k_p1];
const float npc = fft->_rfft_data[k_m1 + 1] - fft->_rfft_data[k_p1 + 1];
const float dpr = 2.0f * fft->_rfft_data[k] - fft->_rfft_data[k_m1] - fft->_rfft_data[k_p1];
const float dpc = 2.0f * fft->_rfft_data[k + 1] - fft->_rfft_data[k_m1 + 1] - fft->_rfft_data[k_p1 + 1];
const float realn = npr * dpr + npc * dpc;
const float reald = dpr * dpr + dpc * dpc;
// sanity check
if (is_zero(reald)) {
return 0.0f;
}
float d = CANDAN_FACTOR * (realn / reald);
// -0.5 < d < 0.5 which is the fraction of the sample spacing about the center element
return constrain_float(d, -0.5f, 0.5f);
}
#if DEBUG_FFT
void DSP::StepTimer::time(uint32_t start)
{
_timer_total += (AP_HAL::micros() - start);
_time_ticks = (_time_ticks + 1) % TICK_CYCLE;
if (_time_ticks == 0) {
_timer_avg = _timer_total / TICK_CYCLE;
_timer_total = 0;
}
}
#endif
#endif

View File

@ -0,0 +1,92 @@
/*
* This file is free software: you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This file is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Code by Andy Piper and the betaflight team
*/
#pragma once
#include <AP_HAL/AP_HAL.h>
#include "AP_HAL_ChibiOS_Namespace.h"
#if HAL_WITH_DSP
#include <arm_math.h>
#define DEBUG_FFT 0
// ChibiOS implementation of FFT analysis to run on STM32 processors
class ChibiOS::DSP : public AP_HAL::DSP {
public:
// initialise an FFT instance
virtual FFTWindowState* fft_init(uint16_t window_size, uint16_t sample_rate) override;
// start an FFT analysis
virtual void fft_start(FFTWindowState* state, const float* samples, uint16_t buffer_index, uint16_t buffer_size) override;
// perform remaining steps of an FFT analysis
virtual uint16_t fft_analyse(FFTWindowState* state, uint16_t start_bin, uint16_t end_bin, uint8_t harmonics, float noise_att_cutoff) override;
// STM32-based FFT state
class FFTWindowStateARM : public AP_HAL::DSP::FFTWindowState {
friend class ChibiOS::DSP;
protected:
FFTWindowStateARM(uint16_t window_size, uint16_t sample_rate);
~FFTWindowStateARM();
private:
// underlying CMSIS data structure for FFT analysis
arm_rfft_fast_instance_f32 _fft_instance;
};
protected:
void vector_max_float(const float* vin, uint16_t len, float* maxValue, uint16_t* maxIndex) const override {
uint32_t mindex;
arm_max_f32(vin, len, maxValue, &mindex);
*maxIndex = mindex;
}
void vector_scale_float(const float* vin, float scale, float* vout, uint16_t len) const override {
arm_scale_f32(vin, scale, vout, len);
}
private:
// following are the six independent steps for calculating an FFT
void step_hanning(FFTWindowStateARM* fft, const float* samples, uint16_t buffer_index, uint16_t buffer_size);
void step_arm_cfft_f32(FFTWindowStateARM* fft);
void step_bitreversal(FFTWindowStateARM* fft);
void step_stage_rfft_f32(FFTWindowStateARM* fft);
void step_arm_cmplx_mag_f32(FFTWindowStateARM* fft, uint16_t start_bin, uint16_t end_bin, uint8_t harmonics, float noise_att_cutoff);
uint16_t step_calc_frequencies_f32(FFTWindowStateARM* fft, uint16_t start_bin, uint16_t end_bin);
// candan's frequency interpolator
float calculate_candans_estimator(const FFTWindowStateARM* fft, uint16_t k) const;
#if DEBUG_FFT
class StepTimer {
public:
uint32_t _timer_total;
uint32_t _timer_avg;
uint8_t _time_ticks;
void time(uint32_t start);
};
uint32_t _output_count;
StepTimer _hanning_timer;
StepTimer _arm_cfft_f32_timer;
StepTimer _bitreversal_timer;
StepTimer _stage_rfft_f32_timer;
StepTimer _arm_cmplx_mag_f32_timer;
StepTimer _step_calc_frequencies;
#endif
};
#endif

View File

@ -91,6 +91,12 @@ static ChibiOS::Scheduler schedulerInstance;
static ChibiOS::Util utilInstance; static ChibiOS::Util utilInstance;
static Empty::OpticalFlow opticalFlowDriver; static Empty::OpticalFlow opticalFlowDriver;
#if HAL_WITH_DSP
static ChibiOS::DSP dspDriver;
#else
static Empty::DSP dspDriver;
#endif
#ifndef HAL_NO_FLASH_SUPPORT #ifndef HAL_NO_FLASH_SUPPORT
static ChibiOS::Flash flashDriver; static ChibiOS::Flash flashDriver;
#else #else
@ -126,6 +132,7 @@ HAL_ChibiOS::HAL_ChibiOS() :
&utilInstance, &utilInstance,
&opticalFlowDriver, &opticalFlowDriver,
&flashDriver, &flashDriver,
&dspDriver,
nullptr nullptr
) )
{} {}

View File

@ -161,3 +161,4 @@ define STM32_PWM_USE_ADVANCED TRUE
# we are low on flash on this board # we are low on flash on this board
define HAL_MINIMIZE_FEATURES 1 define HAL_MINIMIZE_FEATURES 1
define HAL_WITH_DSP 1

View File

@ -158,8 +158,6 @@ ROMFS_WILDCARD libraries/AP_OSD/fonts/font*.bin
define BOARD_PWM_COUNT_DEFAULT 6 define BOARD_PWM_COUNT_DEFAULT 6
define STM32_PWM_USE_ADVANCED TRUE define STM32_PWM_USE_ADVANCED TRUE
define HAL_MINIMIZE_FEATURES 1
# disable SMBUS and fuel battery monitors to save flash # disable SMBUS and fuel battery monitors to save flash
define HAL_BATTMON_SMBUS_ENABLE 0 define HAL_BATTMON_SMBUS_ENABLE 0
define HAL_BATTMON_FUEL_ENABLE 0 define HAL_BATTMON_FUEL_ENABLE 0
@ -170,3 +168,7 @@ define HAL_SPRAYER_ENABLED 0
# reduce max size of embedded params for apj_tool.py # reduce max size of embedded params for apj_tool.py
define AP_PARAM_MAX_EMBEDDED_PARAM 1024 define AP_PARAM_MAX_EMBEDDED_PARAM 1024
# we are low on flash on this board
define HAL_MINIMIZE_FEATURES 1
define HAL_WITH_DSP 1

View File

@ -196,3 +196,4 @@ define HAL_NAVEKF2_AVAILABLE 0
# reduce max size of embedded params for apj_tool.py # reduce max size of embedded params for apj_tool.py
define AP_PARAM_MAX_EMBEDDED_PARAM 1024 define AP_PARAM_MAX_EMBEDDED_PARAM 1024
define HAL_WITH_DSP FALSE

View File

@ -133,6 +133,7 @@ define HAL_I2C_INTERNAL_MASK 0
define GPS_MAX_RECEIVERS 1 define GPS_MAX_RECEIVERS 1
define GPS_MAX_INSTANCES 1 define GPS_MAX_INSTANCES 1
define HAL_COMPASS_MAX_SENSORS 1 define HAL_COMPASS_MAX_SENSORS 1
define HAL_WITH_DSP FALSE
# use the app descriptor needed by MissionPlanner for CAN upload # use the app descriptor needed by MissionPlanner for CAN upload
env APP_DESCRIPTOR MissionPlanner env APP_DESCRIPTOR MissionPlanner

View File

@ -142,3 +142,4 @@ IOMCU_FW 1
MAIN_STACK 0x200 MAIN_STACK 0x200
PROCESS_STACK 0x250 PROCESS_STACK 0x250
define HAL_DISABLE_LOOP_DELAY define HAL_DISABLE_LOOP_DELAY
define HAL_WITH_DSP FALSE

View File

@ -138,4 +138,5 @@ define HAL_LOGGING_DATAFLASH
# 8 PWM available by default # 8 PWM available by default
define BOARD_PWM_COUNT_DEFAULT 8 define BOARD_PWM_COUNT_DEFAULT 8
define HAL_WITH_DSP FALSE

View File

@ -718,11 +718,18 @@ def write_mcu_config(f):
env_vars['CPU_FLAGS'] = ["-mcpu=%s" % cortex] env_vars['CPU_FLAGS'] = ["-mcpu=%s" % cortex]
build_info['MCU'] = cortex build_info['MCU'] = cortex
else: else:
# default to F4
cortex = "cortex-m4" cortex = "cortex-m4"
env_vars['CPU_FLAGS'] = ["-mcpu=%s" % cortex, "-mfpu=fpv4-sp-d16", "-mfloat-abi=hard"] env_vars['CPU_FLAGS'] = ["-mcpu=%s" % cortex, "-mfpu=fpv4-sp-d16", "-mfloat-abi=hard"]
build_info['MCU'] = cortex build_info['MCU'] = cortex
env_vars['CORTEX'] = cortex
if not args.bootloader:
if cortex == 'cortex-m4':
env_vars['CPU_FLAGS'].append('-DARM_MATH_CM4')
elif cortex == 'cortex-m7':
env_vars['CPU_FLAGS'].append('-DARM_MATH_CM7')
if not mcu_series.startswith("STM32F1") and not args.bootloader: if not mcu_series.startswith("STM32F1") and not args.bootloader:
env_vars['CPU_FLAGS'].append('-u_printf_float') env_vars['CPU_FLAGS'].append('-u_printf_float')
build_info['ENV_UDEFS'] = "-DCHPRINTF_USE_FLOAT=1" build_info['ENV_UDEFS'] = "-DCHPRINTF_USE_FLOAT=1"