From 6ab94e0b318884bbcb95e2ea3835f951502e1d99 Mon Sep 17 00:00:00 2001 From: jaseg Date: Wed, 14 Oct 2020 12:47:28 +0200 Subject: Move firmware into subdirectory --- .../arm_biquad_cascade_df1_32x64_init_q31.c | 98 ++ .../arm_biquad_cascade_df1_32x64_q31.c | 549 +++++++ .../arm_biquad_cascade_df1_f32.c | 412 ++++++ .../arm_biquad_cascade_df1_fast_q15.c | 273 ++++ .../arm_biquad_cascade_df1_fast_q31.c | 292 ++++ .../arm_biquad_cascade_df1_init_f32.c | 97 ++ .../arm_biquad_cascade_df1_init_q15.c | 99 ++ .../arm_biquad_cascade_df1_init_q31.c | 98 ++ .../arm_biquad_cascade_df1_q15.c | 398 ++++++ .../arm_biquad_cascade_df1_q31.c | 392 +++++ .../arm_biquad_cascade_df2T_f32.c | 590 ++++++++ .../arm_biquad_cascade_df2T_f64.c | 590 ++++++++ .../arm_biquad_cascade_df2T_init_f32.c | 89 ++ .../arm_biquad_cascade_df2T_init_f64.c | 89 ++ .../arm_biquad_cascade_stereo_df2T_f32.c | 670 +++++++++ .../arm_biquad_cascade_stereo_df2T_init_f32.c | 89 ++ .../DSP/Source/FilteringFunctions/arm_conv_f32.c | 635 +++++++++ .../FilteringFunctions/arm_conv_fast_opt_q15.c | 531 +++++++ .../Source/FilteringFunctions/arm_conv_fast_q15.c | 1398 ++++++++++++++++++ .../Source/FilteringFunctions/arm_conv_fast_q31.c | 565 ++++++++ .../Source/FilteringFunctions/arm_conv_opt_q15.c | 533 +++++++ .../Source/FilteringFunctions/arm_conv_opt_q7.c | 423 ++++++ .../FilteringFunctions/arm_conv_partial_f32.c | 678 +++++++++ .../arm_conv_partial_fast_opt_q15.c | 756 ++++++++++ .../FilteringFunctions/arm_conv_partial_fast_q15.c | 1494 ++++++++++++++++++++ .../FilteringFunctions/arm_conv_partial_fast_q31.c | 620 ++++++++ .../FilteringFunctions/arm_conv_partial_opt_q15.c | 753 ++++++++++ .../FilteringFunctions/arm_conv_partial_opt_q7.c | 791 +++++++++++ .../FilteringFunctions/arm_conv_partial_q15.c | 795 +++++++++++ .../FilteringFunctions/arm_conv_partial_q31.c | 616 ++++++++ .../FilteringFunctions/arm_conv_partial_q7.c | 750 ++++++++++ .../DSP/Source/FilteringFunctions/arm_conv_q15.c | 722 ++++++++++ .../DSP/Source/FilteringFunctions/arm_conv_q31.c | 553 ++++++++ .../DSP/Source/FilteringFunctions/arm_conv_q7.c | 678 +++++++++ .../Source/FilteringFunctions/arm_correlate_f32.c | 727 ++++++++++ .../arm_correlate_fast_opt_q15.c | 500 +++++++ .../FilteringFunctions/arm_correlate_fast_q15.c | 1307 +++++++++++++++++ .../FilteringFunctions/arm_correlate_fast_q31.c | 600 ++++++++ .../FilteringFunctions/arm_correlate_opt_q15.c | 501 +++++++ .../FilteringFunctions/arm_correlate_opt_q7.c | 452 ++++++ .../Source/FilteringFunctions/arm_correlate_q15.c | 707 +++++++++ .../Source/FilteringFunctions/arm_correlate_q31.c | 653 +++++++++ .../Source/FilteringFunctions/arm_correlate_q7.c | 778 ++++++++++ .../FilteringFunctions/arm_fir_decimate_f32.c | 512 +++++++ .../FilteringFunctions/arm_fir_decimate_fast_q15.c | 586 ++++++++ .../FilteringFunctions/arm_fir_decimate_fast_q31.c | 339 +++++ .../FilteringFunctions/arm_fir_decimate_init_f32.c | 105 ++ .../FilteringFunctions/arm_fir_decimate_init_q15.c | 107 ++ .../FilteringFunctions/arm_fir_decimate_init_q31.c | 105 ++ .../FilteringFunctions/arm_fir_decimate_q15.c | 684 +++++++++ .../FilteringFunctions/arm_fir_decimate_q31.c | 299 ++++ .../DSP/Source/FilteringFunctions/arm_fir_f32.c | 985 +++++++++++++ .../Source/FilteringFunctions/arm_fir_fast_q15.c | 333 +++++ .../Source/FilteringFunctions/arm_fir_fast_q31.c | 293 ++++ .../Source/FilteringFunctions/arm_fir_init_f32.c | 84 ++ .../Source/FilteringFunctions/arm_fir_init_q15.c | 142 ++ .../Source/FilteringFunctions/arm_fir_init_q31.c | 84 ++ .../Source/FilteringFunctions/arm_fir_init_q7.c | 82 ++ .../FilteringFunctions/arm_fir_interpolate_f32.c | 569 ++++++++ .../arm_fir_interpolate_init_f32.c | 109 ++ .../arm_fir_interpolate_init_q15.c | 108 ++ .../arm_fir_interpolate_init_q31.c | 109 ++ .../FilteringFunctions/arm_fir_interpolate_q15.c | 496 +++++++ .../FilteringFunctions/arm_fir_interpolate_q31.c | 492 +++++++ .../FilteringFunctions/arm_fir_lattice_f32.c | 494 +++++++ .../FilteringFunctions/arm_fir_lattice_init_f32.c | 71 + .../FilteringFunctions/arm_fir_lattice_init_q15.c | 71 + .../FilteringFunctions/arm_fir_lattice_init_q31.c | 71 + .../FilteringFunctions/arm_fir_lattice_q15.c | 524 +++++++ .../FilteringFunctions/arm_fir_lattice_q31.c | 341 +++++ .../DSP/Source/FilteringFunctions/arm_fir_q15.c | 679 +++++++++ .../DSP/Source/FilteringFunctions/arm_fir_q31.c | 353 +++++ .../DSP/Source/FilteringFunctions/arm_fir_q7.c | 385 +++++ .../Source/FilteringFunctions/arm_fir_sparse_f32.c | 433 ++++++ .../FilteringFunctions/arm_fir_sparse_init_f32.c | 95 ++ .../FilteringFunctions/arm_fir_sparse_init_q15.c | 95 ++ .../FilteringFunctions/arm_fir_sparse_init_q31.c | 94 ++ .../FilteringFunctions/arm_fir_sparse_init_q7.c | 95 ++ .../Source/FilteringFunctions/arm_fir_sparse_q15.c | 470 ++++++ .../Source/FilteringFunctions/arm_fir_sparse_q31.c | 450 ++++++ .../Source/FilteringFunctions/arm_fir_sparse_q7.c | 469 ++++++ .../FilteringFunctions/arm_iir_lattice_f32.c | 435 ++++++ .../FilteringFunctions/arm_iir_lattice_init_f32.c | 79 ++ .../FilteringFunctions/arm_iir_lattice_init_q15.c | 79 ++ .../FilteringFunctions/arm_iir_lattice_init_q31.c | 79 ++ .../FilteringFunctions/arm_iir_lattice_q15.c | 452 ++++++ .../FilteringFunctions/arm_iir_lattice_q31.c | 338 +++++ .../DSP/Source/FilteringFunctions/arm_lms_f32.c | 430 ++++++ .../Source/FilteringFunctions/arm_lms_init_f32.c | 83 ++ .../Source/FilteringFunctions/arm_lms_init_q15.c | 93 ++ .../Source/FilteringFunctions/arm_lms_init_q31.c | 93 ++ .../Source/FilteringFunctions/arm_lms_norm_f32.c | 454 ++++++ .../FilteringFunctions/arm_lms_norm_init_f32.c | 93 ++ .../FilteringFunctions/arm_lms_norm_init_q15.c | 100 ++ .../FilteringFunctions/arm_lms_norm_init_q31.c | 99 ++ .../Source/FilteringFunctions/arm_lms_norm_q15.c | 428 ++++++ .../Source/FilteringFunctions/arm_lms_norm_q31.c | 419 ++++++ .../DSP/Source/FilteringFunctions/arm_lms_q15.c | 368 +++++ .../DSP/Source/FilteringFunctions/arm_lms_q31.c | 357 +++++ 99 files changed, 40633 insertions(+) create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c create mode 100644 fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c (limited to 'fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions') diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c new file mode 100644 index 0000000..8a29213 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c @@ -0,0 +1,98 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_32x64_init_q31.c + * Description: High precision Q31 Biquad cascade filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1_32x64 + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the high precision Q31 Biquad cascade filter structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format. + * @return none + * + * Coefficient and State Ordering: + * + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+ * 
+ * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState points to state variables array and size of each state variable is 1.63 format. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the state array as: + *
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}
+ * 
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cas_df1_32x64_init_q31( + arm_biquad_cas_df1_32x64_ins_q31 * S, + uint8_t numStages, + q31_t * pCoeffs, + q63_t * pState, + uint8_t postShift) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign postShift to be applied to the output */ + S->postShift = postShift; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 4 * numStages */ + memset(pState, 0, (4U * (uint32_t) numStages) * sizeof(q63_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF1_32x64 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c new file mode 100644 index 0000000..d241f76 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c @@ -0,0 +1,549 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_32x64_q31.c + * Description: High precision Q31 Biquad cascade filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter + * + * This function implements a high precision Biquad cascade filter which operates on + * Q31 data values. The filter coefficients are in 1.31 format and the state variables + * are in 1.63 format. The double precision state variables reduce quantization noise + * in the filter and provide a cleaner output. + * These filters are particularly useful when implementing filters in which the + * singularities are close to the unit circle. This is common for low pass or high + * pass filters with very low cutoff frequencies. + * + * The function operates on blocks of input and output data + * and each call to the function processes blockSize samples through + * the filter. pSrc and pDst points to input and output arrays + * containing blockSize Q31 values. + * + * \par Algorithm + * Each Biquad stage implements a second order filter using the difference equation: + *
+ *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * 
+ * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage. + * \image html Biquad.gif "Single Biquad filter stage" + * Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. + * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. + * Pay careful attention to the sign of the feedback coefficients. + * Some design tools use the difference equation + *
+ *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
+ * 
+ * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. + * + * \par + * Higher order filters are realized as a cascade of second order sections. + * numStages refers to the number of second order stages used. + * For example, an 8th order filter would be realized with numStages=4 second order stages. + * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages" + * A 9th order filter would be realized with numStages=5 second order stages with the coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). + * + * \par + * The pState points to state variables array . + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2] and each state variable in 1.63 format to improve precision. + * The state variables are arranged in the array as: + *
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}
+ * 
+ * + * \par + * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values of data in 1.63 format. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * + * \par Init Function + * There is also an associated initialization function which performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * To do this manually without calling the init function, assign the follow subfields of the instance structure: + * numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * For example, to statically initialize the filter instance structure use + *
+ *     arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};
+ * 
+ * where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer; postShift shift to be applied which is described in detail below. + * \par Fixed-Point Behavior + * Care must be taken while using Biquad Cascade 32x64 filter function. + * Following issues must be considered: + * - Scaling of coefficients + * - Filter gain + * - Overflow and saturation + * + * \par + * Filter coefficients are represented as fractional values and + * restricted to lie in the range [-1 +1). + * The processing function has an additional scaling parameter postShift + * which allows the filter coefficients to exceed the range [+1 -1). + * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits. + * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator" + * This essentially scales the filter coefficients by 2^postShift. + * For example, to realize the coefficients + *
+ *    {1.5, -0.8, 1.2, 1.6, -0.9}
+ * 
+ * set the Coefficient array to: + *
+ *    {0.75, -0.4, 0.6, 0.8, -0.45}
+ * 
+ * and set postShift=1 + * + * \par + * The second thing to keep in mind is the gain through the filter. + * The frequency response of a Biquad filter is a function of its coefficients. + * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies. + * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter. + * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed. + * + * \par + * The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version. + * This is described in the function specific documentation below. + */ + +/** + * @addtogroup BiquadCascadeDF1_32x64 + * @{ + */ + +/** + * @details + + * @param[in] *S points to an instance of the high precision Q31 Biquad cascade filter. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25). + * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to + * 1.31 format by discarding the low 32 bits. + * + * \par + * Two related functions are provided in the CMSIS DSP library. + * arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator. + * arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator. + */ + +void arm_biquad_cas_df1_32x64_q31( + const arm_biquad_cas_df1_32x64_ins_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pIn = pSrc; /* input pointer initialization */ + q31_t *pOut = pDst; /* output pointer initialization */ + q63_t *pState = S->pState; /* state pointer initialization */ + q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */ + q63_t acc; /* accumulator */ + q31_t Xn1, Xn2; /* Input Filter state variables */ + q63_t Yn1, Yn2; /* Output Filter state variables */ + q31_t b0, b1, b2, a1, a2; /* Filter coefficients */ + q31_t Xn; /* temporary input */ + int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */ + uint32_t sample, stage = S->numStages; /* loop counters */ + q31_t acc_l, acc_h; /* temporary output */ + uint32_t uShift = ((uint32_t) S->postShift + 1U); + uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = (q31_t) (pState[0]); + Xn2 = (q31_t) (pState[1]); + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + /* The variable acc hold output value that is being computed and + * stored in the destination buffer + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) Xn *b0; + + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn1 *b1; + + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn2 *b2; + + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn1, a1); + + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn2, a2); + + /* The result is converted to 1.63 , Yn2 variable is reused */ + Yn2 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *pOut = acc_h; + + /* Read the second input into Xn2, to reuse the value */ + Xn2 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc += b1 * x[n-1] */ + acc = (q63_t) Xn *b1; + + /* acc = b0 * x[n] */ + acc += (q63_t) Xn2 *b0; + + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn1 *b2; + + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn2, a1); + + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn1, a2); + + /* The result is converted to 1.63, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Read the third input into Xn1, to reuse the value */ + Xn1 = *pIn++; + + /* The result is converted to 1.31 */ + /* Store the output in the destination buffer. */ + *(pOut + 1U) = acc_h; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) Xn1 *b0; + + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn2 *b1; + + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn *b2; + + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn1, a1); + + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn2, a2); + + /* The result is converted to 1.63, Yn2 variable is reused */ + Yn2 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *(pOut + 2U) = acc_h; + + /* Read the fourth input into Xn, to reuse the value */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q63_t) Xn *b0; + + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn1 *b1; + + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn2 *b2; + + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn2, a1); + + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn1, a2); + + /* The result is converted to 1.63, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *(pOut + 3U) = acc_h; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + + /* update output pointer */ + pOut += 4U; + + /* decrement the loop counter */ + sample--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + sample = (blockSize & 0x3U); + + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) Xn *b0; + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn1 *b1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn2 *b2; + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn1, a1); + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn2, a2); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + /* The result is converted to 1.63, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *pOut++ = acc_h; + /* Yn1 = acc << shift; */ + + /* Store the output in the destination buffer in 1.31 format. */ +/* *pOut++ = (q31_t) (acc >> (32 - shift)); */ + + /* decrement the loop counter */ + sample--; + } + + /* The first stage output is given as input to the second stage. */ + pIn = pDst; + + /* Reset to destination buffer working pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + /* Store the updated state variables back into the pState array */ + *pState++ = (q63_t) Xn1; + *pState++ = (q63_t) Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while (--stage); + +#else + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* The variable acc hold output value that is being computed and + * stored in the destination buffer + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize; + + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q63_t) Xn *b0; + /* acc += b1 * x[n-1] */ + acc += (q63_t) Xn1 *b1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) Xn2 *b2; + /* acc += a1 * y[n-1] */ + acc += mult32x64(Yn1, a1); + /* acc += a2 * y[n-2] */ + acc += mult32x64(Yn2, a2); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + + /* The result is converted to 1.63, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer in 1.31 format. */ + *pOut++ = acc_h; + + /* Yn1 = acc << shift; */ + + /* Store the output in the destination buffer in 1.31 format. */ + /* *pOut++ = (q31_t) (acc >> (32 - shift)); */ + + /* decrement the loop counter */ + sample--; + } + + /* The first stage output is given as input to the second stage. */ + pIn = pDst; + + /* Reset to destination buffer working pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = (q63_t) Xn1; + *pState++ = (q63_t) Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while (--stage); + +#endif /* #if defined (ARM_MATH_DSP) */ +} + + /** + * @} end of BiquadCascadeDF1_32x64 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c new file mode 100644 index 0000000..658e395 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c @@ -0,0 +1,412 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_f32.c + * Description: Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure + * + * This set of functions implements arbitrary order recursive (IIR) filters. + * The filters are implemented as a cascade of second order Biquad sections. + * The functions support Q15, Q31 and floating-point data types. + * Fast version of Q15 and Q31 also supported on CortexM4 and Cortex-M3. + * + * The functions operate on blocks of input and output data and each call to the function + * processes blockSize samples through the filter. + * pSrc points to the array of input data and + * pDst points to the array of output data. + * Both arrays contain blockSize values. + * + * \par Algorithm + * Each Biquad stage implements a second order filter using the difference equation: + *
+ *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * 
+ * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage. + * \image html Biquad.gif "Single Biquad filter stage" + * Coefficients b0, b1 and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. + * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. + * Pay careful attention to the sign of the feedback coefficients. + * Some design tools use the difference equation + *
+ *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]
+ * 
+ * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. + * + * \par + * Higher order filters are realized as a cascade of second order sections. + * numStages refers to the number of second order stages used. + * For example, an 8th order filter would be realized with numStages=4 second order stages. + * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages" + * A 9th order filter would be realized with numStages=5 second order stages with the coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). + * + * \par + * The pState points to state variables array. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the pState array as: + *
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}
+ * 
+ * + * \par + * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed, the coefficients are untouched. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Init Functions + * There is also an associated initialization function for each data type. + * The initialization function performs following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * To do this manually without calling the init function, assign the follow subfields of the instance structure: + * numStages, pCoeffs, pState. Also set all of the values in pState to zero. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * The code below statically initializes each of the 3 different data type filter instance structures + *
+ *     arm_biquad_casd_df1_inst_f32 S1 = {numStages, pState, pCoeffs};
+ *     arm_biquad_casd_df1_inst_q15 S2 = {numStages, pState, pCoeffs, postShift};
+ *     arm_biquad_casd_df1_inst_q31 S3 = {numStages, pState, pCoeffs, postShift};
+ * 
+ * where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer; postShift shift to be applied. + * + * \par Fixed-Point Behavior + * Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions. + * Following issues must be considered: + * - Scaling of coefficients + * - Filter gain + * - Overflow and saturation + * + * \par + * Scaling of coefficients: + * Filter coefficients are represented as fractional values and + * coefficients are restricted to lie in the range [-1 +1). + * The fixed-point functions have an additional scaling parameter postShift + * which allow the filter coefficients to exceed the range [+1 -1). + * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits. + * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator" + * This essentially scales the filter coefficients by 2^postShift. + * For example, to realize the coefficients + *
+ *    {1.5, -0.8, 1.2, 1.6, -0.9}
+ * 
+ * set the pCoeffs array to: + *
+ *    {0.75, -0.4, 0.6, 0.8, -0.45}
+ * 
+ * and set postShift=1 + * + * \par + * Filter gain: + * The frequency response of a Biquad filter is a function of its coefficients. + * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies. + * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter. + * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed. + * + * \par + * Overflow and saturation: + * For Q15 and Q31 versions, it is described separately as part of the function specific documentation below. + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @param[in] *S points to an instance of the floating-point Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + */ + +void arm_biquad_cascade_df1_f32( + const arm_biquad_casd_df1_inst_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pIn = pSrc; /* source pointer */ + float32_t *pOut = pDst; /* destination pointer */ + float32_t *pState = S->pState; /* pState pointer */ + float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */ + float32_t acc; /* Simulates the accumulator */ + float32_t b0, b1, b2, a1, a2; /* Filter coefficients */ + float32_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */ + float32_t Xn; /* temporary input */ + uint32_t sample, stage = S->numStages; /* loop counters */ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the pState values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + /* The variable acc hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (sample > 0U) + { + /* Read the first input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = Yn2; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + + /* Read the second input */ + Xn2 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = Yn1; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + + /* Read the third input */ + Xn1 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = Yn2; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + + /* Read the forth input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = Yn1; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + + /* decrement the loop counter */ + sample--; + + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + sample = blockSize & 0x3U; + + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = acc; + + /* decrement the loop counter */ + sample--; + + } + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent numStages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset the output pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while (stage > 0U); + +#else + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the pState values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* The variables acc holds the output value that is computed: + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize; + + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2); + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = acc; + + /* decrement the loop counter */ + sample--; + } + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent numStages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset the output pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while (stage > 0U); + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + + + /** + * @} end of BiquadCascadeDF1 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c new file mode 100644 index 0000000..2a08968 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c @@ -0,0 +1,273 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_fast_q15.c + * Description: Fast processing function for the Q15 Biquad cascade filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * @param[in] *S points to an instance of the Q15 Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around and distorts the result. + * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). + * The 2.30 accumulator is then shifted by postShift bits and the result truncated to 1.15 format by discarding the low 16 bits. + * + * \par + * Refer to the function arm_biquad_cascade_df1_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure. + * Use the function arm_biquad_cascade_df1_init_q15() to initialize the filter structure. + * + */ + +void arm_biquad_cascade_df1_fast_q15( + const arm_biquad_casd_df1_inst_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pIn = pSrc; /* Source pointer */ + q15_t *pOut = pDst; /* Destination pointer */ + q31_t in; /* Temporary variable to hold input value */ + q31_t out; /* Temporary variable to hold output value */ + q31_t b0; /* Temporary variable to hold bo value */ + q31_t b1, a1; /* Filter coefficients */ + q31_t state_in, state_out; /* Filter state variables */ + q31_t acc; /* Accumulator */ + int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + uint32_t sample, stage = S->numStages; /* Stage loop counter */ + + + + do + { + + /* Read the b0 and 0 coefficients using SIMD */ + b0 = *__SIMD32(pCoeffs)++; + + /* Read the b1 and b2 coefficients using SIMD */ + b1 = *__SIMD32(pCoeffs)++; + + /* Read the a1 and a2 coefficients using SIMD */ + a1 = *__SIMD32(pCoeffs)++; + + /* Read the input state values from the state buffer: x[n-1], x[n-2] */ + state_in = *__SIMD32(pState)++; + + /* Read the output state values from the state buffer: y[n-1], y[n-2] */ + state_out = *__SIMD32(pState)--; + + /* Apply loop unrolling and compute 2 output values simultaneously. */ + /* The variable acc hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + sample = blockSize >> 1U; + + /* First part of the processing with loop unrolling. Compute 2 outputs at a time. + ** a second loop below computes the remaining 1 sample. */ + while (sample > 0U) + { + + /* Read the input */ + in = *__SIMD32(pIn)++; + + /* out = b0 * x[n] + 0 * 0 */ + out = __SMUAD(b0, in); + /* acc = b1 * x[n-1] + acc += b2 * x[n-2] + out */ + acc = __SMLAD(b1, state_in, out); + /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */ + acc = __SMLAD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 and then saturation is applied */ + out = __SSAT((acc >> shift), 16); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, (in >> 16), 16); + state_out = __PKHBT(state_out >> 16, (out), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* out = b0 * x[n] + 0 * 0 */ + out = __SMUADX(b0, in); + /* acc0 = b1 * x[n-1] , acc0 += b2 * x[n-2] + out */ + acc = __SMLAD(b1, state_in, out); + /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */ + acc = __SMLAD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 and then saturation is applied */ + out = __SSAT((acc >> shift), 16); + + + /* Store the output in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16); + +#else + + *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in >> 16, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, in, 16); + state_out = __PKHBT(state_out >> 16, out, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + /* Decrement the loop counter */ + sample--; + + } + + /* If the blockSize is not a multiple of 2, compute any remaining output samples here. + ** No loop unrolling is used. */ + + if ((blockSize & 0x1U) != 0U) + { + /* Read the input */ + in = *pIn++; + + /* out = b0 * x[n] + 0 * 0 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + out = __SMUAD(b0, in); + +#else + + out = __SMUADX(b0, in); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc = b1 * x[n-1], acc += b2 * x[n-2] + out */ + acc = __SMLAD(b1, state_in, out); + /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */ + acc = __SMLAD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 and then saturation is applied */ + out = __SSAT((acc >> shift), 16); + + /* Store the output in the destination buffer. */ + *pOut++ = (q15_t) out; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, in, 16); + state_out = __PKHBT(state_out >> 16, out, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent (numStages - 1) occur in-place in the output buffer */ + pIn = pDst; + + /* Reset the output pointer */ + pOut = pDst; + + /* Store the updated state variables back into the state array */ + *__SIMD32(pState)++ = state_in; + *__SIMD32(pState)++ = state_out; + + + /* Decrement the loop counter */ + stage--; + + } while (stage > 0U); +} + + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c new file mode 100644 index 0000000..5e41faa --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c @@ -0,0 +1,292 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_fast_q31.c + * Description: Processing function for the Q31 Fast Biquad cascade DirectFormI(DF1) filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * + * @param[in] *S points to an instance of the Q31 Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are added to a 2.30 accumulator. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function + * arm_biquad_cascade_df1_init_q31() to initialize filter structure. + * + * \par + * Refer to the function arm_biquad_cascade_df1_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure. + * Use the function arm_biquad_cascade_df1_init_q31() to initialize the filter structure. + */ + +void arm_biquad_cascade_df1_fast_q31( + const arm_biquad_casd_df1_inst_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t acc = 0; /* accumulator */ + q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */ + q31_t b0, b1, b2, a1, a2; /* Filter coefficients */ + q31_t *pIn = pSrc; /* input pointer initialization */ + q31_t *pOut = pDst; /* output pointer initialization */ + q31_t *pState = S->pState; /* pState pointer initialization */ + q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */ + q31_t Xn; /* temporary input */ + int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */ + uint32_t sample, stage = S->numStages; /* loop counters */ + + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + /* The variables acc ... acc3 hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + /*acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/ + mult_32x32_keep32_R(acc, b1, Xn1); + /* acc += b1 * x[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b0, Xn); + /* acc += b[2] * x[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b2, Xn2); + /* acc += a1 * y[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a1, Yn1); + /* acc += a2 * y[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a2, Yn2); + + /* The result is converted to 1.31 , Yn2 variable is reused */ + Yn2 = acc << shift; + + /* Read the second input */ + Xn2 = *(pIn + 1U); + + /* Store the output in the destination buffer. */ + *pOut = Yn2; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + /*acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/ + mult_32x32_keep32_R(acc, b0, Xn2); + /* acc += b1 * x[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b1, Xn); + /* acc += b[2] * x[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b2, Xn1); + /* acc += a1 * y[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a1, Yn2); + /* acc += a2 * y[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a2, Yn1); + + /* The result is converted to 1.31, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Read the third input */ + Xn1 = *(pIn + 2U); + + /* Store the output in the destination buffer. */ + *(pOut + 1U) = Yn1; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + /*acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/ + mult_32x32_keep32_R(acc, b0, Xn1); + /* acc += b1 * x[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b1, Xn2); + /* acc += b[2] * x[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b2, Xn); + /* acc += a1 * y[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a1, Yn1); + /* acc += a2 * y[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a2, Yn2); + + /* The result is converted to 1.31, Yn2 variable is reused */ + Yn2 = acc << shift; + + /* Read the forth input */ + Xn = *(pIn + 3U); + + /* Store the output in the destination buffer. */ + *(pOut + 2U) = Yn2; + pIn += 4U; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/ + mult_32x32_keep32_R(acc, b0, Xn); + /* acc += b1 * x[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b1, Xn1); + /* acc += b[2] * x[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b2, Xn2); + /* acc += a1 * y[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a1, Yn2); + /* acc += a2 * y[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a2, Yn1); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + Xn2 = Xn1; + + /* The result is converted to 1.31, Yn1 variable is reused */ + Yn1 = acc << shift; + + /* Xn1 = Xn */ + Xn1 = Xn; + + /* Store the output in the destination buffer. */ + *(pOut + 3U) = Yn1; + pOut += 4U; + + /* decrement the loop counter */ + sample--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + sample = (blockSize & 0x3U); + + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/ + mult_32x32_keep32_R(acc, b0, Xn); + /* acc += b1 * x[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b1, Xn1); + /* acc += b[2] * x[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/ + multAcc_32x32_keep32_R(acc, b2, Xn2); + /* acc += a1 * y[n-1] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a1, Yn1); + /* acc += a2 * y[n-2] */ + /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/ + multAcc_32x32_keep32_R(acc, a2, Yn2); + + /* The result is converted to 1.31 */ + acc = acc << shift; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = acc; + + /* Store the output in the destination buffer. */ + *pOut++ = acc; + + /* decrement the loop counter */ + sample--; + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent stages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset to destination pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while (--stage); +} + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c new file mode 100644 index 0000000..147c8c5 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c @@ -0,0 +1,97 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_init_f32.c + * Description: Floating-point Biquad cascade DirectFormI(DF1) filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * @brief Initialization function for the floating-point Biquad cascade filter. + * @param[in,out] *S points to an instance of the floating-point Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients array. + * @param[in] *pState points to the state array. + * @return none + * + * + * Coefficient and State Ordering: + * + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+ * 
+ * + * \par + * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState is a pointer to state array. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the pState array as: + *
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}
+ * 
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * + */ + +void arm_biquad_cascade_df1_init_f32( + arm_biquad_casd_df1_inst_f32 * S, + uint8_t numStages, + float32_t * pCoeffs, + float32_t * pState) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 4 * numStages */ + memset(pState, 0, (4U * (uint32_t) numStages) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c new file mode 100644 index 0000000..dd46fb4 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c @@ -0,0 +1,99 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_init_q15.c + * Description: Q15 Biquad cascade DirectFormI(DF1) filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the Q15 Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] postShift Shift to be applied to the accumulator result. Varies according to the coefficients format + * @return none + * + * Coefficient and State Ordering: + * + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
+ *     {b10, 0, b11, b12, a11, a12, b20, 0, b21, b22, a21, a22, ...}
+ * 
+ * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 6*numStages values. + * The zero coefficient between b1 and b2 facilities use of 16-bit SIMD instructions on the Cortex-M4. + * + * \par + * The state variables are stored in the array pState. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the pState array as: + *
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}
+ * 
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cascade_df1_init_q15( + arm_biquad_casd_df1_inst_q15 * S, + uint8_t numStages, + q15_t * pCoeffs, + q15_t * pState, + int8_t postShift) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign postShift to be applied to the output */ + S->postShift = postShift; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 4 * numStages */ + memset(pState, 0, (4U * (uint32_t) numStages) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c new file mode 100644 index 0000000..10fb6bc --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c @@ -0,0 +1,98 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_init_q31.c + * Description: Q31 Biquad cascade DirectFormI(DF1) filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the Q31 Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format + * @return none + * + * Coefficient and State Ordering: + * + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+ * 
+ * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState points to state variables array. + * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2]. + * The state variables are arranged in the pState array as: + *
+ *     {x[n-1], x[n-2], y[n-1], y[n-2]}
+ * 
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on. + * The state array has a total length of 4*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cascade_df1_init_q31( + arm_biquad_casd_df1_inst_q31 * S, + uint8_t numStages, + q31_t * pCoeffs, + q31_t * pState, + int8_t postShift) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign postShift to be applied to the output */ + S->postShift = postShift; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 4 * numStages */ + memset(pState, 0, (4U * (uint32_t) numStages) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c new file mode 100644 index 0000000..c524756 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c @@ -0,0 +1,398 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_q15.c + * Description: Processing function for the Q15 Biquad cascade DirectFormI(DF1) filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @brief Processing function for the Q15 Biquad cascade filter. + * @param[in] *S points to an instance of the Q15 Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * The accumulator is then shifted by postShift bits to truncate the result to 1.15 format by discarding the low 16 bits. + * Finally, the result is saturated to 1.15 format. + * + * \par + * Refer to the function arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4. + */ + +void arm_biquad_cascade_df1_q15( + const arm_biquad_casd_df1_inst_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q15_t *pIn = pSrc; /* Source pointer */ + q15_t *pOut = pDst; /* Destination pointer */ + q31_t in; /* Temporary variable to hold input value */ + q31_t out; /* Temporary variable to hold output value */ + q31_t b0; /* Temporary variable to hold bo value */ + q31_t b1, a1; /* Filter coefficients */ + q31_t state_in, state_out; /* Filter state variables */ + q31_t acc_l, acc_h; + q63_t acc; /* Accumulator */ + int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */ + int32_t uShift = (32 - lShift); + + do + { + /* Read the b0 and 0 coefficients using SIMD */ + b0 = *__SIMD32(pCoeffs)++; + + /* Read the b1 and b2 coefficients using SIMD */ + b1 = *__SIMD32(pCoeffs)++; + + /* Read the a1 and a2 coefficients using SIMD */ + a1 = *__SIMD32(pCoeffs)++; + + /* Read the input state values from the state buffer: x[n-1], x[n-2] */ + state_in = *__SIMD32(pState)++; + + /* Read the output state values from the state buffer: y[n-1], y[n-2] */ + state_out = *__SIMD32(pState)--; + + /* Apply loop unrolling and compute 2 output values simultaneously. */ + /* The variable acc hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + sample = blockSize >> 1U; + + /* First part of the processing with loop unrolling. Compute 2 outputs at a time. + ** a second loop below computes the remaining 1 sample. */ + while (sample > 0U) + { + + /* Read the input */ + in = *__SIMD32(pIn)++; + + /* out = b0 * x[n] + 0 * 0 */ + out = __SMUAD(b0, in); + + /* acc += b1 * x[n-1] + b2 * x[n-2] + out */ + acc = __SMLALD(b1, state_in, out); + /* acc += a1 * y[n-1] + a2 * y[n-2] */ + acc = __SMLALD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + out = (uint32_t) acc_l >> lShift | acc_h << uShift; + + out = __SSAT(out, 16); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, (in >> 16), 16); + state_out = __PKHBT(state_out >> 16, (out), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* out = b0 * x[n] + 0 * 0 */ + out = __SMUADX(b0, in); + /* acc += b1 * x[n-1] + b2 * x[n-2] + out */ + acc = __SMLALD(b1, state_in, out); + /* acc += a1 * y[n-1] + a2 * y[n-2] */ + acc = __SMLALD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + out = (uint32_t) acc_l >> lShift | acc_h << uShift; + + out = __SSAT(out, 16); + + /* Store the output in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16); + +#else + + *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in >> 16, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, in, 16); + state_out = __PKHBT(state_out >> 16, out, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + /* Decrement the loop counter */ + sample--; + + } + + /* If the blockSize is not a multiple of 2, compute any remaining output samples here. + ** No loop unrolling is used. */ + + if ((blockSize & 0x1U) != 0U) + { + /* Read the input */ + in = *pIn++; + + /* out = b0 * x[n] + 0 * 0 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + out = __SMUAD(b0, in); + +#else + + out = __SMUADX(b0, in); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc = b1 * x[n-1] + b2 * x[n-2] + out */ + acc = __SMLALD(b1, state_in, out); + /* acc += a1 * y[n-1] + a2 * y[n-2] */ + acc = __SMLALD(a1, state_out, acc); + + /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + out = (uint32_t) acc_l >> lShift | acc_h << uShift; + + out = __SSAT(out, 16); + + /* Store the output in the destination buffer. */ + *pOut++ = (q15_t) out; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ + /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ + +#ifndef ARM_MATH_BIG_ENDIAN + + state_in = __PKHBT(in, state_in, 16); + state_out = __PKHBT(out, state_out, 16); + +#else + + state_in = __PKHBT(state_in >> 16, in, 16); + state_out = __PKHBT(state_out >> 16, out, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + } + + /* The first stage goes from the input wire to the output wire. */ + /* Subsequent numStages occur in-place in the output wire */ + pIn = pDst; + + /* Reset the output pointer */ + pOut = pDst; + + /* Store the updated state variables back into the state array */ + *__SIMD32(pState)++ = state_in; + *__SIMD32(pState)++ = state_out; + + + /* Decrement the loop counter */ + stage--; + + } while (stage > 0U); + +#else + + /* Run the below code for Cortex-M0 */ + + q15_t *pIn = pSrc; /* Source pointer */ + q15_t *pOut = pDst; /* Destination pointer */ + q15_t b0, b1, b2, a1, a2; /* Filter coefficients */ + q15_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */ + q15_t Xn; /* temporary input */ + q63_t acc; /* Accumulator */ + int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + pCoeffs++; // skip the 0 coefficient + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* The variables acc holds the output value that is computed: + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize; + + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q31_t) b0 *Xn; + + /* acc += b1 * x[n-1] */ + acc += (q31_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q31_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q31_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q31_t) a2 *Yn2; + + /* The result is converted to 1.31 */ + acc = __SSAT((acc >> shift), 16); + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = (q15_t) acc; + + /* Store the output in the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* decrement the loop counter */ + sample--; + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent stages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset to destination pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while (--stage); + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c new file mode 100644 index 0000000..da367ec --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c @@ -0,0 +1,392 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df1_q31.c + * Description: Processing function for the Q31 Biquad cascade filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF1 + * @{ + */ + +/** + * @brief Processing function for the Q31 Biquad cascade filter. + * @param[in] *S points to an instance of the Q31 Biquad cascade structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25). + * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to + * 1.31 format by discarding the low 32 bits. + * + * \par + * Refer to the function arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4. + */ + +void arm_biquad_cascade_df1_q31( + const arm_biquad_casd_df1_inst_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q63_t acc; /* accumulator */ + uint32_t uShift = ((uint32_t) S->postShift + 1U); + uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */ + q31_t *pIn = pSrc; /* input pointer initialization */ + q31_t *pOut = pDst; /* output pointer initialization */ + q31_t *pState = S->pState; /* pState pointer initialization */ + q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */ + q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */ + q31_t b0, b1, b2, a1, a2; /* Filter coefficients */ + q31_t Xn; /* temporary input */ + uint32_t sample, stage = S->numStages; /* loop counters */ + + +#if defined (ARM_MATH_DSP) + + q31_t acc_l, acc_h; /* temporary output variables */ + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + /* The variable acc hold output values that are being computed: + * + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn2; + + /* The result is converted to 1.31 , Yn2 variable is reused */ + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer. */ + *pOut++ = Yn2; + + /* Read the second input */ + Xn2 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn2; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn1; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn2; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn1; + + + /* The result is converted to 1.31, Yn1 variable is reused */ + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + + /* Apply shift for lower part of acc and upper part of acc */ + Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer. */ + *pOut++ = Yn1; + + /* Read the third input */ + Xn1 = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn1; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn2; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn2; + + /* The result is converted to 1.31, Yn2 variable is reused */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + + /* Apply shift for lower part of acc and upper part of acc */ + Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the output in the destination buffer. */ + *pOut++ = Yn2; + + /* Read the forth input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn2; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn1; + + /* The result is converted to 1.31, Yn1 variable is reused */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + + /* Store the output in the destination buffer. */ + *pOut++ = Yn1; + + /* decrement the loop counter */ + sample--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + sample = (blockSize & 0x3U); + + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn; + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn2; + + /* The result is converted to 1.31 */ + acc = acc >> lShift; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = (q31_t) acc; + + /* Store the output in the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* decrement the loop counter */ + sample--; + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent stages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset to destination pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while (--stage); + +#else + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /* Reading the state values */ + Xn1 = pState[0]; + Xn2 = pState[1]; + Yn1 = pState[2]; + Yn2 = pState[3]; + + /* The variables acc holds the output value that is computed: + * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + */ + + sample = blockSize; + + while (sample > 0U) + { + /* Read the input */ + Xn = *pIn++; + + /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ + /* acc = b0 * x[n] */ + acc = (q63_t) b0 *Xn; + + /* acc += b1 * x[n-1] */ + acc += (q63_t) b1 *Xn1; + /* acc += b[2] * x[n-2] */ + acc += (q63_t) b2 *Xn2; + /* acc += a1 * y[n-1] */ + acc += (q63_t) a1 *Yn1; + /* acc += a2 * y[n-2] */ + acc += (q63_t) a2 *Yn2; + + /* The result is converted to 1.31 */ + acc = acc >> lShift; + + /* Every time after the output is computed state should be updated. */ + /* The states should be updated as: */ + /* Xn2 = Xn1 */ + /* Xn1 = Xn */ + /* Yn2 = Yn1 */ + /* Yn1 = acc */ + Xn2 = Xn1; + Xn1 = Xn; + Yn2 = Yn1; + Yn1 = (q31_t) acc; + + /* Store the output in the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* decrement the loop counter */ + sample--; + } + + /* The first stage goes from the input buffer to the output buffer. */ + /* Subsequent stages occur in-place in the output buffer */ + pIn = pDst; + + /* Reset to destination pointer */ + pOut = pDst; + + /* Store the updated state variables back into the pState array */ + *pState++ = Xn1; + *pState++ = Xn2; + *pState++ = Yn1; + *pState++ = Yn2; + + } while (--stage); + +#endif /* #if defined (ARM_MATH_DSP) */ +} + + + + +/** + * @} end of BiquadCascadeDF1 group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c new file mode 100644 index 0000000..3f1ce03 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c @@ -0,0 +1,590 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df2T_f32.c + * Description: Processing function for floating-point transposed direct form II Biquad cascade filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** +* @ingroup groupFilters +*/ + +/** +* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure +* +* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure. +* The filters are implemented as a cascade of second order Biquad sections. +* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions. +* Only floating-point data is supported. +* +* This function operate on blocks of input and output data and each call to the function +* processes blockSize samples through the filter. +* pSrc points to the array of input data and +* pDst points to the array of output data. +* Both arrays contain blockSize values. +* +* \par Algorithm +* Each Biquad stage implements a second order filter using the difference equation: +*
+*    y[n] = b0 * x[n] + d1
+*    d1 = b1 * x[n] + a1 * y[n] + d2
+*    d2 = b2 * x[n] + a2 * y[n]
+* 
+* where d1 and d2 represent the two state values. +* +* \par +* A Biquad filter using a transposed Direct Form II structure is shown below. +* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad" +* Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. +* Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. +* Pay careful attention to the sign of the feedback coefficients. +* Some design tools flip the sign of the feedback coefficients: +*
+*    y[n] = b0 * x[n] + d1;
+*    d1 = b1 * x[n] - a1 * y[n] + d2;
+*    d2 = b2 * x[n] - a2 * y[n];
+* 
+* In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. +* +* \par +* Higher order filters are realized as a cascade of second order sections. +* numStages refers to the number of second order stages used. +* For example, an 8th order filter would be realized with numStages=4 second order stages. +* A 9th order filter would be realized with numStages=5 second order stages with the +* coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). +* +* \par +* pState points to the state variable array. +* Each Biquad stage has 2 state variables d1 and d2. +* The state variables are arranged in the pState array as: +*
+*     {d11, d12, d21, d22, ...}
+* 
+* where d1x refers to the state variables for the first Biquad and +* d2x refers to the state variables for the second Biquad. +* The state array has a total length of 2*numStages values. +* The state variables are updated after each block of data is processed; the coefficients are untouched. +* +* \par +* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II. +* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types. +* That is why the Direct Form I structure supports Q15 and Q31 data types. +* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2. +* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad. +* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage. +* +* \par Instance Structure +* The coefficients and state variables for a filter are stored together in an instance data structure. +* A separate instance structure must be defined for each filter. +* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. +* +* \par Init Functions +* There is also an associated initialization function. +* The initialization function performs following operations: +* - Sets the values of the internal structure fields. +* - Zeros out the values in the state buffer. +* To do this manually without calling the init function, assign the follow subfields of the instance structure: +* numStages, pCoeffs, pState. Also set all of the values in pState to zero. +* +* \par +* Use of the initialization function is optional. +* However, if the initialization function is used, then the instance structure cannot be placed into a const data section. +* To place an instance structure into a const data section, the instance structure must be manually initialized. +* Set the values in the state buffer to zeros before static initialization. +* For example, to statically initialize the instance structure use +*
+*     arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
+* 
+* where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer. +* pCoeffs is the address of the coefficient buffer; +* +*/ + +/** +* @addtogroup BiquadCascadeDF2T +* @{ +*/ + +/** +* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. +* @param[in] *S points to an instance of the filter data structure. +* @param[in] *pSrc points to the block of input data. +* @param[out] *pDst points to the block of output data +* @param[in] blockSize number of samples to process. +* @return none. +*/ + + +LOW_OPTIMIZATION_ENTER +void arm_biquad_cascade_df2T_f32( +const arm_biquad_cascade_df2T_instance_f32 * S, +float32_t * pSrc, +float32_t * pDst, +uint32_t blockSize) +{ + + float32_t *pIn = pSrc; /* source pointer */ + float32_t *pOut = pDst; /* destination pointer */ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */ + float32_t acc1; /* accumulator */ + float32_t b0, b1, b2, a1, a2; /* Filter coefficients */ + float32_t Xn1; /* temporary input */ + float32_t d1, d2; /* state variables */ + uint32_t sample, stage = S->numStages; /* loop counters */ + +#if defined(ARM_MATH_CM7) + + float32_t Xn2, Xn3, Xn4, Xn5, Xn6, Xn7, Xn8; /* Input State variables */ + float32_t Xn9, Xn10, Xn11, Xn12, Xn13, Xn14, Xn15, Xn16; + float32_t acc2, acc3, acc4, acc5, acc6, acc7; /* Simulates the accumulator */ + float32_t acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15, acc16; + + do + { + /* Reading the coefficients */ + b0 = pCoeffs[0]; + b1 = pCoeffs[1]; + b2 = pCoeffs[2]; + a1 = pCoeffs[3]; + /* Apply loop unrolling and compute 16 output values simultaneously. */ + sample = blockSize >> 4U; + a2 = pCoeffs[4]; + + /*Reading the state values */ + d1 = pState[0]; + d2 = pState[1]; + + pCoeffs += 5U; + + + /* First part of the processing with loop unrolling. Compute 16 outputs at a time. + ** a second loop below computes the remaining 1 to 15 samples. */ + while (sample > 0U) { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + /* Read the first 2 inputs. 2 cycles */ + Xn1 = pIn[0 ]; + Xn2 = pIn[1 ]; + + /* Sample 1. 5 cycles */ + Xn3 = pIn[2 ]; + acc1 = b0 * Xn1 + d1; + + Xn4 = pIn[3 ]; + d1 = b1 * Xn1 + d2; + + Xn5 = pIn[4 ]; + d2 = b2 * Xn1; + + Xn6 = pIn[5 ]; + d1 += a1 * acc1; + + Xn7 = pIn[6 ]; + d2 += a2 * acc1; + + /* Sample 2. 5 cycles */ + Xn8 = pIn[7 ]; + acc2 = b0 * Xn2 + d1; + + Xn9 = pIn[8 ]; + d1 = b1 * Xn2 + d2; + + Xn10 = pIn[9 ]; + d2 = b2 * Xn2; + + Xn11 = pIn[10]; + d1 += a1 * acc2; + + Xn12 = pIn[11]; + d2 += a2 * acc2; + + /* Sample 3. 5 cycles */ + Xn13 = pIn[12]; + acc3 = b0 * Xn3 + d1; + + Xn14 = pIn[13]; + d1 = b1 * Xn3 + d2; + + Xn15 = pIn[14]; + d2 = b2 * Xn3; + + Xn16 = pIn[15]; + d1 += a1 * acc3; + + pIn += 16; + d2 += a2 * acc3; + + /* Sample 4. 5 cycles */ + acc4 = b0 * Xn4 + d1; + d1 = b1 * Xn4 + d2; + d2 = b2 * Xn4; + d1 += a1 * acc4; + d2 += a2 * acc4; + + /* Sample 5. 5 cycles */ + acc5 = b0 * Xn5 + d1; + d1 = b1 * Xn5 + d2; + d2 = b2 * Xn5; + d1 += a1 * acc5; + d2 += a2 * acc5; + + /* Sample 6. 5 cycles */ + acc6 = b0 * Xn6 + d1; + d1 = b1 * Xn6 + d2; + d2 = b2 * Xn6; + d1 += a1 * acc6; + d2 += a2 * acc6; + + /* Sample 7. 5 cycles */ + acc7 = b0 * Xn7 + d1; + d1 = b1 * Xn7 + d2; + d2 = b2 * Xn7; + d1 += a1 * acc7; + d2 += a2 * acc7; + + /* Sample 8. 5 cycles */ + acc8 = b0 * Xn8 + d1; + d1 = b1 * Xn8 + d2; + d2 = b2 * Xn8; + d1 += a1 * acc8; + d2 += a2 * acc8; + + /* Sample 9. 5 cycles */ + acc9 = b0 * Xn9 + d1; + d1 = b1 * Xn9 + d2; + d2 = b2 * Xn9; + d1 += a1 * acc9; + d2 += a2 * acc9; + + /* Sample 10. 5 cycles */ + acc10 = b0 * Xn10 + d1; + d1 = b1 * Xn10 + d2; + d2 = b2 * Xn10; + d1 += a1 * acc10; + d2 += a2 * acc10; + + /* Sample 11. 5 cycles */ + acc11 = b0 * Xn11 + d1; + d1 = b1 * Xn11 + d2; + d2 = b2 * Xn11; + d1 += a1 * acc11; + d2 += a2 * acc11; + + /* Sample 12. 5 cycles */ + acc12 = b0 * Xn12 + d1; + d1 = b1 * Xn12 + d2; + d2 = b2 * Xn12; + d1 += a1 * acc12; + d2 += a2 * acc12; + + /* Sample 13. 5 cycles */ + acc13 = b0 * Xn13 + d1; + d1 = b1 * Xn13 + d2; + d2 = b2 * Xn13; + + pOut[0 ] = acc1 ; + d1 += a1 * acc13; + + pOut[1 ] = acc2 ; + d2 += a2 * acc13; + + /* Sample 14. 5 cycles */ + pOut[2 ] = acc3 ; + acc14 = b0 * Xn14 + d1; + + pOut[3 ] = acc4 ; + d1 = b1 * Xn14 + d2; + + pOut[4 ] = acc5 ; + d2 = b2 * Xn14; + + pOut[5 ] = acc6 ; + d1 += a1 * acc14; + + pOut[6 ] = acc7 ; + d2 += a2 * acc14; + + /* Sample 15. 5 cycles */ + pOut[7 ] = acc8 ; + pOut[8 ] = acc9 ; + acc15 = b0 * Xn15 + d1; + + pOut[9 ] = acc10; + d1 = b1 * Xn15 + d2; + + pOut[10] = acc11; + d2 = b2 * Xn15; + + pOut[11] = acc12; + d1 += a1 * acc15; + + pOut[12] = acc13; + d2 += a2 * acc15; + + /* Sample 16. 5 cycles */ + pOut[13] = acc14; + acc16 = b0 * Xn16 + d1; + + pOut[14] = acc15; + d1 = b1 * Xn16 + d2; + + pOut[15] = acc16; + d2 = b2 * Xn16; + + sample--; + d1 += a1 * acc16; + + pOut += 16; + d2 += a2 * acc16; + } + + sample = blockSize & 0xFu; + while (sample > 0U) { + Xn1 = *pIn; + acc1 = b0 * Xn1 + d1; + + pIn++; + d1 = b1 * Xn1 + d2; + + *pOut = acc1; + d2 = b2 * Xn1; + + pOut++; + d1 += a1 * acc1; + + sample--; + d2 += a2 * acc1; + } + + /* Store the updated state variables back into the state array */ + pState[0] = d1; + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + pState[1] = d2; + /* decrement the loop counter */ + stage--; + + pState += 2U; + + /*Reset the output working pointer */ + pOut = pDst; + + } while (stage > 0U); + +#elif defined(ARM_MATH_CM0_FAMILY) + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /*Reading the state values */ + d1 = pState[0]; + d2 = pState[1]; + + + sample = blockSize; + + while (sample > 0U) + { + /* Read the input */ + Xn1 = *pIn++; + + /* y[n] = b0 * x[n] + d1 */ + acc1 = (b0 * Xn1) + d1; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc1; + + /* Every time after the output is computed state should be updated. */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + d1 = ((b1 * Xn1) + (a1 * acc1)) + d2; + + /* d2 = b2 * x[n] + a2 * y[n] */ + d2 = (b2 * Xn1) + (a2 * acc1); + + /* decrement the loop counter */ + sample--; + } + + /* Store the updated state variables back into the state array */ + *pState++ = d1; + *pState++ = d2; + + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + /*Reset the output working pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while (stage > 0U); + +#else + + float32_t Xn2, Xn3, Xn4; /* Input State variables */ + float32_t acc2, acc3, acc4; /* accumulator */ + + + float32_t p0, p1, p2, p3, p4, A1; + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + + /*Reading the state values */ + d1 = pState[0]; + d2 = pState[1]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + sample = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (sample > 0U) { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + /* Read the four inputs */ + Xn1 = pIn[0]; + Xn2 = pIn[1]; + Xn3 = pIn[2]; + Xn4 = pIn[3]; + pIn += 4; + + p0 = b0 * Xn1; + p1 = b1 * Xn1; + acc1 = p0 + d1; + p0 = b0 * Xn2; + p3 = a1 * acc1; + p2 = b2 * Xn1; + A1 = p1 + p3; + p4 = a2 * acc1; + d1 = A1 + d2; + d2 = p2 + p4; + + p1 = b1 * Xn2; + acc2 = p0 + d1; + p0 = b0 * Xn3; + p3 = a1 * acc2; + p2 = b2 * Xn2; + A1 = p1 + p3; + p4 = a2 * acc2; + d1 = A1 + d2; + d2 = p2 + p4; + + p1 = b1 * Xn3; + acc3 = p0 + d1; + p0 = b0 * Xn4; + p3 = a1 * acc3; + p2 = b2 * Xn3; + A1 = p1 + p3; + p4 = a2 * acc3; + d1 = A1 + d2; + d2 = p2 + p4; + + acc4 = p0 + d1; + p1 = b1 * Xn4; + p3 = a1 * acc4; + p2 = b2 * Xn4; + A1 = p1 + p3; + p4 = a2 * acc4; + d1 = A1 + d2; + d2 = p2 + p4; + + pOut[0] = acc1; + pOut[1] = acc2; + pOut[2] = acc3; + pOut[3] = acc4; + pOut += 4; + + sample--; + } + + sample = blockSize & 0x3U; + while (sample > 0U) { + Xn1 = *pIn++; + + p0 = b0 * Xn1; + p1 = b1 * Xn1; + acc1 = p0 + d1; + p3 = a1 * acc1; + p2 = b2 * Xn1; + A1 = p1 + p3; + p4 = a2 * acc1; + d1 = A1 + d2; + d2 = p2 + p4; + + *pOut++ = acc1; + + sample--; + } + + /* Store the updated state variables back into the state array */ + *pState++ = d1; + *pState++ = d2; + + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + /*Reset the output working pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while (stage > 0U); + +#endif + +} +LOW_OPTIMIZATION_EXIT + +/** + * @} end of BiquadCascadeDF2T group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c new file mode 100644 index 0000000..8f8a830 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c @@ -0,0 +1,590 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df2T_f64.c + * Description: Processing function for floating-point transposed direct form II Biquad cascade filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** +* @ingroup groupFilters +*/ + +/** +* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure +* +* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure. +* The filters are implemented as a cascade of second order Biquad sections. +* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions. +* Only floating-point data is supported. +* +* This function operate on blocks of input and output data and each call to the function +* processes blockSize samples through the filter. +* pSrc points to the array of input data and +* pDst points to the array of output data. +* Both arrays contain blockSize values. +* +* \par Algorithm +* Each Biquad stage implements a second order filter using the difference equation: +*
+*    y[n] = b0 * x[n] + d1
+*    d1 = b1 * x[n] + a1 * y[n] + d2
+*    d2 = b2 * x[n] + a2 * y[n]
+* 
+* where d1 and d2 represent the two state values. +* +* \par +* A Biquad filter using a transposed Direct Form II structure is shown below. +* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad" +* Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. +* Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. +* Pay careful attention to the sign of the feedback coefficients. +* Some design tools flip the sign of the feedback coefficients: +*
+*    y[n] = b0 * x[n] + d1;
+*    d1 = b1 * x[n] - a1 * y[n] + d2;
+*    d2 = b2 * x[n] - a2 * y[n];
+* 
+* In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. +* +* \par +* Higher order filters are realized as a cascade of second order sections. +* numStages refers to the number of second order stages used. +* For example, an 8th order filter would be realized with numStages=4 second order stages. +* A 9th order filter would be realized with numStages=5 second order stages with the +* coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). +* +* \par +* pState points to the state variable array. +* Each Biquad stage has 2 state variables d1 and d2. +* The state variables are arranged in the pState array as: +*
+*     {d11, d12, d21, d22, ...}
+* 
+* where d1x refers to the state variables for the first Biquad and +* d2x refers to the state variables for the second Biquad. +* The state array has a total length of 2*numStages values. +* The state variables are updated after each block of data is processed; the coefficients are untouched. +* +* \par +* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II. +* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types. +* That is why the Direct Form I structure supports Q15 and Q31 data types. +* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2. +* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad. +* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage. +* +* \par Instance Structure +* The coefficients and state variables for a filter are stored together in an instance data structure. +* A separate instance structure must be defined for each filter. +* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. +* +* \par Init Functions +* There is also an associated initialization function. +* The initialization function performs following operations: +* - Sets the values of the internal structure fields. +* - Zeros out the values in the state buffer. +* To do this manually without calling the init function, assign the follow subfields of the instance structure: +* numStages, pCoeffs, pState. Also set all of the values in pState to zero. +* +* \par +* Use of the initialization function is optional. +* However, if the initialization function is used, then the instance structure cannot be placed into a const data section. +* To place an instance structure into a const data section, the instance structure must be manually initialized. +* Set the values in the state buffer to zeros before static initialization. +* For example, to statically initialize the instance structure use +*
+*     arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
+* 
+* where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer. +* pCoeffs is the address of the coefficient buffer; +* +*/ + +/** +* @addtogroup BiquadCascadeDF2T +* @{ +*/ + +/** +* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. +* @param[in] *S points to an instance of the filter data structure. +* @param[in] *pSrc points to the block of input data. +* @param[out] *pDst points to the block of output data +* @param[in] blockSize number of samples to process. +* @return none. +*/ + + +LOW_OPTIMIZATION_ENTER +void arm_biquad_cascade_df2T_f64( +const arm_biquad_cascade_df2T_instance_f64 * S, +float64_t * pSrc, +float64_t * pDst, +uint32_t blockSize) +{ + + float64_t *pIn = pSrc; /* source pointer */ + float64_t *pOut = pDst; /* destination pointer */ + float64_t *pState = S->pState; /* State pointer */ + float64_t *pCoeffs = S->pCoeffs; /* coefficient pointer */ + float64_t acc1; /* accumulator */ + float64_t b0, b1, b2, a1, a2; /* Filter coefficients */ + float64_t Xn1; /* temporary input */ + float64_t d1, d2; /* state variables */ + uint32_t sample, stage = S->numStages; /* loop counters */ + +#if defined(ARM_MATH_CM7) + + float64_t Xn2, Xn3, Xn4, Xn5, Xn6, Xn7, Xn8; /* Input State variables */ + float64_t Xn9, Xn10, Xn11, Xn12, Xn13, Xn14, Xn15, Xn16; + float64_t acc2, acc3, acc4, acc5, acc6, acc7; /* Simulates the accumulator */ + float64_t acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15, acc16; + + do + { + /* Reading the coefficients */ + b0 = pCoeffs[0]; + b1 = pCoeffs[1]; + b2 = pCoeffs[2]; + a1 = pCoeffs[3]; + /* Apply loop unrolling and compute 16 output values simultaneously. */ + sample = blockSize >> 4U; + a2 = pCoeffs[4]; + + /*Reading the state values */ + d1 = pState[0]; + d2 = pState[1]; + + pCoeffs += 5U; + + + /* First part of the processing with loop unrolling. Compute 16 outputs at a time. + ** a second loop below computes the remaining 1 to 15 samples. */ + while (sample > 0U) { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + /* Read the first 2 inputs. 2 cycles */ + Xn1 = pIn[0 ]; + Xn2 = pIn[1 ]; + + /* Sample 1. 5 cycles */ + Xn3 = pIn[2 ]; + acc1 = b0 * Xn1 + d1; + + Xn4 = pIn[3 ]; + d1 = b1 * Xn1 + d2; + + Xn5 = pIn[4 ]; + d2 = b2 * Xn1; + + Xn6 = pIn[5 ]; + d1 += a1 * acc1; + + Xn7 = pIn[6 ]; + d2 += a2 * acc1; + + /* Sample 2. 5 cycles */ + Xn8 = pIn[7 ]; + acc2 = b0 * Xn2 + d1; + + Xn9 = pIn[8 ]; + d1 = b1 * Xn2 + d2; + + Xn10 = pIn[9 ]; + d2 = b2 * Xn2; + + Xn11 = pIn[10]; + d1 += a1 * acc2; + + Xn12 = pIn[11]; + d2 += a2 * acc2; + + /* Sample 3. 5 cycles */ + Xn13 = pIn[12]; + acc3 = b0 * Xn3 + d1; + + Xn14 = pIn[13]; + d1 = b1 * Xn3 + d2; + + Xn15 = pIn[14]; + d2 = b2 * Xn3; + + Xn16 = pIn[15]; + d1 += a1 * acc3; + + pIn += 16; + d2 += a2 * acc3; + + /* Sample 4. 5 cycles */ + acc4 = b0 * Xn4 + d1; + d1 = b1 * Xn4 + d2; + d2 = b2 * Xn4; + d1 += a1 * acc4; + d2 += a2 * acc4; + + /* Sample 5. 5 cycles */ + acc5 = b0 * Xn5 + d1; + d1 = b1 * Xn5 + d2; + d2 = b2 * Xn5; + d1 += a1 * acc5; + d2 += a2 * acc5; + + /* Sample 6. 5 cycles */ + acc6 = b0 * Xn6 + d1; + d1 = b1 * Xn6 + d2; + d2 = b2 * Xn6; + d1 += a1 * acc6; + d2 += a2 * acc6; + + /* Sample 7. 5 cycles */ + acc7 = b0 * Xn7 + d1; + d1 = b1 * Xn7 + d2; + d2 = b2 * Xn7; + d1 += a1 * acc7; + d2 += a2 * acc7; + + /* Sample 8. 5 cycles */ + acc8 = b0 * Xn8 + d1; + d1 = b1 * Xn8 + d2; + d2 = b2 * Xn8; + d1 += a1 * acc8; + d2 += a2 * acc8; + + /* Sample 9. 5 cycles */ + acc9 = b0 * Xn9 + d1; + d1 = b1 * Xn9 + d2; + d2 = b2 * Xn9; + d1 += a1 * acc9; + d2 += a2 * acc9; + + /* Sample 10. 5 cycles */ + acc10 = b0 * Xn10 + d1; + d1 = b1 * Xn10 + d2; + d2 = b2 * Xn10; + d1 += a1 * acc10; + d2 += a2 * acc10; + + /* Sample 11. 5 cycles */ + acc11 = b0 * Xn11 + d1; + d1 = b1 * Xn11 + d2; + d2 = b2 * Xn11; + d1 += a1 * acc11; + d2 += a2 * acc11; + + /* Sample 12. 5 cycles */ + acc12 = b0 * Xn12 + d1; + d1 = b1 * Xn12 + d2; + d2 = b2 * Xn12; + d1 += a1 * acc12; + d2 += a2 * acc12; + + /* Sample 13. 5 cycles */ + acc13 = b0 * Xn13 + d1; + d1 = b1 * Xn13 + d2; + d2 = b2 * Xn13; + + pOut[0 ] = acc1 ; + d1 += a1 * acc13; + + pOut[1 ] = acc2 ; + d2 += a2 * acc13; + + /* Sample 14. 5 cycles */ + pOut[2 ] = acc3 ; + acc14 = b0 * Xn14 + d1; + + pOut[3 ] = acc4 ; + d1 = b1 * Xn14 + d2; + + pOut[4 ] = acc5 ; + d2 = b2 * Xn14; + + pOut[5 ] = acc6 ; + d1 += a1 * acc14; + + pOut[6 ] = acc7 ; + d2 += a2 * acc14; + + /* Sample 15. 5 cycles */ + pOut[7 ] = acc8 ; + pOut[8 ] = acc9 ; + acc15 = b0 * Xn15 + d1; + + pOut[9 ] = acc10; + d1 = b1 * Xn15 + d2; + + pOut[10] = acc11; + d2 = b2 * Xn15; + + pOut[11] = acc12; + d1 += a1 * acc15; + + pOut[12] = acc13; + d2 += a2 * acc15; + + /* Sample 16. 5 cycles */ + pOut[13] = acc14; + acc16 = b0 * Xn16 + d1; + + pOut[14] = acc15; + d1 = b1 * Xn16 + d2; + + pOut[15] = acc16; + d2 = b2 * Xn16; + + sample--; + d1 += a1 * acc16; + + pOut += 16; + d2 += a2 * acc16; + } + + sample = blockSize & 0xFu; + while (sample > 0U) { + Xn1 = *pIn; + acc1 = b0 * Xn1 + d1; + + pIn++; + d1 = b1 * Xn1 + d2; + + *pOut = acc1; + d2 = b2 * Xn1; + + pOut++; + d1 += a1 * acc1; + + sample--; + d2 += a2 * acc1; + } + + /* Store the updated state variables back into the state array */ + pState[0] = d1; + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + pState[1] = d2; + /* decrement the loop counter */ + stage--; + + pState += 2U; + + /*Reset the output working pointer */ + pOut = pDst; + + } while (stage > 0U); + +#elif defined(ARM_MATH_CM0_FAMILY) + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /*Reading the state values */ + d1 = pState[0]; + d2 = pState[1]; + + + sample = blockSize; + + while (sample > 0U) + { + /* Read the input */ + Xn1 = *pIn++; + + /* y[n] = b0 * x[n] + d1 */ + acc1 = (b0 * Xn1) + d1; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc1; + + /* Every time after the output is computed state should be updated. */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + d1 = ((b1 * Xn1) + (a1 * acc1)) + d2; + + /* d2 = b2 * x[n] + a2 * y[n] */ + d2 = (b2 * Xn1) + (a2 * acc1); + + /* decrement the loop counter */ + sample--; + } + + /* Store the updated state variables back into the state array */ + *pState++ = d1; + *pState++ = d2; + + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + /*Reset the output working pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while (stage > 0U); + +#else + + float64_t Xn2, Xn3, Xn4; /* Input State variables */ + float64_t acc2, acc3, acc4; /* accumulator */ + + + float64_t p0, p1, p2, p3, p4, A1; + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + + /*Reading the state values */ + d1 = pState[0]; + d2 = pState[1]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + sample = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (sample > 0U) { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + /* Read the four inputs */ + Xn1 = pIn[0]; + Xn2 = pIn[1]; + Xn3 = pIn[2]; + Xn4 = pIn[3]; + pIn += 4; + + p0 = b0 * Xn1; + p1 = b1 * Xn1; + acc1 = p0 + d1; + p0 = b0 * Xn2; + p3 = a1 * acc1; + p2 = b2 * Xn1; + A1 = p1 + p3; + p4 = a2 * acc1; + d1 = A1 + d2; + d2 = p2 + p4; + + p1 = b1 * Xn2; + acc2 = p0 + d1; + p0 = b0 * Xn3; + p3 = a1 * acc2; + p2 = b2 * Xn2; + A1 = p1 + p3; + p4 = a2 * acc2; + d1 = A1 + d2; + d2 = p2 + p4; + + p1 = b1 * Xn3; + acc3 = p0 + d1; + p0 = b0 * Xn4; + p3 = a1 * acc3; + p2 = b2 * Xn3; + A1 = p1 + p3; + p4 = a2 * acc3; + d1 = A1 + d2; + d2 = p2 + p4; + + acc4 = p0 + d1; + p1 = b1 * Xn4; + p3 = a1 * acc4; + p2 = b2 * Xn4; + A1 = p1 + p3; + p4 = a2 * acc4; + d1 = A1 + d2; + d2 = p2 + p4; + + pOut[0] = acc1; + pOut[1] = acc2; + pOut[2] = acc3; + pOut[3] = acc4; + pOut += 4; + + sample--; + } + + sample = blockSize & 0x3U; + while (sample > 0U) { + Xn1 = *pIn++; + + p0 = b0 * Xn1; + p1 = b1 * Xn1; + acc1 = p0 + d1; + p3 = a1 * acc1; + p2 = b2 * Xn1; + A1 = p1 + p3; + p4 = a2 * acc1; + d1 = A1 + d2; + d2 = p2 + p4; + + *pOut++ = acc1; + + sample--; + } + + /* Store the updated state variables back into the state array */ + *pState++ = d1; + *pState++ = d2; + + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + /*Reset the output working pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while (stage > 0U); + +#endif + +} +LOW_OPTIMIZATION_EXIT + +/** + * @} end of BiquadCascadeDF2T group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c new file mode 100644 index 0000000..6dfc985 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c @@ -0,0 +1,89 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df2T_init_f32.c + * Description: Initialization function for floating-point transposed direct form II Biquad cascade filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF2T + * @{ + */ + +/** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] *S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @return none + * + * Coefficient and State Ordering: + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+ * 
+ * + * \par + * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState is a pointer to state array. + * Each Biquad stage has 2 state variables d1, and d2. + * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on. + * The state array has a total length of 2*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cascade_df2T_init_f32( + arm_biquad_cascade_df2T_instance_f32 * S, + uint8_t numStages, + float32_t * pCoeffs, + float32_t * pState) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 2 * numStages */ + memset(pState, 0, (2U * (uint32_t) numStages) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF2T group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c new file mode 100644 index 0000000..8141da5 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c @@ -0,0 +1,89 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_df2T_init_f64.c + * Description: Initialization function for floating-point transposed direct form II Biquad cascade filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF2T + * @{ + */ + +/** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] *S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @return none + * + * Coefficient and State Ordering: + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+ * 
+ * + * \par + * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState is a pointer to state array. + * Each Biquad stage has 2 state variables d1, and d2. + * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on. + * The state array has a total length of 2*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cascade_df2T_init_f64( + arm_biquad_cascade_df2T_instance_f64 * S, + uint8_t numStages, + float64_t * pCoeffs, + float64_t * pState) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 2 * numStages */ + memset(pState, 0, (2U * (uint32_t) numStages) * sizeof(float64_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF2T group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c new file mode 100644 index 0000000..36084e5 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c @@ -0,0 +1,670 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_stereo_df2T_f32.c + * Description: Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** +* @ingroup groupFilters +*/ + +/** +* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure +* +* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure. +* The filters are implemented as a cascade of second order Biquad sections. +* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions. +* Only floating-point data is supported. +* +* This function operate on blocks of input and output data and each call to the function +* processes blockSize samples through the filter. +* pSrc points to the array of input data and +* pDst points to the array of output data. +* Both arrays contain blockSize values. +* +* \par Algorithm +* Each Biquad stage implements a second order filter using the difference equation: +*
+*    y[n] = b0 * x[n] + d1
+*    d1 = b1 * x[n] + a1 * y[n] + d2
+*    d2 = b2 * x[n] + a2 * y[n]
+* 
+* where d1 and d2 represent the two state values. +* +* \par +* A Biquad filter using a transposed Direct Form II structure is shown below. +* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad" +* Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. +* Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. +* Pay careful attention to the sign of the feedback coefficients. +* Some design tools flip the sign of the feedback coefficients: +*
+*    y[n] = b0 * x[n] + d1;
+*    d1 = b1 * x[n] - a1 * y[n] + d2;
+*    d2 = b2 * x[n] - a2 * y[n];
+* 
+* In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. +* +* \par +* Higher order filters are realized as a cascade of second order sections. +* numStages refers to the number of second order stages used. +* For example, an 8th order filter would be realized with numStages=4 second order stages. +* A 9th order filter would be realized with numStages=5 second order stages with the +* coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). +* +* \par +* pState points to the state variable array. +* Each Biquad stage has 2 state variables d1 and d2. +* The state variables are arranged in the pState array as: +*
+*     {d11, d12, d21, d22, ...}
+* 
+* where d1x refers to the state variables for the first Biquad and +* d2x refers to the state variables for the second Biquad. +* The state array has a total length of 2*numStages values. +* The state variables are updated after each block of data is processed; the coefficients are untouched. +* +* \par +* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II. +* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types. +* That is why the Direct Form I structure supports Q15 and Q31 data types. +* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2. +* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad. +* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage. +* +* \par Instance Structure +* The coefficients and state variables for a filter are stored together in an instance data structure. +* A separate instance structure must be defined for each filter. +* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. +* +* \par Init Functions +* There is also an associated initialization function. +* The initialization function performs following operations: +* - Sets the values of the internal structure fields. +* - Zeros out the values in the state buffer. +* To do this manually without calling the init function, assign the follow subfields of the instance structure: +* numStages, pCoeffs, pState. Also set all of the values in pState to zero. +* +* \par +* Use of the initialization function is optional. +* However, if the initialization function is used, then the instance structure cannot be placed into a const data section. +* To place an instance structure into a const data section, the instance structure must be manually initialized. +* Set the values in the state buffer to zeros before static initialization. +* For example, to statically initialize the instance structure use +*
+*     arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
+* 
+* where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer. +* pCoeffs is the address of the coefficient buffer; +* +*/ + +/** +* @addtogroup BiquadCascadeDF2T +* @{ +*/ + +/** +* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. +* @param[in] *S points to an instance of the filter data structure. +* @param[in] *pSrc points to the block of input data. +* @param[out] *pDst points to the block of output data +* @param[in] blockSize number of samples to process. +* @return none. +*/ + + +LOW_OPTIMIZATION_ENTER +void arm_biquad_cascade_stereo_df2T_f32( +const arm_biquad_cascade_stereo_df2T_instance_f32 * S, +float32_t * pSrc, +float32_t * pDst, +uint32_t blockSize) +{ + + float32_t *pIn = pSrc; /* source pointer */ + float32_t *pOut = pDst; /* destination pointer */ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */ + float32_t acc1a, acc1b; /* accumulator */ + float32_t b0, b1, b2, a1, a2; /* Filter coefficients */ + float32_t Xn1a, Xn1b; /* temporary input */ + float32_t d1a, d2a, d1b, d2b; /* state variables */ + uint32_t sample, stage = S->numStages; /* loop counters */ + +#if defined(ARM_MATH_CM7) + + float32_t Xn2a, Xn3a, Xn4a, Xn5a, Xn6a, Xn7a, Xn8a; /* Input State variables */ + float32_t Xn2b, Xn3b, Xn4b, Xn5b, Xn6b, Xn7b, Xn8b; /* Input State variables */ + float32_t acc2a, acc3a, acc4a, acc5a, acc6a, acc7a, acc8a; /* Simulates the accumulator */ + float32_t acc2b, acc3b, acc4b, acc5b, acc6b, acc7b, acc8b; /* Simulates the accumulator */ + + do + { + /* Reading the coefficients */ + b0 = pCoeffs[0]; + b1 = pCoeffs[1]; + b2 = pCoeffs[2]; + a1 = pCoeffs[3]; + /* Apply loop unrolling and compute 8 output values simultaneously. */ + sample = blockSize >> 3U; + a2 = pCoeffs[4]; + + /*Reading the state values */ + d1a = pState[0]; + d2a = pState[1]; + d1b = pState[2]; + d2b = pState[3]; + + pCoeffs += 5U; + + /* First part of the processing with loop unrolling. Compute 8 outputs at a time. + ** a second loop below computes the remaining 1 to 7 samples. */ + while (sample > 0U) { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + /* Read the first 2 inputs. 2 cycles */ + Xn1a = pIn[0 ]; + Xn1b = pIn[1 ]; + + /* Sample 1. 5 cycles */ + Xn2a = pIn[2 ]; + acc1a = b0 * Xn1a + d1a; + + Xn2b = pIn[3 ]; + d1a = b1 * Xn1a + d2a; + + Xn3a = pIn[4 ]; + d2a = b2 * Xn1a; + + Xn3b = pIn[5 ]; + d1a += a1 * acc1a; + + Xn4a = pIn[6 ]; + d2a += a2 * acc1a; + + /* Sample 2. 5 cycles */ + Xn4b = pIn[7 ]; + acc1b = b0 * Xn1b + d1b; + + Xn5a = pIn[8 ]; + d1b = b1 * Xn1b + d2b; + + Xn5b = pIn[9 ]; + d2b = b2 * Xn1b; + + Xn6a = pIn[10]; + d1b += a1 * acc1b; + + Xn6b = pIn[11]; + d2b += a2 * acc1b; + + /* Sample 3. 5 cycles */ + Xn7a = pIn[12]; + acc2a = b0 * Xn2a + d1a; + + Xn7b = pIn[13]; + d1a = b1 * Xn2a + d2a; + + Xn8a = pIn[14]; + d2a = b2 * Xn2a; + + Xn8b = pIn[15]; + d1a += a1 * acc2a; + + pIn += 16; + d2a += a2 * acc2a; + + /* Sample 4. 5 cycles */ + acc2b = b0 * Xn2b + d1b; + d1b = b1 * Xn2b + d2b; + d2b = b2 * Xn2b; + d1b += a1 * acc2b; + d2b += a2 * acc2b; + + /* Sample 5. 5 cycles */ + acc3a = b0 * Xn3a + d1a; + d1a = b1 * Xn3a + d2a; + d2a = b2 * Xn3a; + d1a += a1 * acc3a; + d2a += a2 * acc3a; + + /* Sample 6. 5 cycles */ + acc3b = b0 * Xn3b + d1b; + d1b = b1 * Xn3b + d2b; + d2b = b2 * Xn3b; + d1b += a1 * acc3b; + d2b += a2 * acc3b; + + /* Sample 7. 5 cycles */ + acc4a = b0 * Xn4a + d1a; + d1a = b1 * Xn4a + d2a; + d2a = b2 * Xn4a; + d1a += a1 * acc4a; + d2a += a2 * acc4a; + + /* Sample 8. 5 cycles */ + acc4b = b0 * Xn4b + d1b; + d1b = b1 * Xn4b + d2b; + d2b = b2 * Xn4b; + d1b += a1 * acc4b; + d2b += a2 * acc4b; + + /* Sample 9. 5 cycles */ + acc5a = b0 * Xn5a + d1a; + d1a = b1 * Xn5a + d2a; + d2a = b2 * Xn5a; + d1a += a1 * acc5a; + d2a += a2 * acc5a; + + /* Sample 10. 5 cycles */ + acc5b = b0 * Xn5b + d1b; + d1b = b1 * Xn5b + d2b; + d2b = b2 * Xn5b; + d1b += a1 * acc5b; + d2b += a2 * acc5b; + + /* Sample 11. 5 cycles */ + acc6a = b0 * Xn6a + d1a; + d1a = b1 * Xn6a + d2a; + d2a = b2 * Xn6a; + d1a += a1 * acc6a; + d2a += a2 * acc6a; + + /* Sample 12. 5 cycles */ + acc6b = b0 * Xn6b + d1b; + d1b = b1 * Xn6b + d2b; + d2b = b2 * Xn6b; + d1b += a1 * acc6b; + d2b += a2 * acc6b; + + /* Sample 13. 5 cycles */ + acc7a = b0 * Xn7a + d1a; + d1a = b1 * Xn7a + d2a; + + pOut[0 ] = acc1a ; + d2a = b2 * Xn7a; + + pOut[1 ] = acc1b ; + d1a += a1 * acc7a; + + pOut[2 ] = acc2a ; + d2a += a2 * acc7a; + + /* Sample 14. 5 cycles */ + pOut[3 ] = acc2b ; + acc7b = b0 * Xn7b + d1b; + + pOut[4 ] = acc3a ; + d1b = b1 * Xn7b + d2b; + + pOut[5 ] = acc3b ; + d2b = b2 * Xn7b; + + pOut[6 ] = acc4a ; + d1b += a1 * acc7b; + + pOut[7 ] = acc4b ; + d2b += a2 * acc7b; + + /* Sample 15. 5 cycles */ + pOut[8 ] = acc5a ; + acc8a = b0 * Xn8a + d1a; + + pOut[9 ] = acc5b; + d1a = b1 * Xn8a + d2a; + + pOut[10] = acc6a; + d2a = b2 * Xn8a; + + pOut[11] = acc6b; + d1a += a1 * acc8a; + + pOut[12] = acc7a; + d2a += a2 * acc8a; + + /* Sample 16. 5 cycles */ + pOut[13] = acc7b; + acc8b = b0 * Xn8b + d1b; + + pOut[14] = acc8a; + d1b = b1 * Xn8b + d2b; + + pOut[15] = acc8b; + d2b = b2 * Xn8b; + + sample--; + d1b += a1 * acc8b; + + pOut += 16; + d2b += a2 * acc8b; + } + + sample = blockSize & 0x7U; + while (sample > 0U) { + /* Read the input */ + Xn1a = *pIn++; //Channel a + Xn1b = *pIn++; //Channel b + + /* y[n] = b0 * x[n] + d1 */ + acc1a = (b0 * Xn1a) + d1a; + acc1b = (b0 * Xn1b) + d1b; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc1a; + *pOut++ = acc1b; + + /* Every time after the output is computed state should be updated. */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a; + d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b; + + /* d2 = b2 * x[n] + a2 * y[n] */ + d2a = (b2 * Xn1a) + (a2 * acc1a); + d2b = (b2 * Xn1b) + (a2 * acc1b); + + sample--; + } + + /* Store the updated state variables back into the state array */ + pState[0] = d1a; + pState[1] = d2a; + + pState[2] = d1b; + pState[3] = d2b; + + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + /* decrement the loop counter */ + stage--; + + pState += 4U; + /*Reset the output working pointer */ + pOut = pDst; + + } while (stage > 0U); + +#elif defined(ARM_MATH_CM0_FAMILY) + + /* Run the below code for Cortex-M0 */ + + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /*Reading the state values */ + d1a = pState[0]; + d2a = pState[1]; + d1b = pState[2]; + d2b = pState[3]; + + + sample = blockSize; + + while (sample > 0U) + { + /* Read the input */ + Xn1a = *pIn++; //Channel a + Xn1b = *pIn++; //Channel b + + /* y[n] = b0 * x[n] + d1 */ + acc1a = (b0 * Xn1a) + d1a; + acc1b = (b0 * Xn1b) + d1b; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc1a; + *pOut++ = acc1b; + + /* Every time after the output is computed state should be updated. */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a; + d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b; + + /* d2 = b2 * x[n] + a2 * y[n] */ + d2a = (b2 * Xn1a) + (a2 * acc1a); + d2b = (b2 * Xn1b) + (a2 * acc1b); + + /* decrement the loop counter */ + sample--; + } + + /* Store the updated state variables back into the state array */ + *pState++ = d1a; + *pState++ = d2a; + *pState++ = d1b; + *pState++ = d2b; + + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + /*Reset the output working pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while (stage > 0U); + +#else + + float32_t Xn2a, Xn3a, Xn4a; /* Input State variables */ + float32_t Xn2b, Xn3b, Xn4b; /* Input State variables */ + float32_t acc2a, acc3a, acc4a; /* accumulator */ + float32_t acc2b, acc3b, acc4b; /* accumulator */ + float32_t p0a, p1a, p2a, p3a, p4a, A1a; + float32_t p0b, p1b, p2b, p3b, p4b, A1b; + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + do + { + /* Reading the coefficients */ + b0 = *pCoeffs++; + b1 = *pCoeffs++; + b2 = *pCoeffs++; + a1 = *pCoeffs++; + a2 = *pCoeffs++; + + /*Reading the state values */ + d1a = pState[0]; + d2a = pState[1]; + d1b = pState[2]; + d2b = pState[3]; + + /* Apply loop unrolling and compute 4 output values simultaneously. */ + sample = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (sample > 0U) { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + /* Read the four inputs */ + Xn1a = pIn[0]; + Xn1b = pIn[1]; + Xn2a = pIn[2]; + Xn2b = pIn[3]; + Xn3a = pIn[4]; + Xn3b = pIn[5]; + Xn4a = pIn[6]; + Xn4b = pIn[7]; + pIn += 8; + + p0a = b0 * Xn1a; + p0b = b0 * Xn1b; + p1a = b1 * Xn1a; + p1b = b1 * Xn1b; + acc1a = p0a + d1a; + acc1b = p0b + d1b; + p0a = b0 * Xn2a; + p0b = b0 * Xn2b; + p3a = a1 * acc1a; + p3b = a1 * acc1b; + p2a = b2 * Xn1a; + p2b = b2 * Xn1b; + A1a = p1a + p3a; + A1b = p1b + p3b; + p4a = a2 * acc1a; + p4b = a2 * acc1b; + d1a = A1a + d2a; + d1b = A1b + d2b; + d2a = p2a + p4a; + d2b = p2b + p4b; + + p1a = b1 * Xn2a; + p1b = b1 * Xn2b; + acc2a = p0a + d1a; + acc2b = p0b + d1b; + p0a = b0 * Xn3a; + p0b = b0 * Xn3b; + p3a = a1 * acc2a; + p3b = a1 * acc2b; + p2a = b2 * Xn2a; + p2b = b2 * Xn2b; + A1a = p1a + p3a; + A1b = p1b + p3b; + p4a = a2 * acc2a; + p4b = a2 * acc2b; + d1a = A1a + d2a; + d1b = A1b + d2b; + d2a = p2a + p4a; + d2b = p2b + p4b; + + p1a = b1 * Xn3a; + p1b = b1 * Xn3b; + acc3a = p0a + d1a; + acc3b = p0b + d1b; + p0a = b0 * Xn4a; + p0b = b0 * Xn4b; + p3a = a1 * acc3a; + p3b = a1 * acc3b; + p2a = b2 * Xn3a; + p2b = b2 * Xn3b; + A1a = p1a + p3a; + A1b = p1b + p3b; + p4a = a2 * acc3a; + p4b = a2 * acc3b; + d1a = A1a + d2a; + d1b = A1b + d2b; + d2a = p2a + p4a; + d2b = p2b + p4b; + + acc4a = p0a + d1a; + acc4b = p0b + d1b; + p1a = b1 * Xn4a; + p1b = b1 * Xn4b; + p3a = a1 * acc4a; + p3b = a1 * acc4b; + p2a = b2 * Xn4a; + p2b = b2 * Xn4b; + A1a = p1a + p3a; + A1b = p1b + p3b; + p4a = a2 * acc4a; + p4b = a2 * acc4b; + d1a = A1a + d2a; + d1b = A1b + d2b; + d2a = p2a + p4a; + d2b = p2b + p4b; + + pOut[0] = acc1a; + pOut[1] = acc1b; + pOut[2] = acc2a; + pOut[3] = acc2b; + pOut[4] = acc3a; + pOut[5] = acc3b; + pOut[6] = acc4a; + pOut[7] = acc4b; + pOut += 8; + + sample--; + } + + sample = blockSize & 0x3U; + while (sample > 0U) { + Xn1a = *pIn++; + Xn1b = *pIn++; + + p0a = b0 * Xn1a; + p0b = b0 * Xn1b; + p1a = b1 * Xn1a; + p1b = b1 * Xn1b; + acc1a = p0a + d1a; + acc1b = p0b + d1b; + p3a = a1 * acc1a; + p3b = a1 * acc1b; + p2a = b2 * Xn1a; + p2b = b2 * Xn1b; + A1a = p1a + p3a; + A1b = p1b + p3b; + p4a = a2 * acc1a; + p4b = a2 * acc1b; + d1a = A1a + d2a; + d1b = A1b + d2b; + d2a = p2a + p4a; + d2b = p2b + p4b; + + *pOut++ = acc1a; + *pOut++ = acc1b; + + sample--; + } + + /* Store the updated state variables back into the state array */ + *pState++ = d1a; + *pState++ = d2a; + *pState++ = d1b; + *pState++ = d2b; + + /* The current stage input is given as the output to the next stage */ + pIn = pDst; + + /*Reset the output working pointer */ + pOut = pDst; + + /* decrement the loop counter */ + stage--; + + } while (stage > 0U); + +#endif + +} +LOW_OPTIMIZATION_EXIT + +/** + * @} end of BiquadCascadeDF2T group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c new file mode 100644 index 0000000..b847c6e --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c @@ -0,0 +1,89 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_biquad_cascade_stereo_df2T_init_f32.c + * Description: Initialization function for floating-point transposed direct form II Biquad cascade filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup BiquadCascadeDF2T + * @{ + */ + +/** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] *S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @return none + * + * Coefficient and State Ordering: + * \par + * The coefficients are stored in the array pCoeffs in the following order: + *
+ *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+ * 
+ * + * \par + * where b1x and a1x are the coefficients for the first stage, + * b2x and a2x are the coefficients for the second stage, + * and so on. The pCoeffs array contains a total of 5*numStages values. + * + * \par + * The pState is a pointer to state array. + * Each Biquad stage has 2 state variables d1, and d2 for each channel. + * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on. + * The state array has a total length of 2*numStages values. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + */ + +void arm_biquad_cascade_stereo_df2T_init_f32( + arm_biquad_cascade_stereo_df2T_instance_f32 * S, + uint8_t numStages, + float32_t * pCoeffs, + float32_t * pState) +{ + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 4 * numStages */ + memset(pState, 0, (4U * (uint32_t) numStages) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; +} + +/** + * @} end of BiquadCascadeDF2T group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c new file mode 100644 index 0000000..906f7ab --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c @@ -0,0 +1,635 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_f32.c + * Description: Convolution of floating-point sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup Conv Convolution + * + * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector. + * Convolution is similar to correlation and is frequently used in filtering and data analysis. + * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types. + * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3. + * + * \par Algorithm + * Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively. + * Then the convolution + * + *
+ *                   c[n] = a[n] * b[n]
+ * 
+ * + * \par + * is defined as + * \image html ConvolutionEquation.gif + * \par + * Note that c[n] is of length srcALen + srcBLen - 1 and is defined over the interval n=0, 1, 2, ..., srcALen + srcBLen - 2. + * pSrcA points to the first input vector of length srcALen and + * pSrcB points to the second input vector of length srcBLen. + * The output result is written to pDst and the calling function must allocate srcALen+srcBLen-1 words for the result. + * + * \par + * Conceptually, when two signals a[n] and b[n] are convolved, + * the signal b[n] slides over a[n]. + * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together. + * + * \par + * Note that convolution is a commutative operation: + * + *
+ *                   a[n] * b[n] = b[n] * a[n].
+ * 
+ * + * \par + * This means that switching the A and B arguments to the convolution functions has no effect. + * + * Fixed-Point Behavior + * + * \par + * Convolution requires summing up a large number of intermediate products. + * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. + * Refer to the function specific documentation below for further details of the particular algorithm used. + * + * + * Fast Versions + * + * \par + * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires + * the input signals should be scaled down to avoid intermediate overflows. + * + * + * Opt Versions + * + * \par + * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. + * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of floating-point sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + */ + +void arm_conv_f32( + float32_t * pSrcA, + uint32_t srcALen, + float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float32_t *pIn1; /* inputA pointer */ + float32_t *pIn2; /* inputB pointer */ + float32_t *pOut = pDst; /* output pointer */ + float32_t *px; /* Intermediate inputA pointer */ + float32_t *py; /* Intermediate inputB pointer */ + float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ + float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* x[1] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* x[2] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* x[3] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + + /* Read x[3] sample */ + x3 = *(px); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += x0 * c0; + + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += x1 * c0; + + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += x2 * c0; + + /* acc3 += x[3] * y[srcBLen - 1] */ + acc3 += x3 * c0; + + /* Read y[srcBLen - 2] sample */ + c0 = *(py--); + + /* Read x[4] sample */ + x0 = *(px + 1U); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += x1 * c0; + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += x2 * c0; + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += x3 * c0; + /* acc3 += x[4] * y[srcBLen - 2] */ + acc3 += x0 * c0; + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + + /* Read x[5] sample */ + x1 = *(px + 2U); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += x2 * c0; + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += x3 * c0; + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += x0 * c0; + /* acc3 += x[5] * y[srcBLen - 2] */ + acc3 += x1 * c0; + + /* Read y[srcBLen - 4] sample */ + c0 = *(py--); + + /* Read x[6] sample */ + x2 = *(px + 3U); + px += 4U; + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[srcBLen - 4] */ + acc0 += x3 * c0; + /* acc1 += x[4] * y[srcBLen - 4] */ + acc1 += x0 * c0; + /* acc2 += x[5] * y[srcBLen - 4] */ + acc2 += x1 * c0; + /* acc3 += x[6] * y[srcBLen - 4] */ + acc3 += x2 * c0; + + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += x0 * c0; + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += x1 * c0; + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += x2 * c0; + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += x3 * c0; + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc0; + *pOut++ = acc1; + *pOut++ = acc2; + *pOut++ = acc3; + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + float32_t *pIn1 = pSrcA; /* inputA pointer */ + float32_t *pIn2 = pSrcB; /* inputB pointer */ + float32_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + + /* Loop to calculate convolution for output length number of times */ + for (i = 0U; i < ((srcALen + srcBLen) - 1U); i++) + { + /* Initialize sum with zero to carry out MAC operations */ + sum = 0.0f; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0U; j <= i; j++) + { + /* Check the array limitations */ + if ((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += pIn1[j] * pIn2[i - j]; + } + } + /* Store the output in the destination buffer */ + pDst[i] = sum; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of Conv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c new file mode 100644 index 0000000..26c37f0 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c @@ -0,0 +1,531 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_fast_opt_q15.c + * Description: Fast Q15 Convolution + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return none. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * Scaling and Overflow Behavior: + * + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results + * but provides only a single guard bit. There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, + * as maximum of min(srcALen, srcBLen) number of additions are carried internally. + * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. + * + * \par + * See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. + */ + +void arm_conv_fast_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2) +{ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ + q31_t y1, y2; /* State variables */ + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + uint32_t tapCnt; /* loop count */ +#ifdef UNALIGNED_SUPPORT_DISABLE + + q15_t a, b; + +#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + + /* Copy smaller length input sequence in reverse order into second scratch buffer */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Assuming scratch1 buffer is aligned by 32-bit */ + /* Fill (srcBLen - 1U) zeros in scratch1 buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr1, srcALen); + + /* Update pointers */ + pScr1 += srcALen; + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update pointer */ + pScr1 += (srcBLen - 1U); + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1U) >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1U) % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + /* First part of the processing with loop unrolling process 4 data points at a time. + ** a second loop below process for the remaining 1 to 3 samples. */ + + /* Actual convolution process starts here */ + blkCnt = (srcALen + srcBLen - 1U) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2U); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = _SIMD32_OFFSET(pScr1); + + /* multiply and accumlate */ + acc0 = __SMLAD(x2, y2, acc0); + acc2 = __SMLAD(x1, y2, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + acc1 = __SMLADX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr1 + 2U); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); + +#else + + /* Read four samples from smaller buffer */ + a = *pIn2; + b = *(pIn2 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + y1 = __PKHBT(a, b, 16); +#else + y1 = __PKHBT(b, a, 16); +#endif + + a = *(pIn2 + 2); + b = *(pIn2 + 3); +#ifndef ARM_MATH_BIG_ENDIAN + y2 = __PKHBT(a, b, 16); +#else + y2 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLAD(x1, y1, acc0); + + acc2 = __SMLAD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLADX(x3, y1, acc1); + + a = *pScr1; + b = *(pScr1 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(a, b, 16); +#else + x1 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLAD(x2, y2, acc0); + + acc2 = __SMLAD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + acc1 = __SMLADX(x3, y2, acc1); + + a = *(pScr1 + 2); + b = *(pScr1 + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + x2 = __PKHBT(a, b, 16); +#else + x2 = __PKHBT(b, a, 16); +#endif + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* update scratch pointers */ + pIn2 += 4U; + pScr1 += 4U; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + + + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4U; + + } + + + blkCnt = (srcALen + srcBLen - 1U) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + acc0 += (*pScr1++ * *pIn2++); + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1U; + + } + +} + +/** + * @} end of Conv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c new file mode 100644 index 0000000..16b0424 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c @@ -0,0 +1,1398 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_fast_q15.c + * Description: Fast Q15 Convolution + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * Scaling and Overflow Behavior: + * + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results + * but provides only a single guard bit. There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, + * as maximum of min(srcALen, srcBLen) number of additions are carried internally. + * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. + * + * \par + * See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. + */ + +void arm_conv_fast_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst) +{ +#ifndef UNALIGNED_SUPPORT_DISABLE + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while ((count < 4U) && (blockSize1 > 0U)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1U; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + (count - 1U); + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0U; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + py = py - 1U; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px+1); + px+= 2U; + + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLADX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLADX(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px+1); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLADX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLADX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLADX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLADX(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px+2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px+3); + px += 4U; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLADX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLADX(x1, c0, acc3); + + } while (--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + if (k == 1U) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if (k == 2U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + px += 2U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + } + + if (k == 3U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + + /* Read y[srcBLen - 7] */ + c0 = *(py-1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px+2); + px += 3U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the results in the accumulators in the destination buffer. */ +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16); + *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16); + +#else + + *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16); + *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + pIn2 = pSrc2 - 1U; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = blockSize3 >> 2U; + + while ((j > 0U) && (blockSize3 > 0U)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied + * with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied + * with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1U; + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4U; + + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1U; + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ + q15_t a, b; + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while ((count < 4U) && (blockSize1 > 0U)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + py++; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + (count - 1U); + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0U; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + py = py - 1U; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1] samples */ + a = *px++; + b = *px++; + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *px; + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *px; + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + a = *py; + b = *(py+1); + py -= 2; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLADX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLADX(x1, c0, acc1); + + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x2 = __PKHBT(a, b, 16); + a = *(px + 2); + x3 = __PKHBT(b, a, 16); + +#else + + x2 = __PKHBT(b, a, 16); + a = *(px + 2); + x3 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLADX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLADX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + a = *py; + b = *(py+1); + py -= 2; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLADX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLADX(x3, c0, acc1); + + /* Read x[4], x[5], x[6] */ + a = *(px + 2); + b = *(px + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *(px + 4); + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *(px + 4); + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 4U; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLADX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLADX(x1, c0, acc3); + + } while (--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + if (k == 1U) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + a = *px; + b = *(px+1); + px++; + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if (k == 2U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + a = *py; + b = *(py+1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + px += 2U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + } + + if (k == 3U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + a = *py; + b = *(py+1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + + /* Read y[srcBLen - 7] */ + c0 = *(py-1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + a = *(px+2); + b = *(px+3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 3U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = (q15_t)(acc0 >> 15); + *pOut++ = (q15_t)(acc1 >> 15); + *pOut++ = (q15_t)(acc2 >> 15); + *pOut++ = (q15_t)(acc3 >> 15); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + pIn2 = pSrc2 - 1U; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = blockSize3 >> 2U; + + while ((j > 0U) && (blockSize3 > 0U)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + py++; + + while (k > 0U) + { + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4U; + + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1U; + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ +} + +/** + * @} end of Conv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c new file mode 100644 index 0000000..bc57221 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c @@ -0,0 +1,565 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_fast_q31.c + * Description: Fast Q31 Convolution + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are accumulated in a 32-bit register in 2.30 format. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * + * \par + * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signals must be scaled down. + * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, + * as maximum of min(srcALen, srcBLen) number of additions are carried internally. + * + * \par + * See arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. + */ + +void arm_conv_fast_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst) +{ + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[1] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[2] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[3] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + + /* acc3 += x[3] * y[srcBLen - 1] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Read y[srcBLen - 2] sample */ + c0 = *(py--); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc3 += x[4] * y[srcBLen - 2] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc1 += x[3] * y[srcBLen - 3] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc2 += x[4] * y[srcBLen - 3] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc3 += x[5] * y[srcBLen - 3] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read y[srcBLen - 4] sample */ + c0 = *(py--); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[srcBLen - 4] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc1 += x[4] * y[srcBLen - 4] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc2 += x[5] * y[srcBLen - 4] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc3 += x[6] * y[srcBLen - 4] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); + + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = (q31_t) (acc0 << 1); + *pOut++ = (q31_t) (acc1 << 1); + *pOut++ = (q31_t) (acc2 << 1); + *pOut++ = (q31_t) (acc3 << 1); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +} + +/** + * @} end of Conv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c new file mode 100644 index 0000000..47f6f84 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c @@ -0,0 +1,533 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_opt_q15.c + * Description: Convolution of Q15 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return none. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both inputs are in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * This approach provides 33 guard bits and there is no risk of overflow. + * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. + * + * + * \par + * Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * + */ + +void arm_conv_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2) +{ + q63_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ + q31_t y1, y2; /* State variables */ + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + uint32_t tapCnt; /* loop count */ +#ifdef UNALIGNED_SUPPORT_DISABLE + + q15_t a, b; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + /* Copy smaller length input sequence in reverse order into second scratch buffer */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Assuming scratch1 buffer is aligned by 32-bit */ + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr1, srcALen); + + /* Update pointers */ + pScr1 += srcALen; + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + +#endif + + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update pointer */ + pScr1 += (srcBLen - 1U); + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1U) >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1U) % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + +#endif + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + /* First part of the processing with loop unrolling process 4 data points at a time. + ** a second loop below process for the remaining 1 to 3 samples. */ + + /* Actual convolution process starts here */ + blkCnt = (srcALen + srcBLen - 1U) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2U); + + /* multiply and accumlate */ + acc0 = __SMLALD(x1, y1, acc0); + acc2 = __SMLALD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLALDX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = _SIMD32_OFFSET(pScr1); + + /* multiply and accumlate */ + acc0 = __SMLALD(x2, y2, acc0); + acc2 = __SMLALD(x1, y2, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + acc1 = __SMLALDX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr1 + 2U); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + +#else + + /* Read four samples from smaller buffer */ + a = *pIn2; + b = *(pIn2 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + y1 = __PKHBT(a, b, 16); +#else + y1 = __PKHBT(b, a, 16); +#endif + + a = *(pIn2 + 2); + b = *(pIn2 + 3); +#ifndef ARM_MATH_BIG_ENDIAN + y2 = __PKHBT(a, b, 16); +#else + y2 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLALD(x1, y1, acc0); + + acc2 = __SMLALD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLALDX(x3, y1, acc1); + + a = *pScr1; + b = *(pScr1 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(a, b, 16); +#else + x1 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLALD(x2, y2, acc0); + + acc2 = __SMLALD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + + acc1 = __SMLALDX(x3, y2, acc1); + + a = *(pScr1 + 2); + b = *(pScr1 + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + x2 = __PKHBT(a, b, 16); +#else + x2 = __PKHBT(b, a, 16); +#endif + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + pIn2 += 4U; + pScr1 += 4U; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4U; + + } + + + blkCnt = (srcALen + srcBLen - 1U) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + /* Read next two samples from scratch1 buffer */ + acc0 += (*pScr1++ * *pIn2++); + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1U; + + } + +} + + +/** + * @} end of Conv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c new file mode 100644 index 0000000..1dc2e49 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c @@ -0,0 +1,423 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_opt_q7.c + * Description: Convolution of Q7 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + * @return none. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. + * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. + * + */ + +void arm_conv_opt_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ + q15_t x4; /* Temporary input variable */ + q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ + uint32_t j, k, blkCnt, tapCnt; /* loop counter */ + q7_t *px; /* Temporary input1 pointer */ + q15_t *py; /* Temporary input2 pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3, y1; /* Temporary input variables */ + q7_t *pOut = pDst; /* output pointer */ + q7_t out0, out1, out2, out3; /* temporary variables */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2; + + /* points to smaller length sequence */ + px = pIn2 + srcBLen - 1; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy (srcALen) samples in scratch buffer */ + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update pointer */ + pScr1 += (srcBLen - 1U); + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1U) >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1U) % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + +#endif + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* Initialization of pIn2 pointer */ + pIn2 = (q7_t *) py; + + pScr2 = py; + + /* Actual convolution process starts here */ + blkCnt = (srcALen + srcBLen - 1U) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2 + 2U); + + acc0 = __SMLAD(x2, y1, acc0); + + acc2 = __SMLAD(x1, y1, acc2); + + acc1 = __SMLADX(x3, y1, acc1); + + x2 = *__SIMD32(pScr1)++; + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + pScr2 += 4U; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2); + acc1 += (*pScr1++ * *pScr2); + acc2 += (*pScr1++ * *pScr2); + acc3 += (*pScr1++ * *pScr2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + out0 = (q7_t) (__SSAT(acc0 >> 7U, 8)); + out1 = (q7_t) (__SSAT(acc1 >> 7U, 8)); + out2 = (q7_t) (__SSAT(acc2 >> 7U, 8)); + out3 = (q7_t) (__SSAT(acc3 >> 7U, 8)); + + *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 4U; + + } + + + blkCnt = (srcALen + srcBLen - 1U) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + acc0 += (*pScr1++ * *pScr2++); + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 1U; + + } + +} + + +/** + * @} end of Conv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c new file mode 100644 index 0000000..9eae124 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c @@ -0,0 +1,678 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_partial_f32.c + * Description: Partial convolution of floating-point sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup PartialConv Partial Convolution + * + * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated. + * Each function has two additional arguments. + * firstIndex specifies the starting index of the subset of output samples. + * numPoints is the number of output samples to compute. + * The function computes the output in the range + * [firstIndex, ..., firstIndex+numPoints-1]. + * The output array pDst contains numPoints values. + * + * The allowable range of output indices is [0 srcALen+srcBLen-2]. + * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR. + * Otherwise the functions return ARM_MATH_SUCCESS. + * \note Refer arm_conv_f32() for details on fixed point behavior. + * + * + * Fast Versions + * + * \par + * Fast versions are supported for Q31 and Q15 of partial convolution. Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires + * the input signals should be scaled down to avoid intermediate overflows. + * + * + * Opt Versions + * + * \par + * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. + * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of floating-point sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + +arm_status arm_conv_partial_f32( + float32_t * pSrcA, + uint32_t srcALen, + float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float32_t *pIn1 = pSrcA; /* inputA pointer */ + float32_t *pIn2 = pSrcB; /* inputB pointer */ + float32_t *pOut = pDst; /* output pointer */ + float32_t *px; /* Intermediate inputA pointer */ + float32_t *py; /* Intermediate inputB pointer */ + float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ + float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count = 0U, blkCnt, check; + int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; + blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; + blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = ((int32_t) check - blockSize3) - + (blockSize1 + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1U + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + firstIndex; + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* x[1] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* x[2] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* x[3] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc1; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1; + } + else + { + px = pIn1; + } + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = ((uint32_t) blockSize2 >> 2U); + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += x0 * c0; + + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += x1 * c0; + + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += x2 * c0; + + /* acc3 += x[3] * y[srcBLen - 1] */ + acc3 += x3 * c0; + + /* Read y[srcBLen - 2] sample */ + c0 = *(py--); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += x1 * c0; + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += x2 * c0; + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += x3 * c0; + /* acc3 += x[4] * y[srcBLen - 2] */ + acc3 += x0 * c0; + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += x2 * c0; + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += x3 * c0; + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += x0 * c0; + /* acc3 += x[5] * y[srcBLen - 2] */ + acc3 += x1 * c0; + + /* Read y[srcBLen - 4] sample */ + c0 = *(py--); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[srcBLen - 4] */ + acc0 += x3 * c0; + /* acc1 += x[4] * y[srcBLen - 4] */ + acc1 += x0 * c0; + /* acc2 += x[5] * y[srcBLen - 4] */ + acc2 += x1 * c0; + /* acc3 += x[6] * y[srcBLen - 4] */ + acc3 += x2 * c0; + + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += x0 * c0; + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += x1 * c0; + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += x2 * c0; + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += x3 * c0; + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc0; + *pOut++ = acc1; + *pOut++ = acc2; + *pOut++ = acc3; + + /* Increment the pointer pIn1 index, count by 1 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + while (blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + float32_t *pIn1 = pSrcA; /* inputA pointer */ + float32_t *pIn2 = pSrcB; /* inputB pointer */ + float32_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Loop to calculate convolution for output length number of values */ + for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0.0f; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0U; j <= i; j++) + { + /* Check the array limitations for inputs */ + if ((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += pIn1[j] * pIn2[i - j]; + } + } + /* Store the output in the destination buffer */ + pDst[i] = sum; + } + /* set status as ARM_SUCCESS as there are no argument errors */ + status = ARM_MATH_SUCCESS; + } + return (status); + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of PartialConv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c new file mode 100644 index 0000000..f469d1f --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c @@ -0,0 +1,756 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_partial_fast_opt_q15.c + * Description: Fast Q15 Partial convolution + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * See arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + +arm_status arm_conv_partial_fast_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ + q31_t y1, y2; /* State variables */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + arm_status status; + + uint32_t tapCnt; /* loop count */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + + /* Copy smaller length input sequence in reverse order into second scratch buffer */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Assuming scratch1 buffer is aligned by 32-bit */ + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr1, srcALen); + + /* Update pointers */ + pScr1 += srcALen; + + /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update pointer */ + pScr1 += (srcBLen - 1U); + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + pScratch1 += firstIndex; + + pOut = pDst + firstIndex; + + /* First part of the processing with loop unrolling process 4 data points at a time. + ** a second loop below process for the remaining 1 to 3 samples. */ + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2U); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = _SIMD32_OFFSET(pScr1); + + /* multiply and accumlate */ + acc0 = __SMLAD(x2, y2, acc0); + + acc2 = __SMLAD(x1, y2, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + acc1 = __SMLADX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr1 + 2U); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); + + /* update scratch pointers */ + pIn2 += 4U; + pScr1 += 4U; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4U; + + } + + + blkCnt = numPoints & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read two samples from smaller buffer */ + y1 = *__SIMD32(pIn2)++; + + acc0 = __SMLAD(x1, y1, acc0); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1U; + + } + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + /* Return to application */ + return (status); +} + +#else + +arm_status arm_conv_partial_fast_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + arm_status status; /* Status variable */ + uint32_t tapCnt; /* loop count */ + q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */ + q15_t y10, y11; /* Temporary variables to hold srcB buffer */ + + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1U) >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1U) % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + pScratch1 += firstIndex; + + pOut = pDst + firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read next two samples from scratch1 buffer */ + x20 = *pScr1++; + x21 = *pScr1++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + + /* Read two samples from smaller buffer */ + y10 = *pIn2; + y11 = *(pIn2 + 1U); + + /* multiply and accumlate */ + acc0 += (q31_t) x10 *y10; + acc0 += (q31_t) x11 *y11; + acc2 += (q31_t) x20 *y10; + acc2 += (q31_t) x21 *y11; + + /* multiply and accumlate */ + acc1 += (q31_t) x11 *y10; + acc1 += (q31_t) x20 *y11; + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1; + x11 = *(pScr1 + 1U); + + /* multiply and accumlate */ + acc3 += (q31_t) x21 *y10; + acc3 += (q31_t) x10 *y11; + + /* Read next two samples from scratch2 buffer */ + y10 = *(pIn2 + 2U); + y11 = *(pIn2 + 3U); + + /* multiply and accumlate */ + acc0 += (q31_t) x20 *y10; + acc0 += (q31_t) x21 *y11; + acc2 += (q31_t) x10 *y10; + acc2 += (q31_t) x11 *y11; + acc1 += (q31_t) x21 *y10; + acc1 += (q31_t) x10 *y11; + + /* Read next two samples from scratch1 buffer */ + x20 = *(pScr1 + 2); + x21 = *(pScr1 + 3); + + /* multiply and accumlate */ + acc3 += (q31_t) x11 *y10; + acc3 += (q31_t) x20 *y11; + + /* update scratch pointers */ + pIn2 += 4U; + pScr1 += 4U; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = __SSAT((acc0 >> 15), 16); + *pOut++ = __SSAT((acc1 >> 15), 16); + *pOut++ = __SSAT((acc2 >> 15), 16); + *pOut++ = __SSAT((acc3 >> 15), 16); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4U; + + } + + + blkCnt = numPoints & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read two samples from smaller buffer */ + y10 = *pIn2++; + y11 = *pIn2++; + + /* multiply and accumlate */ + acc0 += (q31_t) x10 *y10; + acc0 += (q31_t) x11 *y11; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1U; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + } + + /* Return to application */ + return (status); +} + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +/** + * @} end of PartialConv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c new file mode 100644 index 0000000..0d4486a --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c @@ -0,0 +1,1494 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_partial_fast_q15.c + * Description: Fast Q15 Partial convolution + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * See arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. + */ + + +arm_status arm_conv_partial_fast_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ +#ifndef UNALIGNED_SUPPORT_DISABLE + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >=srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; + blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1U + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while ((count < 4U) && (blockSize1 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while (blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1U; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2 - 1U; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1; + } + else + { + px = pIn1; + } + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0U; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = ((uint32_t) blockSize2 >> 2U); + + while (blkCnt > 0U) + { + py = py - 1U; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px+1); + px+= 2U; + + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLADX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLADX(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px+1); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLADX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLADX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLADX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLADX(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px+2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px+3); + px += 4U; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLADX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLADX(x1, c0, acc3); + + } while (--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + if (k == 1U) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if (k == 2U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + px += 2U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + } + + if (k == 3U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + + c0 = *(py-1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px+2); + px += 3U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the results in the accumulators in the destination buffer. */ +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16); + *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16); + +#else + + *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16); + *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + pIn2 = pSrc2 - 1U; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = count >> 2U; + + while ((j > 0U) && (blockSize3 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied + * with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied + * with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1U; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1U; + + while (blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = __SMLAD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ + arm_status status; /* status of Partial convolution */ + q15_t a, b; + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >=srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; + blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; + blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = ((int32_t) check - blockSize3) - + (blockSize1 + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1U + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while ((count < 4U) && (blockSize1 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while (blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + py++; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2 - 1U; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1; + } + else + { + px = pIn1; + } + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0U; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = ((uint32_t) blockSize2 >> 2U); + + while (blkCnt > 0U) + { + py = py - 1U; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1] samples */ + a = *px++; + b = *px++; + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *px; + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *px; + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + a = *py; + b = *(py+1); + py -= 2; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLADX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLADX(x1, c0, acc1); + + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x2 = __PKHBT(a, b, 16); + a = *(px + 2); + x3 = __PKHBT(b, a, 16); + +#else + + x2 = __PKHBT(b, a, 16); + a = *(px + 2); + x3 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLADX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLADX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + a = *py; + b = *(py+1); + py -= 2; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLADX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLADX(x3, c0, acc1); + + /* Read x[4], x[5], x[6] */ + a = *(px + 2); + b = *(px + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *(px + 4); + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *(px + 4); + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 4U; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLADX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLADX(x1, c0, acc3); + + } while (--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + if (k == 1U) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + a = *px; + b = *(px+1); + px++; + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if (k == 2U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + a = *py; + b = *(py+1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + px += 2U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + } + + if (k == 3U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + a = *py; + b = *(py+1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x0, c0, acc0); + acc1 = __SMLADX(x1, c0, acc1); + acc2 = __SMLADX(x3, c0, acc2); + acc3 = __SMLADX(x2, c0, acc3); + + /* Read y[srcBLen - 7] */ + c0 = *(py-1); +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + a = *(px+2); + b = *(px+3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16);; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 3U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = (q15_t)(acc0 >> 15); + *pOut++ = (q15_t)(acc1 >> 15); + *pOut++ = (q15_t)(acc2 >> 15); + *pOut++ = (q15_t)(acc3 >> 15); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + pIn2 = pSrc2 - 1U; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = count >> 2U; + + while ((j > 0U) && (blockSize3 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + py++; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + sum += ((q31_t) * px++ * *py--); + /* Decrement the loop counter */ + k--; + } + + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1U; + + while (blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (sum >> 15); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ +} + +/** + * @} end of PartialConv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c new file mode 100644 index 0000000..e845947 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c @@ -0,0 +1,620 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_partial_fast_q31.c + * Description: Fast Q31 Partial convolution + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * \par + * See arm_conv_partial_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. + */ + +arm_status arm_conv_partial_fast_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q31_t x0, x1, x2, x3, c0; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; + blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1U + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while (blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[1] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[2] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* x[3] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1; + } + else + { + px = pIn1; + } + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2 */ + blkCnt = ((uint32_t) blockSize2 >> 2U); + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + + /* acc3 += x[3] * y[srcBLen - 1] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Read y[srcBLen - 2] sample */ + c0 = *(py--); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc3 += x[4] * y[srcBLen - 2] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc3 += x[5] * y[srcBLen - 2] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read y[srcBLen - 4] sample */ + c0 = *(py--); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[srcBLen - 4] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc1 += x[4] * y[srcBLen - 4] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc2 += x[5] * y[srcBLen - 4] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc3 += x[6] * y[srcBLen - 4] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); + + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (acc0 << 1); + *pOut++ = (q31_t) (acc1 << 1); + *pOut++ = (q31_t) (acc2 << 1); + *pOut++ = (q31_t) (acc3 << 1); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py--))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum << 1; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +} + +/** + * @} end of PartialConv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c new file mode 100644 index 0000000..78dd548 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c @@ -0,0 +1,753 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_partial_opt_q15.c + * Description: Partial convolution of Q15 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, state buffers should be aligned by 32-bit + * + * Refer to arm_conv_partial_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * + */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + +arm_status arm_conv_partial_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q63_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ + q31_t y1, y2; /* State variables */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + arm_status status; /* Status variable */ + uint32_t tapCnt; /* loop count */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr1, srcALen); + + /* Update pointers */ + pScr1 += srcALen; + + /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update pointer */ + pScr1 += (srcBLen - 1U); + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + pScratch1 += firstIndex; + + pOut = pDst + firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2U); + + /* multiply and accumlate */ + acc0 = __SMLALD(x1, y1, acc0); + acc2 = __SMLALD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLALDX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = _SIMD32_OFFSET(pScr1); + + /* multiply and accumlate */ + acc0 = __SMLALD(x2, y2, acc0); + acc2 = __SMLALD(x1, y2, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + acc1 = __SMLALDX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr1 + 2U); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + + /* update scratch pointers */ + pIn2 += 4U; + pScr1 += 4U; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4U; + + } + + + blkCnt = numPoints & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read two samples from smaller buffer */ + y1 = *__SIMD32(pIn2)++; + + acc0 = __SMLALD(x1, y1, acc0); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1U; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + } + + /* Return to application */ + return (status); +} + +#else + +arm_status arm_conv_partial_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ + q63_t acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + uint32_t j, k, blkCnt; /* loop counter */ + arm_status status; /* Status variable */ + uint32_t tapCnt; /* loop count */ + q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */ + q15_t y10, y11; /* Temporary variables to hold srcB buffer */ + + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2 + srcBLen - 1; + + /* points to smaller length sequence */ + px = pIn2; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr2-- = *px++; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ + + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = *pIn1++; + + /* Decrement the loop counter */ + k--; + } + + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1U) >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1U) % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + + /* Initialization of pIn2 pointer */ + pIn2 = py; + + pScratch1 += firstIndex; + + pOut = pDst + firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read next two samples from scratch1 buffer */ + x20 = *pScr1++; + x21 = *pScr1++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + + /* Read two samples from smaller buffer */ + y10 = *pIn2; + y11 = *(pIn2 + 1U); + + /* multiply and accumlate */ + acc0 += (q63_t) x10 *y10; + acc0 += (q63_t) x11 *y11; + acc2 += (q63_t) x20 *y10; + acc2 += (q63_t) x21 *y11; + + /* multiply and accumlate */ + acc1 += (q63_t) x11 *y10; + acc1 += (q63_t) x20 *y11; + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1; + x11 = *(pScr1 + 1U); + + /* multiply and accumlate */ + acc3 += (q63_t) x21 *y10; + acc3 += (q63_t) x10 *y11; + + /* Read next two samples from scratch2 buffer */ + y10 = *(pIn2 + 2U); + y11 = *(pIn2 + 3U); + + /* multiply and accumlate */ + acc0 += (q63_t) x20 *y10; + acc0 += (q63_t) x21 *y11; + acc2 += (q63_t) x10 *y10; + acc2 += (q63_t) x11 *y11; + acc1 += (q63_t) x21 *y10; + acc1 += (q63_t) x10 *y11; + + /* Read next two samples from scratch1 buffer */ + x20 = *(pScr1 + 2); + x21 = *(pScr1 + 3); + + /* multiply and accumlate */ + acc3 += (q63_t) x11 *y10; + acc3 += (q63_t) x20 *y11; + + /* update scratch pointers */ + pIn2 += 4U; + pScr1 += 4U; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2); + acc1 += (*pScr1++ * *pIn2); + acc2 += (*pScr1++ * *pIn2); + acc3 += (*pScr1++ * *pIn2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = __SSAT((acc0 >> 15), 16); + *pOut++ = __SSAT((acc1 >> 15), 16); + *pOut++ = __SSAT((acc2 >> 15), 16); + *pOut++ = __SSAT((acc3 >> 15), 16); + + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 4U; + + } + + + blkCnt = numPoints & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read two samples from smaller buffer */ + y10 = *pIn2++; + y11 = *pIn2++; + + /* multiply and accumlate */ + acc0 += (q63_t) x10 *y10; + acc0 += (q63_t) x11 *y11; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch1 += 1U; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + } + + /* Return to application */ + return (status); +} + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + +/** + * @} end of PartialConv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c new file mode 100644 index 0000000..351c290 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c @@ -0,0 +1,791 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_partial_opt_q7.c + * Description: Partial convolution of Q7 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * + * + */ + + +#ifndef UNALIGNED_SUPPORT_DISABLE + +arm_status arm_conv_partial_opt_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ + q15_t x4; /* Temporary input variable */ + q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ + uint32_t j, k, blkCnt, tapCnt; /* loop counter */ + q7_t *px; /* Temporary input1 pointer */ + q15_t *py; /* Temporary input2 pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t x1, x2, x3, y1; /* Temporary input variables */ + arm_status status; + q7_t *pOut = pDst; /* output pointer */ + q7_t out0, out1, out2, out3; /* temporary variables */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2; + + /* points to smaller length sequence */ + px = pIn2 + srcBLen - 1; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy (srcALen) samples in scratch buffer */ + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update pointer */ + pScr1 += (srcBLen - 1U); + + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* Initialization of pIn2 pointer */ + pIn2 = (q7_t *) py; + + pScr2 = py; + + pOut = pDst + firstIndex; + + pScratch1 += firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2 + 2U); + + acc0 = __SMLAD(x2, y1, acc0); + + acc2 = __SMLAD(x1, y1, acc2); + + acc1 = __SMLADX(x3, y1, acc1); + + x2 = *__SIMD32(pScr1)++; + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + pScr2 += 4U; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2); + acc1 += (*pScr1++ * *pScr2); + acc2 += (*pScr1++ * *pScr2); + acc3 += (*pScr1++ * *pScr2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + out0 = (q7_t) (__SSAT(acc0 >> 7U, 8)); + out1 = (q7_t) (__SSAT(acc1 >> 7U, 8)); + out2 = (q7_t) (__SSAT(acc2 >> 7U, 8)); + out3 = (q7_t) (__SSAT(acc3 >> 7U, 8)); + + *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 4U; + + } + + blkCnt = (numPoints) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read two samples from smaller buffer */ + y1 = *__SIMD32(pScr2)++; + + acc0 = __SMLAD(x1, y1, acc0); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 1U; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + + } + + return (status); + +} + +#else + +arm_status arm_conv_partial_opt_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2) +{ + + q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ + q15_t x4; /* Temporary input variable */ + q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ + uint32_t j, k, blkCnt, tapCnt; /* loop counter */ + q7_t *px; /* Temporary input1 pointer */ + q15_t *py; /* Temporary input2 pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulator */ + arm_status status; + q7_t *pOut = pDst; /* output pointer */ + q15_t x10, x11, x20, x21; /* Temporary input variables */ + q15_t y10, y11; /* Temporary input variables */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* pointer to take end of scratch2 buffer */ + pScr2 = pScratch2; + + /* points to smaller length sequence */ + px = pIn2 + srcBLen - 1; + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * px--; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Initialze temporary scratch pointer */ + pScr1 = pScratch1; + + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy (srcALen) samples in scratch buffer */ + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1U) >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1U) % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + + /* Temporary pointer for scratch2 */ + py = pScratch2; + + /* Initialization of pIn2 pointer */ + pIn2 = (q7_t *) py; + + pScr2 = py; + + pOut = pDst + firstIndex; + + pScratch1 += firstIndex; + + /* Actual convolution process starts here */ + blkCnt = (numPoints) >> 2; + + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read next two samples from scratch1 buffer */ + x20 = *pScr1++; + x21 = *pScr1++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + + /* Read four samples from smaller buffer */ + y10 = *pScr2; + y11 = *(pScr2 + 1U); + + /* multiply and accumlate */ + acc0 += (q31_t) x10 *y10; + acc0 += (q31_t) x11 *y11; + acc2 += (q31_t) x20 *y10; + acc2 += (q31_t) x21 *y11; + + + acc1 += (q31_t) x11 *y10; + acc1 += (q31_t) x20 *y11; + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1; + x11 = *(pScr1 + 1U); + + /* multiply and accumlate */ + acc3 += (q31_t) x21 *y10; + acc3 += (q31_t) x10 *y11; + + /* Read next two samples from scratch2 buffer */ + y10 = *(pScr2 + 2U); + y11 = *(pScr2 + 3U); + + /* multiply and accumlate */ + acc0 += (q31_t) x20 *y10; + acc0 += (q31_t) x21 *y11; + acc2 += (q31_t) x10 *y10; + acc2 += (q31_t) x11 *y11; + acc1 += (q31_t) x21 *y10; + acc1 += (q31_t) x10 *y11; + + /* Read next two samples from scratch1 buffer */ + x20 = *(pScr1 + 2); + x21 = *(pScr1 + 3); + + /* multiply and accumlate */ + acc3 += (q31_t) x11 *y10; + acc3 += (q31_t) x20 *y11; + + /* update scratch pointers */ + + pScr1 += 4U; + pScr2 += 4U; + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2); + acc1 += (*pScr1++ * *pScr2); + acc2 += (*pScr1++ * *pScr2); + acc3 += (*pScr1++ * *pScr2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); + *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8)); + *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8)); + *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8)); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 4U; + + } + + blkCnt = (numPoints) & 0x3; + + /* Calculate convolution for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + /* Read next two samples from scratch1 buffer */ + x10 = *pScr1++; + x11 = *pScr1++; + + /* Read two samples from smaller buffer */ + y10 = *pScr2++; + y11 = *pScr2++; + + /* multiply and accumlate */ + acc0 += (q31_t) x10 *y10; + acc0 += (q31_t) x11 *y11; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 1U; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + + } + + return (status); + +} + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + + +/** + * @} end of PartialConv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c new file mode 100644 index 0000000..43d2b35 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c @@ -0,0 +1,795 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_partial_q15.c + * Description: Partial convolution of Q15 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * Refer to arm_conv_partial_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * \par + * Refer the function arm_conv_partial_opt_q15() for a faster implementation of this function using scratch buffers. + * + */ + +arm_status arm_conv_partial_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + + +#if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* Temporary input variables */ + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; + blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1U + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while ((count < 4U) && (blockSize1 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while (blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1U; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2 - 1U; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1; + } + else + { + px = pIn1; + } + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0U; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + py = py - 1U; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px+1); + px+= 2U; + + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLALDX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px+1); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLALDX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLALDX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLALDX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLALDX(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px+2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px+3); + px += 4U; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLALDX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLALDX(x1, c0, acc3); + + } while (--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + if (k == 1U) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALDX(x1, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + if (k == 2U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + px += 2U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + } + + if (k == 3U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + + c0 = *(py-1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px+2); + px += 3U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x1, c0, acc0); + acc1 = __SMLALD(x2, c0, acc1); + acc2 = __SMLALDX(x2, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + pIn2 = pSrc2 - 1U; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = count >> 2U; + + while ((j > 0U) && (blockSize3 > 0)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied + * with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied + * with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1U; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1U; + + while (blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + q15_t *pIn1 = pSrcA; /* inputA pointer */ + q15_t *pIn2 = pSrcB; /* inputB pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Loop to calculate convolution for output length number of values */ + for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if (((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q31_t) pIn1[j] * (pIn2[i - j])); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U); + } + /* set status as ARM_SUCCESS as there are no argument errors */ + status = ARM_MATH_SUCCESS; + } + return (status); + +#endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ + +} + +/** + * @} end of PartialConv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c new file mode 100644 index 0000000..3a108e0 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c @@ -0,0 +1,616 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_partial_q31.c + * Description: Partial convolution of Q31 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q31 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * See arm_conv_partial_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +arm_status arm_conv_partial_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q63_t sum, acc0, acc1, acc2; /* Accumulator */ + q31_t x0, x1, x2, c0; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ + arm_status status; /* status of Partial convolution */ + + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; + blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1U + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while (blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py--); + /* x[1] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py--); + /* x[2] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py--); + /* x[3] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1; + } + else + { + px = pIn1; + } + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blkCnt */ + + blkCnt = blockSize2 / 3; + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* read x[0], x[1] samples */ + x0 = *(px++); + x1 = *(px++); + + /* Apply loop unrolling and compute 3 MACs simultaneously. */ + k = srcBLen / 3; + + /* First part of the processing with loop unrolling. Compute 3 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 2 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py); + + /* Read x[2] sample */ + x2 = *(px); + + /* Perform the multiply-accumulates */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += (q63_t) x0 *c0; + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += (q63_t) x1 *c0; + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += (q63_t) x2 *c0; + + /* Read y[srcBLen - 2] sample */ + c0 = *(py - 1U); + + /* Read x[3] sample */ + x0 = *(px + 1U); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += (q63_t) x1 *c0; + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += (q63_t) x2 *c0; + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += (q63_t) x0 *c0; + + /* Read y[srcBLen - 3] sample */ + c0 = *(py - 2U); + + /* Read x[4] sample */ + x1 = *(px + 2U); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += (q63_t) x2 *c0; + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += (q63_t) x0 *c0; + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += (q63_t) x1 *c0; + + + px += 3U; + + py -= 3U; + + } while (--k); + + /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen - (3 * (srcBLen / 3)); + + while (k > 0U) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += (q63_t) x0 *c0; + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += (q63_t) x1 *c0; + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += (q63_t) x2 *c0; + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (acc0 >> 31); + *pOut++ = (q31_t) (acc1 >> 31); + *pOut++ = (q31_t) (acc2 >> 31); + + /* Increment the pointer pIn1 index, count by 3 */ + count += 3U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 - 3 * (blockSize2 / 3); + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t *pIn1 = pSrcA; /* inputA pointer */ + q31_t *pIn2 = pSrcB; /* inputB pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Loop to calculate convolution for output length number of values */ + for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if (((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q63_t) pIn1[j] * (pIn2[i - j])); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q31_t) (sum >> 31U); + } + /* set status as ARM_SUCCESS as there are no argument errors */ + status = ARM_MATH_SUCCESS; + } + return (status); + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of PartialConv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c new file mode 100644 index 0000000..cb4c562 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c @@ -0,0 +1,750 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_partial_q7.c + * Description: Partial convolution of Q7 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup PartialConv + * @{ + */ + +/** + * @brief Partial convolution of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + * + * \par + * Refer the function arm_conv_partial_opt_q7() for a faster implementation of this function. + * + */ + +arm_status arm_conv_partial_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pIn1; /* inputA pointer */ + q7_t *pIn2; /* inputB pointer */ + q7_t *pOut = pDst; /* output pointer */ + q7_t *px; /* Intermediate inputA pointer */ + q7_t *py; /* Intermediate inputB pointer */ + q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t input1, input2; + q15_t in1, in2; + q7_t x0, x1, x2, x3, c0, c1; + uint32_t j, k, count, check, blkCnt; + int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ + arm_status status; + + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_MATH_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* Conditions to check which loopCounter holds + * the first and last indices of the output samples to be calculated. */ + check = firstIndex + numPoints; + blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; + blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; + blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : + (int32_t) numPoints) : 0; + blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + + (int32_t) firstIndex); + blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* Set the output pointer to point to the firstIndex + * of the output sample to be calculated. */ + pOut = pDst + firstIndex; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed. + Since the partial convolution starts from from firstIndex + Number of Macs to be performed is firstIndex + 1 */ + count = 1U + firstIndex; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + firstIndex; + py = pSrc2; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] , x[1] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 1] , y[srcBLen - 2] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* x[0] * y[srcBLen - 1] */ + /* x[1] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* x[2] , x[3] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 3] , y[srcBLen - 4] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* x[2] * y[srcBLen - 3] */ + /* x[3] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = ++pSrc2; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1; + } + else + { + px = pIn1; + } + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = ((uint32_t) blockSize2 >> 2U); + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + /* Read y[srcBLen - 2] sample */ + c1 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* x[0] and x[1] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[1] and x[2] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[4] sample */ + x0 = *(px++); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLAD(input1, input2, acc3); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + /* Read y[srcBLen - 4] sample */ + c1 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[4] and x[5] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[6] sample */ + x2 = *(px++); + + /* x[5] and x[6] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLAD(input1, input2, acc3); + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += ((q31_t) x0 * c0); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += ((q31_t) x1 * c0); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += ((q31_t) x2 * c0); + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += ((q31_t) x3 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8)); + *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8)); + *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8)); + *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8)); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = (uint32_t) blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = (uint32_t) blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) + { + px = pIn1 + firstIndex - srcBLen + 1 + count; + } + else + { + px = pIn1 + count; + } + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + + } + + /* set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + q7_t *pIn1 = pSrcA; /* inputA pointer */ + q7_t *pIn2 = pSrcB; /* inputB pointer */ + q31_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + arm_status status; /* status of Partial convolution */ + + /* Check for range of output samples to be calculated */ + if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) + { + /* Set status as ARM_ARGUMENT_ERROR */ + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Loop to calculate convolution for output length number of values */ + for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if (((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q15_t) pIn1[j] * (pIn2[i - j])); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U); + } + /* set status as ARM_SUCCESS as there are no argument errors */ + status = ARM_MATH_SUCCESS; + } + return (status); + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of PartialConv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c new file mode 100644 index 0000000..c6721e0 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c @@ -0,0 +1,722 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_q15.c + * Description: Convolution of Q15 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both inputs are in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * This approach provides 33 guard bits and there is no risk of overflow. + * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. + * + * \par + * Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * \par + * Refer the function arm_conv_opt_q15() for a faster implementation of this function using scratch buffers. + * + */ + +void arm_conv_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst) +{ + +#if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while ((count < 4U) && (blockSize1 > 0U)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1U; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + (count - 1U); + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0U; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + py = py - 1U; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px+1); + px+= 2U; + + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLALDX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px+1); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLALDX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLALDX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLALDX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLALDX(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px+2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px+3); + px += 4U; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLALDX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLALDX(x1, c0, acc3); + + } while (--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + if (k == 1U) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALDX(x1, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + if (k == 2U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + px += 2U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + } + + if (k == 3U) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + + c0 = *(py-1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px+2); + px += 3U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x1, c0, acc0); + acc1 = __SMLALD(x2, c0, acc1); + acc2 = __SMLALDX(x2, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + blockSize3 = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + pIn2 = pSrc2 - 1U; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = blockSize3 >> 2U; + + while ((j > 0U) && (blockSize3 > 0U)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied + * with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied + * with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1U; + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4U; + + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1U; + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q15_t *pIn1 = pSrcA; /* input pointer */ + q15_t *pIn2 = pSrcB; /* coefficient pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counter */ + + /* Loop to calculate output of convolution for output length number of times */ + for (i = 0; i < (srcALen + srcBLen - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if (((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += (q31_t) pIn1[j] * (pIn2[i - j]); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U); + } + +#endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ + +} + +/** + * @} end of Conv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c new file mode 100644 index 0000000..14e5f86 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c @@ -0,0 +1,553 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_q31.c + * Description: Convolution of Q31 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q31 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, + * as maximum of min(srcALen, srcBLen) number of additions are carried internally. + * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. + * + * \par + * See arm_conv_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +void arm_conv_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q63_t sum; /* Accumulator */ + q63_t acc0, acc1, acc2; /* Accumulator */ + q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (q31_t *) pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = (q31_t *) pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py--); + /* x[1] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py--); + /* x[2] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py--); + /* x[3] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll by 3 */ + blkCnt = blockSize2 / 3; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + + /* Apply loop unrolling and compute 3 MACs simultaneously. */ + k = srcBLen / 3; + + /* First part of the processing with loop unrolling. Compute 3 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 2 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py); + + /* Read x[3] sample */ + x2 = *(px); + + /* Perform the multiply-accumulates */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += ((q63_t) x2 * c0); + + /* Read y[srcBLen - 2] sample */ + c0 = *(py - 1U); + + /* Read x[4] sample */ + x0 = *(px + 1U); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += ((q63_t) x1 * c0); + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += ((q63_t) x2 * c0); + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += ((q63_t) x0 * c0); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py - 2U); + + /* Read x[5] sample */ + x1 = *(px + 2U); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += ((q63_t) x2 * c0); + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += ((q63_t) x0 * c0); + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += ((q63_t) x1 * c0); + + /* update scratch pointers */ + px += 3U; + py -= 3U; + + } while (--k); + + /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen - (3 * (srcBLen / 3)); + + while (k > 0U) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += ((q63_t) x2 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + k--; + } + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = (q31_t) (acc0 >> 31); + *pOut++ = (q31_t) (acc1 >> 31); + *pOut++ = (q31_t) (acc2 >> 31); + + /* Increment the pointer pIn1 index, count by 3 */ + count += 3U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 - 3 * (blockSize2 / 3); + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py--); + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py--); + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py--); + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t *pIn1 = pSrcA; /* input pointer */ + q31_t *pIn2 = pSrcB; /* coefficient pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counter */ + + /* Loop to calculate output of convolution for output length number of times */ + for (i = 0; i < (srcALen + srcBLen - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if (((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q63_t) pIn1[j] * (pIn2[i - j])); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q31_t) (sum >> 31U); + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of Conv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c new file mode 100644 index 0000000..6c4dd3c --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c @@ -0,0 +1,678 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_conv_q7.c + * Description: Convolution of Q7 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. + * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. + * + * \par + * Refer the function arm_conv_opt_q7() for a faster implementation of this function. + * + */ + +void arm_conv_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pIn1; /* inputA pointer */ + q7_t *pIn2; /* inputB pointer */ + q7_t *pOut = pDst; /* output pointer */ + q7_t *px; /* Intermediate inputA pointer */ + q7_t *py; /* Intermediate inputB pointer */ + q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t input1, input2; /* Temporary input variables */ + q15_t in1, in2; /* Temporary input variables */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = (srcALen - srcBLen) + 1U; + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] , x[1] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* y[srcBLen - 1] , y[srcBLen - 2] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* x[0] * y[srcBLen - 1] */ + /* x[1] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* x[2] , x[3] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* y[srcBLen - 3] , y[srcBLen - 4] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* x[2] * y[srcBLen - 3] */ + /* x[3] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + /* Read y[srcBLen - 2] sample */ + c1 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* x[0] and x[1] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[1] and x[2] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[4] sample */ + x0 = *(px++); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLAD(input1, input2, acc3); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + /* Read y[srcBLen - 4] sample */ + c1 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[4] and x[5] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[6] sample */ + x2 = *(px++); + + /* x[5] and x[6] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLAD(input1, input2, acc3); + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += ((q15_t) x0 * c0); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += ((q15_t) x1 * c0); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += ((q15_t) x2 * c0); + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += ((q15_t) x3 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); + *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8)); + *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8)); + *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8)); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1U); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q7_t *pIn1 = pSrcA; /* input pointer */ + q7_t *pIn2 = pSrcB; /* coefficient pointer */ + q31_t sum; /* Accumulator */ + uint32_t i, j; /* loop counter */ + + /* Loop to calculate output of convolution for output length number of times */ + for (i = 0; i < (srcALen + srcBLen - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if (((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += (q15_t) pIn1[j] * (pIn2[i - j]); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U); + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of Conv group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c new file mode 100644 index 0000000..9451887 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c @@ -0,0 +1,727 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_correlate_f32.c + * Description: Correlation of floating-point sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup Corr Correlation + * + * Correlation is a mathematical operation that is similar to convolution. + * As with convolution, correlation uses two signals to produce a third signal. + * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution. + * Correlation is commonly used to measure the similarity between two signals. + * It has applications in pattern recognition, cryptanalysis, and searching. + * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types. + * Fast versions of the Q15 and Q31 functions are also provided. + * + * \par Algorithm + * Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively. + * The convolution of the two signals is denoted by + *
+ *                   c[n] = a[n] * b[n]
+ * 
+ * In correlation, one of the signals is flipped in time + *
+ *                   c[n] = a[n] * b[-n]
+ * 
+ * + * \par + * and this is mathematically defined as + * \image html CorrelateEquation.gif + * \par + * The pSrcA points to the first input vector of length srcALen and pSrcB points to the second input vector of length srcBLen. + * The result c[n] is of length 2 * max(srcALen, srcBLen) - 1 and is defined over the interval n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2). + * The output result is written to pDst and the calling function must allocate 2 * max(srcALen, srcBLen) - 1 words for the result. + * + * Note + * \par + * The pDst should be initialized to all zeros before being used. + * + * Fixed-Point Behavior + * \par + * Correlation requires summing up a large number of intermediate products. + * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. + * Refer to the function specific documentation below for further details of the particular algorithm used. + * + * + * Fast Versions + * + * \par + * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires + * the input signals should be scaled down to avoid intermediate overflows. + * + * + * Opt Versions + * + * \par + * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. + * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate + */ + +/** + * @addtogroup Corr + * @{ + */ +/** + * @brief Correlation of floating-point sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + */ + +void arm_correlate_f32( + float32_t * pSrcA, + uint32_t srcALen, + float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float32_t *pIn1; /* inputA pointer */ + float32_t *pIn2; /* inputB pointer */ + float32_t *pOut = pDst; /* output pointer */ + float32_t *px; /* Intermediate inputA pointer */ + float32_t *py; /* Intermediate inputB pointer */ + float32_t *pSrc1; /* Intermediate pointers */ + float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we assume zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding has to be done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + //while (j > 0U) + //{ + // /* Zero is stored in the destination buffer */ + // *pOut++ = 0.0f; + + // /* Decrement the loop counter */ + // j--; + //} + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1U); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 4] */ + sum += *px++ * *py++; + /* x[1] * y[srcBLen - 3] */ + sum += *px++ * *py++; + /* x[2] * y[srcBLen - 2] */ + sum += *px++ * *py++; + /* x[3] * y[srcBLen - 1] */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + /* x[0] * y[srcBLen - 1] */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[0] sample */ + c0 = *(py++); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[0] */ + acc0 += x0 * c0; + /* acc1 += x[1] * y[0] */ + acc1 += x1 * c0; + /* acc2 += x[2] * y[0] */ + acc2 += x2 * c0; + /* acc3 += x[3] * y[0] */ + acc3 += x3 * c0; + + /* Read y[1] sample */ + c0 = *(py++); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[1] */ + acc0 += x1 * c0; + /* acc1 += x[2] * y[1] */ + acc1 += x2 * c0; + /* acc2 += x[3] * y[1] */ + acc2 += x3 * c0; + /* acc3 += x[4] * y[1] */ + acc3 += x0 * c0; + + /* Read y[2] sample */ + c0 = *(py++); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[2] */ + acc0 += x2 * c0; + /* acc1 += x[3] * y[2] */ + acc1 += x3 * c0; + /* acc2 += x[4] * y[2] */ + acc2 += x0 * c0; + /* acc3 += x[5] * y[2] */ + acc3 += x1 * c0; + + /* Read y[3] sample */ + c0 = *(py++); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[3] */ + acc0 += x3 * c0; + /* acc1 += x[4] * y[3] */ + acc1 += x0 * c0; + /* acc2 += x[5] * y[3] */ + acc2 += x1 * c0; + /* acc3 += x[6] * y[3] */ + acc3 += x2 * c0; + + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Read y[4] sample */ + c0 = *(py++); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[4] */ + acc0 += x0 * c0; + /* acc1 += x[5] * y[4] */ + acc1 += x1 * c0; + /* acc2 += x[6] * y[4] */ + acc2 += x2 * c0; + /* acc3 += x[7] * y[4] */ + acc3 += x3 * c0; + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = acc0; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = acc1; + pOut += inc; + + *pOut = acc2; + pOut += inc; + + *pOut = acc3; + pOut += inc; + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py++; + sum += *px++ * *py++; + sum += *px++ * *py++; + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Loop over srcBLen */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] */ + sum += *px++ * *py++; + /* sum += x[srcALen - srcBLen + 3] * y[2] */ + sum += *px++ * *py++; + /* sum += x[srcALen - srcBLen + 2] * y[1] */ + sum += *px++ * *py++; + /* sum += x[srcALen - srcBLen + 1] * y[0] */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + float32_t *pIn1 = pSrcA; /* inputA pointer */ + float32_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */ + float32_t sum; /* Accumulator */ + uint32_t i = 0U, j; /* loop counters */ + uint32_t inv = 0U; /* Reverse order flag */ + uint32_t tot = 0U; /* Length */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and a varaible, inv is set to 1 */ + /* If lengths are not equal then zero pad has to be done to make the two + * inputs of same length. But to improve the performance, we assume zeroes + * in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the + * starting of the output buffer */ + /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the + * ending of the output buffer */ + /* Once the zero padding is done the remaining of the output is calcualted + * using convolution but with the shorter signal time shifted. */ + + /* Calculate the length of the remaining sequence */ + tot = ((srcALen + srcBLen) - 2U); + + if (srcALen > srcBLen) + { + /* Calculating the number of zeros to be padded to the output */ + j = srcALen - srcBLen; + + /* Initialise the pointer after zero padding */ + pDst += j; + } + + else if (srcALen < srcBLen) + { + /* Initialization to inputB pointer */ + pIn1 = pSrcB; + + /* Initialization to the end of inputA pointer */ + pIn2 = pSrcA + (srcALen - 1U); + + /* Initialisation of the pointer after zero padding */ + pDst = pDst + tot; + + /* Swapping the lengths */ + j = srcALen; + srcALen = srcBLen; + srcBLen = j; + + /* Setting the reverse flag */ + inv = 1; + + } + + /* Loop to calculate convolution for output length number of times */ + for (i = 0U; i <= tot; i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0.0f; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0U; j <= i; j++) + { + /* Check the array limitations */ + if ((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += pIn1[j] * pIn2[-((int32_t) i - j)]; + } + } + /* Store the output in the destination buffer */ + if (inv == 1) + *pDst-- = sum; + else + *pDst++ = sum; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of Corr group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c new file mode 100644 index 0000000..baebc49 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c @@ -0,0 +1,500 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_correlate_fast_opt_q15.c + * Description: Fast Q15 Correlation + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @return none. + * + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch buffers should be aligned by 32-bit + * + * + * Scaling and Overflow Behavior: + * + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a + * maximum of min(srcALen, srcBLen) number of additions is carried internally. + * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. + * + * \par + * See arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. + */ + +void arm_correlate_fast_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch) +{ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *py; /* Intermediate inputB pointer */ + q31_t x1, x2, x3; /* temporary variables for holding input and coefficient values */ + uint32_t j, blkCnt, outBlockSize; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + uint32_t tapCnt; + q31_t y1, y2; + q15_t *pScr; /* Intermediate pointers */ + q15_t *pOut = pDst; /* output pointer */ +#ifdef UNALIGNED_SUPPORT_DISABLE + + q15_t a, b; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + pScr = pScratch; + + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr += (srcBLen - 1U); + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr, srcALen); + + /* Update pointers */ + pScr += srcALen; + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + j = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (j > 0U) + { + /* copy second buffer in reversal manner */ + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + + /* Decrement the loop counter */ + j--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + j = srcALen % 0x4U; + + while (j > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr++ = *pIn1++; + + /* Decrement the loop counter */ + j--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr, (srcBLen - 1U)); + + /* Update pointer */ + pScr += (srcBLen - 1U); + +#else + +/* Apply loop unrolling and do 4 Copies simultaneously. */ + j = (srcBLen - 1U) >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (j > 0U) + { + /* copy second buffer in reversal manner */ + *pScr++ = 0; + *pScr++ = 0; + *pScr++ = 0; + *pScr++ = 0; + + /* Decrement the loop counter */ + j--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + j = (srcBLen - 1U) % 0x4U; + + while (j > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr++ = 0; + + /* Decrement the loop counter */ + j--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Temporary pointer for scratch2 */ + py = pIn2; + + + /* Actual correlation process starts here */ + blkCnt = (srcALen + srcBLen - 1U) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr = pScratch; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read four samples from scratch1 buffer */ + x1 = *__SIMD32(pScr)++; + + /* Read next four samples from scratch1 buffer */ + x2 = *__SIMD32(pScr)++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2U); + + acc0 = __SMLAD(x1, y1, acc0); + + acc2 = __SMLAD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLADX(x3, y1, acc1); + + x1 = _SIMD32_OFFSET(pScr); + + acc0 = __SMLAD(x2, y2, acc0); + + acc2 = __SMLAD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + acc1 = __SMLADX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr + 2U); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); +#else + + /* Read four samples from smaller buffer */ + a = *pIn2; + b = *(pIn2 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + y1 = __PKHBT(a, b, 16); +#else + y1 = __PKHBT(b, a, 16); +#endif + + a = *(pIn2 + 2); + b = *(pIn2 + 3); +#ifndef ARM_MATH_BIG_ENDIAN + y2 = __PKHBT(a, b, 16); +#else + y2 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLAD(x1, y1, acc0); + + acc2 = __SMLAD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLADX(x3, y1, acc1); + + a = *pScr; + b = *(pScr + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(a, b, 16); +#else + x1 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLAD(x2, y2, acc0); + + acc2 = __SMLAD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + acc1 = __SMLADX(x3, y2, acc1); + + a = *(pScr + 2); + b = *(pScr + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + x2 = __PKHBT(a, b, 16); +#else + x2 = __PKHBT(b, a, 16); +#endif + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y2, acc3); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + pIn2 += 4U; + + pScr += 4U; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr -= 4U; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr++ * *pIn2); + acc1 += (*pScr++ * *pIn2); + acc2 += (*pScr++ * *pIn2); + acc3 += (*pScr++ * *pIn2++); + + pScr -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + *pOut = (__SSAT(acc0 >> 15U, 16)); + pOut += inc; + *pOut = (__SSAT(acc1 >> 15U, 16)); + pOut += inc; + *pOut = (__SSAT(acc2 >> 15U, 16)); + pOut += inc; + *pOut = (__SSAT(acc3 >> 15U, 16)); + pOut += inc; + + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch += 4U; + + } + + + blkCnt = (srcALen + srcBLen - 1U) & 0x3; + + /* Calculate correlation for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr = pScratch; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + acc0 += (*pScr++ * *pIn2++); + acc0 += (*pScr++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + + *pOut = (q15_t) (__SSAT((acc0 >> 15), 16)); + + pOut += inc; + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch += 1U; + + } +} + +/** + * @} end of Corr group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c new file mode 100644 index 0000000..7b676d0 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c @@ -0,0 +1,1307 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_correlate_fast_q15.c + * Description: Fast Q15 Correlation + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * Scaling and Overflow Behavior: + * + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a + * maximum of min(srcALen, srcBLen) number of additions is carried internally. + * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. + * + * \par + * See arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. + */ + +void arm_correlate_fast_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst) +{ +#ifndef UNALIGNED_SUPPORT_DISABLE + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1U); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ + sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */ + sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum = __SMLAD(*px++, *py++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px + 1); + px += 2U; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the first two inputB samples using SIMD: + * y[0] and y[1] */ + c0 = *__SIMD32(py)++; + + /* acc0 += x[0] * y[0] + x[1] * y[1] */ + acc0 = __SMLAD(x0, c0, acc0); + + /* acc1 += x[1] * y[0] + x[2] * y[1] */ + acc1 = __SMLAD(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px + 1); + + /* acc2 += x[2] * y[0] + x[3] * y[1] */ + acc2 = __SMLAD(x2, c0, acc2); + + /* acc3 += x[3] * y[0] + x[4] * y[1] */ + acc3 = __SMLAD(x3, c0, acc3); + + /* Read y[2] and y[3] */ + c0 = *__SIMD32(py)++; + + /* acc0 += x[2] * y[2] + x[3] * y[3] */ + acc0 = __SMLAD(x2, c0, acc0); + + /* acc1 += x[3] * y[2] + x[4] * y[3] */ + acc1 = __SMLAD(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px + 2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px + 3); + px += 4U; + + /* acc2 += x[4] * y[2] + x[5] * y[3] */ + acc2 = __SMLAD(x0, c0, acc2); + + /* acc3 += x[5] * y[2] + x[6] * y[3] */ + acc3 = __SMLAD(x1, c0, acc3); + + } while (--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + if (k == 1U) + { + /* Read y[4] */ + c0 = *py; +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if (k == 2U) + { + /* Read y[4], y[5] */ + c0 = *__SIMD32(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px + 1); + px += 2U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLAD(x3, c0, acc2); + acc3 = __SMLAD(x2, c0, acc3); + } + + if (k == 3U) + { + /* Read y[4], y[5] */ + c0 = *__SIMD32(py)++; + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px + 1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLAD(x3, c0, acc2); + acc3 = __SMLAD(x2, c0, acc3); + + c0 = (*py); + /* Read y[6] */ +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px + 2); + px += 3U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (acc0 >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q15_t) (acc1 >> 15); + pOut += inc; + + *pOut = (q15_t) (acc2 >> 15); + pOut += inc; + + *pOut = (q15_t) (acc3 >> 15); + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */ + sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */ + sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLAD(*px++, *py++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + q15_t a, b; + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1U); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *(px + 2); + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *(px + 2); + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 2U; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the first two inputB samples using SIMD: + * y[0] and y[1] */ + a = *py; + b = *(py + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[0] * y[0] + x[1] * y[1] */ + acc0 = __SMLAD(x0, c0, acc0); + + /* acc1 += x[1] * y[0] + x[2] * y[1] */ + acc1 = __SMLAD(x1, c0, acc1); + + /* Read x[2], x[3], x[4] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x2 = __PKHBT(a, b, 16); + a = *(px + 2); + x3 = __PKHBT(b, a, 16); + +#else + + x2 = __PKHBT(b, a, 16); + a = *(px + 2); + x3 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc2 += x[2] * y[0] + x[3] * y[1] */ + acc2 = __SMLAD(x2, c0, acc2); + + /* acc3 += x[3] * y[0] + x[4] * y[1] */ + acc3 = __SMLAD(x3, c0, acc3); + + /* Read y[2] and y[3] */ + a = *(py + 2); + b = *(py + 3); + + py += 4U; + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* acc0 += x[2] * y[2] + x[3] * y[3] */ + acc0 = __SMLAD(x2, c0, acc0); + + /* acc1 += x[3] * y[2] + x[4] * y[3] */ + acc1 = __SMLAD(x3, c0, acc1); + + /* Read x[4], x[5], x[6] */ + a = *(px + 2); + b = *(px + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x0 = __PKHBT(a, b, 16); + a = *(px + 4); + x1 = __PKHBT(b, a, 16); + +#else + + x0 = __PKHBT(b, a, 16); + a = *(px + 4); + x1 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 4U; + + /* acc2 += x[4] * y[2] + x[5] * y[3] */ + acc2 = __SMLAD(x0, c0, acc2); + + /* acc3 += x[5] * y[2] + x[6] * y[3] */ + acc3 = __SMLAD(x1, c0, acc3); + + } while (--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + if (k == 1U) + { + /* Read y[4] */ + c0 = *py; +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7] */ + a = *px; + b = *(px + 1); + + px++;; + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLADX(x1, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + if (k == 2U) + { + /* Read y[4], y[5] */ + a = *py; + b = *(py + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 2U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLAD(x3, c0, acc2); + acc3 = __SMLAD(x2, c0, acc3); + } + + if (k == 3U) + { + /* Read y[4], y[5] */ + a = *py; + b = *(py + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + c0 = __PKHBT(a, b, 16); + +#else + + c0 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + py += 2U; + + /* Read x[7], x[8], x[9] */ + a = *px; + b = *(px + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + a = *(px + 2); + x2 = __PKHBT(b, a, 16); + +#else + + x3 = __PKHBT(b, a, 16); + a = *(px + 2); + x2 = __PKHBT(a, b, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + acc2 = __SMLAD(x3, c0, acc2); + acc3 = __SMLAD(x2, c0, acc3); + + c0 = (*py); + /* Read y[6] */ +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + /* Read x[10] */ + b = *(px + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + + x3 = __PKHBT(a, b, 16); + +#else + + x3 = __PKHBT(b, a, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + px += 3U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLADX(x1, c0, acc0); + acc1 = __SMLAD(x2, c0, acc1); + acc2 = __SMLADX(x2, c0, acc2); + acc3 = __SMLADX(x3, c0, acc3); + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (acc0 >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q15_t) (acc1 >> 15); + pOut += inc; + + *pOut = (q15_t) (acc2 >> 15); + pOut += inc; + + *pOut = (q15_t) (acc3 >> 15); + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q31_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (sum >> 15); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +} + +/** + * @} end of Corr group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c new file mode 100644 index 0000000..53373ac --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c @@ -0,0 +1,600 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_correlate_fast_q31.c + * Description: Fast Q31 Correlation + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are accumulated in a 32-bit register in 2.30 format. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * + * \par + * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signals must be scaled down. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a + * maximum of min(srcALen, srcBLen) number of additions is carried internally. + * + * \par + * See arm_correlate_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. + */ + +void arm_correlate_fast_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst) +{ + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1U); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 4] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* x[1] * y[srcBLen - 3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* x[2] * y[srcBLen - 2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* x[3] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum << 1; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[0] sample */ + c0 = *(py++); + + /* Read x[3] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[0] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc1 += x[1] * y[0] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc2 += x[2] * y[0] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc3 += x[3] * y[0] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Read y[1] sample */ + c0 = *(py++); + + /* Read x[4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[1] * y[1] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc1 += x[2] * y[1] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc2 += x[3] * y[1] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc3 += x[4] * y[1] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read y[2] sample */ + c0 = *(py++); + + /* Read x[5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[2] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc1 += x[3] * y[2] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc2 += x[4] * y[2] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc3 += x[5] * y[2] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read y[3] sample */ + c0 = *(py++); + + /* Read x[6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[3] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); + /* acc1 += x[4] * y[3] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc2 += x[5] * y[3] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc3 += x[6] * y[3] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); + + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Read y[4] sample */ + c0 = *(py++); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[4] */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + /* acc1 += x[5] * y[4] */ + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + /* acc2 += x[6] * y[4] */ + acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); + /* acc3 += x[7] * y[4] */ + acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (acc0 << 1); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q31_t) (acc1 << 1); + pOut += inc; + + *pOut = (q31_t) (acc2 << 1); + pOut += inc; + + *pOut = (q31_t) (acc3 << 1); + pOut += inc; + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum << 1; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum << 1; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1U; + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* sum += x[srcALen - srcBLen + 3] * y[2] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* sum += x[srcALen - srcBLen + 2] * y[1] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + /* sum += x[srcALen - srcBLen + 1] * y[0] */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = (q31_t) ((((q63_t) sum << 32) + + ((q63_t) * px++ * (*py++))) >> 32); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum << 1; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +} + +/** + * @} end of Corr group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c new file mode 100644 index 0000000..c021b05 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c @@ -0,0 +1,501 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_correlate_opt_q15.c + * Description: Correlation of Q15 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @return none. + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch buffers should be aligned by 32-bit + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both inputs are in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * This approach provides 33 guard bits and there is no risk of overflow. + * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. + * + * \par + * Refer to arm_correlate_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * + */ + + +void arm_correlate_opt_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch) +{ + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q63_t acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *py; /* Intermediate inputB pointer */ + q31_t x1, x2, x3; /* temporary variables for holding input1 and input2 values */ + uint32_t j, blkCnt, outBlockSize; /* loop counter */ + int32_t inc = 1; /* output pointer increment */ + uint32_t tapCnt; + q31_t y1, y2; + q15_t *pScr; /* Intermediate pointers */ + q15_t *pOut = pDst; /* output pointer */ +#ifdef UNALIGNED_SUPPORT_DISABLE + + q15_t a, b; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + pScr = pScratch; + + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr += (srcBLen - 1U); + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Copy (srcALen) samples in scratch buffer */ + arm_copy_q15(pIn1, pScr, srcALen); + + /* Update pointers */ + //pIn1 += srcALen; + pScr += srcALen; + +#else + + /* Apply loop unrolling and do 4 Copies simultaneously. */ + j = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (j > 0U) + { + /* copy second buffer in reversal manner */ + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + *pScr++ = *pIn1++; + + /* Decrement the loop counter */ + j--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + j = srcALen % 0x4U; + + while (j > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr++ = *pIn1++; + + /* Decrement the loop counter */ + j--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr, (srcBLen - 1U)); + + /* Update pointer */ + pScr += (srcBLen - 1U); + +#else + +/* Apply loop unrolling and do 4 Copies simultaneously. */ + j = (srcBLen - 1U) >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (j > 0U) + { + /* copy second buffer in reversal manner */ + *pScr++ = 0; + *pScr++ = 0; + *pScr++ = 0; + *pScr++ = 0; + + /* Decrement the loop counter */ + j--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + j = (srcBLen - 1U) % 0x4U; + + while (j > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr++ = 0; + + /* Decrement the loop counter */ + j--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Temporary pointer for scratch2 */ + py = pIn2; + + + /* Actual correlation process starts here */ + blkCnt = (srcALen + srcBLen - 1U) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr = pScratch; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read four samples from scratch1 buffer */ + x1 = *__SIMD32(pScr)++; + + /* Read next four samples from scratch1 buffer */ + x2 = *__SIMD32(pScr)++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pIn2); + y2 = _SIMD32_OFFSET(pIn2 + 2U); + + acc0 = __SMLALD(x1, y1, acc0); + + acc2 = __SMLALD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLALDX(x3, y1, acc1); + + x1 = _SIMD32_OFFSET(pScr); + + acc0 = __SMLALD(x2, y2, acc0); + + acc2 = __SMLALD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + + acc1 = __SMLALDX(x3, y2, acc1); + + x2 = _SIMD32_OFFSET(pScr + 2U); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + +#else + + /* Read four samples from smaller buffer */ + a = *pIn2; + b = *(pIn2 + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + y1 = __PKHBT(a, b, 16); +#else + y1 = __PKHBT(b, a, 16); +#endif + + a = *(pIn2 + 2); + b = *(pIn2 + 3); +#ifndef ARM_MATH_BIG_ENDIAN + y2 = __PKHBT(a, b, 16); +#else + y2 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLALD(x1, y1, acc0); + + acc2 = __SMLALD(x2, y1, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc1 = __SMLALDX(x3, y1, acc1); + + a = *pScr; + b = *(pScr + 1); + +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(a, b, 16); +#else + x1 = __PKHBT(b, a, 16); +#endif + + acc0 = __SMLALD(x2, y2, acc0); + + acc2 = __SMLALD(x1, y2, acc2); + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLALDX(x3, y1, acc3); + + acc1 = __SMLALDX(x3, y2, acc1); + + a = *(pScr + 2); + b = *(pScr + 3); + +#ifndef ARM_MATH_BIG_ENDIAN + x2 = __PKHBT(a, b, 16); +#else + x2 = __PKHBT(b, a, 16); +#endif + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLALDX(x3, y2, acc3); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + pIn2 += 4U; + + pScr += 4U; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr -= 4U; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr++ * *pIn2); + acc1 += (*pScr++ * *pIn2); + acc2 += (*pScr++ * *pIn2); + acc3 += (*pScr++ * *pIn2++); + + pScr -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + + /* Store the results in the accumulators in the destination buffer. */ + *pOut = (__SSAT(acc0 >> 15U, 16)); + pOut += inc; + *pOut = (__SSAT(acc1 >> 15U, 16)); + pOut += inc; + *pOut = (__SSAT(acc2 >> 15U, 16)); + pOut += inc; + *pOut = (__SSAT(acc3 >> 15U, 16)); + pOut += inc; + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch += 4U; + + } + + + blkCnt = (srcALen + srcBLen - 1U) & 0x3; + + /* Calculate correlation for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr = pScratch; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + + acc0 += (*pScr++ * *pIn2++); + acc0 += (*pScr++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr++ * *pIn2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT((acc0 >> 15), 16)); + + pOut += inc; + + /* Initialization of inputB pointer */ + pIn2 = py; + + pScratch += 1U; + + } + + +} + +/** + * @} end of Corr group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c new file mode 100644 index 0000000..dbffd5d --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c @@ -0,0 +1,452 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_correlate_opt_q7.c + * Description: Correlation of Q7 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + * @return none. + * + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. + * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format. + * + * + */ + + + +void arm_correlate_opt_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2) +{ + q7_t *pOut = pDst; /* output pointer */ + q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */ + q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */ + q7_t *pIn1; /* inputA pointer */ + q7_t *pIn2; /* inputB pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t j, k = 0U, blkCnt; /* loop counter */ + int32_t inc = 1; /* output pointer increment */ + uint32_t outBlockSize; /* loop counter */ + q15_t x4; /* Temporary input variable */ + uint32_t tapCnt; /* loop counter */ + q31_t x1, x2, x3, y1; /* Temporary input variables */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + + /* Copy (srcBLen) samples in scratch buffer */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn2++; + *pScr2++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* Fill (srcBLen - 1U) zeros in scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update temporary scratch pointer */ + pScr1 += (srcBLen - 1U); + + /* Copy (srcALen) samples in scratch buffer */ + k = srcALen >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = srcALen % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + x4 = (q15_t) * pIn1++; + *pScr1++ = x4; + + /* Decrement the loop counter */ + k--; + } + +#ifndef UNALIGNED_SUPPORT_DISABLE + + /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ + arm_fill_q15(0, pScr1, (srcBLen - 1U)); + + /* Update pointer */ + pScr1 += (srcBLen - 1U); + +#else + +/* Apply loop unrolling and do 4 Copies simultaneously. */ + k = (srcBLen - 1U) >> 2U; + + /* First part of the processing with loop unrolling copies 4 data points at a time. + ** a second loop below copies for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* copy second buffer in reversal manner */ + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, copy remaining samples here. + ** No loop unrolling is used. */ + k = (srcBLen - 1U) % 0x4U; + + while (k > 0U) + { + /* copy second buffer in reversal manner for remaining samples */ + *pScr1++ = 0; + + /* Decrement the loop counter */ + k--; + } + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Temporary pointer for second sequence */ + py = pScratch2; + + /* Initialization of pScr2 pointer */ + pScr2 = pScratch2; + + /* Actual correlation process starts here */ + blkCnt = (srcALen + srcBLen - 1U) >> 2; + + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Read two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* Read next two samples from scratch1 buffer */ + x2 = *__SIMD32(pScr1)++; + + tapCnt = (srcBLen) >> 2U; + + while (tapCnt > 0U) + { + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2); + + /* multiply and accumlate */ + acc0 = __SMLAD(x1, y1, acc0); + acc2 = __SMLAD(x2, y1, acc2); + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + /* multiply and accumlate */ + acc1 = __SMLADX(x3, y1, acc1); + + /* Read next two samples from scratch1 buffer */ + x1 = *__SIMD32(pScr1)++; + + /* pack input data */ +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x1, x2, 0); +#else + x3 = __PKHBT(x2, x1, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + /* Read four samples from smaller buffer */ + y1 = _SIMD32_OFFSET(pScr2 + 2U); + + acc0 = __SMLAD(x2, y1, acc0); + + acc2 = __SMLAD(x1, y1, acc2); + + acc1 = __SMLADX(x3, y1, acc1); + + x2 = *__SIMD32(pScr1)++; + +#ifndef ARM_MATH_BIG_ENDIAN + x3 = __PKHBT(x2, x1, 0); +#else + x3 = __PKHBT(x1, x2, 0); +#endif + + acc3 = __SMLADX(x3, y1, acc3); + + pScr2 += 4U; + + + /* Decrement the loop counter */ + tapCnt--; + } + + + + /* Update scratch pointer for remaining samples of smaller length sequence */ + pScr1 -= 4U; + + + /* apply same above for remaining samples of smaller length sequence */ + tapCnt = (srcBLen) & 3U; + + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2); + acc1 += (*pScr1++ * *pScr2); + acc2 += (*pScr1++ * *pScr2); + acc3 += (*pScr1++ * *pScr2++); + + pScr1 -= 3U; + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8)); + pOut += inc; + *pOut = (q7_t) (__SSAT(acc1 >> 7U, 8)); + pOut += inc; + *pOut = (q7_t) (__SSAT(acc2 >> 7U, 8)); + pOut += inc; + *pOut = (q7_t) (__SSAT(acc3 >> 7U, 8)); + pOut += inc; + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 4U; + + } + + + blkCnt = (srcALen + srcBLen - 1U) & 0x3; + + /* Calculate correlation for remaining samples of Bigger length sequence */ + while (blkCnt > 0) + { + /* Initialze temporary scratch pointer as scratch1 */ + pScr1 = pScratch1; + + /* Clear Accumlators */ + acc0 = 0; + + tapCnt = (srcBLen) >> 1U; + + while (tapCnt > 0U) + { + acc0 += (*pScr1++ * *pScr2++); + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (srcBLen) & 1U; + + /* apply same above for remaining samples of smaller length sequence */ + while (tapCnt > 0U) + { + + /* accumlate the results */ + acc0 += (*pScr1++ * *pScr2++); + + /* Decrement the loop counter */ + tapCnt--; + } + + blkCnt--; + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8)); + + pOut += inc; + + /* Initialization of inputB pointer */ + pScr2 = py; + + pScratch1 += 1U; + + } + +} + +/** + * @} end of Corr group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c new file mode 100644 index 0000000..fdff6db --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c @@ -0,0 +1,707 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_correlate_q15.c + * Description: Correlation of Q15 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both inputs are in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * This approach provides 33 guard bits and there is no risk of overflow. + * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. + * + * \par + * Refer to arm_correlate_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * \par + * Refer the function arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers. + * + */ + +void arm_correlate_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst) +{ + +#if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1U); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first loop starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ + sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */ + sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum = __SMLALD(*px++, *py++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT((sum >> 15), 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px + 1); + px += 2U; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the first two inputB samples using SIMD: + * y[0] and y[1] */ + c0 = *__SIMD32(py)++; + + /* acc0 += x[0] * y[0] + x[1] * y[1] */ + acc0 = __SMLALD(x0, c0, acc0); + + /* acc1 += x[1] * y[0] + x[2] * y[1] */ + acc1 = __SMLALD(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px + 1); + + /* acc2 += x[2] * y[0] + x[3] * y[1] */ + acc2 = __SMLALD(x2, c0, acc2); + + /* acc3 += x[3] * y[0] + x[4] * y[1] */ + acc3 = __SMLALD(x3, c0, acc3); + + /* Read y[2] and y[3] */ + c0 = *__SIMD32(py)++; + + /* acc0 += x[2] * y[2] + x[3] * y[3] */ + acc0 = __SMLALD(x2, c0, acc0); + + /* acc1 += x[3] * y[2] + x[4] * y[3] */ + acc1 = __SMLALD(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px + 2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px + 3); + + px += 4U; + + /* acc2 += x[4] * y[2] + x[5] * y[3] */ + acc2 = __SMLALD(x0, c0, acc2); + + /* acc3 += x[5] * y[2] + x[6] * y[3] */ + acc3 = __SMLALD(x1, c0, acc3); + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + if (k == 1U) + { + /* Read y[4] */ + c0 = *py; +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALDX(x1, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + if (k == 2U) + { + /* Read y[4], y[5] */ + c0 = *__SIMD32(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px + 1); + px += 2U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALD(x3, c0, acc2); + acc3 = __SMLALD(x2, c0, acc3); + } + + if (k == 3U) + { + /* Read y[4], y[5] */ + c0 = *__SIMD32(py)++; + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px + 1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALD(x3, c0, acc2); + acc3 = __SMLALD(x2, c0, acc3); + + c0 = (*py); + + /* Read y[6] */ +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16U; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px + 2); + px += 3U; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x1, c0, acc0); + acc1 = __SMLALD(x2, c0, acc1); + acc2 = __SMLALDX(x2, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT(acc0 >> 15, 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q15_t) (__SSAT(acc1 >> 15, 16)); + pOut += inc; + + *pOut = (q15_t) (__SSAT(acc2 >> 15, 16)); + pOut += inc; + + *pOut = (q15_t) (__SSAT(acc3 >> 15, 16)); + pOut += inc; + + /* Increment the count by 4 as 4 output values are computed */ + count += 4U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q63_t) * px++ * *py++); + sum += ((q63_t) * px++ * *py++); + sum += ((q63_t) * px++ * *py++); + sum += ((q63_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q63_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT(sum >> 15, 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment count by 1, as one output value is computed */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q63_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT(sum >> 15, 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */ + sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */ + sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py++, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q15_t) (__SSAT((sum >> 15), 16)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q15_t *pIn1 = pSrcA; /* inputA pointer */ + q15_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */ + q63_t sum; /* Accumulators */ + uint32_t i = 0U, j; /* loop counters */ + uint32_t inv = 0U; /* Reverse order flag */ + uint32_t tot = 0U; /* Length */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and a varaible, inv is set to 1 */ + /* If lengths are not equal then zero pad has to be done to make the two + * inputs of same length. But to improve the performance, we include zeroes + * in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the + * starting of the output buffer */ + /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the + * ending of the output buffer */ + /* Once the zero padding is done the remaining of the output is calcualted + * using convolution but with the shorter signal time shifted. */ + + /* Calculate the length of the remaining sequence */ + tot = ((srcALen + srcBLen) - 2U); + + if (srcALen > srcBLen) + { + /* Calculating the number of zeros to be padded to the output */ + j = srcALen - srcBLen; + + /* Initialise the pointer after zero padding */ + pDst += j; + } + + else if (srcALen < srcBLen) + { + /* Initialization to inputB pointer */ + pIn1 = pSrcB; + + /* Initialization to the end of inputA pointer */ + pIn2 = pSrcA + (srcALen - 1U); + + /* Initialisation of the pointer after zero padding */ + pDst = pDst + tot; + + /* Swapping the lengths */ + j = srcALen; + srcALen = srcBLen; + srcBLen = j; + + /* Setting the reverse flag */ + inv = 1; + + } + + /* Loop to calculate convolution for output length number of times */ + for (i = 0U; i <= tot; i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0U; j <= i; j++) + { + /* Check the array limitations */ + if ((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]); + } + } + /* Store the output in the destination buffer */ + if (inv == 1) + *pDst-- = (q15_t) __SSAT((sum >> 15U), 16U); + else + *pDst++ = (q15_t) __SSAT((sum >> 15U), 16U); + } + +#endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ + +} + +/** + * @} end of Corr group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c new file mode 100644 index 0000000..f2e946a --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c @@ -0,0 +1,653 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_correlate_q31.c + * Description: Correlation of Q31 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q31 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a + * maximum of min(srcALen, srcBLen) number of additions is carried internally. + * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. + * + * \par + * See arm_correlate_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +void arm_correlate_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst) +{ + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1; /* Intermediate pointers */ + q63_t sum, acc0, acc1, acc2; /* Accumulators */ + q31_t x0, x1, x2, c0; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; /* Destination address modifier */ + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1U); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py++); + /* x[1] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py++); + /* x[2] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py++); + /* x[3] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (sum >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll by 3 */ + blkCnt = blockSize2 / 3; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* read x[0], x[1] samples */ + x0 = *(px++); + x1 = *(px++); + + /* Apply loop unrolling and compute 3 MACs simultaneously. */ + k = srcBLen / 3; + + /* First part of the processing with loop unrolling. Compute 3 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 2 samples. */ + do + { + /* Read y[0] sample */ + c0 = *(py); + + /* Read x[2] sample */ + x2 = *(px); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[0] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[1] * y[0] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[2] * y[0] */ + acc2 += ((q63_t) x2 * c0); + + /* Read y[1] sample */ + c0 = *(py + 1U); + + /* Read x[3] sample */ + x0 = *(px + 1U); + + /* Perform the multiply-accumulates */ + /* acc0 += x[1] * y[1] */ + acc0 += ((q63_t) x1 * c0); + /* acc1 += x[2] * y[1] */ + acc1 += ((q63_t) x2 * c0); + /* acc2 += x[3] * y[1] */ + acc2 += ((q63_t) x0 * c0); + + /* Read y[2] sample */ + c0 = *(py + 2U); + + /* Read x[4] sample */ + x1 = *(px + 2U); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[2] */ + acc0 += ((q63_t) x2 * c0); + /* acc1 += x[3] * y[2] */ + acc1 += ((q63_t) x0 * c0); + /* acc2 += x[4] * y[2] */ + acc2 += ((q63_t) x1 * c0); + + /* update scratch pointers */ + px += 3U; + py += 3U; + + } while (--k); + + /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen - (3 * (srcBLen / 3)); + + while (k > 0U) + { + /* Read y[4] sample */ + c0 = *(py++); + + /* Read x[7] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[4] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[5] * y[4] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[6] * y[4] */ + acc2 += ((q63_t) x2 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (acc0 >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q31_t) (acc1 >> 31); + pOut += inc; + + *pOut = (q31_t) (acc2 >> 31); + pOut += inc; + + /* Increment the pointer pIn1 index, count by 3 */ + count += 3U; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 - 3 * (blockSize2 / 3); + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) * px++ * (*py++); + sum += (q63_t) * px++ * (*py++); + sum += (q63_t) * px++ * (*py++); + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (sum >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (sum >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen - srcBLen + 4] * y[3] */ + sum += (q63_t) * px++ * (*py++); + /* sum += x[srcALen - srcBLen + 3] * y[2] */ + sum += (q63_t) * px++ * (*py++); + /* sum += x[srcALen - srcBLen + 2] * y[1] */ + sum += (q63_t) * px++ * (*py++); + /* sum += x[srcALen - srcBLen + 1] * y[0] */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) * px++ * (*py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q31_t) (sum >> 31); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t *pIn1 = pSrcA; /* inputA pointer */ + q31_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */ + q63_t sum; /* Accumulators */ + uint32_t i = 0U, j; /* loop counters */ + uint32_t inv = 0U; /* Reverse order flag */ + uint32_t tot = 0U; /* Length */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and a varaible, inv is set to 1 */ + /* If lengths are not equal then zero pad has to be done to make the two + * inputs of same length. But to improve the performance, we include zeroes + * in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the + * starting of the output buffer */ + /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the + * ending of the output buffer */ + /* Once the zero padding is done the remaining of the output is calcualted + * using correlation but with the shorter signal time shifted. */ + + /* Calculate the length of the remaining sequence */ + tot = ((srcALen + srcBLen) - 2U); + + if (srcALen > srcBLen) + { + /* Calculating the number of zeros to be padded to the output */ + j = srcALen - srcBLen; + + /* Initialise the pointer after zero padding */ + pDst += j; + } + + else if (srcALen < srcBLen) + { + /* Initialization to inputB pointer */ + pIn1 = pSrcB; + + /* Initialization to the end of inputA pointer */ + pIn2 = pSrcA + (srcALen - 1U); + + /* Initialisation of the pointer after zero padding */ + pDst = pDst + tot; + + /* Swapping the lengths */ + j = srcALen; + srcALen = srcBLen; + srcBLen = j; + + /* Setting the reverse flag */ + inv = 1; + + } + + /* Loop to calculate correlation for output length number of times */ + for (i = 0U; i <= tot; i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to correlation equation */ + for (j = 0U; j <= i; j++) + { + /* Check the array limitations */ + if ((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]); + } + } + /* Store the output in the destination buffer */ + if (inv == 1) + *pDst-- = (q31_t) (sum >> 31U); + else + *pDst++ = (q31_t) (sum >> 31U); + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of Corr group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c new file mode 100644 index 0000000..f8b1df5 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c @@ -0,0 +1,778 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_correlate_q7.c + * Description: Correlation of Q7 sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Corr + * @{ + */ + +/** + * @brief Correlation of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. + * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format. + * + * \par + * Refer the function arm_correlate_opt_q7() for a faster implementation of this function. + * + */ + +void arm_correlate_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pIn1; /* inputA pointer */ + q7_t *pIn2; /* inputB pointer */ + q7_t *pOut = pDst; /* output pointer */ + q7_t *px; /* Intermediate inputA pointer */ + q7_t *py; /* Intermediate inputB pointer */ + q7_t *pSrc1; /* Intermediate pointers */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ + q31_t input1, input2; /* temporary variables */ + q15_t in1, in2; /* temporary variables */ + q7_t x0, x1, x2, x3, c0, c1; /* temporary variables for holding input and coefficient values */ + uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ + int32_t inc = 1; + + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we include zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcA); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcB); + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding is done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (pSrcB); + + /* Initialization of inputB pointer */ + pIn2 = (pSrcA); + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + + } + + /* The function is internally + * divided into three parts according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first part of the + * algorithm, the multiplications increase by one for every iteration. + * In the second part of the algorithm, srcBLen number of multiplications are done. + * In the third part of the algorithm, the multiplications decrease by one + * for every iteration.*/ + /* The algorithm is implemented in three stages. + * The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1U); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[0] , x[1] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 4] , y[srcBLen - 3] */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* x[0] * y[srcBLen - 4] */ + /* x[1] * y[srcBLen - 3] */ + sum = __SMLAD(input1, input2, sum); + + /* x[2] , x[3] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[srcBLen - 2] , y[srcBLen - 1] */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* x[2] * y[srcBLen - 2] */ + /* x[3] * y[srcBLen - 1] */ + sum = __SMLAD(input1, input2, sum); + + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + /* x[0] * y[srcBLen - 1] */ + sum += (q31_t) ((q15_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(sum >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2U; + + while (blkCnt > 0U) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *px++; + x1 = *px++; + x2 = *px++; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[0] sample */ + c0 = *py++; + /* Read y[1] sample */ + c1 = *py++; + + /* Read x[3] sample */ + x3 = *px++; + + /* x[0] and x[1] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[0] and y[1] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc0 += x[0] * y[0] + x[1] * y[1] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[1] and x[2] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc1 += x[1] * y[0] + x[2] * y[1] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc2 += x[2] * y[0] + x[3] * y[1] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[4] sample */ + x0 = *(px++); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc3 += x[3] * y[0] + x[4] * y[1] */ + acc3 = __SMLAD(input1, input2, acc3); + + /* Read y[2] sample */ + c0 = *py++; + /* Read y[3] sample */ + c1 = *py++; + + /* Read x[5] sample */ + x1 = *px++; + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[2] and y[3] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc0 += x[2] * y[2] + x[3] * y[3] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc1 += x[3] * y[2] + x[4] * y[3] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[4] and x[5] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc2 += x[4] * y[2] + x[5] * y[3] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[6] sample */ + x2 = *px++; + + /* x[5] and x[6] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* acc3 += x[5] * y[2] + x[6] * y[3] */ + acc3 = __SMLAD(input1, input2, acc3); + + } while (--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Read y[4] sample */ + c0 = *py++; + + /* Read x[7] sample */ + x3 = *px++; + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[4] */ + acc0 += ((q15_t) x0 * c0); + /* acc1 += x[5] * y[4] */ + acc1 += ((q15_t) x1 * c0); + /* acc2 += x[6] * y[4] */ + acc2 += ((q15_t) x2 * c0); + /* acc3 += x[7] * y[4] */ + acc3 += ((q15_t) x3 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(acc0 >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + *pOut = (q7_t) (__SSAT(acc1 >> 7, 8)); + pOut += inc; + + *pOut = (q7_t) (__SSAT(acc2 >> 7, 8)); + pOut += inc; + + *pOut = (q7_t) (__SSAT(acc3 >> 7, 8)); + pOut += inc; + + count += 4U; + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4U; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(sum >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over srcBLen */ + k = srcBLen; + + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += ((q15_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(sum >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while (k > 0U) + { + /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[0] , y[1] */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* sum += x[srcALen - srcBLen + 1] * y[0] */ + /* sum += x[srcALen - srcBLen + 2] * y[1] */ + sum = __SMLAD(input1, input2, sum); + + /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* y[2] , y[3] */ + in1 = (q15_t) * py++; + in2 = (q15_t) * py++; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); + + /* sum += x[srcALen - srcBLen + 3] * y[2] */ + /* sum += x[srcALen - srcBLen + 4] * y[3] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4U; + + while (k > 0U) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py++); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = (q7_t) (__SSAT(sum >> 7, 8)); + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q7_t *pIn1 = pSrcA; /* inputA pointer */ + q7_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */ + q31_t sum; /* Accumulator */ + uint32_t i = 0U, j; /* loop counters */ + uint32_t inv = 0U; /* Reverse order flag */ + uint32_t tot = 0U; /* Length */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and a varaible, inv is set to 1 */ + /* If lengths are not equal then zero pad has to be done to make the two + * inputs of same length. But to improve the performance, we include zeroes + * in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the + * starting of the output buffer */ + /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the + * ending of the output buffer */ + /* Once the zero padding is done the remaining of the output is calcualted + * using convolution but with the shorter signal time shifted. */ + + /* Calculate the length of the remaining sequence */ + tot = ((srcALen + srcBLen) - 2U); + + if (srcALen > srcBLen) + { + /* Calculating the number of zeros to be padded to the output */ + j = srcALen - srcBLen; + + /* Initialise the pointer after zero padding */ + pDst += j; + } + + else if (srcALen < srcBLen) + { + /* Initialization to inputB pointer */ + pIn1 = pSrcB; + + /* Initialization to the end of inputA pointer */ + pIn2 = pSrcA + (srcALen - 1U); + + /* Initialisation of the pointer after zero padding */ + pDst = pDst + tot; + + /* Swapping the lengths */ + j = srcALen; + srcALen = srcBLen; + srcBLen = j; + + /* Setting the reverse flag */ + inv = 1; + + } + + /* Loop to calculate convolution for output length number of times */ + for (i = 0U; i <= tot; i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0U; j <= i; j++) + { + /* Check the array limitations */ + if ((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]); + } + } + /* Store the output in the destination buffer */ + if (inv == 1) + *pDst-- = (q7_t) __SSAT((sum >> 7U), 8U); + else + *pDst++ = (q7_t) __SSAT((sum >> 7U), 8U); + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of Corr group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c new file mode 100644 index 0000000..fd8e237 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c @@ -0,0 +1,512 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_decimate_f32.c + * Description: FIR decimation for floating-point sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup FIR_decimate Finite Impulse Response (FIR) Decimator + * + * These functions combine an FIR filter together with a decimator. + * They are used in multirate systems for reducing the sample rate of a signal without introducing aliasing distortion. + * Conceptually, the functions are equivalent to the block diagram below: + * \image html FIRDecimator.gif "Components included in the FIR Decimator functions" + * When decimating by a factor of M, the signal should be prefiltered by a lowpass filter with a normalized + * cutoff frequency of 1/M in order to prevent aliasing distortion. + * The user of the function is responsible for providing the filter coefficients. + * + * The FIR decimator functions provided in the CMSIS DSP Library combine the FIR filter and the decimator in an efficient manner. + * Instead of calculating all of the FIR filter outputs and discarding M-1 out of every M, only the + * samples output by the decimator are computed. + * The functions operate on blocks of input and output data. + * pSrc points to an array of blockSize input values and + * pDst points to an array of blockSize/M output values. + * In order to have an integer number of output samples blockSize + * must always be a multiple of the decimation factor M. + * + * The library provides separate functions for Q15, Q31 and floating-point data types. + * + * \par Algorithm: + * The FIR portion of the algorithm uses the standard form filter: + *
+ *    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+ * 
+ * where, b[n] are the filter coefficients. + * \par + * The pCoeffs points to a coefficient array of size numTaps. + * Coefficients are stored in time reversed order. + * \par + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to a state array of size numTaps + blockSize - 1. + * Samples in the state buffer are stored in the order: + * \par + *
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+ * 
+ * The state variables are updated after each block of data is processed, the coefficients are untouched. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable array should be allocated separately. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * - Checks to make sure that the size of the input is a multiple of the decimation factor. + * To do this manually without calling the init function, assign the follow subfields of the instance structure: + * numTaps, pCoeffs, M (decimation factor), pState. Also set all of the values in pState to zero. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * The code below statically initializes each of the 3 different data type filter instance structures + *
+ *arm_fir_decimate_instance_f32 S = {M, numTaps, pCoeffs, pState};
+ *arm_fir_decimate_instance_q31 S = {M, numTaps, pCoeffs, pState};
+ *arm_fir_decimate_instance_q15 S = {M, numTaps, pCoeffs, pState};
+ * 
+ * where M is the decimation factor; numTaps is the number of filter coefficients in the filter; + * pCoeffs is the address of the coefficient buffer; + * pState is the address of the state buffer. + * Be sure to set the values in the state buffer to zeros when doing static initialization. + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the FIR decimate filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + + /** + * @brief Processing function for the floating-point FIR decimator. + * @param[in] *S points to an instance of the floating-point FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + */ + +void arm_fir_decimate_f32( + const arm_fir_decimate_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t sum0; /* Accumulator */ + float32_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + +#if defined (ARM_MATH_DSP) + + uint32_t blkCntN4; + float32_t *px0, *px1, *px2, *px3; + float32_t acc0, acc1, acc2, acc3; + float32_t x1, x2, x3; + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 4; + blkCntN4 = outBlockSize - (4 * blkCnt); + + while (blkCnt > 0U) + { + /* Copy 4 * decimation factor number of new input samples into the state buffer */ + i = 4 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* Initialize state pointer for all the samples */ + px0 = pState; + px1 = pState + S->M; + px2 = pState + 2 * S->M; + px3 = pState + 3 * S->M; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + + while (tapCnt > 0U) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-1] sample for acc0 */ + x0 = *(px0++); + /* Read x[n-numTaps-1] sample for acc1 */ + x1 = *(px1++); + /* Read x[n-numTaps-1] sample for acc2 */ + x2 = *(px2++); + /* Read x[n-numTaps-1] sample for acc3 */ + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */ + x0 = *(px0++); + x1 = *(px1++); + x2 = *(px2++); + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */ + x0 = *(px0++); + x1 = *(px1++); + x2 = *(px2++); + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */ + x0 = *(px0++); + x1 = *(px1++); + x2 = *(px2++); + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch state variables for acc0, acc1, acc2, acc3 */ + x0 = *(px0++); + x1 = *(px1++); + x2 = *(px2++); + x3 = *(px3++); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + 4 * S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = acc0; + *pDst++ = acc1; + *pDst++ = acc2; + *pDst++ = acc3; + + /* Decrement the loop counter */ + blkCnt--; + } + + while (blkCntN4 > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + sum0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-1] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-2] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum0; + + /* Decrement the loop counter */ + blkCntN4--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1U) >> 2; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1U) % 0x04U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize; + + while (blkCnt > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + sum0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum0; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + i = (numTaps - 1U); + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c new file mode 100644 index 0000000..684640e --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c @@ -0,0 +1,586 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_decimate_fast_q15.c + * Description: Fast Q15 FIR Decimator + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4. + * @param[in] *S points to an instance of the Q15 FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + * @return none + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, state buffers should be aligned by 32-bit + * + * Scaling and Overflow Behavior: + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around and distorts the result. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2). + * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result. + * + * \par + * Refer to the function arm_fir_decimate_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. + * Both the slow and the fast versions use the same instance structure. + * Use the function arm_fir_decimate_init_q15() to initialize the filter structure. + */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + +void arm_fir_decimate_fast_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */ + q31_t sum0; /* Accumulators */ + q31_t acc0, acc1; + q15_t *px0, *px1; + uint32_t blkCntN3; + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 2; + blkCntN3 = outBlockSize - (2 * blkCnt); + + + while (blkCnt > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + + px1 = pState + S->M; + + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ + x0 = *__SIMD32(px0)++; + + x1 = *__SIMD32(px1)++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLAD(x0, c0, acc0); + + acc1 = __SMLAD(x1, c0, acc1); + + /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ + x0 = *__SIMD32(px0)++; + + x1 = *__SIMD32(px1)++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLAD(x0, c0, acc0); + + acc1 = __SMLAD(x1, c0, acc1); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px0++; + + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLAD(x0, c0, acc0); + acc1 = __SMLAD(x1, c0, acc1); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); + + /* Decrement the loop counter */ + blkCnt--; + } + + + + while (blkCntN3 > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ + x0 = *__SIMD32(px)++; + + /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ + c1 = *__SIMD32(pb)++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLAD(x0, c0, sum0); + + /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ + x0 = *__SIMD32(px)++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLAD(x0, c1, sum0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLAD(x0, c0, sum0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCntN3--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1U) >> 2U; + + /* copy data */ + while (i > 0U) + { + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1U) % 0x04U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + +#else + + +void arm_fir_decimate_fast_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */ + q31_t sum0; /* Accumulators */ + q31_t acc0, acc1; + q15_t *px0, *px1; + uint32_t blkCntN3; + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 2; + blkCntN3 = outBlockSize - (2 * blkCnt); + + while (blkCnt > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + + px1 = pState + S->M; + + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the Read b[numTaps-1] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-1] for sample 0 and for sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-2] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-3] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-3] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-4] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); + + + /* Decrement the loop counter */ + blkCnt--; + } + + while (blkCntN3 > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the Read b[numTaps-1] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-1] and sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-2] and sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-3] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-3] sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-4] sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCntN3--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1U) >> 2U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1U) % 0x04U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +/** + * @} end of FIR_decimate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c new file mode 100644 index 0000000..46b7d3d --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c @@ -0,0 +1,339 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_decimate_fast_q31.c + * Description: Fast Q31 FIR Decimator + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4. + * @param[in] *S points to an instance of the Q31 FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + * @return none + * + * Scaling and Overflow Behavior: + * + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are added to a 2.30 accumulator. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2). + * + * \par + * Refer to the function arm_fir_decimate_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. + * Both the slow and the fast versions use the same instance structure. + * Use the function arm_fir_decimate_init_q31() to initialize the filter structure. + */ + +void arm_fir_decimate_fast_q31( + arm_fir_decimate_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + q31_t *px; /* Temporary pointers for state buffer */ + q31_t *pb; /* Temporary pointers for coefficient buffer */ + q31_t sum0; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + uint32_t blkCntN2; + q31_t x1; + q31_t acc0, acc1; + q31_t *px0, *px1; + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + /* Total number of output samples to be computed */ + + blkCnt = outBlockSize / 2; + blkCntN2 = outBlockSize - (2 * blkCnt); + + while (blkCnt > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + px1 = pState + S->M; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb); + + /* Read x[n-numTaps-1] for sample 0 sample 1 */ + x0 = *(px0); + x1 = *(px1); + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb + 1U); + + /* Read x[n-numTaps-2] for sample 0 sample 1 */ + x0 = *(px0 + 1U); + x1 = *(px1 + 1U); + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb + 2U); + + /* Read x[n-numTaps-3] for sample 0 sample 1 */ + x0 = *(px0 + 2U); + x1 = *(px1 + 2U); + pb += 4U; + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb - 1U); + + /* Read x[n-numTaps-4] for sample 0 sample 1 */ + x0 = *(px0 + 3U); + x1 = *(px1 + 3U); + + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* update state pointers */ + px0 += 4U; + px1 += 4U; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x0 = *(px0++); + x1 = *(px1++); + + /* Perform the multiply-accumulate */ + acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); + acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (acc0 << 1); + *pDst++ = (q31_t) (acc1 << 1); + + /* Decrement the loop counter */ + blkCnt--; + } + + while (blkCntN2 > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-1] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-2] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum0 << 1); + + /* Decrement the loop counter */ + blkCntN2--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1U) >> 2U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1U) % 0x04U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + +/** + * @} end of FIR_decimate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c new file mode 100644 index 0000000..45797dc --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c @@ -0,0 +1,105 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_decimate_init_f32.c + * Description: Floating-point FIR Decimator initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Initialization function for the floating-point FIR decimator. + * @param[in,out] *S points to an instance of the floating-point FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_f32(). + * M is the decimation factor. + */ + +arm_status arm_fir_decimate_init_f32( + arm_fir_decimate_instance_f32 * S, + uint16_t numTaps, + uint8_t M, + float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The size of the input block must be a multiple of the decimation factor */ + if ((blockSize % M) != 0U) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Decimation Factor */ + S->M = M; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c new file mode 100644 index 0000000..7314711 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c @@ -0,0 +1,107 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_decimate_init_q15.c + * Description: Initialization function for the Q15 FIR Decimator + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Initialization function for the Q15 FIR decimator. + * @param[in,out] *S points to an instance of the Q15 FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples + * to the call arm_fir_decimate_q15(). + * M is the decimation factor. + */ + +arm_status arm_fir_decimate_init_q15( + arm_fir_decimate_instance_q15 * S, + uint16_t numTaps, + uint8_t M, + q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize) +{ + + arm_status status; + + /* The size of the input block must be a multiple of the decimation factor */ + if ((blockSize % M) != 0U) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size of buffer is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Decimation factor */ + S->M = M; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c new file mode 100644 index 0000000..f6f3fb2 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c @@ -0,0 +1,105 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_decimate_init_q31.c + * Description: Initialization function for Q31 FIR Decimation filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Initialization function for the Q31 FIR decimator. + * @param[in,out] *S points to an instance of the Q31 FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_q31(). + * M is the decimation factor. + */ + +arm_status arm_fir_decimate_init_q31( + arm_fir_decimate_instance_q31 * S, + uint16_t numTaps, + uint8_t M, + q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The size of the input block must be a multiple of the decimation factor */ + if ((blockSize % M) != 0U) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1)) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Decimation factor */ + S->M = M; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c new file mode 100644 index 0000000..56f12fb --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c @@ -0,0 +1,684 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_decimate_q15.c + * Description: Q15 FIR Decimator + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Processing function for the Q15 FIR decimator. + * @param[in] *S points to an instance of the Q15 FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the location where the output result is written. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + * + * \par + * Refer to the function arm_fir_decimate_fast_q15() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +#if defined (ARM_MATH_DSP) + +#ifndef UNALIGNED_SUPPORT_DISABLE + +void arm_fir_decimate_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */ + q63_t sum0; /* Accumulators */ + q63_t acc0, acc1; + q15_t *px0, *px1; + uint32_t blkCntN3; + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 2; + blkCntN3 = outBlockSize - (2 * blkCnt); + + + while (blkCnt > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + + px1 = pState + S->M; + + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ + x0 = *__SIMD32(px0)++; + + x1 = *__SIMD32(px1)++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLALD(x0, c0, acc0); + + acc1 = __SMLALD(x1, c0, acc1); + + /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ + x0 = *__SIMD32(px0)++; + + x1 = *__SIMD32(px1)++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLALD(x0, c0, acc0); + + acc1 = __SMLALD(x1, c0, acc1); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px0++; + + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); + + /* Decrement the loop counter */ + blkCnt--; + } + + + + while (blkCntN3 > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ + c0 = *__SIMD32(pb)++; + + /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ + x0 = *__SIMD32(px)++; + + /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ + c1 = *__SIMD32(pb)++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLALD(x0, c0, sum0); + + /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ + x0 = *__SIMD32(px)++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLALD(x0, c1, sum0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 = __SMLALD(x0, c0, sum0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCntN3--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1U) >> 2U; + + /* copy data */ + while (i > 0U) + { + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1U) % 0x04U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + +#else + + +void arm_fir_decimate_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */ + q63_t sum0; /* Accumulators */ + q63_t acc0, acc1; + q15_t *px0, *px1; + uint32_t blkCntN3; + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize / 2; + blkCntN3 = outBlockSize - (2 * blkCnt); + + while (blkCnt > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = 2 * S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + px0 = pState; + + px1 = pState + S->M; + + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the Read b[numTaps-1] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-1] for sample 0 and for sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-2] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-3] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-3] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-4] for sample 0 and sample 1 */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px0++; + x1 = *px1++; + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M * 2; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); + + /* Decrement the loop counter */ + blkCnt--; + } + + while (blkCntN3 > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the Read b[numTaps-1] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-1] and sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-2] and sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-3] coefficients */ + c0 = *pb++; + + /* Read x[n-numTaps-3] sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *pb++; + + /* Read x[n-numTaps-4] sample */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* Store filter output, smlad returns the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCntN3--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1U) >> 2U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1U) % 0x04U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +#else + + +void arm_fir_decimate_q15( + const arm_fir_decimate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer coefficient buffer */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + q63_t sum0; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + +/* Run the below code for Cortex-M0 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize; + + while (blkCnt > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /*Set sum to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += (q31_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /*Store filter output , smlad will return the values in 2.14 format */ + /* so downsacle by 15 to get output in 1.15 */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = numTaps - 1U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + +} +#endif /* #if defined (ARM_MATH_DSP) */ + + +/** + * @} end of FIR_decimate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c new file mode 100644 index 0000000..6a13cb5 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c @@ -0,0 +1,299 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_decimate_q31.c + * Description: Q31 FIR Decimator + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_decimate + * @{ + */ + +/** + * @brief Processing function for the Q31 FIR decimator. + * @param[in] *S points to an instance of the Q31 FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + * @return none + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2). + * After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format. + * + * \par + * Refer to the function arm_fir_decimate_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +void arm_fir_decimate_q31( + const arm_fir_decimate_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + q31_t *px; /* Temporary pointers for state buffer */ + q31_t *pb; /* Temporary pointers for coefficient buffer */ + q63_t sum0; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of taps */ + uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize; + + while (blkCnt > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0U) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-1] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-2] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum0 >> 31); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1U) >> 2U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1U) % 0x04U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize; + + while (blkCnt > 0U) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } while (--i); + + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *pb++; + + /* Fetch 1 state variable */ + x0 = *px++; + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum0 >> 31); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = numTaps - 1U; + + /* copy data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR_decimate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c new file mode 100644 index 0000000..812f9df --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c @@ -0,0 +1,985 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_f32.c + * Description: Floating-point FIR filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** +* @ingroup groupFilters +*/ + +/** +* @defgroup FIR Finite Impulse Response (FIR) Filters +* +* This set of functions implements Finite Impulse Response (FIR) filters +* for Q7, Q15, Q31, and floating-point data types. Fast versions of Q15 and Q31 are also provided. +* The functions operate on blocks of input and output data and each call to the function processes +* blockSize samples through the filter. pSrc and +* pDst points to input and output arrays containing blockSize values. +* +* \par Algorithm: +* The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations. +* Each filter coefficient b[n] is multiplied by a state variable which equals a previous input sample x[n]. +*
+*    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+* 
+* \par +* \image html FIR.gif "Finite Impulse Response filter" +* \par +* pCoeffs points to a coefficient array of size numTaps. +* Coefficients are stored in time reversed order. +* \par +*
+*    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+* 
+* \par +* pState points to a state array of size numTaps + blockSize - 1. +* Samples in the state buffer are stored in the following order. +* \par +*
+*    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+* 
+* \par +* Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1. +* The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters, +* to be avoided and yields a significant speed improvement. +* The state variables are updated after each block of data is processed; the coefficients are untouched. +* \par Instance Structure +* The coefficients and state variables for a filter are stored together in an instance data structure. +* A separate instance structure must be defined for each filter. +* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. +* There are separate instance structure declarations for each of the 4 supported data types. +* +* \par Initialization Functions +* There is also an associated initialization function for each data type. +* The initialization function performs the following operations: +* - Sets the values of the internal structure fields. +* - Zeros out the values in the state buffer. +* To do this manually without calling the init function, assign the follow subfields of the instance structure: +* numTaps, pCoeffs, pState. Also set all of the values in pState to zero. +* +* \par +* Use of the initialization function is optional. +* However, if the initialization function is used, then the instance structure cannot be placed into a const data section. +* To place an instance structure into a const data section, the instance structure must be manually initialized. +* Set the values in the state buffer to zeros before static initialization. +* The code below statically initializes each of the 4 different data type filter instance structures +*
+*arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
+*arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
+*arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
+*arm_fir_instance_q7 S =  {numTaps, pState, pCoeffs};
+* 
+* +* where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer; +* pCoeffs is the address of the coefficient buffer. +* +* \par Fixed-Point Behavior +* Care must be taken when using the fixed-point versions of the FIR filter functions. +* In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. +* Refer to the function specific documentation below for usage guidelines. +*/ + +/** +* @addtogroup FIR +* @{ +*/ + +/** +* +* @param[in] *S points to an instance of the floating-point FIR filter structure. +* @param[in] *pSrc points to the block of input data. +* @param[out] *pDst points to the block of output data. +* @param[in] blockSize number of samples to process per call. +* @return none. +* +*/ + +#if defined(ARM_MATH_CM7) + +void arm_fir_f32( +const arm_fir_instance_f32 * S, +float32_t * pSrc, +float32_t * pDst, +uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */ + float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Apply loop unrolling and compute 8 output values simultaneously. + * The variables acc0 ... acc7 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 3; + + /* First part of the processing with loop unrolling. Compute 8 outputs at a time. + ** a second loop below computes the remaining 1 to 7 samples. */ + while (blkCnt > 0U) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + acc4 = 0.0f; + acc5 = 0.0f; + acc6 = 0.0f; + acc7 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* This is separated from the others to avoid + * a call to __aeabi_memmove which would be slower + */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Read the first seven samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *px++; + x1 = *px++; + x2 = *px++; + x3 = *px++; + x4 = *px++; + x5 = *px++; + x6 = *px++; + + /* Loop unrolling. Process 8 taps at a time. */ + tapCnt = numTaps >> 3U; + + /* Loop over the number of taps. Unroll by a factor of 8. + ** Repeat until we've computed numTaps-8 coefficients. */ + while (tapCnt > 0U) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x7 = *(px++); + + /* acc0 += b[numTaps-1] * x[n-numTaps] */ + acc0 += x0 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-1] */ + acc1 += x1 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-2] */ + acc2 += x2 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-3] */ + acc3 += x3 * c0; + + /* acc4 += b[numTaps-1] * x[n-numTaps-4] */ + acc4 += x4 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-5] */ + acc5 += x5 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-6] */ + acc6 += x6 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-7] */ + acc7 += x7 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + acc0 += x1 * c0; + acc1 += x2 * c0; + acc2 += x3 * c0; + acc3 += x4 * c0; + acc4 += x5 * c0; + acc5 += x6 * c0; + acc6 += x7 * c0; + acc7 += x0 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x2 * c0; + acc1 += x3 * c0; + acc2 += x4 * c0; + acc3 += x5 * c0; + acc4 += x6 * c0; + acc5 += x7 * c0; + acc6 += x0 * c0; + acc7 += x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x3 * c0; + acc1 += x4 * c0; + acc2 += x5 * c0; + acc3 += x6 * c0; + acc4 += x7 * c0; + acc5 += x0 * c0; + acc6 += x1 * c0; + acc7 += x2 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x3 = *(px++); + /* Perform the multiply-accumulates */ + acc0 += x4 * c0; + acc1 += x5 * c0; + acc2 += x6 * c0; + acc3 += x7 * c0; + acc4 += x0 * c0; + acc5 += x1 * c0; + acc6 += x2 * c0; + acc7 += x3 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x4 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x5 * c0; + acc1 += x6 * c0; + acc2 += x7 * c0; + acc3 += x0 * c0; + acc4 += x1 * c0; + acc5 += x2 * c0; + acc6 += x3 * c0; + acc7 += x4 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x5 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x6 * c0; + acc1 += x7 * c0; + acc2 += x0 * c0; + acc3 += x1 * c0; + acc4 += x2 * c0; + acc5 += x3 * c0; + acc6 += x4 * c0; + acc7 += x5 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x6 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x7 * c0; + acc1 += x0 * c0; + acc2 += x1 * c0; + acc3 += x2 * c0; + acc4 += x3 * c0; + acc5 += x4 * c0; + acc6 += x5 * c0; + acc7 += x6 * c0; + + tapCnt--; + } + + /* If the filter length is not a multiple of 8, compute the remaining filter taps */ + tapCnt = numTaps % 0x8U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x7 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + acc4 += x4 * c0; + acc5 += x5 * c0; + acc6 += x6 * c0; + acc7 += x7 * c0; + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + x3 = x4; + x4 = x5; + x5 = x6; + x6 = x7; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by 8 to process the next group of 8 samples */ + pState = pState + 8; + + /* The results in the 8 accumulators, store in the destination buffer. */ + *pDst++ = acc0; + *pDst++ = acc1; + *pDst++ = acc2; + *pDst++ = acc3; + *pDst++ = acc4; + *pDst++ = acc5; + *pDst++ = acc6; + *pDst++ = acc7; + + blkCnt--; + } + + /* If the blockSize is not a multiple of 8, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x8U; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += *px++ * *pb++; + i--; + + } while (i > 0U); + + /* The result is store in the destination buffer. */ + *pDst++ = acc0; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#elif defined(ARM_MATH_CM0_FAMILY) + +void arm_fir_f32( +const arm_fir_instance_f32 * S, +float32_t * pSrc, +float32_t * pDst, +uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* Run the below code for Cortex-M0 */ + + float32_t acc; + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += *px++ * *pb++; + i--; + + } while (i > 0U); + + /* The result is store in the destination buffer. */ + *pDst++ = acc; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the starting of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + tapCnt = numTaps - 1U; + + /* Copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#else + +/* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_f32( +const arm_fir_instance_f32 * S, +float32_t * pSrc, +float32_t * pDst, +uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */ + float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + float32_t p0,p1,p2,p3,p4,p5,p6,p7; /* Temporary product values */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Apply loop unrolling and compute 8 output values simultaneously. + * The variables acc0 ... acc7 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 3; + + /* First part of the processing with loop unrolling. Compute 8 outputs at a time. + ** a second loop below computes the remaining 1 to 7 samples. */ + while (blkCnt > 0U) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + acc4 = 0.0f; + acc5 = 0.0f; + acc6 = 0.0f; + acc7 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* This is separated from the others to avoid + * a call to __aeabi_memmove which would be slower + */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Read the first seven samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *px++; + x1 = *px++; + x2 = *px++; + x3 = *px++; + x4 = *px++; + x5 = *px++; + x6 = *px++; + + /* Loop unrolling. Process 8 taps at a time. */ + tapCnt = numTaps >> 3U; + + /* Loop over the number of taps. Unroll by a factor of 8. + ** Repeat until we've computed numTaps-8 coefficients. */ + while (tapCnt > 0U) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x7 = *(px++); + + /* acc0 += b[numTaps-1] * x[n-numTaps] */ + p0 = x0 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-1] */ + p1 = x1 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-2] */ + p2 = x2 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-3] */ + p3 = x3 * c0; + + /* acc4 += b[numTaps-1] * x[n-numTaps-4] */ + p4 = x4 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-5] */ + p5 = x5 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-6] */ + p6 = x6 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-7] */ + p7 = x7 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + + /* Perform the multiply-accumulate */ + p0 = x1 * c0; + p1 = x2 * c0; + p2 = x3 * c0; + p3 = x4 * c0; + p4 = x5 * c0; + p5 = x6 * c0; + p6 = x7 * c0; + p7 = x0 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x2 * c0; + p1 = x3 * c0; + p2 = x4 * c0; + p3 = x5 * c0; + p4 = x6 * c0; + p5 = x7 * c0; + p6 = x0 * c0; + p7 = x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x3 * c0; + p1 = x4 * c0; + p2 = x5 * c0; + p3 = x6 * c0; + p4 = x7 * c0; + p5 = x0 * c0; + p6 = x1 * c0; + p7 = x2 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x3 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x4 * c0; + p1 = x5 * c0; + p2 = x6 * c0; + p3 = x7 * c0; + p4 = x0 * c0; + p5 = x1 * c0; + p6 = x2 * c0; + p7 = x3 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x4 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x5 * c0; + p1 = x6 * c0; + p2 = x7 * c0; + p3 = x0 * c0; + p4 = x1 * c0; + p5 = x2 * c0; + p6 = x3 * c0; + p7 = x4 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x5 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x6 * c0; + p1 = x7 * c0; + p2 = x0 * c0; + p3 = x1 * c0; + p4 = x2 * c0; + p5 = x3 * c0; + p6 = x4 * c0; + p7 = x5 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x6 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x7 * c0; + p1 = x0 * c0; + p2 = x1 * c0; + p3 = x2 * c0; + p4 = x3 * c0; + p5 = x4 * c0; + p6 = x5 * c0; + p7 = x6 * c0; + + tapCnt--; + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + } + + /* If the filter length is not a multiple of 8, compute the remaining filter taps */ + tapCnt = numTaps % 0x8U; + + while (tapCnt > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x7 = *(px++); + + /* Perform the multiply-accumulates */ + p0 = x0 * c0; + p1 = x1 * c0; + p2 = x2 * c0; + p3 = x3 * c0; + p4 = x4 * c0; + p5 = x5 * c0; + p6 = x6 * c0; + p7 = x7 * c0; + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + x3 = x4; + x4 = x5; + x5 = x6; + x6 = x7; + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by 8 to process the next group of 8 samples */ + pState = pState + 8; + + /* The results in the 8 accumulators, store in the destination buffer. */ + *pDst++ = acc0; + *pDst++ = acc1; + *pDst++ = acc2; + *pDst++ = acc3; + *pDst++ = acc4; + *pDst++ = acc5; + *pDst++ = acc6; + *pDst++ = acc7; + + blkCnt--; + } + + /* If the blockSize is not a multiple of 8, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x8U; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += *px++ * *pb++; + i--; + + } while (i > 0U); + + /* The result is store in the destination buffer. */ + *pDst++ = acc0; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#endif + +/** +* @} end of FIR group +*/ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c new file mode 100644 index 0000000..35e431b --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c @@ -0,0 +1,333 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_fast_q15.c + * Description: Q15 Fast FIR filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q15 FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * This fast version uses a 32-bit accumulator with 2.30 format. + * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around and distorts the result. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. + * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result. + * + * \par + * Refer to the function arm_fir_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure. + * Use the function arm_fir_init_q15() to initialize the filter structure. + */ + +void arm_fir_fast_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */ + q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* Copy four new input samples into the state buffer. + ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */ + px = pState; + + /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ + x0 = *__SIMD32(px)++; + + /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */ + x2 = *__SIMD32(px)++; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */ + tapCnt = numTaps >> 2; + + while (tapCnt > 0) + { + /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + acc0 = __SMLAD(x0, c0, acc0); + + /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ + acc2 = __SMLAD(x2, c0, acc2); + + /* pack x[n-N-1] and x[n-N-2] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read state x[n-N-4], x[n-N-5] */ + x0 = _SIMD32_OFFSET(px); + + /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ + acc1 = __SMLADX(x1, c0, acc1); + + /* pack x[n-N-3] and x[n-N-4] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ + acc3 = __SMLADX(x1, c0, acc3); + + /* Read coefficients b[N-2], b[N-3] */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ + acc0 = __SMLAD(x2, c0, acc0); + + /* Read state x[n-N-6], x[n-N-7] with offset */ + x2 = _SIMD32_OFFSET(px + 2U); + + /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ + acc2 = __SMLAD(x0, c0, acc2); + + /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ + acc1 = __SMLADX(x1, c0, acc1); + + /* pack x[n-N-5] and x[n-N-6] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ + acc3 = __SMLADX(x1, c0, acc3); + + /* Update state pointer for next state reading */ + px += 4U; + + /* Decrement tap count */ + tapCnt--; + + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps. + ** This is always be 2 taps since the filter length is even. */ + if ((numTaps & 0x3U) != 0U) + { + + /* Read last two coefficients */ + c0 = *__SIMD32(pb)++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLAD(x0, c0, acc0); + acc2 = __SMLAD(x2, c0, acc2); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read last state variables */ + x0 = *__SIMD32(px); + + /* Perform the multiply-accumulates */ + acc1 = __SMLADX(x1, c0, acc1); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* Perform the multiply-accumulates */ + acc3 = __SMLADX(x1, c0, acc3); + } + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. + ** Then store the 4 outputs in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4U; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + while (blkCnt > 0U) + { + /* Copy two samples into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Use SIMD to hold states and coefficients */ + px = pState; + pb = pCoeffs; + + tapCnt = numTaps >> 1U; + + do + { + + acc0 += (q31_t) * px++ * *pb++; + acc0 += (q31_t) * px++ * *pb++; + + tapCnt--; + } + while (tapCnt > 0U); + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1U; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1U) >> 2; + + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* copy remaining data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +/** + * @} end of FIR group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c new file mode 100644 index 0000000..bd9c686 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c @@ -0,0 +1,293 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_fast_q31.c + * Description: Processing function for the Q31 Fast FIR filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q31 structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * + * \par + * This function is optimized for speed at the expense of fixed-point precision and overflow protection. + * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. + * These intermediate results are added to a 2.30 accumulator. + * Finally, the accumulator is saturated and converted to a 1.31 result. + * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. + * + * \par + * Refer to the function arm_fir_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. Both the slow and the fast versions use the same instance structure. + * Use the function arm_fir_init_q31() to initialize the filter structure. + */ + +IAR_ONLY_LOW_OPTIMIZATION_ENTER +void arm_fir_fast_q31( + const arm_fir_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t x0, x1, x2, x3; /* Temporary variables to hold state */ + q31_t c0; /* Temporary variable to hold coefficient value */ + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Read the first three samples from the state buffer: + * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + i = tapCnt; + + while (i > 0U) + { + /* Read the b[numTaps] coefficient */ + c0 = *pb; + + /* Read x[n-numTaps-3] sample */ + x3 = *px; + + /* acc0 += b[numTaps] * x[n-numTaps] */ + multAcc_32x32_keep32_R(acc0, x0, c0); + + /* acc1 += b[numTaps] * x[n-numTaps-1] */ + multAcc_32x32_keep32_R(acc1, x1, c0); + + /* acc2 += b[numTaps] * x[n-numTaps-2] */ + multAcc_32x32_keep32_R(acc2, x2, c0); + + /* acc3 += b[numTaps] * x[n-numTaps-3] */ + multAcc_32x32_keep32_R(acc3, x3, c0); + + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb + 1U); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px + 1U); + + /* Perform the multiply-accumulates */ + multAcc_32x32_keep32_R(acc0, x1, c0); + multAcc_32x32_keep32_R(acc1, x2, c0); + multAcc_32x32_keep32_R(acc2, x3, c0); + multAcc_32x32_keep32_R(acc3, x0, c0); + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb + 2U); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px + 2U); + + /* Perform the multiply-accumulates */ + multAcc_32x32_keep32_R(acc0, x2, c0); + multAcc_32x32_keep32_R(acc1, x3, c0); + multAcc_32x32_keep32_R(acc2, x0, c0); + multAcc_32x32_keep32_R(acc3, x1, c0); + + /* Read the b[numTaps-3] coefficients */ + c0 = *(pb + 3U); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px + 3U); + + /* Perform the multiply-accumulates */ + multAcc_32x32_keep32_R(acc0, x3, c0); + multAcc_32x32_keep32_R(acc1, x0, c0); + multAcc_32x32_keep32_R(acc2, x1, c0); + multAcc_32x32_keep32_R(acc3, x2, c0); + + /* update coefficient pointer */ + pb += 4U; + px += 4U; + + /* Decrement the loop counter */ + i--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + + i = numTaps - (tapCnt * 4U); + while (i > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + multAcc_32x32_keep32_R(acc0, x0, c0); + multAcc_32x32_keep32_R(acc1, x1, c0); + multAcc_32x32_keep32_R(acc2, x2, c0); + multAcc_32x32_keep32_R(acc3, x3, c0); + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31 + ** Then store the 4 outputs in the destination buffer. */ + *pDst++ = (q31_t) (acc0 << 1); + *pDst++ = (q31_t) (acc1 << 1); + *pDst++ = (q31_t) (acc2 << 1); + *pDst++ = (q31_t) (acc3 << 1); + + /* Decrement the samples loop counter */ + blkCnt--; + } + + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 4U; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + multAcc_32x32_keep32_R(acc0, (*px++), (*(pb++))); + i--; + } while (i > 0U); + + /* The result is in 2.30 format. Convert to 1.31 + ** Then store the output in the destination buffer. */ + *pDst++ = (q31_t) (acc0 << 1); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1U); + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + +} +IAR_ONLY_LOW_OPTIMIZATION_EXIT +/** + * @} end of FIR group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c new file mode 100644 index 0000000..25fcb01 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c @@ -0,0 +1,84 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_init_f32.c + * Description: Floating-point FIR filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the floating-point FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_f32(). + */ + +void arm_fir_init_f32( + arm_fir_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and the size of state buffer is (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c new file mode 100644 index 0000000..a5638d5 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c @@ -0,0 +1,142 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_init_q15.c + * Description: Q15 FIR filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in,out] *S points to an instance of the Q15 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. Must be even and greater than or equal to 4. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize is number of samples processed per call. + * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if + * numTaps is not greater than or equal to 4 and even. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * Note that numTaps must be even and greater than or equal to 4. + * To implement an odd length filter simply increase numTaps by 1 and set the last coefficient to zero. + * For example, to implement a filter with numTaps=3 and coefficients + *
+ *     {0.3, -0.8, 0.3}
+ * 
+ * set numTaps=4 and use the coefficients: + *
+ *     {0.3, -0.8, 0.3, 0}.
+ * 
+ * Similarly, to implement a two point filter + *
+ *     {0.3, -0.3}
+ * 
+ * set numTaps=4 and use the coefficients: + *
+ *     {0.3, -0.3, 0, 0}.
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize, when running on Cortex-M4 and Cortex-M3 and is of length numTaps+blockSize-1, when running on Cortex-M0 where blockSize is the number of input samples processed by each call to arm_fir_q15(). + */ + +arm_status arm_fir_init_q15( + arm_fir_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize) +{ + arm_status status; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* The Number of filter coefficients in the filter must be even and at least 4 */ + if (numTaps & 0x1U) + { + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps ) */ + memset(pState, 0, (numTaps + (blockSize)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + + return (status); + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c new file mode 100644 index 0000000..2367a65 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c @@ -0,0 +1,84 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_init_q31.c + * Description: Q31 FIR filter initialization function. + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the Q31 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q31(). + */ + +void arm_fir_init_q31( + arm_fir_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and state array size is (blockSize + numTaps - 1) */ + memset(pState, 0, (blockSize + ((uint32_t) numTaps - 1U)) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c new file mode 100644 index 0000000..5a91fb8 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c @@ -0,0 +1,82 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_init_q7.c + * Description: Q7 FIR filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ +/** + * @param[in,out] *S points to an instance of the Q7 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q7(). + */ + +void arm_fir_init_q7( + arm_fir_instance_q7 * S, + uint16_t numTaps, + q7_t * pCoeffs, + q7_t * pState, + uint32_t blockSize) +{ + + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q7_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c new file mode 100644 index 0000000..5f9d19c --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c @@ -0,0 +1,569 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_interpolate_f32.c + * Description: Floating-point FIR interpolation sequences + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator + * + * These functions combine an upsampler (zero stuffer) and an FIR filter. + * They are used in multirate systems for increasing the sample rate of a signal without introducing high frequency images. + * Conceptually, the functions are equivalent to the block diagram below: + * \image html FIRInterpolator.gif "Components included in the FIR Interpolator functions" + * After upsampling by a factor of L, the signal should be filtered by a lowpass filter with a normalized + * cutoff frequency of 1/L in order to eliminate high frequency copies of the spectrum. + * The user of the function is responsible for providing the filter coefficients. + * + * The FIR interpolator functions provided in the CMSIS DSP Library combine the upsampler and FIR filter in an efficient manner. + * The upsampler inserts L-1 zeros between each sample. + * Instead of multiplying by these zero values, the FIR filter is designed to skip them. + * This leads to an efficient implementation without any wasted effort. + * The functions operate on blocks of input and output data. + * pSrc points to an array of blockSize input values and + * pDst points to an array of blockSize*L output values. + * + * The library provides separate functions for Q15, Q31, and floating-point data types. + * + * \par Algorithm: + * The functions use a polyphase filter structure: + *
+ *    y[n] = b[0] * x[n] + b[L]   * x[n-1] + ... + b[L*(phaseLength-1)] * x[n-phaseLength+1]
+ *    y[n+1] = b[1] * x[n] + b[L+1] * x[n-1] + ... + b[L*(phaseLength-1)+1] * x[n-phaseLength+1]
+ *    ...
+ *    y[n+(L-1)] = b[L-1] * x[n] + b[2*L-1] * x[n-1] + ....+ b[L*(phaseLength-1)+(L-1)] * x[n-phaseLength+1]
+ * 
+ * This approach is more efficient than straightforward upsample-then-filter algorithms. + * With this method the computation is reduced by a factor of 1/L when compared to using a standard FIR filter. + * \par + * pCoeffs points to a coefficient array of size numTaps. + * numTaps must be a multiple of the interpolation factor L and this is checked by the + * initialization functions. + * Internally, the function divides the FIR filter's impulse response into shorter filters of length + * phaseLength=numTaps/L. + * Coefficients are stored in time reversed order. + * \par + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to a state array of size blockSize + phaseLength - 1. + * Samples in the state buffer are stored in the order: + * \par + *
+ *    {x[n-phaseLength+1], x[n-phaseLength], x[n-phaseLength-1], x[n-phaseLength-2]....x[0], x[1], ..., x[blockSize-1]}
+ * 
+ * The state variables are updated after each block of data is processed, the coefficients are untouched. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable array should be allocated separately. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * - Checks to make sure that the length of the filter is a multiple of the interpolation factor. + * To do this manually without calling the init function, assign the follow subfields of the instance structure: + * L (interpolation factor), pCoeffs, phaseLength (numTaps / L), pState. Also set all of the values in pState to zero. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * The code below statically initializes each of the 3 different data type filter instance structures + *
+ * arm_fir_interpolate_instance_f32 S = {L, phaseLength, pCoeffs, pState};
+ * arm_fir_interpolate_instance_q31 S = {L, phaseLength, pCoeffs, pState};
+ * arm_fir_interpolate_instance_q15 S = {L, phaseLength, pCoeffs, pState};
+ * 
+ * where L is the interpolation factor; phaseLength=numTaps/L is the + * length of each of the shorter FIR filters used internally, + * pCoeffs is the address of the coefficient buffer; + * pState is the address of the state buffer. + * Be sure to set the values in the state buffer to zeros when doing static initialization. + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the FIR interpolate filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Processing function for the floating-point FIR interpolator. + * @param[in] *S points to an instance of the floating-point FIR interpolator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + */ +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_interpolate_f32( + const arm_fir_interpolate_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + float32_t sum0; /* Accumulators */ + float32_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt, j; /* Loop counters */ + uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */ + float32_t acc0, acc1, acc2, acc3; + float32_t x1, x2, x3; + uint32_t blkCntN4; + float32_t c1, c2, c3; + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (phaseLen - 1U); + + /* Initialise blkCnt */ + blkCnt = blockSize / 4; + blkCntN4 = blockSize - (4 * blkCnt); + + /* Samples loop unrolled by 4 */ + while (blkCnt > 0U) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1U; + + /* Loop over the Interpolation factor. */ + i = (S->L); + + while (i > 0U) + { + /* Set accumulator to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2U; + + x0 = *(ptr1++); + x1 = *(ptr1++); + x2 = *(ptr1++); + + while (tapCnt > 0U) + { + + /* Read the input sample */ + x3 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Read the coefficient */ + c1 = *(ptr2 + S->L); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += x1 * c1; + acc1 += x2 * c1; + acc2 += x3 * c1; + acc3 += x0 * c1; + + /* Read the coefficient */ + c2 = *(ptr2 + S->L * 2); + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += x2 * c2; + acc1 += x3 * c2; + acc2 += x0 * c2; + acc3 += x1 * c2; + + /* Read the coefficient */ + c3 = *(ptr2 + S->L * 3); + + /* Read the input sample */ + x2 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += x3 * c3; + acc1 += x0 * c3; + acc2 += x1 * c3; + acc3 += x2 * c3; + + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += 4 * S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen % 0x4U; + + while (tapCnt > 0U) + { + + /* Read the input sample */ + x3 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* update states for next sample processing */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst = acc0; + *(pDst + S->L) = acc1; + *(pDst + 2 * S->L) = acc2; + *(pDst + 3 * S->L) = acc3; + + pDst++; + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 4; + + pDst += S->L * 3; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + + while (blkCntN4 > 0U) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1U; + + /* Loop over the Interpolation factor. */ + i = S->L; + while (i > 0U) + { + /* Set accumulator to zero */ + sum0 = 0.0f; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2U; + while (tapCnt > 0U) + { + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + sum0 += *(ptr1++) * (*ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum0; + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCntN4--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (phaseLen - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (phaseLen - 1U) % 0x04U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#else + + /* Run the below code for Cortex-M0 */ + +void arm_fir_interpolate_f32( + const arm_fir_interpolate_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + + + float32_t sum; /* Accumulator */ + uint32_t i, blkCnt; /* Loop counters */ + uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */ + + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (phaseLen - 1U); + + /* Total number of intput samples */ + blkCnt = blockSize; + + /* Loop over the blockSize. */ + while (blkCnt > 0U) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Loop over the Interpolation factor. */ + i = S->L; + + while (i > 0U) + { + /* Set accumulator to zero */ + sum = 0.0f; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (i - 1U); + + /* Loop over the polyPhase length */ + tapCnt = phaseLen; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + sum += *ptr1++ * *ptr2; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = phaseLen - 1U; + + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #if defined (ARM_MATH_DSP) */ + + + + /** + * @} end of FIR_Interpolate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c new file mode 100644 index 0000000..415c8da --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c @@ -0,0 +1,109 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_interpolate_init_f32.c + * Description: Floating-point FIR interpolator initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Initialization function for the floating-point FIR interpolator. + * @param[in,out] *S points to an instance of the floating-point FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficient buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}
+ * 
+ * The length of the filter numTaps must be a multiple of the interpolation factor L. + * \par + * pState points to the array of state variables. + * pState is of length (numTaps/L)+blockSize-1 words + * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_f32(). + */ + +arm_status arm_fir_interpolate_init_f32( + arm_fir_interpolate_instance_f32 * S, + uint8_t L, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The filter length must be a multiple of the interpolation factor */ + if ((numTaps % L) != 0U) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign Interpolation factor */ + S->L = L; + + /* Assign polyPhaseLength */ + S->phaseLength = numTaps / L; + + /* Clear state buffer and size of state array is always phaseLength + blockSize - 1 */ + memset(pState, 0, + (blockSize + + ((uint32_t) S->phaseLength - 1U)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + + /** + * @} end of FIR_Interpolate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c new file mode 100644 index 0000000..6dce943 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c @@ -0,0 +1,108 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_interpolate_init_q15.c + * Description: Q15 FIR interpolator initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Initialization function for the Q15 FIR interpolator. + * @param[in,out] *S points to an instance of the Q15 FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficient buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}
+ * 
+ * The length of the filter numTaps must be a multiple of the interpolation factor L. + * \par + * pState points to the array of state variables. + * pState is of length (numTaps/L)+blockSize-1 words + * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q15(). + */ + +arm_status arm_fir_interpolate_init_q15( + arm_fir_interpolate_instance_q15 * S, + uint8_t L, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The filter length must be a multiple of the interpolation factor */ + if ((numTaps % L) != 0U) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign Interpolation factor */ + S->L = L; + + /* Assign polyPhaseLength */ + S->phaseLength = numTaps / L; + + /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */ + memset(pState, 0, + (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + + /** + * @} end of FIR_Interpolate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c new file mode 100644 index 0000000..9875aa8 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c @@ -0,0 +1,109 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_interpolate_init_q31.c + * Description: Q31 FIR interpolator initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + + +/** + * @brief Initialization function for the Q31 FIR interpolator. + * @param[in,out] *S points to an instance of the Q31 FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficient buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}
+ * 
+ * The length of the filter numTaps must be a multiple of the interpolation factor L. + * \par + * pState points to the array of state variables. + * pState is of length (numTaps/L)+blockSize-1 words + * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q31(). + */ + +arm_status arm_fir_interpolate_init_q31( + arm_fir_interpolate_instance_q31 * S, + uint8_t L, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize) +{ + arm_status status; + + /* The filter length must be a multiple of the interpolation factor */ + if ((numTaps % L) != 0U) + { + /* Set status as ARM_MATH_LENGTH_ERROR */ + status = ARM_MATH_LENGTH_ERROR; + } + else + { + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign Interpolation factor */ + S->L = L; + + /* Assign polyPhaseLength */ + S->phaseLength = numTaps / L; + + /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */ + memset(pState, 0, + (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +} + + /** + * @} end of FIR_Interpolate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c new file mode 100644 index 0000000..1cedd25 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c @@ -0,0 +1,496 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_interpolate_q15.c + * Description: Q15 FIR interpolation + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Processing function for the Q15 FIR interpolator. + * @param[in] *S points to an instance of the Q15 FIR interpolator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + */ + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_interpolate_q15( + const arm_fir_interpolate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + q63_t sum0; /* Accumulators */ + q15_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt, j, tapCnt; /* Loop counters */ + uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */ + uint32_t blkCntN2; + q63_t acc0, acc1; + q15_t x1; + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + ((q31_t) phaseLen - 1); + + /* Initialise blkCnt */ + blkCnt = blockSize / 2; + blkCntN2 = blockSize - (2 * blkCnt); + + /* Samples loop unrolled by 2 */ + while (blkCnt > 0U) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1U; + + /* Loop over the Interpolation factor. */ + i = (S->L); + + while (i > 0U) + { + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2U; + + x0 = *(ptr1++); + + while (tapCnt > 0U) + { + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + + /* Read the coefficient */ + c0 = *(ptr2 + S->L); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x1 *c0; + acc1 += (q63_t) x0 *c0; + + + /* Read the coefficient */ + c0 = *(ptr2 + S->L * 2); + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + /* Read the coefficient */ + c0 = *(ptr2 + S->L * 3); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x1 *c0; + acc1 += (q63_t) x0 *c0; + + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += 4 * S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen % 0x4U; + + while (tapCnt > 0U) + { + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* update states for next sample processing */ + x0 = x1; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst = (q15_t) (__SSAT((acc0 >> 15), 16)); + *(pDst + S->L) = (q15_t) (__SSAT((acc1 >> 15), 16)); + + pDst++; + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 2; + + pDst += S->L; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 2, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blkCntN2; + + /* Loop over the blockSize. */ + while (blkCnt > 0U) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1U; + + /* Loop over the Interpolation factor. */ + i = S->L; + while (i > 0U) + { + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2; + while (tapCnt > 0U) + { + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen & 0x3U; + + while (tapCnt > 0U) + { + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); + + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = ((uint32_t) phaseLen - 1U) >> 2U; + + /* copy data */ + while (i > 0U) + { +#ifndef UNALIGNED_SUPPORT_DISABLE + + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + +#else + + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Decrement the loop counter */ + i--; + } + + i = ((uint32_t) phaseLen - 1U) % 0x04U; + + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } +} + +#else + + /* Run the below code for Cortex-M0 */ + +void arm_fir_interpolate_q15( + const arm_fir_interpolate_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + q63_t sum; /* Accumulator */ + q15_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt, tapCnt; /* Loop counters */ + uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */ + + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (phaseLen - 1U); + + /* Total number of intput samples */ + blkCnt = blockSize; + + /* Loop over the blockSize. */ + while (blkCnt > 0U) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Loop over the Interpolation factor. */ + i = S->L; + + while (i > 0U) + { + /* Set accumulator to zero */ + sum = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (i - 1U); + + /* Loop over the polyPhase length */ + tapCnt = (uint32_t) phaseLen; + + while (tapCnt > 0U) + { + /* Read the coefficient */ + c0 = *ptr2; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *ptr1++; + + /* Perform the multiply-accumulate */ + sum += ((q31_t) x0 * c0); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Store the result after converting to 1.15 format in the destination buffer */ + *pDst++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (uint32_t) phaseLen - 1U; + + while (i > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +} + +#endif /* #if defined (ARM_MATH_DSP) */ + + + /** + * @} end of FIR_Interpolate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c new file mode 100644 index 0000000..2c0f522 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c @@ -0,0 +1,492 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_interpolate_q31.c + * Description: Q31 FIR interpolation + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Interpolate + * @{ + */ + +/** + * @brief Processing function for the Q31 FIR interpolator. + * @param[in] *S points to an instance of the Q31 FIR interpolator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by 1/(numTaps/L). + * since numTaps/L additions occur per output sample. + * After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format. + */ + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_interpolate_q31( + const arm_fir_interpolate_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + q63_t sum0; /* Accumulators */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt, j; /* Loop counters */ + uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */ + + uint32_t blkCntN2; + q63_t acc0, acc1; + q31_t x1; + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + ((q31_t) phaseLen - 1); + + /* Initialise blkCnt */ + blkCnt = blockSize / 2; + blkCntN2 = blockSize - (2 * blkCnt); + + /* Samples loop unrolled by 2 */ + while (blkCnt > 0U) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1U; + + /* Loop over the Interpolation factor. */ + i = (S->L); + + while (i > 0U) + { + /* Set accumulator to zero */ + acc0 = 0; + acc1 = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2U; + + x0 = *(ptr1++); + + while (tapCnt > 0U) + { + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + + /* Read the coefficient */ + c0 = *(ptr2 + S->L); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x1 *c0; + acc1 += (q63_t) x0 *c0; + + + /* Read the coefficient */ + c0 = *(ptr2 + S->L * 2); + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + /* Read the coefficient */ + c0 = *(ptr2 + S->L * 3); + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x1 *c0; + acc1 += (q63_t) x0 *c0; + + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += 4 * S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen % 0x4U; + + while (tapCnt > 0U) + { + + /* Read the input sample */ + x1 = *(ptr1++); + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Perform the multiply-accumulate */ + acc0 += (q63_t) x0 *c0; + acc1 += (q63_t) x1 *c0; + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* update states for next sample processing */ + x0 = x1; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst = (q31_t) (acc0 >> 31); + *(pDst + S->L) = (q31_t) (acc1 >> 31); + + + pDst++; + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 2; + + pDst += S->L; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 2, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blkCntN2; + + /* Loop over the blockSize. */ + while (blkCnt > 0U) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1U; + + /* Loop over the Interpolation factor. */ + i = S->L; + while (i > 0U) + { + /* Set accumulator to zero */ + sum0 = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2; + while (tapCnt > 0U) + { + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen & 0x3U; + + while (tapCnt > 0U) + { + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *(ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum0 >> 31); + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (phaseLen - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (phaseLen - 1U) % 0x04U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + + +#else + +void arm_fir_interpolate_q31( + const arm_fir_interpolate_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + + /* Run the below code for Cortex-M0 */ + + q63_t sum; /* Accumulator */ + q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t i, blkCnt; /* Loop counters */ + uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */ + + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + ((q31_t) phaseLen - 1); + + /* Total number of intput samples */ + blkCnt = blockSize; + + /* Loop over the blockSize. */ + while (blkCnt > 0U) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Loop over the Interpolation factor. */ + i = S->L; + + while (i > 0U) + { + /* Set accumulator to zero */ + sum = 0; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (i - 1U); + + tapCnt = phaseLen; + + while (tapCnt > 0U) + { + /* Read the coefficient */ + c0 = *(ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = *ptr1++; + + /* Perform the multiply-accumulate */ + sum += (q63_t) x0 *c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = (q31_t) (sum >> 31); + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = phaseLen - 1U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #if defined (ARM_MATH_DSP) */ + + /** + * @} end of FIR_Interpolate group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c new file mode 100644 index 0000000..1b6d0fb --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c @@ -0,0 +1,494 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_lattice_f32.c + * Description: Processing function for the floating-point FIR Lattice filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup FIR_Lattice Finite Impulse Response (FIR) Lattice Filters + * + * This set of functions implements Finite Impulse Response (FIR) lattice filters + * for Q15, Q31 and floating-point data types. Lattice filters are used in a + * variety of adaptive filter applications. The filter structure is feedforward and + * the net impulse response is finite length. + * The functions operate on blocks + * of input and output data and each call to the function processes + * blockSize samples through the filter. pSrc and + * pDst point to input and output arrays containing blockSize values. + * + * \par Algorithm: + * \image html FIRLattice.gif "Finite Impulse Response Lattice filter" + * The following difference equation is implemented: + *
+ *    f0[n] = g0[n] = x[n]
+ *    fm[n] = fm-1[n] + km * gm-1[n-1] for m = 1, 2, ...M
+ *    gm[n] = km * fm-1[n] + gm-1[n-1] for m = 1, 2, ...M
+ *    y[n] = fM[n]
+ * 
+ * \par + * pCoeffs points to tha array of reflection coefficients of size numStages. + * Reflection Coefficients are stored in the following order. + * \par + *
+ *    {k1, k2, ..., kM}
+ * 
+ * where M is number of stages + * \par + * pState points to a state array of size numStages. + * The state variables (g values) hold previous inputs and are stored in the following order. + *
+ *    {g0[n], g1[n], g2[n] ...gM-1[n]}
+ * 
+ * The state variables are updated after each block of data is processed; the coefficients are untouched. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * To do this manually without calling the init function, assign the follow subfields of the instance structure: + * numStages, pCoeffs, pState. Also set all of the values in pState to zero. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows: + *
+ *arm_fir_lattice_instance_f32 S = {numStages, pState, pCoeffs};
+ *arm_fir_lattice_instance_q31 S = {numStages, pState, pCoeffs};
+ *arm_fir_lattice_instance_q15 S = {numStages, pState, pCoeffs};
+ * 
+ * \par + * where numStages is the number of stages in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer. + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the FIR Lattice filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + + /** + * @brief Processing function for the floating-point FIR lattice filter. + * @param[in] *S points to an instance of the floating-point FIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_fir_lattice_f32( + const arm_fir_lattice_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t *pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *px; /* temporary state pointer */ + float32_t *pk; /* temporary coefficient pointer */ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float32_t fcurr1, fnext1, gcurr1, gnext1; /* temporary variables for first sample in loop unrolling */ + float32_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */ + float32_t fcurr3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */ + float32_t fcurr4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */ + uint32_t numStages = S->numStages; /* Number of stages in the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + gcurr1 = 0.0f; + pState = &S->pState[0]; + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + + /* Read two samples from input buffer */ + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + fcurr2 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* Read g0(n-1) from state */ + gcurr1 = *px; + + /* Process first sample for first tap */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (fcurr1 * (*pk)) + gcurr1; + + /* Process second sample for first tap */ + /* for sample 2 processing */ + fnext2 = fcurr2 + ((*pk) * fcurr1); + gnext2 = (fcurr2 * (*pk)) + fcurr1; + + /* Read next two samples from input buffer */ + /* f0(n+2) = x(n+2) */ + fcurr3 = *pSrc++; + fcurr4 = *pSrc++; + + /* Copy only last input samples into the state buffer + which will be used for next four samples processing */ + *px++ = fcurr4; + + /* Process third sample for first tap */ + fnext3 = fcurr3 + ((*pk) * fcurr2); + gnext3 = (fcurr3 * (*pk)) + fcurr2; + + /* Process fourth sample for first tap */ + fnext4 = fcurr4 + ((*pk) * fcurr3); + gnext4 = (fcurr4 * (*pk++)) + fcurr3; + + /* Update of f values for next coefficient set processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + fcurr3 = fnext3; + fcurr4 = fnext4; + + /* Loop unrolling. Process 4 taps at a time . */ + stageCnt = (numStages - 1U) >> 2U; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numStages-3 coefficients. */ + + /* Process 2nd, 3rd, 4th and 5th taps ... here */ + while (stageCnt > 0U) + { + /* Read g1(n-1), g3(n-1) .... from state */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext4; + + /* Process first sample for 2nd, 6th .. tap */ + /* Sample processing for K2, K6.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* Process second sample for 2nd, 6th .. tap */ + /* for sample 2 processing */ + fnext2 = fcurr2 + ((*pk) * gnext1); + /* Process third sample for 2nd, 6th .. tap */ + fnext3 = fcurr3 + ((*pk) * gnext2); + /* Process fourth sample for 2nd, 6th .. tap */ + fnext4 = fcurr4 + ((*pk) * gnext3); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (fcurr4 * (*pk)) + gnext3; + gnext3 = (fcurr3 * (*pk)) + gnext2; + gnext2 = (fcurr2 * (*pk)) + gnext1; + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurr1 = *px; + + /* save g2(n) in state buffer */ + *px++ = gnext4; + + /* Sample processing for K3, K7.... */ + /* Process first sample for 3rd, 7th .. tap */ + /* f3(n) = f2(n) + K3 * g2(n-1) */ + fcurr1 = fnext1 + ((*pk) * gcurr1); + /* Process second sample for 3rd, 7th .. tap */ + fcurr2 = fnext2 + ((*pk) * gnext1); + /* Process third sample for 3rd, 7th .. tap */ + fcurr3 = fnext3 + ((*pk) * gnext2); + /* Process fourth sample for 3rd, 7th .. tap */ + fcurr4 = fnext4 + ((*pk) * gnext3); + + /* Calculation of state values for next stage */ + /* g3(n) = f2(n) * K3 + g2(n-1) */ + gnext4 = (fnext4 * (*pk)) + gnext3; + gnext3 = (fnext3 * (*pk)) + gnext2; + gnext2 = (fnext2 * (*pk)) + gnext1; + gnext1 = (fnext1 * (*pk++)) + gcurr1; + + + /* Read g1(n-1), g3(n-1) .... from state */ + gcurr1 = *px; + + /* save g3(n) in state buffer */ + *px++ = gnext4; + + /* Sample processing for K4, K8.... */ + /* Process first sample for 4th, 8th .. tap */ + /* f4(n) = f3(n) + K4 * g3(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* Process second sample for 4th, 8th .. tap */ + /* for sample 2 processing */ + fnext2 = fcurr2 + ((*pk) * gnext1); + /* Process third sample for 4th, 8th .. tap */ + fnext3 = fcurr3 + ((*pk) * gnext2); + /* Process fourth sample for 4th, 8th .. tap */ + fnext4 = fcurr4 + ((*pk) * gnext3); + + /* g4(n) = f3(n) * K4 + g3(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (fcurr4 * (*pk)) + gnext3; + gnext3 = (fcurr3 * (*pk)) + gnext2; + gnext2 = (fcurr2 * (*pk)) + gnext1; + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurr1 = *px; + + /* save g4(n) in state buffer */ + *px++ = gnext4; + + /* Sample processing for K5, K9.... */ + /* Process first sample for 5th, 9th .. tap */ + /* f5(n) = f4(n) + K5 * g4(n-1) */ + fcurr1 = fnext1 + ((*pk) * gcurr1); + /* Process second sample for 5th, 9th .. tap */ + fcurr2 = fnext2 + ((*pk) * gnext1); + /* Process third sample for 5th, 9th .. tap */ + fcurr3 = fnext3 + ((*pk) * gnext2); + /* Process fourth sample for 5th, 9th .. tap */ + fcurr4 = fnext4 + ((*pk) * gnext3); + + /* Calculation of state values for next stage */ + /* g5(n) = f4(n) * K5 + g4(n-1) */ + gnext4 = (fnext4 * (*pk)) + gnext3; + gnext3 = (fnext3 * (*pk)) + gnext2; + gnext2 = (fnext2 * (*pk)) + gnext1; + gnext1 = (fnext1 * (*pk++)) + gcurr1; + + stageCnt--; + } + + /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */ + stageCnt = (numStages - 1U) % 0x4U; + + while (stageCnt > 0U) + { + gcurr1 = *px; + + /* save g value in state buffer */ + *px++ = gnext4; + + /* Process four samples for last three taps here */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + fnext2 = fcurr2 + ((*pk) * gnext1); + fnext3 = fcurr3 + ((*pk) * gnext2); + fnext4 = fcurr4 + ((*pk) * gnext3); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext4 = (fcurr4 * (*pk)) + gnext3; + gnext3 = (fcurr3 * (*pk)) + gnext2; + gnext2 = (fcurr2 * (*pk)) + gnext1; + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* Update of f values for next coefficient set processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + fcurr3 = fnext3; + fcurr4 = fnext4; + + stageCnt--; + + } + + /* The results in the 4 accumulators, store in the destination buffer. */ + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + *pDst++ = fcurr2; + *pDst++ = fcurr3; + *pDst++ = fcurr4; + + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* save g1(n) in state buffer */ + *px++ = fcurr1; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt = (numStages - 1U); + + /* stage loop */ + while (stageCnt > 0U) + { + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext1; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = fcurr1 + ((*pk) * gcurr1); + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + + blkCnt--; + + } + +#else + + /* Run the below code for Cortex-M0 */ + + float32_t fcurr, fnext, gcurr, gnext; /* temporary variables */ + uint32_t numStages = S->numStages; /* Length of the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + pState = &S->pState[0]; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* f0(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize coeff pointer */ + pk = pCoeffs; + + /* Initialize state pointer */ + px = pState; + + /* read g0(n-1) from state buffer */ + gcurr = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext = fcurr + ((*pk) * gcurr); + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext = (fcurr * (*pk++)) + gcurr; + + /* save f0(n) in state buffer */ + *px++ = fcurr; + + /* f1(n) is saved in fcurr + for next stage processing */ + fcurr = fnext; + + stageCnt = (numStages - 1U); + + /* stage loop */ + while (stageCnt > 0U) + { + /* read g2(n) from state buffer */ + gcurr = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext = fcurr + ((*pk) * gcurr); + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext = (fcurr * (*pk++)) + gcurr; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr = fnext; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = fcurr; + + blkCnt--; + + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c new file mode 100644 index 0000000..55520eb --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_lattice_init_f32.c + * Description: Floating-point FIR Lattice filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + +/** + * @brief Initialization function for the floating-point FIR lattice filter. + * @param[in] *S points to an instance of the floating-point FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] *pState points to the state buffer. The array is of length numStages. + * @return none. + */ + +void arm_fir_lattice_init_f32( + arm_fir_lattice_instance_f32 * S, + uint16_t numStages, + float32_t * pCoeffs, + float32_t * pState) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always numStages */ + memset(pState, 0, (numStages) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c new file mode 100644 index 0000000..59cf496 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_lattice_init_q15.c + * Description: Q15 FIR Lattice filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + /** + * @brief Initialization function for the Q15 FIR lattice filter. + * @param[in] *S points to an instance of the Q15 FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] *pState points to the state buffer. The array is of length numStages. + * @return none. + */ + +void arm_fir_lattice_init_q15( + arm_fir_lattice_instance_q15 * S, + uint16_t numStages, + q15_t * pCoeffs, + q15_t * pState) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always numStages */ + memset(pState, 0, (numStages) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c new file mode 100644 index 0000000..abdd76f --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_lattice_init_q31.c + * Description: Q31 FIR lattice filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + /** + * @brief Initialization function for the Q31 FIR lattice filter. + * @param[in] *S points to an instance of the Q31 FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] *pState points to the state buffer. The array is of length numStages. + * @return none. + */ + +void arm_fir_lattice_init_q31( + arm_fir_lattice_instance_q31 * S, + uint16_t numStages, + q31_t * pCoeffs, + q31_t * pState) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always numStages */ + memset(pState, 0, (numStages) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c new file mode 100644 index 0000000..fb95ab6 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c @@ -0,0 +1,524 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_lattice_q15.c + * Description: Q15 FIR lattice filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + +/** + * @brief Processing function for the Q15 FIR lattice filter. + * @param[in] *S points to an instance of the Q15 FIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_fir_lattice_q15( + const arm_fir_lattice_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *px; /* temporary state pointer */ + q15_t *pk; /* temporary coefficient pointer */ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t fcurnt1, fnext1, gcurnt1 = 0, gnext1; /* temporary variables for first sample in loop unrolling */ + q31_t fcurnt2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */ + q31_t fcurnt3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */ + q31_t fcurnt4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */ + uint32_t numStages = S->numStages; /* Number of stages in the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + pState = &S->pState[0]; + + blkCnt = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + + /* Read two samples from input buffer */ + /* f0(n) = x(n) */ + fcurnt1 = *pSrc++; + fcurnt2 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* Read g0(n-1) from state */ + gcurnt1 = *px; + + /* Process first sample for first tap */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (q31_t) ((fcurnt1 * (*pk)) >> 15U) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + /* Process second sample for first tap */ + /* for sample 2 processing */ + fnext2 = (q31_t) ((fcurnt1 * (*pk)) >> 15U) + fcurnt2; + fnext2 = __SSAT(fnext2, 16); + + gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + fcurnt1; + gnext2 = __SSAT(gnext2, 16); + + + /* Read next two samples from input buffer */ + /* f0(n+2) = x(n+2) */ + fcurnt3 = *pSrc++; + fcurnt4 = *pSrc++; + + /* Copy only last input samples into the state buffer + which is used for next four samples processing */ + *px++ = (q15_t) fcurnt4; + + /* Process third sample for first tap */ + fnext3 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + fcurnt3; + fnext3 = __SSAT(fnext3, 16); + gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + fcurnt2; + gnext3 = __SSAT(gnext3, 16); + + /* Process fourth sample for first tap */ + fnext4 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + fcurnt4; + fnext4 = __SSAT(fnext4, 16); + gnext4 = (q31_t) ((fcurnt4 * (*pk++)) >> 15U) + fcurnt3; + gnext4 = __SSAT(gnext4, 16); + + /* Update of f values for next coefficient set processing */ + fcurnt1 = fnext1; + fcurnt2 = fnext2; + fcurnt3 = fnext3; + fcurnt4 = fnext4; + + + /* Loop unrolling. Process 4 taps at a time . */ + stageCnt = (numStages - 1U) >> 2; + + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numStages-3 coefficients. */ + + /* Process 2nd, 3rd, 4th and 5th taps ... here */ + while (stageCnt > 0U) + { + /* Read g1(n-1), g3(n-1) .... from state */ + gcurnt1 = *px; + + /* save g1(n) in state buffer */ + *px++ = (q15_t) gnext4; + + /* Process first sample for 2nd, 6th .. tap */ + /* Sample processing for K2, K6.... */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + + /* Process second sample for 2nd, 6th .. tap */ + /* for sample 2 processing */ + fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurnt2; + fnext2 = __SSAT(fnext2, 16); + /* Process third sample for 2nd, 6th .. tap */ + fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurnt3; + fnext3 = __SSAT(fnext3, 16); + /* Process fourth sample for 2nd, 6th .. tap */ + /* fnext4 = fcurnt4 + (*pk) * gnext3; */ + fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fcurnt4; + fnext4 = __SSAT(fnext4, 16); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15U) + gnext3; + gnext4 = __SSAT(gnext4, 16); + gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + gnext2; + gnext3 = __SSAT(gnext3, 16); + + gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + gnext1; + gnext2 = __SSAT(gnext2, 16); + + gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15U) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurnt1 = *px; + + /* save g1(n) in state buffer */ + *px++ = (q15_t) gnext4; + + /* Sample processing for K3, K7.... */ + /* Process first sample for 3rd, 7th .. tap */ + /* f3(n) = f2(n) + K3 * g2(n-1) */ + fcurnt1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fnext1; + fcurnt1 = __SSAT(fcurnt1, 16); + + /* Process second sample for 3rd, 7th .. tap */ + fcurnt2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fnext2; + fcurnt2 = __SSAT(fcurnt2, 16); + + /* Process third sample for 3rd, 7th .. tap */ + fcurnt3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fnext3; + fcurnt3 = __SSAT(fcurnt3, 16); + + /* Process fourth sample for 3rd, 7th .. tap */ + fcurnt4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fnext4; + fcurnt4 = __SSAT(fcurnt4, 16); + + /* Calculation of state values for next stage */ + /* g3(n) = f2(n) * K3 + g2(n-1) */ + gnext4 = (q31_t) ((fnext4 * (*pk)) >> 15U) + gnext3; + gnext4 = __SSAT(gnext4, 16); + + gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15U) + gnext2; + gnext3 = __SSAT(gnext3, 16); + + gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15U) + gnext1; + gnext2 = __SSAT(gnext2, 16); + + gnext1 = (q31_t) ((fnext1 * (*pk++)) >> 15U) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + /* Read g1(n-1), g3(n-1) .... from state */ + gcurnt1 = *px; + + /* save g1(n) in state buffer */ + *px++ = (q15_t) gnext4; + + /* Sample processing for K4, K8.... */ + /* Process first sample for 4th, 8th .. tap */ + /* f4(n) = f3(n) + K4 * g3(n-1) */ + fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + /* Process second sample for 4th, 8th .. tap */ + /* for sample 2 processing */ + fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurnt2; + fnext2 = __SSAT(fnext2, 16); + + /* Process third sample for 4th, 8th .. tap */ + fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurnt3; + fnext3 = __SSAT(fnext3, 16); + + /* Process fourth sample for 4th, 8th .. tap */ + fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fcurnt4; + fnext4 = __SSAT(fnext4, 16); + + /* g4(n) = f3(n) * K4 + g3(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15U) + gnext3; + gnext4 = __SSAT(gnext4, 16); + + gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + gnext2; + gnext3 = __SSAT(gnext3, 16); + + gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + gnext1; + gnext2 = __SSAT(gnext2, 16); + gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15U) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurnt1 = *px; + + /* save g4(n) in state buffer */ + *px++ = (q15_t) gnext4; + + /* Sample processing for K5, K9.... */ + /* Process first sample for 5th, 9th .. tap */ + /* f5(n) = f4(n) + K5 * g4(n-1) */ + fcurnt1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fnext1; + fcurnt1 = __SSAT(fcurnt1, 16); + + /* Process second sample for 5th, 9th .. tap */ + fcurnt2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fnext2; + fcurnt2 = __SSAT(fcurnt2, 16); + + /* Process third sample for 5th, 9th .. tap */ + fcurnt3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fnext3; + fcurnt3 = __SSAT(fcurnt3, 16); + + /* Process fourth sample for 5th, 9th .. tap */ + fcurnt4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fnext4; + fcurnt4 = __SSAT(fcurnt4, 16); + + /* Calculation of state values for next stage */ + /* g5(n) = f4(n) * K5 + g4(n-1) */ + gnext4 = (q31_t) ((fnext4 * (*pk)) >> 15U) + gnext3; + gnext4 = __SSAT(gnext4, 16); + gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15U) + gnext2; + gnext3 = __SSAT(gnext3, 16); + gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15U) + gnext1; + gnext2 = __SSAT(gnext2, 16); + gnext1 = (q31_t) ((fnext1 * (*pk++)) >> 15U) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + stageCnt--; + } + + /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */ + stageCnt = (numStages - 1U) % 0x4U; + + while (stageCnt > 0U) + { + gcurnt1 = *px; + + /* save g value in state buffer */ + *px++ = (q15_t) gnext4; + + /* Process four samples for last three taps here */ + fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15U) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurnt2; + fnext2 = __SSAT(fnext2, 16); + + fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurnt3; + fnext3 = __SSAT(fnext3, 16); + + fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15U) + fcurnt4; + fnext4 = __SSAT(fnext4, 16); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15U) + gnext3; + gnext4 = __SSAT(gnext4, 16); + gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15U) + gnext2; + gnext3 = __SSAT(gnext3, 16); + gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15U) + gnext1; + gnext2 = __SSAT(gnext2, 16); + gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15U) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + /* Update of f values for next coefficient set processing */ + fcurnt1 = fnext1; + fcurnt2 = fnext2; + fcurnt3 = fnext3; + fcurnt4 = fnext4; + + stageCnt--; + + } + + /* The results in the 4 accumulators, store in the destination buffer. */ + /* y(n) = fN(n) */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = __PKHBT(fcurnt1, fcurnt2, 16); + *__SIMD32(pDst)++ = __PKHBT(fcurnt3, fcurnt4, 16); + +#else + + *__SIMD32(pDst)++ = __PKHBT(fcurnt2, fcurnt1, 16); + *__SIMD32(pDst)++ = __PKHBT(fcurnt4, fcurnt3, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* f0(n) = x(n) */ + fcurnt1 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g2(n) from state buffer */ + gcurnt1 = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (((q31_t) gcurnt1 * (*pk)) >> 15U) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (((q31_t) fcurnt1 * (*pk++)) >> 15U) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + /* save g1(n) in state buffer */ + *px++ = (q15_t) fcurnt1; + + /* f1(n) is saved in fcurnt1 + for next stage processing */ + fcurnt1 = fnext1; + + stageCnt = (numStages - 1U); + + /* stage loop */ + while (stageCnt > 0U) + { + /* read g2(n) from state buffer */ + gcurnt1 = *px; + + /* save g1(n) in state buffer */ + *px++ = (q15_t) gnext1; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = (((q31_t) gcurnt1 * (*pk)) >> 15U) + fcurnt1; + fnext1 = __SSAT(fnext1, 16); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext1 = (((q31_t) fcurnt1 * (*pk++)) >> 15U) + gcurnt1; + gnext1 = __SSAT(gnext1, 16); + + + /* f1(n) is saved in fcurnt1 + for next stage processing */ + fcurnt1 = fnext1; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = __SSAT(fcurnt1, 16); + + + blkCnt--; + + } + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t fcurnt, fnext, gcurnt, gnext; /* temporary variables */ + uint32_t numStages = S->numStages; /* Length of the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + pState = &S->pState[0]; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* f0(n) = x(n) */ + fcurnt = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g0(n-1) from state buffer */ + gcurnt = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext = ((gcurnt * (*pk)) >> 15U) + fcurnt; + fnext = __SSAT(fnext, 16); + + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext = ((fcurnt * (*pk++)) >> 15U) + gcurnt; + gnext = __SSAT(gnext, 16); + + /* save f0(n) in state buffer */ + *px++ = (q15_t) fcurnt; + + /* f1(n) is saved in fcurnt + for next stage processing */ + fcurnt = fnext; + + stageCnt = (numStages - 1U); + + /* stage loop */ + while (stageCnt > 0U) + { + /* read g1(n-1) from state buffer */ + gcurnt = *px; + + /* save g0(n-1) in state buffer */ + *px++ = (q15_t) gnext; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext = ((gcurnt * (*pk)) >> 15U) + fcurnt; + fnext = __SSAT(fnext, 16); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext = ((fcurnt * (*pk++)) >> 15U) + gcurnt; + gnext = __SSAT(gnext, 16); + + + /* f1(n) is saved in fcurnt + for next stage processing */ + fcurnt = fnext; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = __SSAT(fcurnt, 16); + + + blkCnt--; + + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c new file mode 100644 index 0000000..9d52bbc --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c @@ -0,0 +1,341 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_lattice_q31.c + * Description: Q31 FIR lattice filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Lattice + * @{ + */ + + +/** + * @brief Processing function for the Q31 FIR lattice filter. + * @param[in] *S points to an instance of the Q31 FIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * In order to avoid overflows the input signal must be scaled down by 2*log2(numStages) bits. + */ + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_lattice_q31( + const arm_fir_lattice_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *px; /* temporary state pointer */ + q31_t *pk; /* temporary coefficient pointer */ + q31_t fcurr1, fnext1, gcurr1 = 0, gnext1; /* temporary variables for first sample in loop unrolling */ + q31_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */ + uint32_t numStages = S->numStages; /* Length of the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + q31_t k; + + pState = &S->pState[0]; + + blkCnt = blockSize >> 1U; + + /* First part of the processing with loop unrolling. Compute 2 outputs at a time. + a second loop below computes the remaining 1 sample. */ + while (blkCnt > 0U) + { + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + + /* f0(n) = x(n) */ + fcurr2 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g0(n - 1) from state buffer */ + gcurr1 = *px; + + /* Read the reflection coefficient */ + k = *pk++; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32); + fnext1 = fcurr1 + (fnext1 << 1U); + gnext1 = gcurr1 + (gnext1 << 1U); + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext2 = (q31_t) (((q63_t) fcurr1 * k) >> 32); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext2 = (q31_t) (((q63_t) fcurr2 * (k)) >> 32); + fnext2 = fcurr2 + (fnext2 << 1U); + gnext2 = fcurr1 + (gnext2 << 1U); + + /* save g1(n) in state buffer */ + *px++ = fcurr2; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + + stageCnt = (numStages - 1U); + + /* stage loop */ + while (stageCnt > 0U) + { + + /* Read the reflection coefficient */ + k = *pk++; + + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext2; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32); + fnext2 = (q31_t) (((q63_t) gnext1 * k) >> 32); + + fnext1 = fcurr1 + (fnext1 << 1U); + fnext2 = fcurr2 + (fnext2 << 1U); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext2 = (q31_t) (((q63_t) fcurr2 * (k)) >> 32); + gnext2 = gnext1 + (gnext2 << 1U); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32); + gnext1 = gcurr1 + (gnext1 << 1U); + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + *pDst++ = fcurr2; + + blkCnt--; + + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x2U; + + while (blkCnt > 0U) + { + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g0(n - 1) from state buffer */ + gcurr1 = *px; + + /* Read the reflection coefficient */ + k = *pk++; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32); + fnext1 = fcurr1 + (fnext1 << 1U); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32); + gnext1 = gcurr1 + (gnext1 << 1U); + + /* save g1(n) in state buffer */ + *px++ = fcurr1; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt = (numStages - 1U); + + /* stage loop */ + while (stageCnt > 0U) + { + /* Read the reflection coefficient */ + k = *pk++; + + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext1; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32); + fnext1 = fcurr1 + (fnext1 << 1U); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32); + gnext1 = gcurr1 + (gnext1 << 1U); + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt--; + + } + + + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + + blkCnt--; + + } + + +} + + +#else + +/* Run the below code for Cortex-M0 */ + +void arm_fir_lattice_q31( + const arm_fir_lattice_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *px; /* temporary state pointer */ + q31_t *pk; /* temporary coefficient pointer */ + q31_t fcurr, fnext, gcurr, gnext; /* temporary variables */ + uint32_t numStages = S->numStages; /* Length of the filter */ + uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + pState = &S->pState[0]; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* f0(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g0(n-1) from state buffer */ + gcurr = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr; + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr; + /* save g1(n) in state buffer */ + *px++ = fcurr; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr = fnext; + + stageCnt = (numStages - 1U); + + /* stage loop */ + while (stageCnt > 0U) + { + /* read g2(n) from state buffer */ + gcurr = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr; + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr = fnext; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = fcurr; + + blkCnt--; + + } + +} + +#endif /* #if defined (ARM_MATH_DSP) */ + + +/** + * @} end of FIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c new file mode 100644 index 0000000..a979783 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c @@ -0,0 +1,679 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_q15.c + * Description: Q15 FIR filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @brief Processing function for the Q15 FIR filter. + * @param[in] *S points to an instance of the Q15 FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, state buffers should be aligned by 32-bit + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + * + * \par + * Refer to the function arm_fir_fast_q15() for a faster but less precise implementation of this function. + */ + +#if defined (ARM_MATH_DSP) + +/* Run the below code for Cortex-M4 and Cortex-M3 */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px1; /* Temporary q15 pointer for state buffer */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */ + q63_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* Copy four new input samples into the state buffer. + ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ + *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Initialize state pointer of type q15 */ + px1 = pState; + + /* Initialize coeff pointer of type q31 */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ + x0 = _SIMD32_OFFSET(px1); + + /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */ + x1 = _SIMD32_OFFSET(px1 + 1U); + + px1 += 2U; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + tapCnt = numTaps >> 2; + + while (tapCnt > 0U) + { + /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + acc0 = __SMLALD(x0, c0, acc0); + + /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ + acc1 = __SMLALD(x1, c0, acc1); + + /* Read state x[n-N-2], x[n-N-3] */ + x2 = _SIMD32_OFFSET(px1); + + /* Read state x[n-N-3], x[n-N-4] */ + x3 = _SIMD32_OFFSET(px1 + 1U); + + /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ + acc2 = __SMLALD(x2, c0, acc2); + + /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ + acc3 = __SMLALD(x3, c0, acc3); + + /* Read coefficients b[N-2], b[N-3] */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ + acc0 = __SMLALD(x2, c0, acc0); + + /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ + acc1 = __SMLALD(x3, c0, acc1); + + /* Read state x[n-N-4], x[n-N-5] */ + x0 = _SIMD32_OFFSET(px1 + 2U); + + /* Read state x[n-N-5], x[n-N-6] */ + x1 = _SIMD32_OFFSET(px1 + 3U); + + /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ + acc2 = __SMLALD(x0, c0, acc2); + + /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ + acc3 = __SMLALD(x1, c0, acc3); + + px1 += 4U; + + tapCnt--; + + } + + + /* If the filter length is not a multiple of 4, compute the remaining filter taps. + ** This is always be 2 taps since the filter length is even. */ + if ((numTaps & 0x3U) != 0U) + { + /* Read 2 coefficients */ + c0 = *__SIMD32(pb)++; + + /* Fetch 4 state variables */ + x2 = _SIMD32_OFFSET(px1); + + x3 = _SIMD32_OFFSET(px1 + 1U); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + + px1 += 2U; + + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALD(x2, c0, acc2); + acc3 = __SMLALD(x3, c0, acc3); + } + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. + ** Then store the 4 outputs in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + while (blkCnt > 0U) + { + /* Copy two samples into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer of type q15 */ + px1 = pState; + + /* Initialize coeff pointer of type q31 */ + pb = pCoeffs; + + tapCnt = numTaps >> 1; + + do + { + + c0 = *__SIMD32(pb)++; + x0 = *__SIMD32(px1)++; + + acc0 = __SMLALD(x0, c0, acc0); + tapCnt--; + } + while (tapCnt > 0U); + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1U) >> 2; + + while (tapCnt > 0U) + { + + /* Copy state values to start of state buffer */ + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* copy remaining data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#else /* UNALIGNED_SUPPORT_DISABLE */ + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q63_t acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */ + q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* Copy four new input samples into the state buffer. + ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */ + px = pState; + + /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ + x0 = *__SIMD32(px)++; + + /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */ + x2 = *__SIMD32(px)++; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */ + tapCnt = numTaps >> 2; + + while (tapCnt > 0) + { + /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + acc0 = __SMLALD(x0, c0, acc0); + + /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ + acc2 = __SMLALD(x2, c0, acc2); + + /* pack x[n-N-1] and x[n-N-2] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read state x[n-N-4], x[n-N-5] */ + x0 = _SIMD32_OFFSET(px); + + /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* pack x[n-N-3] and x[n-N-4] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ + acc3 = __SMLALDX(x1, c0, acc3); + + /* Read coefficients b[N-2], b[N-3] */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ + acc0 = __SMLALD(x2, c0, acc0); + + /* Read state x[n-N-6], x[n-N-7] with offset */ + x2 = _SIMD32_OFFSET(px + 2U); + + /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ + acc2 = __SMLALD(x0, c0, acc2); + + /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* pack x[n-N-5] and x[n-N-6] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ + acc3 = __SMLALDX(x1, c0, acc3); + + /* Update state pointer for next state reading */ + px += 4U; + + /* Decrement tap count */ + tapCnt--; + + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps. + ** This is always be 2 taps since the filter length is even. */ + if ((numTaps & 0x3U) != 0U) + { + + /* Read last two coefficients */ + c0 = *__SIMD32(pb)++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc2 = __SMLALD(x2, c0, acc2); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read last state variables */ + x0 = *__SIMD32(px); + + /* Perform the multiply-accumulates */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* Perform the multiply-accumulates */ + acc3 = __SMLALDX(x1, c0, acc3); + } + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. + ** Then store the 4 outputs in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + while (blkCnt > 0U) + { + /* Copy two samples into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Use SIMD to hold states and coefficients */ + px = pState; + pb = pCoeffs; + + tapCnt = numTaps >> 1U; + + do + { + acc0 += (q31_t) * px++ * *pb++; + acc0 += (q31_t) * px++ * *pb++; + tapCnt--; + } + while (tapCnt > 0U); + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1U; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1U) >> 2; + + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* copy remaining data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +#else /* ARM_MATH_CM0_FAMILY */ + + +/* Run the below code for Cortex-M0 */ + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + + + + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q63_t acc; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of nTaps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + tapCnt = numTaps; + + /* Perform the multiply-accumulates */ + do + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += (q31_t) * px++ * *pb++; + tapCnt--; + } while (tapCnt > 0U); + + /* The result is in 2.30 format. Convert to 1.15 + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) __SSAT((acc >> 15U), 16); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + tapCnt = (numTaps - 1U); + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #if defined (ARM_MATH_DSP) */ + + + + +/** + * @} end of FIR group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c new file mode 100644 index 0000000..b0a2723 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c @@ -0,0 +1,353 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_q31.c + * Description: Q31 FIR filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q31 FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. + * After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. + * + * \par + * Refer to the function arm_fir_fast_q31() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4. + */ + +void arm_fir_q31( + const arm_fir_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t x0, x1, x2; /* Temporary variables to hold state */ + q31_t c0; /* Temporary variable to hold coefficient value */ + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + q63_t acc0, acc1, acc2; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt, tapCntN3; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize / 3; + blockSize = blockSize - (3 * blkCnt); + + tapCnt = numTaps / 3; + tapCntN3 = numTaps - (3 * tapCnt); + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* Copy three new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: + * x[n-numTaps], x[n-numTaps-1] */ + x0 = *(px++); + x1 = *(px++); + + /* Loop unrolling. Process 3 taps at a time. */ + i = tapCnt; + + while (i > 0U) + { + /* Read the b[numTaps] coefficient */ + c0 = *pb; + + /* Read x[n-numTaps-2] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x0 * c0); + acc1 += ((q63_t) x1 * c0); + acc2 += ((q63_t) x2 * c0); + + /* Read the coefficient and state */ + c0 = *(pb + 1U); + x0 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x1 * c0); + acc1 += ((q63_t) x2 * c0); + acc2 += ((q63_t) x0 * c0); + + /* Read the coefficient and state */ + c0 = *(pb + 2U); + x1 = *(px++); + + /* update coefficient pointer */ + pb += 3U; + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x2 * c0); + acc1 += ((q63_t) x0 * c0); + acc2 += ((q63_t) x1 * c0); + + /* Decrement the loop counter */ + i--; + } + + /* If the filter length is not a multiple of 3, compute the remaining filter taps */ + + i = tapCntN3; + + while (i > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x0 * c0); + acc1 += ((q63_t) x1 * c0); + acc2 += ((q63_t) x2 * c0); + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 3 to process the next group of 3 samples */ + pState = pState + 3; + + /* The results in the 3 accumulators are in 2.30 format. Convert to 1.31 + ** Then store the 3 outputs in the destination buffer. */ + *pDst++ = (q31_t) (acc0 >> 31U); + *pDst++ = (q31_t) (acc1 >> 31U); + *pDst++ = (q31_t) (acc2 >> 31U); + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + + while (blockSize > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += (q63_t) * (px++) * (*(pb++)); + i--; + } while (i > 0U); + + /* The result is in 2.62 format. Convert to 1.31 + ** Then store the output in the destination buffer. */ + *pDst++ = (q31_t) (acc0 >> 31U); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blockSize--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + q63_t acc; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Length of the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += (q63_t) * px++ * *pb++; + i--; + } while (i > 0U); + + /* The result is in 2.62 format. Convert to 1.31 + ** Then store the output in the destination buffer. */ + *pDst++ = (q31_t) (acc >> 31U); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the starting of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + tapCnt = numTaps - 1U; + + /* Copy the data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c new file mode 100644 index 0000000..4f795d7 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c @@ -0,0 +1,385 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_q7.c + * Description: Q7 FIR filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q7 FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * The accumulator is converted to 18.7 format by discarding the low 7 bits. + * Finally, the result is truncated to 1.7 format. + */ + +void arm_fir_q7( + const arm_fir_instance_q7 * S, + q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pState = S->pState; /* State pointer */ + q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q7_t *pStateCurnt; /* Points to the current sample of the state */ + q7_t x0, x1, x2, x3; /* Temporary variables to hold state */ + q7_t c0; /* Temporary variable to hold coefficient value */ + q7_t *px; /* Temporary pointer for state */ + q7_t *pb; /* Temporary pointer for coefficient buffer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Read the first three samples from the state buffer: + * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + i = tapCnt; + + while (i > 0U) + { + /* Read the b[numTaps] coefficient */ + c0 = *pb; + + /* Read x[n-numTaps-3] sample */ + x3 = *px; + + /* acc0 += b[numTaps] * x[n-numTaps] */ + acc0 += ((q15_t) x0 * c0); + + /* acc1 += b[numTaps] * x[n-numTaps-1] */ + acc1 += ((q15_t) x1 * c0); + + /* acc2 += b[numTaps] * x[n-numTaps-2] */ + acc2 += ((q15_t) x2 * c0); + + /* acc3 += b[numTaps] * x[n-numTaps-3] */ + acc3 += ((q15_t) x3 * c0); + + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb + 1U); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px + 1U); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x1 * c0); + acc1 += ((q15_t) x2 * c0); + acc2 += ((q15_t) x3 * c0); + acc3 += ((q15_t) x0 * c0); + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb + 2U); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px + 2U); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x2 * c0); + acc1 += ((q15_t) x3 * c0); + acc2 += ((q15_t) x0 * c0); + acc3 += ((q15_t) x1 * c0); + + /* Read the b[numTaps-3] coefficients */ + c0 = *(pb + 3U); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px + 3U); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x3 * c0); + acc1 += ((q15_t) x0 * c0); + acc2 += ((q15_t) x1 * c0); + acc3 += ((q15_t) x2 * c0); + + /* update coefficient pointer */ + pb += 4U; + px += 4U; + + /* Decrement the loop counter */ + i--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + + i = numTaps - (tapCnt * 4U); + while (i > 0U) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x0 * c0); + acc1 += ((q15_t) x1 * c0); + acc2 += ((q15_t) x2 * c0); + acc3 += ((q15_t) x3 * c0); + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31 + ** Then store the 4 outputs in the destination buffer. */ + acc0 = __SSAT((acc0 >> 7U), 8); + *pDst++ = acc0; + acc1 = __SSAT((acc1 >> 7U), 8); + *pDst++ = acc1; + acc2 = __SSAT((acc2 >> 7U), 8); + *pDst++ = acc2; + acc3 = __SSAT((acc3 >> 7U), 8); + *pDst++ = acc3; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 4U; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += (q15_t) * (px++) * (*(pb++)); + i--; + } while (i > 0U); + + /* The result is in 2.14 format. Convert to 1.7 + ** Then store the output in the destination buffer. */ + *pDst++ = __SSAT((acc0 >> 7U), 8); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t i, blkCnt; /* Loop counters */ + q7_t *pState = S->pState; /* State pointer */ + q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q7_t *px, *pb; /* Temporary pointers to state and coeff */ + q31_t acc = 0; /* Accumlator */ + q7_t *pStateCurnt; /* Points to the current sample of the state */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1U); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + /* Perform filtering upto BlockSize - BlockSize%4 */ + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set accumulator to zero */ + acc = 0; + + /* Initialize state pointer of type q7 */ + px = pState; + + /* Initialize coeff pointer of type q7 */ + pb = pCoeffs; + + + i = numTaps; + + while (i > 0U) + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += (q15_t) * px++ * *pb++; + i--; + } + + /* Store the 1.7 format filter output in destination buffer */ + *pDst++ = (q7_t) __SSAT((acc >> 7), 8); + + /* Advance the state pointer by 1 to process the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + + /* Copy numTaps number of values */ + i = (numTaps - 1U); + + /* Copy q7_t data */ + while (i > 0U) + { + *pStateCurnt++ = *pState++; + i--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c new file mode 100644 index 0000000..fe9aacd --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c @@ -0,0 +1,433 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_sparse_f32.c + * Description: Floating-point sparse FIR filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup FIR_Sparse Finite Impulse Response (FIR) Sparse Filters + * + * This group of functions implements sparse FIR filters. + * Sparse FIR filters are equivalent to standard FIR filters except that most of the coefficients are equal to zero. + * Sparse filters are used for simulating reflections in communications and audio applications. + * + * There are separate functions for Q7, Q15, Q31, and floating-point data types. + * The functions operate on blocks of input and output data and each call to the function processes + * blockSize samples through the filter. pSrc and + * pDst points to input and output arrays respectively containing blockSize values. + * + * \par Algorithm: + * The sparse filter instant structure contains an array of tap indices pTapDelay which specifies the locations of the non-zero coefficients. + * This is in addition to the coefficient array b. + * The implementation essentially skips the multiplications by zero and leads to an efficient realization. + *
+ *     y[n] = b[0] * x[n-pTapDelay[0]] + b[1] * x[n-pTapDelay[1]] + b[2] * x[n-pTapDelay[2]] + ...+ b[numTaps-1] * x[n-pTapDelay[numTaps-1]]
+ * 
+ * \par + * \image html FIRSparse.gif "Sparse FIR filter. b[n] represents the filter coefficients" + * \par + * pCoeffs points to a coefficient array of size numTaps; + * pTapDelay points to an array of nonzero indices and is also of size numTaps; + * pState points to a state array of size maxDelay + blockSize, where + * maxDelay is the largest offset value that is ever used in the pTapDelay array. + * Some of the processing functions also require temporary working buffers. + * + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient and offset arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 4 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * To do this manually without calling the init function, assign the follow subfields of the instance structure: + * numTaps, pCoeffs, pTapDelay, maxDelay, stateIndex, pState. Also set all of the values in pState to zero. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * The code below statically initializes each of the 4 different data type filter instance structures + *
+ *arm_fir_sparse_instance_f32 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};
+ *arm_fir_sparse_instance_q31 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};
+ *arm_fir_sparse_instance_q15 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};
+ *arm_fir_sparse_instance_q7 S =  {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};
+ * 
+ * \par + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the sparse FIR filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Processing function for the floating-point sparse FIR filter. + * @param[in] *S points to an instance of the floating-point sparse FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] *pScratchIn points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + * @return none. + */ + +void arm_fir_sparse_f32( + arm_fir_sparse_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + float32_t * pScratchIn, + uint32_t blockSize) +{ + + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *px; /* Scratch buffer pointer */ + float32_t *py = pState; /* Temporary pointers for state buffer */ + float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + float32_t *pOut; /* Destination pointer */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + float32_t coeff = *pCoeffs++; /* Read the first coefficient value */ + + + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1, + (int32_t *) pSrc, 1, blockSize); + + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 Multiplications at a time. */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* Perform Multiplications and store in destination buffer */ + *pOut++ = *px++ * coeff; + *pOut++ = *px++ * coeff; + *pOut++ = *px++ * coeff; + *pOut++ = *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiplications and store in destination buffer */ + *pOut++ = *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 2U; + + while (tapCnt > 0U) + { + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - + (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Compute last tap without the final read of pTapDelay */ + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiplications and store in destination buffer */ + *pOut++ = *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 2U; + + while (tapCnt > 0U) + { + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = + ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Compute last tap without the final read of pTapDelay */ + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c new file mode 100644 index 0000000..191f8bb --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c @@ -0,0 +1,95 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_sparse_init_f32.c + * Description: Floating-point sparse FIR filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Initialization function for the floating-point sparse FIR filter. + * @param[in,out] *S points to an instance of the floating-point sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] *pCoeffs points to the array of filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] *pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + * @return none + * + * Description: + * \par + * pCoeffs holds the filter coefficients and has length numTaps. + * pState holds the filter's state variables and must be of length + * maxDelay + blockSize, where maxDelay + * is the maximum number of delay line values. + * blockSize is the + * number of samples processed by the arm_fir_sparse_f32() function. + */ + +void arm_fir_sparse_init_f32( + arm_fir_sparse_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign TapDelay pointer */ + S->pTapDelay = pTapDelay; + + /* Assign MaxDelay */ + S->maxDelay = maxDelay; + + /* reset the stateIndex to 0 */ + S->stateIndex = 0U; + + /* Clear state buffer and size is always maxDelay + blockSize */ + memset(pState, 0, (maxDelay + blockSize) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c new file mode 100644 index 0000000..297c5fa --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c @@ -0,0 +1,95 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_sparse_init_q15.c + * Description: Q15 sparse FIR filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Initialization function for the Q15 sparse FIR filter. + * @param[in,out] *S points to an instance of the Q15 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] *pCoeffs points to the array of filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] *pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + * @return none + * + * Description: + * \par + * pCoeffs holds the filter coefficients and has length numTaps. + * pState holds the filter's state variables and must be of length + * maxDelay + blockSize, where maxDelay + * is the maximum number of delay line values. + * blockSize is the + * number of words processed by arm_fir_sparse_q15() function. + */ + +void arm_fir_sparse_init_q15( + arm_fir_sparse_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign TapDelay pointer */ + S->pTapDelay = pTapDelay; + + /* Assign MaxDelay */ + S->maxDelay = maxDelay; + + /* reset the stateIndex to 0 */ + S->stateIndex = 0U; + + /* Clear state buffer and size is always maxDelay + blockSize */ + memset(pState, 0, (maxDelay + blockSize) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c new file mode 100644 index 0000000..3eb8d47 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c @@ -0,0 +1,94 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_sparse_init_q31.c + * Description: Q31 sparse FIR filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Initialization function for the Q31 sparse FIR filter. + * @param[in,out] *S points to an instance of the Q31 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] *pCoeffs points to the array of filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] *pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + * @return none + * + * Description: + * \par + * pCoeffs holds the filter coefficients and has length numTaps. + * pState holds the filter's state variables and must be of length + * maxDelay + blockSize, where maxDelay + * is the maximum number of delay line values. + * blockSize is the number of words processed by arm_fir_sparse_q31() function. + */ + +void arm_fir_sparse_init_q31( + arm_fir_sparse_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign TapDelay pointer */ + S->pTapDelay = pTapDelay; + + /* Assign MaxDelay */ + S->maxDelay = maxDelay; + + /* reset the stateIndex to 0 */ + S->stateIndex = 0U; + + /* Clear state buffer and size is always maxDelay + blockSize */ + memset(pState, 0, (maxDelay + blockSize) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c new file mode 100644 index 0000000..c2cb7b0 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c @@ -0,0 +1,95 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_sparse_init_q7.c + * Description: Q7 sparse FIR filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Initialization function for the Q7 sparse FIR filter. + * @param[in,out] *S points to an instance of the Q7 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] *pCoeffs points to the array of filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] *pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + * @return none + * + * Description: + * \par + * pCoeffs holds the filter coefficients and has length numTaps. + * pState holds the filter's state variables and must be of length + * maxDelay + blockSize, where maxDelay + * is the maximum number of delay line values. + * blockSize is the + * number of samples processed by the arm_fir_sparse_q7() function. + */ + +void arm_fir_sparse_init_q7( + arm_fir_sparse_instance_q7 * S, + uint16_t numTaps, + q7_t * pCoeffs, + q7_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign TapDelay pointer */ + S->pTapDelay = pTapDelay; + + /* Assign MaxDelay */ + S->maxDelay = maxDelay; + + /* reset the stateIndex to 0 */ + S->stateIndex = 0U; + + /* Clear state buffer and size is always maxDelay + blockSize */ + memset(pState, 0, (maxDelay + blockSize) * sizeof(q7_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c new file mode 100644 index 0000000..663b6e0 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c @@ -0,0 +1,470 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_sparse_q15.c + * Description: Q15 sparse FIR filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Processing function for the Q15 sparse FIR filter. + * @param[in] *S points to an instance of the Q15 sparse FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] *pScratchIn points to a temporary buffer of size blockSize. + * @param[in] *pScratchOut points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 32-bit accumulator. + * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator. + * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator. + * If the accumulator result overflows it will wrap around rather than saturate. + * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format. + * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. + */ + + +void arm_fir_sparse_q15( + arm_fir_sparse_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + q15_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize) +{ + + q15_t *pState = S->pState; /* State pointer */ + q15_t *pIn = pSrc; /* Working pointer for input */ + q15_t *pOut = pDst; /* Working pointer for output */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *px; /* Temporary pointers for scratch buffer */ + q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + q15_t *py = pState; /* Temporary pointers for state buffer */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Filter order */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + q15_t coeff = *pCoeffs++; /* Read the first coefficient value */ + q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t in1, in2; /* Temporary variables */ + + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); + + /* Loop over the number of taps. */ + tapCnt = numTaps; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 multiplications at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 2U; + + while (tapCnt > 0U) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) * px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) * px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Compute last tap without the final read of pTapDelay */ + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) * px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) * px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ + /* Loop over the blockSize. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + in1 = *pScr2++; + in2 = *pScr2++; + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), + 16); + +#else + *__SIMD32(pOut)++ = + __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), + 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + in1 = *pScr2++; + + in2 = *pScr2++; + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), + 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), + 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + blkCnt--; + + } + + /* If the blockSize is not a multiple of 4, + remaining samples are processed in the below loop */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); + blkCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); + + /* Loop over the number of taps. */ + tapCnt = numTaps; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 2U; + + while (tapCnt > 0U) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) * px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Compute last tap without the final read of pTapDelay */ + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, delaySize, &readIndex, 1, + pb, pb, blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) * px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ + /* Loop over the blockSize. */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); + blkCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c new file mode 100644 index 0000000..3fd3da0 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c @@ -0,0 +1,450 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_sparse_q31.c + * Description: Q31 sparse FIR filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + + +/** + * @addtogroup FIR_Sparse + * @{ + */ + +/** + * @brief Processing function for the Q31 sparse FIR filter. + * @param[in] *S points to an instance of the Q31 sparse FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] *pScratchIn points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 32-bit accumulator. + * The 1.31 x 1.31 multiplications are truncated to 2.30 format. + * This leads to loss of precision on the intermediate multiplications and provides only a single guard bit. + * If the accumulator result overflows, it wraps around rather than saturate. + * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. + */ + +void arm_fir_sparse_q31( + arm_fir_sparse_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + q31_t * pScratchIn, + uint32_t blockSize) +{ + + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *px; /* Scratch buffer pointer */ + q31_t *py = pState; /* Temporary pointers for state buffer */ + q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + q31_t *pOut; /* Destination pointer */ + q63_t out; /* Temporary output variable */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Filter order */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + q31_t coeff = *pCoeffs++; /* Read the first coefficient value */ + q31_t in; + + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1, + (int32_t *) pSrc, 1, blockSize); + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pOut = pDst; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 Multiplications at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + /* Perform Multiplications and store in the destination buffer */ + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiplications and store in the destination buffer */ + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 2U; + + while (tapCnt > 0U) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pOut = pDst; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Compute last tap without the final read of pTapDelay */ + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pOut = pDst; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Working output pointer is updated */ + pOut = pDst; + + /* Output is converted into 1.31 format. */ + /* Loop over the blockSize. Unroll by a factor of 4. + * process 4 output samples at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + in = *pOut << 1; + *pOut++ = in; + in = *pOut << 1; + *pOut++ = in; + in = *pOut << 1; + *pOut++ = in; + in = *pOut << 1; + *pOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * process the remaining output samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + in = *pOut << 1; + *pOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiplications and store in the destination buffer */ + *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 2U; + + while (tapCnt > 0U) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pOut = pDst; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Compute last tap without the final read of pTapDelay */ + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, + (int32_t *) pb, (int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pOut = pDst; + + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + out = *pOut; + out += ((q63_t) * px++ * coeff) >> 32; + *pOut++ = (q31_t) (out); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Working output pointer is updated */ + pOut = pDst; + + /* Output is converted into 1.31 format. */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + in = *pOut << 1; + *pOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c new file mode 100644 index 0000000..252ba95 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c @@ -0,0 +1,469 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fir_sparse_q7.c + * Description: Q7 sparse FIR filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR_Sparse + * @{ + */ + + +/** + * @brief Processing function for the Q7 sparse FIR filter. + * @param[in] *S points to an instance of the Q7 sparse FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] *pScratchIn points to a temporary buffer of size blockSize. + * @param[in] *pScratchOut points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * The accumulator is then converted to 18.7 format by discarding the low 7 bits. + * Finally, the result is truncated to 1.7 format. + */ + +void arm_fir_sparse_q7( + arm_fir_sparse_instance_q7 * S, + q7_t * pSrc, + q7_t * pDst, + q7_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize) +{ + + q7_t *pState = S->pState; /* State pointer */ + q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q7_t *px; /* Scratch buffer pointer */ + q7_t *py = pState; /* Temporary pointers for state buffer */ + q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + q7_t *pOut = pDst; /* Destination pointer */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Filter order */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + q7_t coeff = *pCoeffs++; /* Read the coefficient value */ + q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */ + q31_t in; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t in1, in2, in3, in4; + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, + blockSize); + + /* Loop over the number of taps. */ + tapCnt = numTaps; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 multiplications at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 2U; + + while (tapCnt > 0U) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - + (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Compute last tap without the final read of pTapDelay */ + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ + /* Loop over the blockSize. */ + blkCnt = blockSize >> 2; + + while (blkCnt > 0U) + { + in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + + *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + remaining samples are processed in the below loop */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, + blockSize); + + /* Loop over the number of taps. */ + tapCnt = numTaps; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) * px++ * coeff); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (uint32_t) numTaps - 2U; + + while (tapCnt > 0U) + { + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = + ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + + /* Compute last tap without the final read of pTapDelay */ + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, + (int32_t) blockSize, 1, blockSize); + + /* Working pointer for the scratch buffer of state values */ + px = pb; + + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; + + /* Loop over the blockSize */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) * px++ * coeff); + *pScratchOut++ = in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ + /* Loop over the blockSize. */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of FIR_Sparse group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c new file mode 100644 index 0000000..7cccd4a --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c @@ -0,0 +1,435 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_iir_lattice_f32.c + * Description: Floating-point IIR Lattice filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters + * + * This set of functions implements lattice filters + * for Q15, Q31 and floating-point data types. Lattice filters are used in a + * variety of adaptive filter applications. The filter structure has feedforward and + * feedback components and the net impulse response is infinite length. + * The functions operate on blocks + * of input and output data and each call to the function processes + * blockSize samples through the filter. pSrc and + * pDst point to input and output arrays containing blockSize values. + + * \par Algorithm: + * \image html IIRLattice.gif "Infinite Impulse Response Lattice filter" + *
+ *    fN(n)   =  x(n)
+ *    fm-1(n) = fm(n) - km * gm-1(n-1)   for m = N, N-1, ...1
+ *    gm(n)   = km * fm-1(n) + gm-1(n-1) for m = N, N-1, ...1
+ *    y(n)    = vN * gN(n) + vN-1 * gN-1(n) + ...+ v0 * g0(n)
+ * 
+ * \par + * pkCoeffs points to array of reflection coefficients of size numStages. + * Reflection coefficients are stored in time-reversed order. + * \par + *
+ *    {kN, kN-1, ....k1}
+ * 
+ * pvCoeffs points to the array of ladder coefficients of size (numStages+1). + * Ladder coefficients are stored in time-reversed order. + * \par + *
+ *    {vN, vN-1, ...v0}
+ * 
+ * pState points to a state array of size numStages + blockSize. + * The state variables shown in the figure above (the g values) are stored in the pState array. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * To do this manually without calling the init function, assign the follow subfields of the instance structure: + * numStages, pkCoeffs, pvCoeffs, pState. Also set all of the values in pState to zero. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows: + *
+ *arm_iir_lattice_instance_f32 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q31 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q15 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ * 
+ * \par + * where numStages is the number of stages in the filter; pState points to the state buffer array; + * pkCoeffs points to array of the reflection coefficients; pvCoeffs points to the array of ladder coefficients. + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the IIR lattice filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + +/** + * @brief Processing function for the floating-point IIR lattice filter. + * @param[in] *S points to an instance of the floating-point IIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_iir_lattice_f32( + const arm_iir_lattice_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t fnext1, gcurr1, gnext; /* Temporary variables for lattice stages */ + float32_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* temporary variables for counts */ + float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + float32_t *pState; /* State pointer */ + float32_t *pStateCurnt; /* State current pointer */ + float32_t k1, k2; + float32_t v1, v2, v3, v4; + float32_t gcurr2; + float32_t fnext2; + + /* initialise loop count */ + blkCnt = blockSize; + + /* initialise state pointer */ + pState = &S->pState[0]; + + /* Sample processing */ + while (blkCnt > 0U) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fnext2 = *pSrc++; + + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + + /* Set accumulator to zero */ + acc = 0.0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = (numStages) >> 2; + + while (tapCnt > 0U) + { + /* Read gN-1(n-1) from state buffer */ + gcurr1 = *px1; + + /* read reflection coefficient kN */ + k1 = *pk; + + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext1 = fnext2 - (k1 * gcurr1); + + /* read ladder coefficient vN */ + v1 = *pv; + + /* read next reflection coefficient kN-1 */ + k2 = *(pk + 1U); + + /* Read gN-2(n-1) from state buffer */ + gcurr2 = *(px1 + 1U); + + /* read next ladder coefficient vN-1 */ + v2 = *(pv + 1U); + + /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */ + fnext2 = fnext1 - (k2 * gcurr2); + + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = gcurr1 + (k1 * fnext1); + + /* read reflection coefficient kN-2 */ + k1 = *(pk + 2U); + + /* write gN(n) into state for next sample processing */ + *px2++ = gnext; + + /* Read gN-3(n-1) from state buffer */ + gcurr1 = *(px1 + 2U); + + /* y(n) += gN(n) * vN */ + acc += (gnext * v1); + + /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */ + fnext1 = fnext2 - (k1 * gcurr1); + + /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */ + gnext = gcurr2 + (k2 * fnext2); + + /* Read gN-4(n-1) from state buffer */ + gcurr2 = *(px1 + 3U); + + /* y(n) += gN-1(n) * vN-1 */ + acc += (gnext * v2); + + /* read reflection coefficient kN-3 */ + k2 = *(pk + 3U); + + /* write gN-1(n) into state for next sample processing */ + *px2++ = gnext; + + /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */ + fnext2 = fnext1 - (k2 * gcurr2); + + /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */ + gnext = gcurr1 + (k1 * fnext1); + + /* read ladder coefficient vN-2 */ + v3 = *(pv + 2U); + + /* y(n) += gN-2(n) * vN-2 */ + acc += (gnext * v3); + + /* write gN-2(n) into state for next sample processing */ + *px2++ = gnext; + + /* update pointer */ + pk += 4U; + + /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */ + gnext = (fnext2 * k2) + gcurr2; + + /* read next ladder coefficient vN-3 */ + v4 = *(pv + 3U); + + /* y(n) += gN-4(n) * vN-4 */ + acc += (gnext * v4); + + /* write gN-3(n) into state for next sample processing */ + *px2++ = gnext; + + /* update pointers */ + px1 += 4U; + pv += 4U; + + tapCnt--; + + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = (numStages) % 0x4U; + + while (tapCnt > 0U) + { + gcurr1 = *px1++; + /* Process sample for last taps */ + fnext1 = fnext2 - ((*pk) * gcurr1); + gnext = (fnext1 * (*pk++)) + gcurr1; + /* Output samples for last taps */ + acc += (gnext * (*pv++)); + *px2++ = gnext; + fnext2 = fnext1; + + tapCnt--; + + } + + /* y(n) += g0(n) * v0 */ + acc += (fnext2 * (*pv)); + + *px2++ = fnext2; + + /* write out into pDst */ + *pDst++ = acc; + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 1U; + + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + tapCnt = numStages >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + + } + + /* Calculate remaining number of copies */ + tapCnt = (numStages) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#else + +void arm_iir_lattice_f32( + const arm_iir_lattice_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t fcurr, fnext = 0, gcurr, gnext; /* Temporary variables for lattice stages */ + float32_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* temporary variables for counts */ + float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + float32_t *pState; /* State pointer */ + float32_t *pStateCurnt; /* State current pointer */ + + + /* Run the below code for Cortex-M0 */ + + blkCnt = blockSize; + + pState = &S->pState[0]; + + /* Sample processing */ + while (blkCnt > 0U) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0.0f; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + + /* Process sample for numStages */ + tapCnt = numStages; + + while (tapCnt > 0U) + { + gcurr = *px1++; + /* Process sample for last taps */ + fnext = fcurr - ((*pk) * gcurr); + gnext = (fnext * (*pk++)) + gcurr; + + /* Output samples for last taps */ + acc += (gnext * (*pv++)); + *px2++ = gnext; + fcurr = fnext; + + /* Decrementing loop counter */ + tapCnt--; + + } + + /* y(n) += g0(n) * v0 */ + acc += (fnext * (*pv)); + + *px2++ = fnext; + + /* write out into pDst */ + *pDst++ = acc; + + /* Advance the state pointer by 1 to process the next group of samples */ + pState = pState + 1U; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + tapCnt = numStages; + + /* Copy the data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #if defined (ARM_MATH_DSP) */ + + +/** + * @} end of IIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c new file mode 100644 index 0000000..f20a21b --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_iir_lattice_init_f32.c + * Description: Floating-point IIR lattice filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + +/** + * @brief Initialization function for the floating-point IIR lattice filter. + * @param[in] *S points to an instance of the floating-point IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages. + * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1. + * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_iir_lattice_init_f32( + arm_iir_lattice_instance_f32 * S, + uint16_t numStages, + float32_t * pkCoeffs, + float32_t * pvCoeffs, + float32_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign reflection coefficient pointer */ + S->pkCoeffs = pkCoeffs; + + /* Assign ladder coefficient pointer */ + S->pvCoeffs = pvCoeffs; + + /* Clear state buffer and size is always blockSize + numStages */ + memset(pState, 0, (numStages + blockSize) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + +} + + /** + * @} end of IIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c new file mode 100644 index 0000000..6cae944 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_iir_lattice_init_q15.c + * Description: Q15 IIR lattice filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + + /** + * @brief Initialization function for the Q15 IIR lattice filter. + * @param[in] *S points to an instance of the Q15 IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] *pkCoeffs points to reflection coefficient buffer. The array is of length numStages. + * @param[in] *pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1. + * @param[in] *pState points to state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process per call. + * @return none. + */ + +void arm_iir_lattice_init_q15( + arm_iir_lattice_instance_q15 * S, + uint16_t numStages, + q15_t * pkCoeffs, + q15_t * pvCoeffs, + q15_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign reflection coefficient pointer */ + S->pkCoeffs = pkCoeffs; + + /* Assign ladder coefficient pointer */ + S->pvCoeffs = pvCoeffs; + + /* Clear state buffer and size is always blockSize + numStages */ + memset(pState, 0, (numStages + blockSize) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + +} + +/** + * @} end of IIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c new file mode 100644 index 0000000..fe9869e --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_iir_lattice_init_q31.c + * Description: Initialization function for the Q31 IIR lattice filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + + /** + * @brief Initialization function for the Q31 IIR lattice filter. + * @param[in] *S points to an instance of the Q31 IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages. + * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1. + * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_iir_lattice_init_q31( + arm_iir_lattice_instance_q31 * S, + uint16_t numStages, + q31_t * pkCoeffs, + q31_t * pvCoeffs, + q31_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign reflection coefficient pointer */ + S->pkCoeffs = pkCoeffs; + + /* Assign ladder coefficient pointer */ + S->pvCoeffs = pvCoeffs; + + /* Clear state buffer and size is always blockSize + numStages */ + memset(pState, 0, (numStages + blockSize) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + + +} + +/** + * @} end of IIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c new file mode 100644 index 0000000..9c70b68 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c @@ -0,0 +1,452 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_iir_lattice_q15.c + * Description: Q15 IIR lattice filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + +/** + * @brief Processing function for the Q15 IIR lattice filter. + * @param[in] *S points to an instance of the Q15 IIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + */ + +void arm_iir_lattice_q15( + const arm_iir_lattice_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t fcurr, fnext, gcurr = 0, gnext; /* Temporary variables for lattice stages */ + q15_t gnext1, gnext2; /* Temporary variables for lattice stages */ + uint32_t stgCnt; /* Temporary variables for counts */ + q63_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* Temporary variables for counts */ + q15_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + q15_t *pState; /* State pointer */ + q15_t *pStateCurnt; /* State current pointer */ + q15_t out; /* Temporary variable for output */ + q31_t v; /* Temporary variable for ladder coefficient */ +#ifdef UNALIGNED_SUPPORT_DISABLE + q15_t v1, v2; +#endif + + + blkCnt = blockSize; + + pState = &S->pState[0]; + + /* Sample processing */ + while (blkCnt > 0U) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + + /* Process sample for first tap */ + gcurr = *px1++; + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr; + gnext = __SSAT(gnext, 16); + /* write gN(n) into state for next sample processing */ + *px2++ = (q15_t) gnext; + /* y(n) += gN(n) * vN */ + acc += (q31_t) ((gnext * (*pv++))); + + + /* Update f values for next coefficient processing */ + fcurr = fnext; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = (numStages - 1U) >> 2; + + while (tapCnt > 0U) + { + + /* Process sample for 2nd, 6th ...taps */ + /* Read gN-2(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 2nd, 6th .. taps */ + /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */ + fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */ + gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr; + gnext1 = (q15_t) __SSAT(gnext, 16); + /* write gN-1(n) into state */ + *px2++ = (q15_t) gnext1; + + + /* Process sample for 3nd, 7th ...taps */ + /* Read gN-3(n-1) from state */ + gcurr = *px1++; + /* Process sample for 3rd, 7th .. taps */ + /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */ + fcurr = fnext - (((q31_t) gcurr * (*pk)) >> 15); + fcurr = __SSAT(fcurr, 16); + /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */ + gnext = (((q31_t) fcurr * (*pk++)) >> 15) + gcurr; + gnext2 = (q15_t) __SSAT(gnext, 16); + /* write gN-2(n) into state */ + *px2++ = (q15_t) gnext2; + + /* Read vN-1 and vN-2 at a time */ +#ifndef UNALIGNED_SUPPORT_DISABLE + + v = *__SIMD32(pv)++; + +#else + + v1 = *pv++; + v2 = *pv++; + +#ifndef ARM_MATH_BIG_ENDIAN + + v = __PKHBT(v1, v2, 16); + +#else + + v = __PKHBT(v2, v1, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + + /* Pack gN-1(n) and gN-2(n) */ + +#ifndef ARM_MATH_BIG_ENDIAN + + gnext = __PKHBT(gnext1, gnext2, 16); + +#else + + gnext = __PKHBT(gnext2, gnext1, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* y(n) += gN-1(n) * vN-1 */ + /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */ + /* y(n) += gN-2(n) * vN-2 */ + /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */ + acc = __SMLALD(gnext, v, acc); + + + /* Process sample for 4th, 8th ...taps */ + /* Read gN-4(n-1) from state */ + gcurr = *px1++; + /* Process sample for 4th, 8th .. taps */ + /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */ + fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + /* gN-3(n) = kN-3 * fN-1(n) + gN-1(n-1) */ + gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr; + gnext1 = (q15_t) __SSAT(gnext, 16); + /* write gN-3(n) for the next sample process */ + *px2++ = (q15_t) gnext1; + + + /* Process sample for 5th, 9th ...taps */ + /* Read gN-5(n-1) from state */ + gcurr = *px1++; + /* Process sample for 5th, 9th .. taps */ + /* fN-5(n) = fN-4(n) - kN-4 * gN-5(n-1) */ + fcurr = fnext - (((q31_t) gcurr * (*pk)) >> 15); + fcurr = __SSAT(fcurr, 16); + /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */ + gnext = (((q31_t) fcurr * (*pk++)) >> 15) + gcurr; + gnext2 = (q15_t) __SSAT(gnext, 16); + /* write gN-4(n) for the next sample process */ + *px2++ = (q15_t) gnext2; + + /* Read vN-3 and vN-4 at a time */ +#ifndef UNALIGNED_SUPPORT_DISABLE + + v = *__SIMD32(pv)++; + +#else + + v1 = *pv++; + v2 = *pv++; + +#ifndef ARM_MATH_BIG_ENDIAN + + v = __PKHBT(v1, v2, 16); + +#else + + v = __PKHBT(v2, v1, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + + /* Pack gN-3(n) and gN-4(n) */ +#ifndef ARM_MATH_BIG_ENDIAN + + gnext = __PKHBT(gnext1, gnext2, 16); + +#else + + gnext = __PKHBT(gnext2, gnext1, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* y(n) += gN-4(n) * vN-4 */ + /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */ + /* y(n) += gN-3(n) * vN-3 */ + /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */ + acc = __SMLALD(gnext, v, acc); + + tapCnt--; + + } + + fnext = fcurr; + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = (numStages - 1U) % 0x4U; + + while (tapCnt > 0U) + { + gcurr = *px1++; + /* Process sample for last taps */ + fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr; + gnext = __SSAT(gnext, 16); + /* Output samples for last taps */ + acc += (q31_t) (((q31_t) gnext * (*pv++))); + *px2++ = (q15_t) gnext; + fcurr = fnext; + + tapCnt--; + } + + /* y(n) += g0(n) * v0 */ + acc += (q31_t) (((q31_t) fnext * (*pv++))); + + out = (q15_t) __SSAT(acc >> 15, 16); + *px2++ = (q15_t) fnext; + + /* write out into pDst */ + *pDst++ = out; + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 1U; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + stgCnt = (numStages >> 2U); + + /* copy data */ + while (stgCnt > 0U) + { +#ifndef UNALIGNED_SUPPORT_DISABLE + + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + +#else + + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Decrement the loop counter */ + stgCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + stgCnt = (numStages) % 0x4U; + + /* copy data */ + while (stgCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + stgCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */ + uint32_t stgCnt; /* Temporary variables for counts */ + q63_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* Temporary variables for counts */ + q15_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + q15_t *pState; /* State pointer */ + q15_t *pStateCurnt; /* State current pointer */ + q15_t out; /* Temporary variable for output */ + + + blkCnt = blockSize; + + pState = &S->pState[0]; + + /* Sample processing */ + while (blkCnt > 0U) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + tapCnt = numStages; + + while (tapCnt > 0U) + { + gcurr = *px1++; + /* Process sample */ + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext = fcurr - ((gcurr * (*pk)) >> 15); + fnext = __SSAT(fnext, 16); + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = ((fnext * (*pk++)) >> 15) + gcurr; + gnext = __SSAT(gnext, 16); + /* Output samples */ + /* y(n) += gN(n) * vN */ + acc += (q31_t) ((gnext * (*pv++))); + /* write gN(n) into state for next sample processing */ + *px2++ = (q15_t) gnext; + /* Update f values for next coefficient processing */ + fcurr = fnext; + + tapCnt--; + } + + /* y(n) += g0(n) * v0 */ + acc += (q31_t) ((fnext * (*pv++))); + + out = (q15_t) __SSAT(acc >> 15, 16); + *px2++ = (q15_t) fnext; + + /* write out into pDst */ + *pDst++ = out; + + /* Advance the state pointer by 1 to process the next group of samples */ + pState = pState + 1U; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + stgCnt = numStages; + + /* copy data */ + while (stgCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + stgCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + + + + +/** + * @} end of IIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c new file mode 100644 index 0000000..736cbc0 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c @@ -0,0 +1,338 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_iir_lattice_q31.c + * Description: Q31 IIR lattice filter processing function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + +/** + * @brief Processing function for the Q31 IIR lattice filter. + * @param[in] *S points to an instance of the Q31 IIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by 2*log2(numStages) bits. + * After all multiply-accumulates are performed, the 2.62 accumulator is saturated to 1.32 format and then truncated to 1.31 format. + */ + +void arm_iir_lattice_q31( + const arm_iir_lattice_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */ + q63_t acc; /* Accumlator */ + uint32_t blkCnt, tapCnt; /* Temporary variables for counts */ + q31_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */ + uint32_t numStages = S->numStages; /* number of stages */ + q31_t *pState; /* State pointer */ + q31_t *pStateCurnt; /* State current pointer */ + + blkCnt = blockSize; + + pState = &S->pState[0]; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* Sample processing */ + while (blkCnt > 0U) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + + /* Process sample for first tap */ + gcurr = *px1++; + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31)); + /* write gN-1(n-1) into state for next sample processing */ + *px2++ = gnext; + /* y(n) += gN(n) * vN */ + acc += ((q63_t) gnext * *pv++); + + /* Update f values for next coefficient processing */ + fcurr = fnext; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = (numStages - 1U) >> 2; + + while (tapCnt > 0U) + { + + /* Process sample for 2nd, 6th .. taps */ + /* Read gN-2(n-1) from state buffer */ + gcurr = *px1++; + /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */ + fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31)); + /* y(n) += gN-1(n) * vN-1 */ + /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */ + acc += ((q63_t) gnext * *pv++); + /* write gN-1(n) into state for next sample processing */ + *px2++ = gnext; + + /* Process sample for 3nd, 7th ...taps */ + /* Read gN-3(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 3rd, 7th .. taps */ + /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */ + fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31)); + /* y(n) += gN-2(n) * vN-2 */ + /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */ + acc += ((q63_t) gnext * *pv++); + /* write gN-2(n) into state for next sample processing */ + *px2++ = gnext; + + + /* Process sample for 4th, 8th ...taps */ + /* Read gN-4(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 4th, 8th .. taps */ + /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */ + fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31)); + /* y(n) += gN-3(n) * vN-3 */ + /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */ + acc += ((q63_t) gnext * *pv++); + /* write gN-3(n) into state for next sample processing */ + *px2++ = gnext; + + + /* Process sample for 5th, 9th ...taps */ + /* Read gN-5(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 5th, 9th .. taps */ + /* fN-5(n) = fN-4(n) - kN-4 * gN-1(n-1) */ + fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */ + gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31)); + /* y(n) += gN-4(n) * vN-4 */ + /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */ + acc += ((q63_t) gnext * *pv++); + /* write gN-4(n) into state for next sample processing */ + *px2++ = gnext; + + tapCnt--; + + } + + fnext = fcurr; + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = (numStages - 1U) % 0x4U; + + while (tapCnt > 0U) + { + gcurr = *px1++; + /* Process sample for last taps */ + fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31)); + gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31)); + /* Output samples for last taps */ + acc += ((q63_t) gnext * *pv++); + *px2++ = gnext; + fcurr = fnext; + + tapCnt--; + + } + + /* y(n) += g0(n) * v0 */ + acc += (q63_t) fnext *( + *pv++); + + *px2++ = fnext; + + /* write out into pDst */ + *pDst++ = (q31_t) (acc >> 31U); + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 1U; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + tapCnt = numStages >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + + } + + /* Calculate remaining number of copies */ + tapCnt = (numStages) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + }; + +#else + + /* Run the below code for Cortex-M0 */ + /* Sample processing */ + while (blkCnt > 0U) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[0]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + tapCnt = numStages; + + while (tapCnt > 0U) + { + gcurr = *px1++; + /* Process sample */ + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext = + clip_q63_to_q31(((q63_t) fcurr - + ((q31_t) (((q63_t) gcurr * (*pk)) >> 31)))); + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = + clip_q63_to_q31(((q63_t) gcurr + + ((q31_t) (((q63_t) fnext * (*pk++)) >> 31)))); + /* Output samples */ + /* y(n) += gN(n) * vN */ + acc += ((q63_t) gnext * *pv++); + /* write gN-1(n-1) into state for next sample processing */ + *px2++ = gnext; + /* Update f values for next coefficient processing */ + fcurr = fnext; + + tapCnt--; + } + + /* y(n) += g0(n) * v0 */ + acc += (q63_t) fnext *( + *pv++); + + *px2++ = fnext; + + /* write out into pDst */ + *pDst++ = (q31_t) (acc >> 31U); + + /* Advance the state pointer by 1 to process the next group of samples */ + pState = pState + 1U; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + tapCnt = numStages; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + + + + +/** + * @} end of IIR_Lattice group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c new file mode 100644 index 0000000..3975f00 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c @@ -0,0 +1,430 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_f32.c + * Description: Processing function for the floating-point LMS filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup LMS Least Mean Square (LMS) Filters + * + * LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions. + * LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal. + * Adaptive filters are often used in communication systems, equalizers, and noise removal. + * The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types. + * The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal. + * + * An LMS filter consists of two components as shown below. + * The first component is a standard transversal or FIR filter. + * The second component is a coefficient update mechanism. + * The LMS filter has two input signals. + * The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter. + * That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input. + * The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input. + * This "error signal" tends towards zero as the filter adapts. + * The LMS processing functions accept the input and reference input signals and generate the filter output and error signal. + * \image html LMS.gif "Internal structure of the Least Mean Square filter" + * + * The functions operate on blocks of data and each call to the function processes + * blockSize samples through the filter. + * pSrc points to input signal, pRef points to reference signal, + * pOut points to output signal and pErr points to error signal. + * All arrays contain blockSize values. + * + * The functions operate on a block-by-block basis. + * Internally, the filter coefficients b[n] are updated on a sample-by-sample basis. + * The convergence of the LMS filter is slower compared to the normalized LMS algorithm. + * + * \par Algorithm: + * The output signal y[n] is computed by a standard FIR filter: + *
+ *     y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+ * 
+ * + * \par + * The error signal equals the difference between the reference signal d[n] and the filter output: + *
+ *     e[n] = d[n] - y[n].
+ * 
+ * + * \par + * After each sample of the error signal is computed, the filter coefficients b[k] are updated on a sample-by-sample basis: + *
+ *     b[k] = b[k] + e[n] * mu * x[n-k],  for k=0, 1, ..., numTaps-1
+ * 
+ * where mu is the step size and controls the rate of coefficient convergence. + *\par + * In the APIs, pCoeffs points to a coefficient array of size numTaps. + * Coefficients are stored in time reversed order. + * \par + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to a state array of size numTaps + blockSize - 1. + * Samples in the state buffer are stored in the order: + * \par + *
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+ * 
+ * \par + * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples. + * The increased state buffer length allows circular addressing, which is traditionally used in FIR filters, + * to be avoided and yields a significant speed improvement. + * The state variables are updated after each block of data is processed. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter and + * coefficient and state arrays cannot be shared among instances. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * To do this manually without calling the init function, assign the follow subfields of the instance structure: + * numTaps, pCoeffs, mu, postShift (not for f32), pState. Also set all of the values in pState to zero. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * The code below statically initializes each of the 3 different data type filter instance structures + *
+ *    arm_lms_instance_f32 S = {numTaps, pState, pCoeffs, mu};
+ *    arm_lms_instance_q31 S = {numTaps, pState, pCoeffs, mu, postShift};
+ *    arm_lms_instance_q15 S = {numTaps, pState, pCoeffs, mu, postShift};
+ * 
+ * where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer; mu is the step size parameter; and postShift is the shift applied to coefficients. + * + * \par Fixed-Point Behavior: + * Care must be taken when using the Q15 and Q31 versions of the LMS filter. + * The following issues must be considered: + * - Scaling of coefficients + * - Overflow and saturation + * + * \par Scaling of Coefficients: + * Filter coefficients are represented as fractional values and + * coefficients are restricted to lie in the range [-1 +1). + * The fixed-point functions have an additional scaling parameter postShift. + * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits. + * This essentially scales the filter coefficients by 2^postShift and + * allows the filter coefficients to exceed the range [+1 -1). + * The value of postShift is set by the user based on the expected gain through the system being modeled. + * + * \par Overflow and Saturation: + * Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are + * described separately as part of the function specific documentation below. + */ + +/** + * @addtogroup LMS + * @{ + */ + +/** + * @details + * This function operates on floating-point data types. + * + * @brief Processing function for floating-point LMS filter. + * @param[in] *S points to an instance of the floating-point LMS filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[in] *pRef points to the block of reference data. + * @param[out] *pOut points to the block of output data. + * @param[out] *pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_lms_f32( + const arm_lms_instance_f32 * S, + float32_t * pSrc, + float32_t * pRef, + float32_t * pOut, + float32_t * pErr, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t mu = S->mu; /* Adaptive factor */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + float32_t sum, e, d; /* accumulator, error, reference data sample */ + float32_t w = 0.0f; /* weight factor */ + + e = 0.0f; + d = 0.0f; + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + blkCnt = blockSize; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Set the accumulator to zero */ + sum = 0.0f; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result in the accumulator, store in the destination buffer. */ + *pOut++ = sum; + + /* Compute and store error */ + d = (float32_t) (*pRef++); + e = d - sum; + *pErr++ = e; + + /* Calculation of Weighting factor for the updating filter coefficients */ + w = e * mu; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + *pb = *pb + (w * (*px++)); + pb++; + + *pb = *pb + (w * (*px++)); + pb++; + + *pb = *pb + (w * (*px++)); + pb++; + + *pb = *pb + (w * (*px++)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + *pb = *pb + (w * (*px++)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop unrolling for (numTaps - 1U) samples copy */ + tapCnt = (numTaps - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + sum = 0.0f; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is stored in the destination buffer. */ + *pOut++ = sum; + + /* Compute and store error */ + d = (float32_t) (*pRef++); + e = d - sum; + *pErr++ = e; + + /* Weighting factor for the LMS version */ + w = e * mu; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + *pb = *pb + (w * (*px++)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + * start of the state buffer. This prepares the state buffer for the + * next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Copy (numTaps - 1U) samples */ + tapCnt = (numTaps - 1U); + + /* Copy the data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of LMS group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c new file mode 100644 index 0000000..73158bb --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c @@ -0,0 +1,83 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_init_f32.c + * Description: Floating-point LMS filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @addtogroup LMS + * @{ + */ + + /** + * @brief Initialization function for floating-point LMS filter. + * @param[in] *S points to an instance of the floating-point LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to the coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +/** + * \par Description: + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to an array of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_lms_f32(). + */ + +void arm_lms_init_f32( + arm_lms_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + float32_t mu, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps */ + memset(pState, 0, (numTaps + (blockSize - 1)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; +} + +/** + * @} end of LMS group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c new file mode 100644 index 0000000..001287d --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c @@ -0,0 +1,93 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_init_q15.c + * Description: Q15 LMS filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS + * @{ + */ + +/** +* @brief Initialization function for the Q15 LMS filter. +* @param[in] *S points to an instance of the Q15 LMS filter structure. +* @param[in] numTaps number of filter coefficients. +* @param[in] *pCoeffs points to the coefficient buffer. +* @param[in] *pState points to the state buffer. +* @param[in] mu step size that controls filter coefficient updates. +* @param[in] blockSize number of samples to process. +* @param[in] postShift bit shift applied to coefficients. +* @return none. +* +* \par Description: +* pCoeffs points to the array of filter coefficients stored in time reversed order: +*
+*    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+* 
+* The initial filter coefficients serve as a starting point for the adaptive filter. +* pState points to the array of state variables and size of array is +* numTaps+blockSize-1 samples, where blockSize is the number of +* input samples processed by each call to arm_lms_q15(). +*/ + +void arm_lms_init_q15( + arm_lms_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + q15_t mu, + uint32_t blockSize, + uint32_t postShift) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Assign postShift value to be applied */ + S->postShift = postShift; + +} + +/** + * @} end of LMS group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c new file mode 100644 index 0000000..7d95d97 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c @@ -0,0 +1,93 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_init_q31.c + * Description: Q31 LMS filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS + * @{ + */ + + /** + * @brief Initialization function for Q31 LMS filter. + * @param[in] *S points to an instance of the Q31 LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + * @return none. + * + * \par Description: + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to an array of length numTaps+blockSize-1 samples, + * where blockSize is the number of input samples processed by each call to + * arm_lms_q31(). + */ + +void arm_lms_init_q31( + arm_lms_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + q31_t mu, + uint32_t blockSize, + uint32_t postShift) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, ((uint32_t) numTaps + (blockSize - 1U)) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Assign postShift value to be applied */ + S->postShift = postShift; + +} + +/** + * @} end of LMS group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c new file mode 100644 index 0000000..a365b33 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c @@ -0,0 +1,454 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_norm_f32.c + * Description: Processing function for the floating-point Normalised LMS + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup LMS_NORM Normalized LMS Filters + * + * This set of functions implements a commonly used adaptive filter. + * It is related to the Least Mean Square (LMS) adaptive filter and includes an additional normalization + * factor which increases the adaptation rate of the filter. + * The CMSIS DSP Library contains normalized LMS filter functions that operate on Q15, Q31, and floating-point data types. + * + * A normalized least mean square (NLMS) filter consists of two components as shown below. + * The first component is a standard transversal or FIR filter. + * The second component is a coefficient update mechanism. + * The NLMS filter has two input signals. + * The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter. + * That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input. + * The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input. + * This "error signal" tends towards zero as the filter adapts. + * The NLMS processing functions accept the input and reference input signals and generate the filter output and error signal. + * \image html LMS.gif "Internal structure of the NLMS adaptive filter" + * + * The functions operate on blocks of data and each call to the function processes + * blockSize samples through the filter. + * pSrc points to input signal, pRef points to reference signal, + * pOut points to output signal and pErr points to error signal. + * All arrays contain blockSize values. + * + * The functions operate on a block-by-block basis. + * Internally, the filter coefficients b[n] are updated on a sample-by-sample basis. + * The convergence of the LMS filter is slower compared to the normalized LMS algorithm. + * + * \par Algorithm: + * The output signal y[n] is computed by a standard FIR filter: + *
+ *     y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+ * 
+ * + * \par + * The error signal equals the difference between the reference signal d[n] and the filter output: + *
+ *     e[n] = d[n] - y[n].
+ * 
+ * + * \par + * After each sample of the error signal is computed the instanteous energy of the filter state variables is calculated: + *
+ *    E = x[n]^2 + x[n-1]^2 + ... + x[n-numTaps+1]^2.
+ * 
+ * The filter coefficients b[k] are then updated on a sample-by-sample basis: + *
+ *     b[k] = b[k] + e[n] * (mu/E) * x[n-k],  for k=0, 1, ..., numTaps-1
+ * 
+ * where mu is the step size and controls the rate of coefficient convergence. + *\par + * In the APIs, pCoeffs points to a coefficient array of size numTaps. + * Coefficients are stored in time reversed order. + * \par + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to a state array of size numTaps + blockSize - 1. + * Samples in the state buffer are stored in the order: + * \par + *
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+ * 
+ * \par + * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples. + * The increased state buffer length allows circular addressing, which is traditionally used in FIR filters, + * to be avoided and yields a significant speed improvement. + * The state variables are updated after each block of data is processed. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter and + * coefficient and state arrays cannot be shared among instances. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * To do this manually without calling the init function, assign the follow subfields of the instance structure: + * numTaps, pCoeffs, mu, energy, x0, pState. Also set all of the values in pState to zero. + * For Q7, Q15, and Q31 the following fields must also be initialized; + * recipTable, postShift + * + * \par + * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function. + * \par Fixed-Point Behavior: + * Care must be taken when using the Q15 and Q31 versions of the normalised LMS filter. + * The following issues must be considered: + * - Scaling of coefficients + * - Overflow and saturation + * + * \par Scaling of Coefficients: + * Filter coefficients are represented as fractional values and + * coefficients are restricted to lie in the range [-1 +1). + * The fixed-point functions have an additional scaling parameter postShift. + * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits. + * This essentially scales the filter coefficients by 2^postShift and + * allows the filter coefficients to exceed the range [+1 -1). + * The value of postShift is set by the user based on the expected gain through the system being modeled. + * + * \par Overflow and Saturation: + * Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are + * described separately as part of the function specific documentation below. + */ + + +/** + * @addtogroup LMS_NORM + * @{ + */ + + + /** + * @brief Processing function for floating-point normalized LMS filter. + * @param[in] *S points to an instance of the floating-point normalized LMS filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[in] *pRef points to the block of reference data. + * @param[out] *pOut points to the block of output data. + * @param[out] *pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void arm_lms_norm_f32( + arm_lms_norm_instance_f32 * S, + float32_t * pSrc, + float32_t * pRef, + float32_t * pOut, + float32_t * pErr, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t mu = S->mu; /* Adaptive factor */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + float32_t energy; /* Energy of the input */ + float32_t sum, e, d; /* accumulator, error, reference data sample */ + float32_t w, x0, in; /* weight factor, temporary variable to hold input sample and state */ + + /* Initializations of error, difference, Coefficient update */ + e = 0.0f; + d = 0.0f; + w = 0.0f; + + energy = S->energy; + x0 = S->x0; + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy -= x0 * x0; + energy += in * in; + + /* Set the accumulator to zero */ + sum = 0.0f; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result in the accumulator, store in the destination buffer. */ + *pOut++ = sum; + + /* Compute and store error */ + d = (float32_t) (*pRef++); + e = d - sum; + *pErr++ = e; + + /* Calculation of Weighting factor for updating filter coefficients */ + /* epsilon value 0.000000119209289f */ + w = (e * mu) / (energy + 0.000000119209289f); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + *pb += w * (*px++); + pb++; + + *pb += w * (*px++); + pb++; + + *pb += w * (*px++); + pb++; + + *pb += w * (*px++); + pb++; + + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + *pb += w * (*px++); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + S->energy = energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop unrolling for (numTaps - 1U)/4 samples copy */ + tapCnt = (numTaps - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy -= x0 * x0; + energy += in * in; + + /* Set the accumulator to zero */ + sum = 0.0f; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + sum += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result in the accumulator is stored in the destination buffer. */ + *pOut++ = sum; + + /* Compute and store error */ + d = (float32_t) (*pRef++); + e = d - sum; + *pErr++ = e; + + /* Calculation of Weighting factor for updating filter coefficients */ + /* epsilon value 0.000000119209289f */ + w = (e * mu) / (energy + 0.000000119209289f); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCcoeffs pointer */ + pb = pCoeffs; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + *pb += w * (*px++); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + S->energy = energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Copy (numTaps - 1U) samples */ + tapCnt = (numTaps - 1U); + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c new file mode 100644 index 0000000..49272f8 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c @@ -0,0 +1,93 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_norm_init_f32.c + * Description: Floating-point NLMS filter initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS_NORM + * @{ + */ + + /** + * @brief Initialization function for floating-point normalized LMS filter. + * @param[in] *S points to an instance of the floating-point LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @return none. + * + * \par Description: + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to an array of length numTaps+blockSize-1 samples, + * where blockSize is the number of input samples processed by each call to arm_lms_norm_f32(). + */ + +void arm_lms_norm_init_f32( + arm_lms_norm_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + float32_t mu, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Initialise Energy to zero */ + S->energy = 0.0f; + + /* Initialise x0 to zero */ + S->x0 = 0.0f; + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c new file mode 100644 index 0000000..0624222 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c @@ -0,0 +1,100 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_norm_init_q15.c + * Description: Q15 NLMS initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" +#include "arm_common_tables.h" + +/** + * @addtogroup LMS_NORM + * @{ + */ + + /** + * @brief Initialization function for Q15 normalized LMS filter. + * @param[in] *S points to an instance of the Q15 normalized LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to the array of state variables and size of array is + * numTaps+blockSize-1 samples, where blockSize is the number of input samples processed + * by each call to arm_lms_norm_q15(). + */ + +void arm_lms_norm_init_q15( + arm_lms_norm_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + q15_t mu, + uint32_t blockSize, + uint8_t postShift) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t)); + + /* Assign post Shift value applied to coefficients */ + S->postShift = postShift; + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Initialize reciprocal pointer table */ + S->recipTable = (q15_t *) armRecipTableQ15; + + /* Initialise Energy to zero */ + S->energy = 0; + + /* Initialise x0 to zero */ + S->x0 = 0; + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c new file mode 100644 index 0000000..4f70408 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c @@ -0,0 +1,99 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_norm_init_q31.c + * Description: Q31 NLMS initialization function + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" +#include "arm_common_tables.h" + +/** + * @addtogroup LMS_NORM + * @{ + */ + + /** + * @brief Initialization function for Q31 normalized LMS filter. + * @param[in] *S points to an instance of the Q31 normalized LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] *pCoeffs points to coefficient buffer. + * @param[in] *pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * The initial filter coefficients serve as a starting point for the adaptive filter. + * pState points to an array of length numTaps+blockSize-1 samples, + * where blockSize is the number of input samples processed by each call to arm_lms_norm_q31(). + */ + +void arm_lms_norm_init_q31( + arm_lms_norm_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + q31_t mu, + uint32_t blockSize, + uint8_t postShift) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always blockSize + numTaps - 1 */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t)); + + /* Assign post Shift value applied to coefficients */ + S->postShift = postShift; + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Step size value */ + S->mu = mu; + + /* Initialize reciprocal pointer table */ + S->recipTable = (q31_t *) armRecipTableQ31; + + /* Initialise Energy to zero */ + S->energy = 0; + + /* Initialise x0 to zero */ + S->x0 = 0; + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c new file mode 100644 index 0000000..00bde39 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c @@ -0,0 +1,428 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_norm_q15.c + * Description: Q15 NLMS filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS_NORM + * @{ + */ + +/** +* @brief Processing function for Q15 normalized LMS filter. +* @param[in] *S points to an instance of the Q15 normalized LMS filter structure. +* @param[in] *pSrc points to the block of input data. +* @param[in] *pRef points to the block of reference data. +* @param[out] *pOut points to the block of output data. +* @param[out] *pErr points to the block of error data. +* @param[in] blockSize number of samples to process. +* @return none. +* +* Scaling and Overflow Behavior: +* \par +* The function is implemented using a 64-bit internal accumulator. +* Both coefficients and state variables are represented in 1.15 format and +* multiplications yield a 2.30 result. The 2.30 intermediate results are +* accumulated in a 64-bit accumulator in 34.30 format. +* There is no risk of internal overflow with this approach and the full +* precision of intermediate multiplications is preserved. After all additions +* have been performed, the accumulator is truncated to 34.15 format by +* discarding low 15 bits. Lastly, the accumulator is saturated to yield a +* result in 1.15 format. +* +* \par +* In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted. +* + */ + +void arm_lms_norm_q15( + arm_lms_norm_instance_q15 * S, + q15_t * pSrc, + q15_t * pRef, + q15_t * pOut, + q15_t * pErr, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + q15_t mu = S->mu; /* Adaptive factor */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + q31_t energy; /* Energy of the input */ + q63_t acc; /* Accumulator */ + q15_t e = 0, d = 0; /* error, reference data sample */ + q15_t w = 0, in; /* weight factor and state */ + q15_t x0; /* temporary variable to hold input sample */ + //uint32_t shift = (uint32_t) S->postShift + 1U; /* Shift to be applied to the output */ + q15_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */ + q15_t postShift; /* Post shift to be applied to weight after reciprocal calculation */ + q31_t coef; /* Teporary variable for coefficient */ + q31_t acc_l, acc_h; + int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */ + int32_t uShift = (32 - lShift); + + energy = S->energy; + x0 = S->x0; + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy -= (((q31_t) x0 * (x0)) >> 15); + energy += (((q31_t) in * (in)) >> 15); + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while (tapCnt > 0U) + { + + /* Perform the multiply-accumulate */ +#ifndef UNALIGNED_SUPPORT_DISABLE + + acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc); + acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc); + +#else + + acc += (((q31_t) * px++ * (*pb++))); + acc += (((q31_t) * px++ * (*pb++))); + acc += (((q31_t) * px++ * (*pb++))); + acc += (((q31_t) * px++ * (*pb++))); + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += (((q31_t) * px++ * (*pb++))); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Converting the result to 1.15 format and saturate the output */ + acc = __SSAT(acc, 16U); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* Compute and store error */ + d = *pRef++; + e = d - (q15_t) acc; + *pErr++ = e; + + /* Calculation of 1/energy */ + postShift = arm_recip_q15((q15_t) energy + DELTA_Q15, + &oneByEnergy, S->recipTable); + + /* Calculation of e * mu value */ + errorXmu = (q15_t) (((q31_t) e * mu) >> 15); + + /* Calculation of (e * mu) * (1/energy) value */ + acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift)); + + /* Weighting factor for the normalized version */ + w = (q15_t) __SSAT((q31_t) acc, 16); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while (tapCnt > 0U) + { + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Read the sample from state buffer */ + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1U; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Save energy and x0 values for the next frame */ + S->energy = (q15_t) energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1U) >> 2; + + while (tapCnt > 0U) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + +#else + + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + +#endif + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy -= (((q31_t) x0 * (x0)) >> 15); + energy += (((q31_t) in * (in)) >> 15); + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += (((q31_t) * px++ * (*pb++))); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Converting the result to 1.15 format and saturate the output */ + acc = __SSAT(acc, 16U); + + /* Converting the result to 1.15 format */ + //acc = __SSAT((acc >> (16U - shift)), 16U); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* Compute and store error */ + d = *pRef++; + e = d - (q15_t) acc; + *pErr++ = e; + + /* Calculation of 1/energy */ + postShift = arm_recip_q15((q15_t) energy + DELTA_Q15, + &oneByEnergy, S->recipTable); + + /* Calculation of e * mu value */ + errorXmu = (q15_t) (((q31_t) e * mu) >> 15); + + /* Calculation of (e * mu) * (1/energy) value */ + acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift)); + + /* Weighting factor for the normalized version */ + w = (q15_t) __SSAT((q31_t) acc, 16); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + coef = *pb + (((q31_t) w * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Read the sample from state buffer */ + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1U; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Save energy and x0 values for the next frame */ + S->energy = (q15_t) energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* copy (numTaps - 1U) data */ + tapCnt = (numTaps - 1U); + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + + +/** + * @} end of LMS_NORM group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c new file mode 100644 index 0000000..bc65fa6 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c @@ -0,0 +1,419 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_norm_q31.c + * Description: Processing function for the Q31 NLMS filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS_NORM + * @{ + */ + +/** +* @brief Processing function for Q31 normalized LMS filter. +* @param[in] *S points to an instance of the Q31 normalized LMS filter structure. +* @param[in] *pSrc points to the block of input data. +* @param[in] *pRef points to the block of reference data. +* @param[out] *pOut points to the block of output data. +* @param[out] *pErr points to the block of error data. +* @param[in] blockSize number of samples to process. +* @return none. +* +* Scaling and Overflow Behavior: +* \par +* The function is implemented using an internal 64-bit accumulator. +* The accumulator has a 2.62 format and maintains full precision of the intermediate +* multiplication results but provides only a single guard bit. +* Thus, if the accumulator result overflows it wraps around rather than clip. +* In order to avoid overflows completely the input signal must be scaled down by +* log2(numTaps) bits. The reference signal should not be scaled down. +* After all multiply-accumulates are performed, the 2.62 accumulator is shifted +* and saturated to 1.31 format to yield the final result. +* The output signal and error signal are in 1.31 format. +* +* \par +* In this filter, filter coefficients are updated for each sample and the +* updation of filter cofficients are saturted. +* +*/ + +void arm_lms_norm_q31( + arm_lms_norm_instance_q31 * S, + q31_t * pSrc, + q31_t * pRef, + q31_t * pOut, + q31_t * pErr, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + q31_t mu = S->mu; /* Adaptive factor */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + q63_t energy; /* Energy of the input */ + q63_t acc; /* Accumulator */ + q31_t e = 0, d = 0; /* error, reference data sample */ + q31_t w = 0, in; /* weight factor and state */ + q31_t x0; /* temporary variable to hold input sample */ +// uint32_t shift = 32U - ((uint32_t) S->postShift + 1U); /* Shift to be applied to the output */ + q31_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */ + q31_t postShift; /* Post shift to be applied to weight after reciprocal calculation */ + q31_t coef; /* Temporary variable for coef */ + q31_t acc_l, acc_h; /* temporary input */ + uint32_t uShift = ((uint32_t) S->postShift + 1U); + uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */ + + energy = S->energy; + x0 = S->x0; + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while (blkCnt > 0U) + { + + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy = (q31_t) ((((q63_t) energy << 32) - + (((q63_t) x0 * x0) << 1)) >> 32); + energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32); + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + acc += ((q63_t) (*px++)) * (*pb++); + acc += ((q63_t) (*px++)) * (*pb++); + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Converting the result to 1.31 format */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* Compute and store error */ + d = *pRef++; + e = d - (q31_t) acc; + *pErr++ = e; + + /* Calculates the reciprocal of energy */ + postShift = arm_recip_q31(energy + DELTA_Q31, + &oneByEnergy, &S->recipTable[0]); + + /* Calculation of product of (e * mu) */ + errorXmu = (q31_t) (((q63_t) e * mu) >> 31); + + /* Weighting factor for the normalized version */ + w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift)); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + + /* coef is in 2.30 format */ + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + /* get coef in 1.31 format by left shifting */ + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + /* update coefficient buffer to next coefficient */ + pb++; + + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + pb++; + + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + pb++; + + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Read the sample from state buffer */ + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Save energy and x0 values for the next frame */ + S->energy = (q31_t) energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop unrolling for (numTaps - 1U) samples copy */ + tapCnt = (numTaps - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while (blkCnt > 0U) + { + + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Read the sample from input buffer */ + in = *pSrc++; + + /* Update the energy calculation */ + energy = + (q31_t) ((((q63_t) energy << 32) - (((q63_t) x0 * x0) << 1)) >> 32); + energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32); + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Converting the result to 1.31 format */ + /* Converting the result to 1.31 format */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + + //acc = (q31_t) (acc >> shift); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* Compute and store error */ + d = *pRef++; + e = d - (q31_t) acc; + *pErr++ = e; + + /* Calculates the reciprocal of energy */ + postShift = + arm_recip_q31(energy + DELTA_Q31, &oneByEnergy, &S->recipTable[0]); + + /* Calculation of product of (e * mu) */ + errorXmu = (q31_t) (((q63_t) e * mu) >> 31); + + /* Weighting factor for the normalized version */ + w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift)); + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + /* coef is in 2.30 format */ + coef = (q31_t) (((q63_t) w * (*px++)) >> (32)); + /* get coef in 1.31 format by left shifting */ + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + /* update coefficient buffer to next coefficient */ + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Read the sample from state buffer */ + x0 = *pState; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Save energy and x0 values for the next frame */ + S->energy = (q31_t) energy; + S->x0 = x0; + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + start of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop for (numTaps - 1U) samples copy */ + tapCnt = (numTaps - 1U); + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of LMS_NORM group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c new file mode 100644 index 0000000..8d5226e --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c @@ -0,0 +1,368 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_q15.c + * Description: Processing function for the Q15 LMS filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS + * @{ + */ + + /** + * @brief Processing function for Q15 LMS filter. + * @param[in] *S points to an instance of the Q15 LMS filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[in] *pRef points to the block of reference data. + * @param[out] *pOut points to the block of output data. + * @param[out] *pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * \par Scaling and Overflow Behavior: + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + * + * \par + * In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted. + * + */ + +void arm_lms_q15( + const arm_lms_instance_q15 * S, + q15_t * pSrc, + q15_t * pRef, + q15_t * pOut, + q15_t * pErr, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t mu = S->mu; /* Adaptive factor */ + q15_t *px; /* Temporary pointer for state */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + q63_t acc; /* Accumulator */ + q15_t e = 0; /* error of data sample */ + q15_t alpha; /* Intermediate constant for taps update */ + q31_t coef; /* Teporary variable for coefficient */ + q31_t acc_l, acc_h; + int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */ + int32_t uShift = (32 - lShift); + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Initializing blkCnt with blockSize */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2U; + + while (tapCnt > 0U) + { + /* acc += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + /* Perform the multiply-accumulate */ +#ifndef UNALIGNED_SUPPORT_DISABLE + + acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc); + acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc); + +#else + + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += (q63_t) (((q31_t) (*px++) * (*pb++))); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Converting the result to 1.15 format and saturate the output */ + acc = __SSAT(acc, 16); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* Compute and store error */ + e = *pRef++ - (q15_t) acc; + + *pErr++ = (q15_t) e; + + /* Compute alpha i.e. intermediate constant for taps update */ + alpha = (q15_t) (((q31_t) e * (mu)) >> 15); + + /* Initialize state pointer */ + /* Advance state pointer by 1 for the next sample */ + px = pState++; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2U; + + /* Update filter coefficients */ + while (tapCnt > 0U) + { + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Decrement the loop counter */ + blkCnt--; + + } + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1U) >> 2; + + while (tapCnt > 0U) + { + +#ifndef UNALIGNED_SUPPORT_DISABLE + + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; +#else + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; +#endif + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += (q63_t) ((q31_t) (*px++) * (*pb++)); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + /* Apply shift for lower part of acc and upper part of acc */ + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Converting the result to 1.15 format and saturate the output */ + acc = __SSAT(acc, 16); + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q15_t) acc; + + /* Compute and store error */ + e = *pRef++ - (q15_t) acc; + + *pErr++ = (q15_t) e; + + /* Compute alpha i.e. intermediate constant for taps update */ + alpha = (q15_t) (((q31_t) e * (mu)) >> 15); + + /* Initialize pState pointer */ + /* Advance state pointer by 1 for the next sample */ + px = pState++; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15); + *pb++ = (q15_t) __SSAT((coef), 16); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Decrement the loop counter */ + blkCnt--; + + } + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + start of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Copy (numTaps - 1U) samples */ + tapCnt = (numTaps - 1U); + + /* Copy the data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of LMS group + */ diff --git a/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c new file mode 100644 index 0000000..66b2a91 --- /dev/null +++ b/fw/midi-dials/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c @@ -0,0 +1,357 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_lms_q31.c + * Description: Processing function for the Q31 LMS filter + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_math.h" +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup LMS + * @{ + */ + + /** + * @brief Processing function for Q31 LMS filter. + * @param[in] *S points to an instance of the Q15 LMS filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[in] *pRef points to the block of reference data. + * @param[out] *pOut points to the block of output data. + * @param[out] *pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + * @return none. + * + * \par Scaling and Overflow Behavior: + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate + * multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clips. + * In order to avoid overflows completely the input signal must be scaled down by + * log2(numTaps) bits. + * The reference signal should not be scaled down. + * After all multiply-accumulates are performed, the 2.62 accumulator is shifted + * and saturated to 1.31 format to yield the final result. + * The output signal and error signal are in 1.31 format. + * + * \par + * In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted. + */ + +void arm_lms_q31( + const arm_lms_instance_q31 * S, + q31_t * pSrc, + q31_t * pRef, + q31_t * pOut, + q31_t * pErr, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + q31_t mu = S->mu; /* Adaptive factor */ + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + q63_t acc; /* Accumulator */ + q31_t e = 0; /* error of data sample */ + q31_t alpha; /* Intermediate constant for taps update */ + q31_t coef; /* Temporary variable for coef */ + q31_t acc_l, acc_h; /* temporary input */ + uint32_t uShift = ((uint32_t) S->postShift + 1U); + uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */ + + /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Initializing blkCnt with blockSize */ + blkCnt = blockSize; + + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + /* acc += b[N] * x[n-N] */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* acc += b[N-1] * x[n-N-1] */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* acc += b[N-2] * x[n-N-2] */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* acc += b[N-3] * x[n-N-3] */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Converting the result to 1.31 format */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = (q31_t) acc; + + /* Compute and store error */ + e = *pRef++ - (q31_t) acc; + + *pErr++ = (q31_t) e; + + /* Compute alpha i.e. intermediate constant for taps update */ + alpha = (q31_t) (((q63_t) e * mu) >> 31); + + /* Initialize state pointer */ + /* Advance state pointer by 1 for the next sample */ + px = pState++; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Update filter coefficients */ + while (tapCnt > 0U) + { + /* coef is in 2.30 format */ + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + /* get coef in 1.31 format by left shifting */ + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + /* update coefficient buffer to next coefficient */ + pb++; + + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + pb++; + + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + pb++; + + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4U; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + satrt of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Loop unrolling for (numTaps - 1U) samples copy */ + tapCnt = (numTaps - 1U) >> 2U; + + /* copy data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1U) % 0x4U; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + acc = 0; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += ((q63_t) (*px++)) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Converting the result to 1.31 format */ + /* Store the result from accumulator into the destination buffer. */ + /* Calc lower part of acc */ + acc_l = acc & 0xffffffff; + + /* Calc upper part of acc */ + acc_h = (acc >> 32) & 0xffffffff; + + acc = (uint32_t) acc_l >> lShift | acc_h << uShift; + + *pOut++ = (q31_t) acc; + + /* Compute and store error */ + e = *pRef++ - (q31_t) acc; + + *pErr++ = (q31_t) e; + + /* Weighting factor for the LMS version */ + alpha = (q31_t) (((q63_t) e * mu) >> 31); + + /* Initialize pState pointer */ + /* Advance state pointer by 1 for the next sample */ + px = pState++; + + /* Initialize pCoeffs pointer */ + pb = pCoeffs; + + /* Loop over numTaps number of values */ + tapCnt = numTaps; + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32)); + *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U)); + pb++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. Now copy the last numTaps - 1 samples to the + start of the state buffer. This prepares the state buffer for the + next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* Copy (numTaps - 1U) samples */ + tapCnt = (numTaps - 1U); + + /* Copy the data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/** + * @} end of LMS group + */ -- cgit