10 files changed, 4270 insertions, 0 deletions
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/biquad.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/biquad.c
new file mode 100644
index 0000000..1fe7c54
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/biquad.c
@@ -0,0 +1,713 @@
+#include "ref.h"
+
+void ref_biquad_cascade_df2T_f32(
+	const arm_biquad_cascade_df2T_instance_f32 * S,
+	float32_t * pSrc,
+	float32_t * pDst,
+	uint32_t blockSize)
+{
+   float32_t *pIn = pSrc;                         /*  source pointer            */
+   float32_t *pOut = pDst;                        /*  destination pointer       */
+   float32_t *pState = S->pState;                 /*  State pointer             */
+   float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+   float32_t acc;                                 /*  accumulator               */
+   float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
+   float32_t Xn;                                  /*  temporary input           */
+   float32_t d1, d2;                              /*  state variables           */
+   uint32_t sample, stage = S->numStages;         /*  loop counters             */
+
+   do
+   {
+      /* Reading the coefficients */
+      b0 = *pCoeffs++;
+      b1 = *pCoeffs++;
+      b2 = *pCoeffs++;
+      a1 = *pCoeffs++;
+      a2 = *pCoeffs++;
+
+      /*Reading the state values */
+      d1 = pState[0];
+      d2 = pState[1];
+
+      sample = blockSize;
+
+      while (sample > 0U)
+      {
+         /* Read the input */
+         Xn = *pIn++;
+
+         /* y[n] = b0 * x[n] + d1 */
+         acc = (b0 * Xn) + d1;
+
+         /* Store the result in the accumulator in the destination buffer. */
+         *pOut++ = acc;
+
+         /* Every time after the output is computed state should be updated. */
+         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+         d1 = (b1 * Xn + a1 * acc) + d2;
+
+         /* d2 = b2 * x[n] + a2 * y[n] */
+         d2 = (b2 * Xn) + (a2 * acc);
+
+         /* decrement the loop counter */
+         sample--;
+      }
+
+      /* Store the updated state variables back into the state array */
+      *pState++ = d1;
+      *pState++ = d2;
+
+      /* The current stage input is given as the output to the next stage */
+      pIn = pDst;
+
+      /*Reset the output working pointer */
+      pOut = pDst;
+
+      /* decrement the loop counter */
+      stage--;
+
+   } while (stage > 0U);
+}
+
+
+void ref_biquad_cascade_stereo_df2T_f32(
+	const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+	float32_t * pSrc,
+	float32_t * pDst,
+	uint32_t blockSize)
+{
+    float32_t *pIn = pSrc;                         /*  source pointer            */
+    float32_t *pOut = pDst;                        /*  destination pointer       */
+    float32_t *pState = S->pState;                 /*  State pointer             */
+    float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+    float32_t acc1a, acc1b;                        /*  accumulator               */
+    float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
+    float32_t Xn1a, Xn1b;                          /*  temporary input           */
+    float32_t d1a, d2a, d1b, d2b;                  /*  state variables           */
+    uint32_t sample, stage = S->numStages;         /*  loop counters             */
+
+    do
+    {
+        /* Reading the coefficients */
+        b0 = *pCoeffs++;
+        b1 = *pCoeffs++;
+        b2 = *pCoeffs++;
+        a1 = *pCoeffs++;
+        a2 = *pCoeffs++;
+
+        /*Reading the state values */
+        d1a = pState[0];
+        d2a = pState[1];
+        d1b = pState[2];
+        d2b = pState[3];
+
+        sample = blockSize;
+
+        while (sample > 0U)
+        {
+            /* Read the input */
+            Xn1a = *pIn++; //Channel a
+            Xn1b = *pIn++; //Channel b
+
+            /* y[n] = b0 * x[n] + d1 */
+            acc1a = (b0 * Xn1a) + d1a;
+            acc1b = (b0 * Xn1b) + d1b;
+
+            /* Store the result in the accumulator in the destination buffer. */
+            *pOut++ = acc1a;
+            *pOut++ = acc1b;
+
+            /* Every time after the output is computed state should be updated. */
+            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+            d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+            d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+            /* d2 = b2 * x[n] + a2 * y[n] */
+            d2a = (b2 * Xn1a) + (a2 * acc1a);
+            d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+            /* decrement the loop counter */
+            sample--;
+        }
+
+        /* Store the updated state variables back into the state array */
+        *pState++ = d1a;
+        *pState++ = d2a;
+        *pState++ = d1b;
+        *pState++ = d2b;
+
+        /* The current stage input is given as the output to the next stage */
+        pIn = pDst;
+
+        /*Reset the output working pointer */
+        pOut = pDst;
+
+        /* decrement the loop counter */
+        stage--;
+
+    } while (stage > 0U);
+	
+}
+
+void ref_biquad_cascade_df2T_f64(
+	const arm_biquad_cascade_df2T_instance_f64 * S,
+	float64_t * pSrc,
+	float64_t * pDst,
+	uint32_t blockSize)
+{
+   float64_t *pIn = pSrc;                         /*  source pointer            */
+   float64_t *pOut = pDst;                        /*  destination pointer       */
+   float64_t *pState = S->pState;                 /*  State pointer             */
+   float64_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+   float64_t acc;                                 /*  accumulator               */
+   float64_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
+   float64_t Xn;                                  /*  temporary input           */
+   float64_t d1, d2;                              /*  state variables           */
+   uint32_t sample, stage = S->numStages;         /*  loop counters             */
+
+   do
+   {
+      /* Reading the coefficients */
+      b0 = *pCoeffs++;
+      b1 = *pCoeffs++;
+      b2 = *pCoeffs++;
+      a1 = *pCoeffs++;
+      a2 = *pCoeffs++;
+
+      /*Reading the state values */
+      d1 = pState[0];
+      d2 = pState[1];
+
+      sample = blockSize;
+
+      while (sample > 0U)
+      {
+         /* Read the input */
+         Xn = *pIn++;
+
+         /* y[n] = b0 * x[n] + d1 */
+         acc = (b0 * Xn) + d1;
+
+         /* Store the result in the accumulator in the destination buffer. */
+         *pOut++ = acc;
+
+         /* Every time after the output is computed state should be updated. */
+         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+         d1 = (b1 * Xn + a1 * acc) + d2;
+
+         /* d2 = b2 * x[n] + a2 * y[n] */
+         d2 = (b2 * Xn) + (a2 * acc);
+
+         /* decrement the loop counter */
+         sample--;
+      }
+
+      /* Store the updated state variables back into the state array */
+      *pState++ = d1;
+      *pState++ = d2;
+
+      /* The current stage input is given as the output to the next stage */
+      pIn = pDst;
+
+      /*Reset the output working pointer */
+      pOut = pDst;
+
+      /* decrement the loop counter */
+      stage--;
+
+   } while (stage > 0U);
+}
+
+void ref_biquad_cascade_df1_f32(
+  const arm_biquad_casd_df1_inst_f32 * S,
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  float32_t *pIn = pSrc;                         /*  source pointer            */
+  float32_t *pOut = pDst;                        /*  destination pointer       */
+  float32_t *pState = S->pState;                 /*  pState pointer            */
+  float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+  float32_t acc;                                 /*  Simulates the accumulator */
+  float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
+  float32_t Xn1, Xn2, Yn1, Yn2;                  /*  Filter pState variables   */
+  float32_t Xn;                                  /*  temporary input           */
+  uint32_t sample, stage = S->numStages;         /*  loop counters             */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the pState values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    /*      The variables acc holds the output value that is computed:        
+     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]        
+     */
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
+
+      /* Store the result in the accumulator in the destination buffer. */
+      *pOut++ = acc;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as:    */
+      /* Xn2 = Xn1    */
+      /* Xn1 = Xn     */
+      /* Yn2 = Yn1    */
+      /* Yn1 = acc   */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = acc;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent numStages  occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset the output pointer */
+    pOut = pDst;
+
+    /* decrement the loop counter */
+    stage--;
+
+  } while (stage > 0U);
+}
+
+void ref_biquad_cas_df1_32x64_q31(
+  const arm_biquad_cas_df1_32x64_ins_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t *pIn = pSrc;                             /*  input pointer initialization  			*/
+  q31_t *pOut = pDst;                            /*  output pointer initialization 			*/
+  q63_t *pState = S->pState;                     /*  state pointer initialization  			*/
+  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  			*/
+  q63_t acc;                                     /*  accumulator                   			*/
+  q31_t Xn1, Xn2;                                /*  Input Filter state variables  			*/
+  q63_t Yn1, Yn2;                                /*  Output Filter state variables 			*/
+  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           			*/
+  q31_t Xn;                                      /*  temporary input               			*/
+  int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output 	*/
+  uint32_t sample, stage = S->numStages;         /*  loop counters                     	*/
+  q31_t acc_l, acc_h;                            /*  temporary output               		*/
+  uint32_t uShift = ((uint32_t) S->postShift + 1U);
+  uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output 	*/
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = (q63_t)Xn*b0 + (q63_t)Xn1*b1 + (q63_t)Xn2*b2;
+      /* acc +=  a1 * y[n-1] */
+      acc += mult32x64(Yn1, a1);
+      /* acc +=  a2 * y[n-2] */
+      acc += mult32x64(Yn2, a2);
+
+      /* Every time after the output is computed state should be updated. */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+
+      /* The result is converted to 1.63, Yn1 variable is reused  */
+      Yn1 = acc << shift;
+
+      /* Calc lower part of acc */
+      acc_l = acc & 0xffffffff;
+
+      /* Calc upper part of acc */
+      acc_h = (acc >> 32) & 0xffffffff;
+
+      /* Apply shift for lower part of acc and upper part of acc */
+      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+      /* Store the output in the destination buffer in 1.31 format. */
+      *pOut++ = acc_h;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  The first stage output is given as input to the second stage. */
+    pIn = pDst;
+
+    /* Reset to destination buffer working pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = (q63_t) Xn1;
+    *pState++ = (q63_t) Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
+
+void ref_biquad_cascade_df1_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{	
+  q63_t acc;                                     /*  accumulator                   */
+  uint32_t uShift = ((uint32_t) S->postShift + 1U);
+  uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output */
+  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
+  q31_t *pOut = pDst;                            /*  output pointer initialization */
+  q31_t *pState = S->pState;                     /*  pState pointer initialization */
+  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
+  q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
+  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
+  q31_t Xn;                                      /*  temporary input               */
+  uint32_t sample, stage = S->numStages;         /*  loop counters                 */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    /*      The variables acc holds the output value that is computed:         
+     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]         
+     */
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      /* acc =  b0 * x[n] */
+      acc = (q63_t) b0 *Xn;
+
+      /* acc +=  b1 * x[n-1] */
+      acc += (q63_t) b1 *Xn1;
+      /* acc +=  b[2] * x[n-2] */
+      acc += (q63_t) b2 *Xn2;
+      /* acc +=  a1 * y[n-1] */
+      acc += (q63_t) a1 *Yn1;
+      /* acc +=  a2 * y[n-2] */
+      acc += (q63_t) a2 *Yn2;
+
+      /* The result is converted to 1.31  */
+      acc = acc >> lShift;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as:  */
+      /* Xn2 = Xn1    */
+      /* Xn1 = Xn     */
+      /* Yn2 = Yn1    */
+      /* Yn1 = acc    */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = (q31_t) acc;
+
+      /* Store the output in the destination buffer. */
+      *pOut++ = (q31_t) acc;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent stages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset to destination pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
+
+
+void ref_biquad_cascade_df1_fast_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t acc = 0;                                 /*  accumulator                   */
+  q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
+  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
+  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
+  q31_t *pOut = pDst;                            /*  output pointer initialization */
+  q31_t *pState = S->pState;                     /*  pState pointer initialization */
+  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
+  q31_t Xn;                                      /*  temporary input               */
+  int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output */
+  uint32_t sample, stage = S->numStages;         /*  loop counters                     */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+		
+    sample = blockSize;
+
+   while (sample > 0U)
+   {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */    
+      mult_32x32_keep32_R(acc, b0, Xn);
+      multAcc_32x32_keep32_R(acc, b1, Xn1);
+      multAcc_32x32_keep32_R(acc, b2, Xn2);
+      multAcc_32x32_keep32_R(acc, a1, Yn1);
+      multAcc_32x32_keep32_R(acc, a2, Yn2);
+
+      /* The result is converted to 1.31  */
+      acc <<= shift;
+
+      /* Every time after the output is computed state should be updated. */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = acc;
+
+      /* Store the output in the destination buffer. */
+      *pOut++ = acc;
+
+      /* decrement the loop counter */
+      sample--;
+   }
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent stages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset to destination pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
+
+void ref_biquad_cascade_df1_fast_q15(
+  const arm_biquad_casd_df1_inst_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	q15_t *pIn = pSrc;                             			/*  Source pointer                           */
+	q15_t *pOut = pDst;                            			/*  Destination pointer                      */
+	q15_t b0, b1, b2, a1, a2;                      			/*  Filter coefficients           				*/
+	q15_t Xn1, Xn2, Yn1, Yn2;                      			/*  Filter state variables        				*/
+	q15_t Xn;                                      			/*  temporary input               				*/
+	q31_t acc;                                     			/*  Accumulator                              */
+	int32_t shift = (15 - (int32_t) S->postShift); 			/*  Post shift                               */
+	q15_t *pState = S->pState;                     			/*  State pointer                            */
+	q15_t *pCoeffs = S->pCoeffs;                   			/*  Coefficient pointer                      */
+  uint32_t sample, stage = (uint32_t) S->numStages;   /*  Stage loop counter                          */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    pCoeffs++;  // skip the 0 coefficient
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = (q31_t)b0*Xn + (q31_t)b1*Xn1 + (q31_t)b2*Xn2 + (q31_t)a1*Yn1 + (q31_t)a2*Yn2;
+
+      /* The result is converted to 1.15  */
+      acc = ref_sat_q15(acc >> shift);
+
+      /* Every time after the output is computed state should be updated. */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = (q15_t) acc;
+
+      /* Store the output in the destination buffer. */
+      *pOut++ = (q15_t) acc;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent stages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset to destination pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
+
+void ref_biquad_cascade_df1_q15(
+  const arm_biquad_casd_df1_inst_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	q15_t *pIn = pSrc;                             			/*  Source pointer                           */
+	q15_t *pOut = pDst;                            			/*  Destination pointer                      */
+	q15_t b0, b1, b2, a1, a2;                      			/*  Filter coefficients           				*/
+	q15_t Xn1, Xn2, Yn1, Yn2;                      			/*  Filter state variables        				*/
+	q15_t Xn;                                      			/*  temporary input               				*/
+	q63_t acc;                                     			/*  Accumulator                              */
+	int32_t shift = (15 - (int32_t) S->postShift); 			/*  Post shift                               */
+	q15_t *pState = S->pState;                     			/*  State pointer                            */
+	q15_t *pCoeffs = S->pCoeffs;                   			/*  Coefficient pointer                      */
+  uint32_t sample, stage = (uint32_t) S->numStages;   /*  Stage loop counter                          */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    pCoeffs++;  // skip the 0 coefficient
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = (q31_t)b0*Xn + (q31_t)b1*Xn1 + (q31_t)b2*Xn2 + (q31_t)a1*Yn1 + (q31_t)a2*Yn2;
+
+      /* The result is converted to 1.15  */
+      acc = ref_sat_q15(acc >> shift);
+
+      /* Every time after the output is computed state should be updated. */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = (q15_t) acc;
+
+      /* Store the output in the destination buffer. */
+      *pOut++ = (q15_t) acc;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent stages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset to destination pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/conv.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/conv.c
new file mode 100644
index 0000000..dc1b103
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/conv.c
@@ -0,0 +1,350 @@
+#include "ref.h"
+
+void ref_conv_f32(
+  float32_t * pSrcA,
+  uint32_t 		srcALen,
+  float32_t * pSrcB,
+  uint32_t 		srcBLen,
+  float32_t * pDst)
+{
+  float32_t sum;                                 /* Accumulator */
+  uint32_t i, j;                                 /* loop counters */
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry out MAC operations */
+    sum = 0.0f;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += pSrcB[i - j] * pSrcA[j];
+      }
+    }
+    /* Store the output in the destination buffer */
+    pDst[i] = sum;
+  }
+}
+
+arm_status ref_conv_partial_f32(
+  float32_t * pSrcA,
+  uint32_t srcALen,
+  float32_t * pSrcB,
+  uint32_t srcBLen,
+  float32_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_f32(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
+
+void ref_conv_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst)
+{
+  q63_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q63_t) pSrcA[j] * (pSrcB[i - j]);
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = (q31_t)(sum >> 31U);
+  }
+}
+
+void ref_conv_fast_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+				sum = (q31_t) ((((q63_t)sum << 32) +
+                      ((q63_t)pSrcA[j] * pSrcB[i - j])) >> 32);
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = (q31_t)(sum << 1U);
+  }
+}
+
+arm_status ref_conv_partial_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_q31(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_conv_partial_fast_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_fast_q31(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+   
+	return ARM_MATH_SUCCESS;
+}
+
+void ref_conv_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst)
+{
+  q63_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q31_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = ref_sat_q15(sum >> 15U);
+  }
+}
+
+arm_status ref_conv_partial_fast_opt_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints,
+  q15_t * pScratch1,
+  q15_t * pScratch2)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q31_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = ref_sat_q15(sum >> 15U);
+  }
+	
+  return ARM_MATH_SUCCESS;
+}
+
+void ref_conv_fast_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q31_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = sum >> 15U;
+  }
+}
+
+void ref_conv_fast_opt_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  q15_t * pScratch1,
+  q15_t * pScratch2)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q31_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = ref_sat_q15(sum >> 15U);
+  }
+}
+
+arm_status ref_conv_partial_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_q15(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_conv_partial_fast_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_fast_q15(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
+
+
+void ref_conv_q7(
+  q7_t * pSrcA,
+  uint32_t srcALen,
+  q7_t * pSrcB,
+  uint32_t srcBLen,
+  q7_t * pDst)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q15_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = (q7_t)ref_sat_q7(sum >> 7);
+  }
+}
+
+arm_status ref_conv_partial_q7(
+  q7_t * pSrcA,
+  uint32_t srcALen,
+  q7_t * pSrcB,
+  uint32_t srcBLen,
+  q7_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{	
+	ref_conv_q7(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/correlate.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/correlate.c
new file mode 100644
index 0000000..ff1d95b
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/correlate.c
@@ -0,0 +1,513 @@
+#include "ref.h"
+
+void ref_correlate_f32(
+  float32_t * pSrcA,
+  uint32_t srcALen,
+  float32_t * pSrcB,
+  uint32_t srcBLen,
+  float32_t * pDst)
+{
+  float32_t *pIn1 = pSrcA;                       /* inputA pointer 			*/
+  float32_t *pIn2 = pSrcB + (srcBLen - 1U);      /* inputB pointer 			*/
+  float32_t sum;                                 /* Accumulator 				*/
+  uint32_t i = 0U, j;                            /* loop counters 			*/
+  uint32_t inv = 0U;                             /* Reverse order flag 	*/
+  uint32_t tot = 0U;                             /* Length 							*/
+
+  /* The algorithm implementation is based on the lengths of the inputs. 
+   * srcB is always made to slide across srcA. 
+   * So srcBLen is always considered as shorter or equal to srcALen 
+   * But CORR(x, y) is reverse of CORR(y, x) 
+   * So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer 
+   * and a variable, inv is set to 1 
+   * If lengths are not equal then zero pad has to be done to make the two    
+   * inputs of same length. But to improve the performance, we include zeroes    
+   * in the output instead of zero padding either of the the inputs
+   * If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the    
+   * starting of the output buffer 
+   * If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the   
+   * ending of the output buffer 
+   * Once the zero padding is done the remaining of the output is calcualted   
+   * using convolution but with the shorter signal time shifted. 
+	 */
+
+  /* Calculate the length of the remaining sequence */
+  tot = srcALen + srcBLen - 2U;
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    /* Initialise the pointer after zero padding */
+    pDst += srcALen - srcBLen;
+  }
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + srcALen - 1U;
+
+    /* Initialisation of the pointer after zero padding */
+    pDst += tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0.0f;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += pIn1[j] * pIn2[-((int32_t)i - j)];
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = sum;
+    else
+      *pDst++ = sum;
+  }
+}
+
+void ref_correlate_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst)
+{
+  q31_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q31_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q63_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate correlation for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to correlation equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q31_t)(sum >> 31U);
+    else
+      *pDst++ = (q31_t)(sum >> 31U);
+  }
+}
+
+void ref_correlate_fast_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst)
+{
+  q31_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q31_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q63_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate correlation for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to correlation equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum = (q31_t) ((((q63_t) sum << 32) +
+												((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)])) >> 32);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q31_t)(sum << 1U);
+    else
+      *pDst++ = (q31_t)(sum << 1U);
+  }          
+}
+
+void ref_correlate_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst)
+{
+  q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q15_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q63_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q15_t) ref_sat_q15(sum >> 15U);
+    else
+      *pDst++ = (q15_t) ref_sat_q15(sum >> 15U);
+  }
+}
+
+void ref_correlate_fast_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst)
+{
+  q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q15_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q63_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q15_t)(sum >> 15U);
+    else
+      *pDst++ = (q15_t)(sum >> 15U);
+  }
+}
+
+void ref_correlate_fast_opt_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  q15_t * pScratch)
+{
+  q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q15_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q31_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q15_t) ref_sat_q15(sum >> 15U);
+    else
+      *pDst++ = (q15_t) ref_sat_q15(sum >> 15U);
+  }
+}
+
+void ref_correlate_q7(
+  q7_t * pSrcA,
+  uint32_t srcALen,
+  q7_t * pSrcB,
+  uint32_t srcBLen,
+  q7_t * pDst)
+{
+  q7_t *pIn1 = pSrcA;                            /* inputA pointer */
+  q7_t *pIn2 = pSrcB + (srcBLen - 1U);           /* inputB pointer */
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q7_t) __SSAT((sum >> 7U), 8U);
+    else
+      *pDst++ = (q7_t) __SSAT((sum >> 7U), 8U);
+  }
+}
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir.c
new file mode 100644
index 0000000..3e72b87
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir.c
@@ -0,0 +1,325 @@
+#include "ref.h"
+
+void ref_fir_f32(
+	const arm_fir_instance_f32 * S,
+	float32_t * pSrc,
+	float32_t * pDst,
+	uint32_t blockSize)
+{
+   float32_t *pState = S->pState;                 /* State pointer */
+   float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+   float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+   uint32_t i;                    								/* Loop counters */
+   float32_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = acc;
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_q31(
+  const arm_fir_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+   q31_t *pState = S->pState;                 /* State pointer */
+   q31_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+   q31_t *pStateCurnt;                        /* Points to the current sample of the state */
+   uint32_t numTaps = S->numTaps;             /* Number of filter coefficients in the filter */
+   uint32_t i;                    						/* Loop counters */
+   q63_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += (q63_t)pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = (q31_t)(acc >> 31);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_fast_q31(
+  const arm_fir_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+   q31_t *pState = S->pState;                 /* State pointer */
+   q31_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+   q31_t *pStateCurnt;                        /* Points to the current sample of the state */
+   uint32_t numTaps = S->numTaps;             /* Number of filter coefficients in the filter */
+   uint32_t i;                    						/* Loop counters */
+   q31_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc = (q31_t) (((((q63_t) acc) << 32) + ((q63_t) pState[i] * pCoeffs[i]) + 0x80000000LL ) >> 32);
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = (q31_t)(acc << 1);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_q15(
+  const arm_fir_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+   q15_t *pState = S->pState;                 /* State pointer */
+   q15_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+   q15_t *pStateCurnt;                        /* Points to the current sample of the state */
+   uint32_t numTaps = S->numTaps;             /* Number of filter coefficients in the filter */
+   uint32_t i;                    						/* Loop counters */
+   q63_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += (q31_t)pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = ref_sat_q15(acc >> 15);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_fast_q15(
+  const arm_fir_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+   q15_t *pState = S->pState;                 /* State pointer */
+   q15_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+   q15_t *pStateCurnt;                        /* Points to the current sample of the state */
+   uint32_t numTaps = S->numTaps;             /* Number of filter coefficients in the filter */
+   uint32_t i;                    						/* Loop counters */
+   q31_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += (q31_t)pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = ref_sat_q15(acc >> 15);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_q7(
+  const arm_fir_instance_q7 * S,
+  q7_t * pSrc,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+   q7_t *pState = S->pState;                 /* State pointer */
+   q7_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+   q7_t *pStateCurnt;                        /* Points to the current sample of the state */
+   uint32_t numTaps = S->numTaps;            /* Number of filter coefficients in the filter */
+   uint32_t i;                    					 /* Loop counters */
+   q31_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += (q31_t)pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = ref_sat_q7(acc >> 7);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_decimate.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_decimate.c
new file mode 100644
index 0000000..7cbf127
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_decimate.c
@@ -0,0 +1,386 @@
+#include "ref.h"
+
+void ref_fir_decimate_f32(
+  const arm_fir_decimate_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  float32_t *pState = S->pState;                 /* State pointer */
+  float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+  float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+  float32_t sum0;                                /* Accumulator */
+  float32_t x0, c0;                              /* Temporary variables to hold state and coefficient values */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  uint32_t i, blkCnt;  									 				 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0.0f;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+      sum0 += x0 * c0;
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState += S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = sum0;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  /* Copy numTaps number of values */
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
+void ref_fir_decimate_q31(
+  const arm_fir_decimate_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t *pState = S->pState;                     /* State pointer */
+  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+  q63_t sum0;                                    /* Accumulator */
+  uint32_t numTaps = S->numTaps;                 /* Number of taps */
+  uint32_t i, blkCnt;  													 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+      sum0 += (q63_t)x0 * c0;
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState = pState + S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = (q31_t) (sum0 >> 31);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
+void ref_fir_decimate_fast_q31(
+  const arm_fir_decimate_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t *pState = S->pState;                     /* State pointer */
+  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+  q31_t sum0;                                    /* Accumulator */
+  uint32_t numTaps = S->numTaps;                 /* Number of taps */
+  uint32_t i, blkCnt;  													 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+			sum0 = (q31_t)((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState = pState + S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = (q31_t) (sum0 << 1);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
+void ref_fir_decimate_q15(
+  const arm_fir_decimate_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+  q15_t *pState = S->pState;                     /* State pointer */
+  q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q15_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+  q63_t sum0;                                    /* Accumulator */
+  uint32_t numTaps = S->numTaps;                 /* Number of taps */
+  uint32_t i, blkCnt;  													 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+      sum0 += (q31_t)x0 * c0;
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState = pState + S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = ref_sat_q15(sum0 >> 15);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
+void ref_fir_decimate_fast_q15(
+  const arm_fir_decimate_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+  q15_t *pState = S->pState;                     /* State pointer */
+  q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q15_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q15_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+  q31_t sum0;                                    /* Accumulator */
+  uint32_t numTaps = S->numTaps;                 /* Number of taps */
+  uint32_t i, blkCnt;  													 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+      sum0 += x0 * c0;
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState = pState + S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = ref_sat_q15(sum0 >> 15);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_interpolate.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_interpolate.c
new file mode 100644
index 0000000..8abb089
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_interpolate.c
@@ -0,0 +1,291 @@
+#include "ref.h"
+
+void ref_fir_interpolate_f32(
+  const arm_fir_interpolate_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  float32_t *pState = S->pState;                 /* State pointer */
+  float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+  float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+  float32_t *ptr1, *ptr2;                        /* Temporary pointers for state and coefficient buffers */
+  float32_t sum;                                 /* Accumulator */
+  uint32_t i, blkCnt;                            /* Loop counters */
+  uint16_t phaseLen = S->phaseLength, tapCnt;    /* Length of each polyphase filter component */
+
+
+  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + phaseLen - 1;
+
+  /* Total number of intput samples */
+  blkCnt = blockSize;
+
+  /* Loop over the blockSize. */
+  while (blkCnt > 0U)
+  {
+    /* Copy new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Loop over the Interpolation factor. */
+    i = S->L;
+
+    while (i > 0U)
+    {
+      /* Set accumulator to zero */
+      sum = 0.0f;
+
+      /* Initialize state pointer */
+      ptr1 = pState;
+
+      /* Initialize coefficient pointer */
+      ptr2 = pCoeffs + i - 1;
+
+      /* Loop over the polyPhase length */
+      tapCnt = phaseLen;
+
+      while (tapCnt > 0U)
+      {
+        /* Perform the multiply-accumulate */
+        sum += *ptr1++ * *ptr2;
+
+        /* Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+      }
+
+      /* The result is in the accumulator, store in the destination buffer. */
+      *pDst++ = sum;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Advance the state pointer by 1           
+     * to process the next group of interpolation factor number samples */
+    pState = pState + 1;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last phaseLen - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  tapCnt = phaseLen - 1U;
+
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+
+}
+
+void ref_fir_interpolate_q31(
+  const arm_fir_interpolate_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t *pState = S->pState;                     /* State pointer */
+  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q31_t *ptr1, *ptr2;                            /* Temporary pointers for state and coefficient buffers */
+
+  /* Run the below code for Cortex-M0 */
+
+  q63_t sum;                                     /* Accumulator */
+  q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+  uint32_t i, blkCnt;                            /* Loop counters */
+  uint16_t phaseLen = S->phaseLength, tapCnt;    /* Length of each polyphase filter component */
+
+
+  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + (q31_t)phaseLen - 1;
+
+  /* Total number of intput samples */
+  blkCnt = blockSize;
+
+  /* Loop over the blockSize. */
+  while (blkCnt > 0U)
+  {
+    /* Copy new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Loop over the Interpolation factor. */
+    i = S->L;
+
+    while (i > 0U)
+    {
+      /* Set accumulator to zero */
+      sum = 0;
+
+      /* Initialize state pointer */
+      ptr1 = pState;
+
+      /* Initialize coefficient pointer */
+      ptr2 = pCoeffs + i - 1;
+
+      tapCnt = phaseLen;
+
+      while (tapCnt > 0U)
+      {
+        /* Read the coefficient */
+        c0 = *(ptr2);
+
+        /* Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        /* Read the input sample */
+        x0 = *ptr1++;
+
+        /* Perform the multiply-accumulate */
+        sum += (q63_t) x0 *c0;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+      }
+
+      /* The result is in the accumulator, store in the destination buffer. */
+      *pDst++ = (q31_t)(sum >> 31);
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Advance the state pointer by 1           
+     * to process the next group of interpolation factor number samples */
+    pState = pState + 1;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  tapCnt = phaseLen - 1U;
+
+  /* copy data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+
+}
+
+void ref_fir_interpolate_q15(
+  const arm_fir_interpolate_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+  q15_t *pState = S->pState;                     /* State pointer                                            */
+  q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer                                      */
+  q15_t *pStateCurnt;                            /* Points to the current sample of the state                */
+  q15_t *ptr1, *ptr2;                            /* Temporary pointers for state and coefficient buffers     */
+  q63_t sum;                                     /* Accumulator */
+  q15_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+  uint32_t i, blkCnt, tapCnt;                    /* Loop counters                                            */
+  uint16_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
+
+
+  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + phaseLen - 1;
+
+  /* Total number of intput samples */
+  blkCnt = blockSize;
+
+  /* Loop over the blockSize. */
+  while (blkCnt > 0U)
+  {
+    /* Copy new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Loop over the Interpolation factor. */
+    i = S->L;
+
+    while (i > 0U)
+    {
+      /* Set accumulator to zero */
+      sum = 0;
+
+      /* Initialize state pointer */
+      ptr1 = pState;
+
+      /* Initialize coefficient pointer */
+      ptr2 = pCoeffs + i - 1;
+
+      /* Loop over the polyPhase length */
+      tapCnt = (uint32_t)phaseLen;
+
+      while (tapCnt > 0U)
+      {
+        /* Read the coefficient */
+        c0 = *ptr2;
+
+        /* Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        /* Read the input sample */
+        x0 = *ptr1++;
+
+        /* Perform the multiply-accumulate */
+        sum += (q31_t) x0 * c0;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+      }
+
+      /* Store the result after converting to 1.15 format in the destination buffer */
+      *pDst++ = ref_sat_q15(sum >> 15);
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Advance the state pointer by 1           
+     * to process the next group of interpolation factor number samples */
+    pState = pState + 1;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last phaseLen - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = (uint32_t) phaseLen - 1U;
+
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+
+}
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_lattice.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_lattice.c
new file mode 100644
index 0000000..6466106
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_lattice.c
@@ -0,0 +1,241 @@
+#include "ref.h"
+
+void ref_fir_lattice_f32(
+  const arm_fir_lattice_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  float32_t *pState;                             /* State pointer */
+  float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+  float32_t *px;                                 /* temporary state pointer */
+  float32_t *pk;                                 /* temporary coefficient pointer */
+  float32_t fcurr, fnext, gcurr, gnext;          /* temporary variables */
+  uint32_t numStages = S->numStages;             /* Length of the filter */
+  uint32_t blkCnt, stageCnt;                     /* temporary variables for counts */
+
+  pState = &S->pState[0];
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* f0(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize coeff pointer */
+    pk = pCoeffs;
+
+    /* Initialize state pointer */
+    px = pState;
+
+    /* read g0(n-1) from state buffer */
+    gcurr = *px;
+
+    /* for sample 1 processing */
+    /* f1(n) = f0(n) +  K1 * g0(n-1) */
+    fnext = fcurr + ((*pk) * gcurr);
+    /* g1(n) = f0(n) * K1  +  g0(n-1) */
+    gnext = (fcurr * (*pk++)) + gcurr;
+
+    /* save f0(n) in state buffer */
+    *px++ = fcurr;
+
+    /* f1(n) is saved in fcurr            
+       for next stage processing */
+    fcurr = fnext;
+
+    stageCnt = (numStages - 1U);
+
+    /* stage loop */
+    while (stageCnt > 0U)
+    {
+      /* read g2(n) from state buffer */
+      gcurr = *px;
+
+      /* save g1(n) in state buffer */
+      *px++ = gnext;
+
+      /* Sample processing for K2, K3.... */
+      /* f2(n) = f1(n) +  K2 * g1(n-1) */
+      fnext = fcurr + ((*pk) * gcurr);
+      /* g2(n) = f1(n) * K2  +  g1(n-1) */
+      gnext = (fcurr * (*pk++)) + gcurr;
+
+      /* f1(n) is saved in fcurr1            
+         for next stage processing */
+      fcurr = fnext;
+
+      stageCnt--;
+    }
+
+    /* y(n) = fN(n) */
+    *pDst++ = fcurr;
+
+    blkCnt--;
+  }
+}
+
+void ref_fir_lattice_q31(
+  const arm_fir_lattice_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t *pState;                                 /* State pointer */
+  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q31_t *px;                                     /* temporary state pointer */
+  q31_t *pk;                                     /* temporary coefficient pointer */
+  q31_t fcurr, fnext, gcurr, gnext;              /* temporary variables */
+  uint32_t numStages = S->numStages;             /* Length of the filter */
+  uint32_t blkCnt, stageCnt;                     /* temporary variables for counts */
+
+  pState = &S->pState[0];
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* f0(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize coeff pointer */
+    pk = pCoeffs;
+
+    /* Initialize state pointer */
+    px = pState;
+
+    /* read g0(n-1) from state buffer */
+    gcurr = *px;
+
+    /* for sample 1 processing */
+    /* f1(n) = f0(n) +  K1 * g0(n-1) */
+    fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr;
+    /* g1(n) = f0(n) * K1  +  g0(n-1) */
+    gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr;
+    /* save g1(n) in state buffer */
+    *px++ = fcurr;
+
+    /* f1(n) is saved in fcurr1            
+       for next stage processing */
+    fcurr = fnext;
+
+    stageCnt = (numStages - 1U);
+
+    /* stage loop */
+    while (stageCnt > 0U)
+    {
+      /* read g2(n) from state buffer */
+      gcurr = *px;
+
+      /* save g1(n) in state buffer */
+      *px++ = gnext;
+
+      /* Sample processing for K2, K3.... */
+      /* f2(n) = f1(n) +  K2 * g1(n-1) */
+      fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr;
+      /* g2(n) = f1(n) * K2  +  g1(n-1) */
+      gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr;
+
+      /* f1(n) is saved in fcurr1            
+         for next stage processing */
+      fcurr = fnext;
+
+      stageCnt--;
+
+    }
+
+    /* y(n) = fN(n) */
+    *pDst++ = fcurr;
+
+    blkCnt--;
+
+  }
+}
+
+void ref_fir_lattice_q15(
+  const arm_fir_lattice_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+  q15_t *pState;                                 /* State pointer */
+  q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q15_t *px;                                     /* temporary state pointer */
+  q15_t *pk;                                     /* temporary coefficient pointer */
+  q31_t fcurnt, fnext, gcurnt, gnext;            /* temporary variables */
+  uint32_t numStages = S->numStages;             /* Length of the filter */
+  uint32_t blkCnt, stageCnt;                     /* temporary variables for counts */
+
+  pState = &S->pState[0];
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* f0(n) = x(n) */
+    fcurnt = *pSrc++;
+
+    /* Initialize coeff pointer */
+    pk = (pCoeffs);
+
+    /* Initialize state pointer */
+    px = pState;
+
+    /* read g0(n-1) from state buffer */
+    gcurnt = *px;
+
+    /* for sample 1 processing */
+    /* f1(n) = f0(n) +  K1 * g0(n-1) */
+    fnext = ((gcurnt * (*pk)) >> 15U) + fcurnt;
+    fnext = ref_sat_q15(fnext);
+
+
+    /* g1(n) = f0(n) * K1  +  g0(n-1) */
+    gnext = ((fcurnt * (*pk++)) >> 15U) + gcurnt;
+    gnext = ref_sat_q15(gnext);
+
+    /* save f0(n) in state buffer */
+    *px++ = (q15_t) fcurnt;
+
+    /* f1(n) is saved in fcurnt            
+       for next stage processing */
+    fcurnt = fnext;
+
+    stageCnt = (numStages - 1U);
+
+    /* stage loop */
+    while (stageCnt > 0U)
+    {
+      /* read g1(n-1) from state buffer */
+      gcurnt = *px;
+
+      /* save g0(n-1) in state buffer */
+      *px++ = (q15_t) gnext;
+
+      /* Sample processing for K2, K3.... */
+      /* f2(n) = f1(n) +  K2 * g1(n-1) */
+      fnext = ((gcurnt * (*pk)) >> 15U) + fcurnt;
+      fnext = ref_sat_q15(fnext);
+
+      /* g2(n) = f1(n) * K2  +  g1(n-1) */
+      gnext = ((fcurnt * (*pk++)) >> 15U) + gcurnt;
+      gnext = ref_sat_q15(gnext);
+
+
+      /* f1(n) is saved in fcurnt            
+         for next stage processing */
+      fcurnt = fnext;
+
+      stageCnt--;
+
+    }
+
+    /* y(n) = fN(n) */
+    *pDst++ = ref_sat_q15(fcurnt);
+
+
+    blkCnt--;
+
+  }
+}
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_sparse.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_sparse.c
new file mode 100644
index 0000000..0638313
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_sparse.c
@@ -0,0 +1,485 @@
+#include "ref.h"
+
+void ref_fir_sparse_f32(
+  arm_fir_sparse_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pDst,
+  float32_t * pScratchIn,
+  uint32_t blockSize)
+{
+  float32_t *pState = S->pState;                 /* State pointer */
+  float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+  float32_t *px;                                 /* Scratch buffer pointer */
+  float32_t *py = pState;                        /* Temporary pointers for state buffer */
+  float32_t *pb = pScratchIn;                    /* Temporary pointers for scratch buffer */
+  float32_t *pOut;                               /* Destination pointer */
+  int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
+  uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
+  uint16_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter  */
+  int32_t readIndex;                             /* Read index of the state buffer */
+  uint32_t tapCnt, blkCnt;                       /* loop counters */
+  float32_t coeff = *pCoeffs++;                  /* Read the first coefficient value */
+
+
+  /* BlockSize of Input samples are copied into the state buffer */
+  /* StateIndex points to the starting position to write in the state buffer */
+  arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
+                        (int32_t *) pSrc, 1, blockSize);
+
+
+  /* Read Index, from where the state buffer should be read, is calculated. */
+  readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+  /* Wraparound of readIndex */
+  if (readIndex < 0)
+  {
+    readIndex += (int32_t) delaySize;
+  }
+
+  /* Working pointer for state buffer is updated */
+  py = pState;
+
+  /* blockSize samples are read from the state buffer */
+  arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+                       (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+                       blockSize);
+
+  /* Working pointer for the scratch buffer */
+  px = pb;
+
+  /* Working pointer for destination buffer */
+  pOut = pDst;
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Perform Multiplications and store in destination buffer */
+    *pOut++ = *px++ * coeff;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Loop over the number of taps. */
+  tapCnt = (uint32_t) numTaps - 1U;
+
+  while (tapCnt > 0U)
+  {
+    /* Load the coefficient value and
+     * increment the coefficient buffer for the next set of state values */
+    coeff = *pCoeffs++;
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+      readIndex += (int32_t) delaySize;
+    }
+
+    /* Working pointer for state buffer is updated */
+    py = pState;
+
+    /* blockSize samples are read from the state buffer */
+    arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+                         (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+                         blockSize);
+
+    /* Working pointer for the scratch buffer */
+    px = pb;
+
+    /* Working pointer for destination buffer */
+    pOut = pDst;
+
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+      /* Perform Multiply-Accumulate */
+      *pOut++ += *px++ * coeff;
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+
+    /* Decrement the tap loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_fir_sparse_q31(
+  arm_fir_sparse_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  q31_t * pScratchIn,
+  uint32_t blockSize)
+{
+  q31_t *pState = S->pState;                     /* State pointer */
+  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q31_t *px;                                     /* Scratch buffer pointer */
+  q31_t *py = pState;                            /* Temporary pointers for state buffer */
+  q31_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */
+  q31_t *pOut;                                   /* Destination pointer */
+  q63_t out;                                     /* Temporary output variable */
+  int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
+  uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
+  uint16_t numTaps = S->numTaps;                 /* Filter order */
+  int32_t readIndex;                             /* Read index of the state buffer */
+  uint32_t tapCnt, blkCnt;                       /* loop counters */
+  q31_t coeff = *pCoeffs++;                      /* Read the first coefficient value */
+  q31_t in;
+
+
+  /* BlockSize of Input samples are copied into the state buffer */
+  /* StateIndex points to the starting position to write in the state buffer */
+  arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
+                        (int32_t *) pSrc, 1, blockSize);
+
+  /* Read Index, from where the state buffer should be read, is calculated. */
+  readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+  /* Wraparound of readIndex */
+  if (readIndex < 0)
+  {
+    readIndex += (int32_t) delaySize;
+  }
+
+  /* Working pointer for state buffer is updated */
+  py = pState;
+
+  /* blockSize samples are read from the state buffer */
+  arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+                       (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+                       blockSize);
+
+  /* Working pointer for the scratch buffer of state values */
+  px = pb;
+
+  /* Working pointer for scratch buffer of output values */
+  pOut = pDst;
+  
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Perform Multiplications and store in the destination buffer */
+    *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Loop over the number of taps. */
+  tapCnt = (uint32_t) numTaps - 1U;
+
+  while (tapCnt > 0U)
+  {
+    /* Load the coefficient value and           
+     * increment the coefficient buffer for the next set of state values */
+    coeff = *pCoeffs++;
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+      readIndex += (int32_t) delaySize;
+    }
+
+    /* Working pointer for state buffer is updated */
+    py = pState;
+
+    /* blockSize samples are read from the state buffer */
+    arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+                         (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+                         blockSize);
+
+    /* Working pointer for the scratch buffer of state values */
+    px = pb;
+
+    /* Working pointer for scratch buffer of output values */
+    pOut = pDst;
+
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+      /* Perform Multiply-Accumulate */
+      out = *pOut;
+      out += ((q63_t) * px++ * coeff) >> 32;
+      *pOut++ = (q31_t) (out);
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+
+    /* Decrement the tap loop counter */
+    tapCnt--;
+  }
+
+  /* Working output pointer is updated */
+  pOut = pDst;
+
+  /* Output is converted into 1.31 format. */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    in = *pOut << 1;
+    *pOut++ = in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+}
+
+void ref_fir_sparse_q15(
+  arm_fir_sparse_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  q15_t * pScratchIn,
+  q31_t * pScratchOut,
+  uint32_t blockSize)
+{
+  q15_t *pState = S->pState;                     /* State pointer */
+  q15_t *pIn = pSrc;                             /* Working pointer for input */
+  q15_t *pOut = pDst;                            /* Working pointer for output */
+  q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q15_t *px;                                     /* Temporary pointers for scratch buffer */
+  q15_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */
+  q15_t *py = pState;                            /* Temporary pointers for state buffer */
+  int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
+  uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
+  uint16_t numTaps = S->numTaps;                 /* Filter order */
+  int32_t readIndex;                             /* Read index of the state buffer */
+  uint32_t tapCnt, blkCnt;                       /* loop counters */
+  q15_t coeff = *pCoeffs++;                      /* Read the first coefficient value */
+  q31_t *pScr2 = pScratchOut;                    /* Working pointer for pScratchOut */
+
+  /* BlockSize of Input samples are copied into the state buffer */
+  /* StateIndex points to the starting position to write in the state buffer */
+  arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
+
+  /* Loop over the number of taps. */
+  tapCnt = numTaps;
+
+  /* Read Index, from where the state buffer should be read, is calculated. */
+  readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+  /* Wraparound of readIndex */
+  if (readIndex < 0)
+  {
+    readIndex += (int32_t) delaySize;
+  }
+
+  /* Working pointer for state buffer is updated */
+  py = pState;
+
+  /* blockSize samples are read from the state buffer */
+  arm_circularRead_q15(py, delaySize, &readIndex, 1,
+                       pb, pb, blockSize, 1, blockSize);
+
+  /* Working pointer for the scratch buffer of state values */
+  px = pb;
+
+  /* Working pointer for scratch buffer of output values */
+  pScratchOut = pScr2;
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Perform multiplication and store in the scratch buffer */
+    *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Loop over the number of taps. */
+  tapCnt = (uint32_t) numTaps - 1U;
+
+  while (tapCnt > 0U)
+  {
+    /* Load the coefficient value and           
+     * increment the coefficient buffer for the next set of state values */
+    coeff = *pCoeffs++;
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+      readIndex += (int32_t) delaySize;
+    }
+
+    /* Working pointer for state buffer is updated */
+    py = pState;
+
+    /* blockSize samples are read from the state buffer */
+    arm_circularRead_q15(py, delaySize, &readIndex, 1,
+                         pb, pb, blockSize, 1, blockSize);
+
+    /* Working pointer for the scratch buffer of state values */
+    px = pb;
+
+    /* Working pointer for scratch buffer of output values */
+    pScratchOut = pScr2;
+
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+      /* Perform Multiply-Accumulate */
+      *pScratchOut++ += (q31_t) * px++ * coeff;
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+
+    /* Decrement the tap loop counter */
+    tapCnt--;
+  }
+
+  /* All the output values are in pScratchOut buffer.       
+     Convert them into 1.15 format, saturate and store in the destination buffer. */
+  /* Loop over the blockSize. */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
+    blkCnt--;
+  }
+}
+
+void ref_fir_sparse_q7(
+  arm_fir_sparse_instance_q7 * S,
+  q7_t *pSrc,
+  q7_t *pDst,
+  q7_t *pScratchIn,
+  q31_t * pScratchOut,
+  uint32_t blockSize)
+{
+  q7_t *pState = S->pState;                      /* State pointer */
+  q7_t *pCoeffs = S->pCoeffs;                    /* Coefficient pointer */
+  q7_t *px;                                      /* Scratch buffer pointer */
+  q7_t *py = pState;                             /* Temporary pointers for state buffer */
+  q7_t *pb = pScratchIn;                         /* Temporary pointers for scratch buffer */
+  q7_t *pOut = pDst;                             /* Destination pointer */
+  int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
+  uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
+  uint16_t numTaps = S->numTaps;                 /* Filter order */
+  int32_t readIndex;                             /* Read index of the state buffer */
+  uint32_t tapCnt, blkCnt;                       /* loop counters */
+  q7_t coeff = *pCoeffs++;                       /* Read the coefficient value */
+  q31_t *pScr2 = pScratchOut;                    /* Working pointer for scratch buffer of output values */
+  q31_t in;
+
+  /* BlockSize of Input samples are copied into the state buffer */
+  /* StateIndex points to the starting position to write in the state buffer */
+  arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
+                       blockSize);
+
+  /* Loop over the number of taps. */
+  tapCnt = numTaps;
+
+  /* Read Index, from where the state buffer should be read, is calculated. */
+  readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+  /* Wraparound of readIndex */
+  if (readIndex < 0)
+  {
+    readIndex += (int32_t) delaySize;
+  }
+
+  /* Working pointer for state buffer is updated */
+  py = pState;
+
+  /* blockSize samples are read from the state buffer */
+  arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
+                      (int32_t) blockSize, 1, blockSize);
+
+  /* Working pointer for the scratch buffer of state values */
+  px = pb;
+
+  /* Working pointer for scratch buffer of output values */
+  pScratchOut = pScr2;
+
+  /* Loop over the blockSize */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Perform multiplication and store in the scratch buffer */
+    *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Loop over the number of taps. */
+  tapCnt = (uint32_t) numTaps - 1U;
+
+  while (tapCnt > 0U)
+  {
+    /* Load the coefficient value and           
+     * increment the coefficient buffer for the next set of state values */
+    coeff = *pCoeffs++;
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+      readIndex += (int32_t) delaySize;
+    }
+
+    /* Working pointer for state buffer is updated */
+    py = pState;
+
+    /* blockSize samples are read from the state buffer */
+    arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
+                        (int32_t) blockSize, 1, blockSize);
+
+    /* Working pointer for the scratch buffer of state values */
+    px = pb;
+
+    /* Working pointer for scratch buffer of output values */
+    pScratchOut = pScr2;
+
+    /* Loop over the blockSize */
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+      /* Perform Multiply-Accumulate */
+      in = *pScratchOut + ((q31_t) * px++ * coeff);
+      *pScratchOut++ = in;
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+
+    /* Decrement the tap loop counter */
+    tapCnt--;
+  }
+
+  /* All the output values are in pScratchOut buffer.       
+     Convert them into 1.15 format, saturate and store in the destination buffer. */
+  /* Loop over the blockSize. */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+
+    /* Decrement the blockSize loop counter */
+    blkCnt--;
+  }
+}
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/iir_lattice.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/iir_lattice.c
new file mode 100644
index 0000000..ab37d5f
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/iir_lattice.c
@@ -0,0 +1,271 @@
+#include "ref.h"
+
+void ref_iir_lattice_f32(
+  const arm_iir_lattice_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  float32_t fcurr, fnext = 0, gcurr, gnext;      /* Temporary variables for lattice stages */
+  float32_t acc;                                 /* Accumlator */
+  uint32_t blkCnt, tapCnt;                       /* temporary variables for counts */
+  float32_t *px1, *px2, *pk, *pv;                /* temporary pointers for state and coef */
+  uint32_t numStages = S->numStages;             /* number of stages */
+  float32_t *pState;                             /* State pointer */
+  float32_t *pStateCurnt;                        /* State current pointer */
+
+  blkCnt = blockSize;
+  pState = &S->pState[0];
+
+  /* Sample processing */
+  while (blkCnt > 0U)
+  {
+    /* Read Sample from input buffer */
+    /* fN(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize state read pointer */
+    px1 = pState;
+    /* Initialize state write pointer */
+    px2 = pState;
+    /* Set accumulator to zero */
+    acc = 0.0f;
+    /* Initialize Ladder coeff pointer */
+    pv = &S->pvCoeffs[0];
+    /* Initialize Reflection coeff pointer */
+    pk = &S->pkCoeffs[0];
+
+    /* Process sample for numStages */
+    tapCnt = numStages;
+
+    while (tapCnt > 0U)
+    {
+      gcurr = *px1++;
+      /* Process sample for last taps */
+      fnext = fcurr - (*pk) * gcurr;
+      gnext = fnext * (*pk++) + gcurr;
+
+      /* Output samples for last taps */
+      acc += gnext * (*pv++);
+      *px2++ = gnext;
+      fcurr = fnext;
+
+      /* Decrementing loop counter */
+      tapCnt--;
+    }
+
+    /* y(n) += g0(n) * v0 */
+    acc += fnext * (*pv);
+
+    *px2++ = fnext;
+
+    /* write out into pDst */
+    *pDst++ = acc;
+
+    /* Advance the state pointer by 1 to process the next group of samples */
+    pState = pState + 1U;
+    blkCnt--;
+  }
+
+  /* Processing is complete. Now copy last S->numStages samples to start of the buffer           
+     for the preperation of next frame process */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = &S->pState[0];
+  pState = &S->pState[blockSize];
+
+  tapCnt = numStages;
+
+  /* Copy the data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_iir_lattice_q31(
+  const arm_iir_lattice_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t fcurr, fnext = 0, gcurr = 0, gnext;      /* Temporary variables for lattice stages */
+  q63_t acc;                                     /* Accumlator */
+  uint32_t blkCnt, tapCnt;                       /* Temporary variables for counts */
+  q31_t *px1, *px2, *pk, *pv;                    /* Temporary pointers for state and coef */
+  uint32_t numStages = S->numStages;             /* number of stages */
+  q31_t *pState;                                 /* State pointer */
+  q31_t *pStateCurnt;                            /* State current pointer */
+
+  blkCnt = blockSize;
+  pState = &S->pState[0];
+
+  /* Sample processing */
+  while (blkCnt > 0U)
+  {
+    /* Read Sample from input buffer */
+    /* fN(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize state read pointer */
+    px1 = pState;
+    /* Initialize state write pointer */
+    px2 = pState;
+    /* Set accumulator to zero */
+    acc = 0;
+    /* Initialize Ladder coeff pointer */
+    pv = &S->pvCoeffs[0];
+    /* Initialize Reflection coeff pointer */
+    pk = &S->pkCoeffs[0];
+
+    tapCnt = numStages;
+
+    while (tapCnt > 0U)
+    {
+      gcurr = *px1++;
+      /* Process sample */
+      /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+      fnext =
+        ref_sat_q31(((q63_t) fcurr -
+                         ((q31_t) (((q63_t) gcurr * (*pk)) >> 31))));
+      /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+      gnext =
+        ref_sat_q31(((q63_t) gcurr +
+                         ((q31_t) (((q63_t) fnext * (*pk++)) >> 31))));
+      /* Output samples */
+      /* y(n) += gN(n) * vN  */
+      acc += ((q63_t) gnext * *pv++);
+      /* write gN-1(n-1) into state for next sample processing */
+      *px2++ = gnext;
+      /* Update f values for next coefficient processing */
+      fcurr = fnext;
+
+      tapCnt--;
+    }
+
+    /* y(n) += g0(n) * v0 */
+    acc += (q63_t) fnext *(*pv++);
+
+    *px2++ = fnext;
+
+    /* write out into pDst */
+    *pDst++ = (q31_t) (acc >> 31U);
+
+    /* Advance the state pointer by 1 to process the next group of samples */
+    pState = pState + 1U;
+    blkCnt--;
+  }
+
+  /* Processing is complete. Now copy last S->numStages samples to start of the buffer           
+     for the preperation of next frame process */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = &S->pState[0];
+  pState = &S->pState[blockSize];
+
+  tapCnt = numStages;
+
+  /* Copy the remaining q31_t data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_iir_lattice_q15(
+  const arm_iir_lattice_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t fcurr, fnext = 0, gcurr = 0, gnext;      /* Temporary variables for lattice stages */
+  uint32_t stgCnt;                               /* Temporary variables for counts */
+  q63_t acc;                                     /* Accumlator */
+  uint32_t blkCnt, tapCnt;                       /* Temporary variables for counts */
+  q15_t *px1, *px2, *pk, *pv;                    /* temporary pointers for state and coef */
+  uint32_t numStages = S->numStages;             /* number of stages */
+  q15_t *pState;                                 /* State pointer */
+  q15_t *pStateCurnt;                            /* State current pointer */
+  q15_t out;                                     /* Temporary variable for output */
+
+  blkCnt = blockSize;
+  pState = &S->pState[0];
+
+  /* Sample processing */
+  while (blkCnt > 0U)
+  {
+    /* Read Sample from input buffer */
+    /* fN(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize state read pointer */
+    px1 = pState;
+    /* Initialize state write pointer */
+    px2 = pState;
+    /* Set accumulator to zero */
+    acc = 0;
+    /* Initialize Ladder coeff pointer */
+    pv = &S->pvCoeffs[0];
+    /* Initialize Reflection coeff pointer */
+    pk = &S->pkCoeffs[0];
+
+    tapCnt = numStages;
+
+    while (tapCnt > 0U)
+    {
+      gcurr = *px1++;
+      /* Process sample */
+      /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+      fnext = fcurr - ((gcurr * (*pk)) >> 15);
+      fnext = ref_sat_q15(fnext);
+      /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+      gnext = ((fnext * (*pk++)) >> 15) + gcurr;
+      gnext = ref_sat_q15(gnext);
+      /* Output samples */
+      /* y(n) += gN(n) * vN */
+      acc += (q31_t) ((gnext * (*pv++)));
+      /* write gN(n) into state for next sample processing */
+      *px2++ = (q15_t) gnext;
+      /* Update f values for next coefficient processing */
+      fcurr = fnext;
+
+      tapCnt--;
+    }
+
+    /* y(n) += g0(n) * v0 */
+    acc += (q31_t) ((fnext * (*pv++)));
+
+    out = ref_sat_q15(acc >> 15);
+    *px2++ = (q15_t) fnext;
+
+    /* write out into pDst */
+    *pDst++ = out;
+
+    /* Advance the state pointer by 1 to process the next group of samples */
+    pState = pState + 1U;
+    blkCnt--;
+  }
+
+  /* Processing is complete. Now copy last S->numStages samples to start of the buffer           
+     for the preperation of next frame process */
+  /* Points to the start of the state buffer */
+  pStateCurnt = &S->pState[0];
+  pState = &S->pState[blockSize];
+
+  stgCnt = numStages;
+
+  /* copy data */
+  while (stgCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    stgCnt--;
+  }
+}
diff --git a/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/lms.c b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/lms.c
new file mode 100644
index 0000000..fee99f9
--- /dev/null
+++ b/fw/hid-dials/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/lms.c
@@ -0,0 +1,695 @@
+#include "ref.h"
+
+void ref_lms_f32(
+  const arm_lms_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pRef,
+  float32_t * pOut,
+  float32_t * pErr,
+  uint32_t blockSize)
+{
+  float32_t *pState = S->pState;                 /* State pointer */
+  float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+  float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+  float32_t mu = S->mu;                          /* Adaptive factor */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  uint32_t i, blkCnt;                     			 /* Loop counters */
+  float32_t sum, e, d;                           /* accumulator, error, reference data sample */
+  float32_t w = 0.0f;                            /* weight factor */
+
+  e = 0.0f;
+  d = 0.0f;
+
+  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[numTaps - 1U]);
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Set the accumulator to zero */
+    sum = 0.0f;
+		
+		for(i=0;i<numTaps;i++)
+		{ /* Perform the multiply-accumulate */
+      sum += pState[i] * pCoeffs[i];
+		}
+
+    /* The result is stored in the destination buffer. */
+    *pOut++ = sum;
+
+    /* Compute and store error */
+    d = *pRef++;
+    e = d - sum;
+    *pErr++ = e;
+
+    /* Weighting factor for the LMS version */
+    w = e * mu;
+		
+		for(i=0;i<numTaps;i++)
+		{ /* Perform the multiply-accumulate */
+      pCoeffs[i] += w * pState[i];
+		}
+
+    /* Advance state pointer by 1 for the next sample */
+    pState++;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the        
+   * start of the state buffer. This prepares the state buffer for the        
+   * next function call. */
+	for(i=0;i<numTaps-1;i++)
+  {
+    S->pState[i] = pState[i];
+  }
+}
+
+void ref_lms_norm_f32(
+  arm_lms_norm_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pRef,
+  float32_t * pOut,
+  float32_t * pErr,
+  uint32_t blockSize)
+{
+  float32_t *pState = S->pState;                 /* State pointer */
+  float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+  float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+  float32_t mu = S->mu;                          /* Adaptive factor */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  uint32_t i, blkCnt;                    				 /* Loop counters */
+  float32_t energy;                              /* Energy of the input */
+  float32_t sum, e, d;                           /* accumulator, error, reference data sample */
+  float32_t w, x0, in;                           /* weight factor, temporary variable to hold input sample and state */
+
+  /* Initializations of error,  difference, Coefficient update */
+  e = 0.0f;
+  d = 0.0f;
+  w = 0.0f;
+
+  energy = S->energy;
+  x0 = S->x0;
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[numTaps - 1U]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc;
+
+    /* Read the sample from input buffer */
+    in = *pSrc++;
+
+    /* Update the energy calculation */
+    energy -= x0 * x0;
+    energy += in * in;
+
+    /* Set the accumulator to zero */
+    sum = 0.0f;
+		
+		for(i=0;i<numTaps;i++)
+		{ /* Perform the multiply-accumulate */
+      sum += pState[i] * pCoeffs[i];
+		}
+
+    /* The result in the accumulator is stored in the destination buffer. */
+    *pOut++ = sum;
+
+    /* Compute and store error */
+    d = *pRef++;
+    e = d - sum;
+    *pErr++ = e;
+
+    /* Calculation of Weighting factor for updating filter coefficients */
+    /* epsilon value 0.000000119209289f */
+    w = e * mu / (energy + 0.000000119209289f);
+
+		for(i=0;i<numTaps;i++)
+    {
+      /* Perform the multiply-accumulate */
+      pCoeffs[i] += w * pState[i];
+    }
+
+    x0 = *pState;
+
+    /* Advance state pointer by 1 for the next sample */
+    pState++;
+  }
+
+  S->energy = energy;
+  S->x0 = x0;
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the        
+   * start of the state buffer. This prepares the state buffer for the        
+   * next function call. */
+	for(i=0;i<numTaps-1;i++)
+  {
+    S->pState[i] = pState[i];
+  }
+}
+
+void ref_lms_q31(
+  const arm_lms_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pRef,
+  q31_t * pOut,
+  q31_t * pErr,
+  uint32_t blockSize)
+{
+  q31_t *pState = S->pState;                     /* State pointer */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q31_t mu = S->mu;                              /* Adaptive factor */
+  q31_t *px;                                     /* Temporary pointer for state */
+  q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
+  uint32_t tapCnt, blkCnt;                       /* Loop counters */
+  q63_t acc;                                     /* Accumulator */
+  q31_t e = 0;                                   /* error of data sample */
+  q31_t alpha;                                   /* Intermediate constant for taps update */
+  q31_t coef;                                    /* Temporary variable for coef */
+  q31_t acc_l, acc_h;                            /*  temporary input */
+  uint32_t uShift = (uint32_t)S->postShift + 1;
+  uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output */
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Set the accumulator to zero */
+    acc = 0;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += (q63_t)(*px++) * (*pb++);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Converting the result to 1.31 format */
+    /* Store the result from accumulator into the destination buffer. */
+    /* Calc lower part of acc */
+    acc_l = acc & 0xffffffff;
+
+    /* Calc upper part of acc */
+    acc_h = (acc >> 32) & 0xffffffff;
+
+    acc = (uint32_t)acc_l >> lShift | acc_h << uShift;
+
+    *pOut++ = (q31_t)acc;
+
+    /* Compute and store error */
+    e = *pRef++ - (q31_t)acc;
+
+    *pErr++ = (q31_t)e;
+
+    /* Weighting factor for the LMS version */
+    alpha = (q31_t)(((q63_t)e * mu) >> 31);
+
+    /* Initialize pState pointer */
+    /* Advance state pointer by 1 for the next sample */
+    px = pState++;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      coef = (q31_t)(((q63_t) alpha * (*px++)) >> 32);
+      *pb = ref_sat_q31((q63_t)*pb + (coef << 1));
+      pb++;
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+  }
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the     
+     start of the state buffer. This prepares the state buffer for the   
+     next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /*  Copy (numTaps - 1U) samples  */
+  tapCnt = numTaps - 1;
+
+  /* Copy the data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_lms_norm_q31(
+  arm_lms_norm_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pRef,
+  q31_t * pOut,
+  q31_t * pErr,
+  uint32_t blockSize)
+{
+  q31_t *pState = S->pState;                     /* State pointer */
+  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q31_t *px, *pb;                                /* Temporary pointers for state and coefficient buffers */
+  q31_t mu = S->mu;                              /* Adaptive factor */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  uint32_t tapCnt, blkCnt;                       /* Loop counters */
+  q63_t energy;                                  /* Energy of the input */
+  q63_t acc;                                     /* Accumulator */
+  q31_t e = 0, d = 0;                            /* error, reference data sample */
+  q31_t w = 0, in;                               /* weight factor and state */
+  q31_t x0;                                      /* temporary variable to hold input sample */   
+  q63_t errorXmu;                   				 /* Temporary variables to store error and mu product and reciprocal of energy */
+  q31_t coef;                                    /* Temporary variable for coef */
+  q31_t acc_l, acc_h;                            /*  temporary input */
+  uint32_t uShift = ((uint32_t) S->postShift + 1U);
+  uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output */
+
+  energy = S->energy;
+  x0 = S->x0;
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Read the sample from input buffer */
+    in = *pSrc++;
+
+    /* Update the energy calculation */
+    energy = (q31_t)((((q63_t)energy << 32) - (((q63_t)x0 * x0) << 1)) >> 32) & 0xffffffff;
+    energy = (q31_t)(((((q63_t)in * in) << 1) + ((q63_t)energy << 32)) >> 32) & 0xffffffff;
+
+    /* Set the accumulator to zero */
+    acc = 0;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += ((q63_t) (*px++)) * (*pb++);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Converting the result to 1.31 format */
+    /* Calc lower part of acc */
+    acc_l = acc & 0xffffffff;
+
+    /* Calc upper part of acc */
+    acc_h = (acc >> 32) & 0xffffffff;
+
+    acc = (uint32_t)acc_l >> lShift | acc_h << uShift;
+
+    /* Store the result from accumulator into the destination buffer. */
+    *pOut++ = (q31_t)acc;
+
+    /* Compute and store error */
+    d = *pRef++;
+    e = d - (q31_t)acc;
+    *pErr++ = e;
+
+    /* Calculation of product of (e * mu) */
+    errorXmu = (q63_t)e * mu;
+
+    /* Weighting factor for the normalized version */
+    w = ref_sat_q31(errorXmu / (energy + DELTA_Q31));
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize coeff pointer */
+    pb = pCoeffs;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      /* coef is in 2.30 format */
+      coef = (q31_t)(((q63_t)w * (*px++)) >> 32);
+      /* get coef in 1.31 format by left shifting */
+      *pb = ref_sat_q31((q63_t)*pb + (coef << 1U));
+      /* update coefficient buffer to next coefficient */
+      pb++;
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Read the sample from state buffer */
+    x0 = *pState;
+
+    /* Advance state pointer by 1 for the next sample */
+    pState++;
+  }
+
+  /* Save energy and x0 values for the next frame */
+  S->energy = (q31_t)energy;
+  S->x0 = x0;
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the     
+     start of the state buffer. This prepares the state buffer for the        
+     next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /* Loop for (numTaps - 1U) samples copy */
+  tapCnt = numTaps - 1;
+
+  /* Copy the remaining q31_t data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_lms_q15(
+  const arm_lms_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pRef,
+  q15_t * pOut,
+  q15_t * pErr,
+  uint32_t blockSize)
+{
+  q15_t *pState = S->pState;                     /* State pointer */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q15_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q15_t mu = S->mu;                              /* Adaptive factor */
+  q15_t *px;                                     /* Temporary pointer for state */
+  q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
+  uint32_t tapCnt, blkCnt;                       /* Loop counters */
+  q63_t acc;                                     /* Accumulator */
+  q15_t e = 0;                                   /* error of data sample */
+  q15_t alpha;                                   /* Intermediate constant for taps update */
+  q31_t coef;                                    /* Teporary variable for coefficient */
+  q31_t acc_l, acc_h;
+  int32_t lShift = 15 - (int32_t)S->postShift;   /*  Post shift  */
+  int32_t uShift = 32 - lShift;
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Set the accumulator to zero */
+    acc = 0;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += (q63_t)((q31_t)(*px++) * (*pb++));
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Calc lower part of acc */
+    acc_l = acc & 0xffffffff;
+
+    /* Calc upper part of acc */
+    acc_h = (acc >> 32) & 0xffffffff;
+
+    /* Apply shift for lower part of acc and upper part of acc */
+    acc = (uint32_t)acc_l >> lShift | acc_h << uShift;
+
+    /* Converting the result to 1.15 format and saturate the output */
+    acc = ref_sat_q15(acc);
+
+    /* Store the result from accumulator into the destination buffer. */
+    *pOut++ = (q15_t)acc;
+
+    /* Compute and store error */
+    e = *pRef++ - (q15_t)acc;
+
+    *pErr++ = (q15_t)e;
+
+    /* Compute alpha i.e. intermediate constant for taps update */
+    alpha = (q15_t)(((q31_t)e * mu) >> 15);
+
+    /* Initialize pState pointer */
+    /* Advance state pointer by 1 for the next sample */
+    px = pState++;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
+      *pb++ = (q15_t) ref_sat_q15(coef);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+  }
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the        
+     start of the state buffer. This prepares the state buffer for the   
+     next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /*  Copy (numTaps - 1U) samples  */
+  tapCnt = numTaps - 1;
+
+  /* Copy the data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_lms_norm_q15(
+  arm_lms_norm_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pRef,
+  q15_t * pOut,
+  q15_t * pErr,
+  uint32_t blockSize)
+{
+  q15_t *pState = S->pState;                     /* State pointer */
+  q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q15_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q15_t *px, *pb;                                /* Temporary pointers for state and coefficient buffers */
+  q15_t mu = S->mu;                              /* Adaptive factor */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  uint32_t tapCnt, blkCnt;                       /* Loop counters */
+  q31_t energy;                                  /* Energy of the input */
+  q63_t acc;                                     /* Accumulator */
+  q15_t e = 0, d = 0;                            /* error, reference data sample */
+  q15_t w = 0, in;                               /* weight factor and state */
+  q15_t x0;                                      /* temporary variable to hold input sample */
+  q15_t errorXmu, oneByEnergy;                   /* Temporary variables to store error and mu product and reciprocal of energy */
+  //q31_t errorXmu;                   				 /* Temporary variables to store error and mu product and reciprocal of energy */
+  q15_t postShift;                               /* Post shift to be applied to weight after reciprocal calculation */
+  q31_t coef;                                    /* Teporary variable for coefficient */
+  q31_t acc_l, acc_h;
+  int32_t lShift = 15 - (int32_t)S->postShift;  /*  Post shift  */
+  int32_t uShift = 32 - lShift;
+
+  energy = S->energy;
+  x0 = S->x0;
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Read the sample from input buffer */
+    in = *pSrc++;
+
+    /* Update the energy calculation */
+    energy -= (((q31_t)x0 * x0) >> 15) & 0xffff;
+    energy += (((q31_t)in * in) >> 15) & 0xffff;
+
+    /* Set the accumulator to zero */
+    acc = 0;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += (q31_t)*px++ * (*pb++);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Calc lower part of acc */
+    acc_l = acc & 0xffffffff;
+
+    /* Calc upper part of acc */
+    acc_h = (acc >> 32) & 0xffffffff;
+
+    /* Apply shift for lower part of acc and upper part of acc */
+    acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+    /* Converting the result to 1.15 format and saturate the output */
+    acc = ref_sat_q15(acc);
+
+    /* Store the result from accumulator into the destination buffer. */
+    *pOut++ = (q15_t) acc;
+
+    /* Compute and store error */
+    d = *pRef++;
+    e = d - (q15_t) acc;
+    *pErr++ = e;
+    
+#if 0
+    /* Calculation of e * mu value */
+    errorXmu = (q31_t) e * mu;
+
+    /* Calculation of (e * mu) /energy value */
+    acc = errorXmu / (energy + DELTA_Q15);
+#endif
+
+    /* Calculation of 1/energy */
+    postShift = arm_recip_q15((q15_t) energy + DELTA_Q15,
+                              &oneByEnergy, S->recipTable);
+
+    /* Calculation of e * mu value */
+    errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
+
+    /* Calculation of (e * mu) * (1/energy) value */
+    acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift));
+    
+    /* Weighting factor for the normalized version */
+    w = ref_sat_q15((q31_t)acc);
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize coeff pointer */
+    pb = pCoeffs;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      coef = *pb + (((q31_t)w * (*px++)) >> 15);
+      *pb++ = ref_sat_q15(coef);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Read the sample from state buffer */
+    x0 = *pState;
+
+    /* Advance state pointer by 1 for the next sample */
+    pState = pState + 1U;
+  }
+
+  /* Save energy and x0 values for the next frame */
+  S->energy = (q15_t)energy;
+  S->x0 = x0;
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the        
+     satrt of the state buffer. This prepares the state buffer for the        
+     next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /* copy (numTaps - 1U) data */
+  tapCnt = numTaps - 1;
+
+  /* copy data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}