From 96d6da4e252b06dcfdc041e7df23e86161c33007 Mon Sep 17 00:00:00 2001
From: rihab kouki <rihab.kouki@st.com>
Date: Tue, 28 Jul 2020 11:24:49 +0100
Subject: Official ARM version: v5.6.0

---
 DSP/Source/TransformFunctions/arm_rfft_q15.c | 470 ++++++++++++---------------
 1 file changed, 212 insertions(+), 258 deletions(-)

(limited to 'DSP/Source/TransformFunctions/arm_rfft_q15.c')

diff --git a/DSP/Source/TransformFunctions/arm_rfft_q15.c b/DSP/Source/TransformFunctions/arm_rfft_q15.c
index f85cf30..fdc9bab 100644
--- a/DSP/Source/TransformFunctions/arm_rfft_q15.c
+++ b/DSP/Source/TransformFunctions/arm_rfft_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_rfft_q15.c
  * Description:  RFFT & RIFFT Q15 process function
  *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
  *
  * Target Processor: Cortex-M cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -33,173 +33,161 @@
  * -------------------------------------------------------------------- */
 
 void arm_split_rfft_q15(
-    q15_t * pSrc,
-    uint32_t fftLen,
-    q15_t * pATable,
-    q15_t * pBTable,
-    q15_t * pDst,
-    uint32_t modifier);
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pATable,
+  const q15_t * pBTable,
+        q15_t * pDst,
+        uint32_t modifier);
 
 void arm_split_rifft_q15(
-    q15_t * pSrc,
-    uint32_t fftLen,
-    q15_t * pATable,
-    q15_t * pBTable,
-    q15_t * pDst,
-    uint32_t modifier);
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pATable,
+  const q15_t * pBTable,
+        q15_t * pDst,
+        uint32_t modifier);
 
 /**
-* @addtogroup RealFFT
-* @{
-*/
+  @addtogroup RealFFT
+  @{
+ */
 
 /**
-* @brief Processing function for the Q15 RFFT/RIFFT.
-* @param[in]  *S    points to an instance of the Q15 RFFT/RIFFT structure.
-* @param[in]  *pSrc points to the input buffer.
-* @param[out] *pDst points to the output buffer.
-* @return none.
-*
-* \par Input an output formats:
-* \par
-* Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
-* Hence the output format is different for different RFFT sizes.
-* The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
-* \par
-* \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT"
-* \par
-* \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT"
-*/
+  @brief         Processing function for the Q15 RFFT/RIFFT.
+  @param[in]     S     points to an instance of the Q15 RFFT/RIFFT structure
+  @param[in]     pSrc  points to input buffer
+  @param[out]    pDst  points to output buffer
+  @return        none
+
+  @par           Input an output formats
+                   Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+                   Hence the output format is different for different RFFT sizes.
+                   The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
+  @par
+                   \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT"
+  @par
+                   \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT"
+ */
 
 void arm_rfft_q15(
-    const arm_rfft_instance_q15 * S,
-    q15_t * pSrc,
-    q15_t * pDst)
+  const arm_rfft_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst)
 {
-    const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
-    uint32_t i;
-    uint32_t L2 = S->fftLenReal >> 1;
-
-    /* Calculation of RIFFT of input */
-    if (S->ifftFlagR == 1U)
-    {
-        /*  Real IFFT core process */
-        arm_split_rifft_q15(pSrc, L2, S->pTwiddleAReal,
-                            S->pTwiddleBReal, pDst, S->twidCoefRModifier);
-
-        /* Complex IFFT process */
-        arm_cfft_q15(S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
+  const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
+        uint32_t L2 = S->fftLenReal >> 1U;
+        uint32_t i;
+
+  /* Calculation of RIFFT of input */
+  if (S->ifftFlagR == 1U)
+  {
+     /*  Real IFFT core process */
+     arm_split_rifft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+
+     /* Complex IFFT process */
+     arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
+
+     for(i = 0; i < S->fftLenReal; i++)
+     {
+        pDst[i] = pDst[i] << 1U;
+     }
+  }
+  else
+  {
+     /* Calculation of RFFT of input */
+
+     /* Complex FFT process */
+     arm_cfft_q15 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
+
+     /*  Real FFT core process */
+     arm_split_rfft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+  }
 
-        for(i=0;i<S->fftLenReal;i++)
-        {
-            pDst[i] = pDst[i] << 1;
-        }
-    }
-    else
-    {
-        /* Calculation of RFFT of input */
-
-        /* Complex FFT process */
-        arm_cfft_q15(S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
-
-        /*  Real FFT core process */
-        arm_split_rfft_q15(pSrc, L2, S->pTwiddleAReal,
-                            S->pTwiddleBReal, pDst, S->twidCoefRModifier);
-    }
 }
 
 /**
-* @} end of RealFFT group
-*/
+  @} end of RealFFT group
+ */
 
 /**
-* @brief  Core Real FFT process
-* @param  *pSrc 				points to the input buffer.
-* @param  fftLen  				length of FFT.
-* @param  *pATable 			points to the A twiddle Coef buffer.
-* @param  *pBTable 			points to the B twiddle Coef buffer.
-* @param  *pDst 				points to the output buffer.
-* @param  modifier 	        twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @return none.
-* The function implements a Real FFT
-*/
+  @brief         Core Real FFT process
+  @param[in]     pSrc      points to input buffer
+  @param[in]     fftLen    length of FFT
+  @param[in]     pATable   points to twiddle Coef A buffer
+  @param[in]     pBTable   points to twiddle Coef B buffer
+  @param[out]    pDst      points to output buffer
+  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  @return        none
+
+  @par
+                   The function implements a Real FFT
+ */
 
 void arm_split_rfft_q15(
-    q15_t * pSrc,
-    uint32_t fftLen,
-    q15_t * pATable,
-    q15_t * pBTable,
-    q15_t * pDst,
-    uint32_t modifier)
-{
-    uint32_t i;                                    /* Loop Counter */
-    q31_t outR, outI;                              /* Temporary variables for output */
-    q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
-    q15_t *pSrc1, *pSrc2;
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pATable,
+  const q15_t * pBTable,
+        q15_t * pDst,
+        uint32_t modifier)
+{       
+        uint32_t i;                                    /* Loop Counter */
+        q31_t outR, outI;                              /* Temporary variables for output */
+  const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
+        q15_t *pSrc1, *pSrc2;
 #if defined (ARM_MATH_DSP)
-    q15_t *pD1, *pD2;
+        q15_t *pD1, *pD2;
 #endif
 
-    //  pSrc[2U * fftLen] = pSrc[0];
-    //  pSrc[(2U * fftLen) + 1U] = pSrc[1];
+  /* Init coefficient pointers */
+  pCoefA = &pATable[modifier * 2];
+  pCoefB = &pBTable[modifier * 2];
 
-    pCoefA = &pATable[modifier * 2U];
-    pCoefB = &pBTable[modifier * 2U];
-
-    pSrc1 = &pSrc[2];
-    pSrc2 = &pSrc[(2U * fftLen) - 2U];
+  pSrc1 = &pSrc[2];
+  pSrc2 = &pSrc[(2U * fftLen) - 2U];
 
 #if defined (ARM_MATH_DSP)
 
-    /* Run the below code for Cortex-M4 and Cortex-M3 */
     i = 1U;
     pD1 = pDst + 2;
     pD2 = pDst + (4U * fftLen) - 2;
 
-    for(i = fftLen - 1; i > 0; i--)
+    for (i = fftLen - 1; i > 0; i--)
     {
         /*
-        outR = (pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1]
-        + pSrc[2 * n - 2 * i] * pBTable[2 * i] +
-        pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-        */
+          outR = (  pSrc[2 * i]             * pATable[2 * i]
+                  - pSrc[2 * i + 1]         * pATable[2 * i + 1]
+                  + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
+                  + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
 
-        /* outI = (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] +
-        pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); */
+          outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+                  + pIn[2 * i]             * pATable[2 * i + 1]
+                  + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+                  - pIn[2 * n - 2 * i + 1] * pBTable[2 * i])
+         */
 
 
 #ifndef ARM_MATH_BIG_ENDIAN
-
         /* pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1] */
-        outR = __SMUSD(*__SIMD32(pSrc1), *__SIMD32(pCoefA));
-
+        outR = __SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA));
 #else
-
         /* -(pSrc[2 * i + 1] * pATable[2 * i + 1] - pSrc[2 * i] * pATable[2 * i]) */
-        outR = -(__SMUSD(*__SIMD32(pSrc1), *__SIMD32(pCoefA)));
-
-#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
+        outR = -(__SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA)));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
-        /* pSrc[2 * n - 2 * i] * pBTable[2 * i] +
-        pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
-        outR = __SMLAD(*__SIMD32(pSrc2), *__SIMD32(pCoefB), outR) >> 16U;
-
-        /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+        /* pSrc[2 * n - 2 * i] * pBTable[2 * i] + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
+        outR = __SMLAD(read_q15x2 (pSrc2), read_q15x2((q15_t *) pCoefB), outR) >> 16U;
 
+        /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
 #ifndef ARM_MATH_BIG_ENDIAN
-
-        outI = __SMUSDX(*__SIMD32(pSrc2)--, *__SIMD32(pCoefB));
-
+        outI = __SMUSDX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
 #else
-
-        outI = __SMUSDX(*__SIMD32(pCoefB), *__SIMD32(pSrc2)--);
-
-#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
+        outI = __SMUSDX(read_q15x2 ((q15_t *) pCoefB), read_q15x2_da (&pSrc2));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
         /* (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] */
-        outI = __SMLADX(*__SIMD32(pSrc1)++, *__SIMD32(pCoefA), outI);
+        outI = __SMLADX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), outI);
 
         /* write output */
         *pD1++ = (q15_t) outR;
@@ -215,23 +203,23 @@ void arm_split_rfft_q15(
         pCoefA = pCoefA + (2U * modifier);
     }
 
-    pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
-    pDst[(2U * fftLen) + 1U] = 0;
+    pDst[2U * fftLen]      = (pSrc[0] - pSrc[1]) >> 1U;
+    pDst[2U * fftLen + 1U] = 0;
 
-    pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
+    pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
     pDst[1] = 0;
 
 #else
 
-    /* Run the below code for Cortex-M0 */
     i = 1U;
 
     while (i < fftLen)
     {
         /*
-        outR = (pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1]
-        + pSrc[2 * n - 2 * i] * pBTable[2 * i] +
-        pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+          outR = (  pSrc[2 * i]             * pATable[2 * i]
+                  - pSrc[2 * i + 1]         * pATable[2 * i + 1]
+                  + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
+                  + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
         */
 
         outR = *pSrc1 * *pCoefA;
@@ -239,10 +227,11 @@ void arm_split_rfft_q15(
         outR = outR + (*pSrc2 * *pCoefB);
         outR = (outR + (*(pSrc2 + 1) * *(pCoefB + 1))) >> 16;
 
-
-        /* outI = (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] +
-        pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+        /*
+          outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+                  + pIn[2 * i]             * pATable[2 * i + 1]
+                  + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+                  - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
         */
 
         outI = *pSrc2 * *(pCoefB + 1);
@@ -256,7 +245,7 @@ void arm_split_rfft_q15(
 
         /* write output */
         pDst[2U * i] = (q15_t) outR;
-        pDst[(2U * i) + 1U] = outI >> 16U;
+        pDst[2U * i + 1U] = outI >> 16U;
 
         /* write complex conjugate output */
         pDst[(4U * fftLen) - (2U * i)] = (q15_t) outR;
@@ -270,7 +259,7 @@ void arm_split_rfft_q15(
     }
 
     pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
-    pDst[(2U * fftLen) + 1U] = 0;
+    pDst[2U * fftLen + 1U] = 0;
 
     pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
     pDst[1] = 0;
@@ -280,147 +269,112 @@ void arm_split_rfft_q15(
 
 
 /**
-* @brief  Core Real IFFT process
-* @param[in]   *pSrc 				points to the input buffer.
-* @param[in]   fftLen  		    length of FFT.
-* @param[in]   *pATable 			points to the twiddle Coef A buffer.
-* @param[in]   *pBTable 			points to the twiddle Coef B buffer.
-* @param[out]  *pDst 				points to the output buffer.
-* @param[in]   modifier 	        twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-* @return none.
-* The function implements a Real IFFT
-*/
+  @brief         Core Real IFFT process
+  @param[in]     pSrc      points to input buffer
+  @param[in]     fftLen    length of FFT
+  @param[in]     pATable   points to twiddle Coef A buffer
+  @param[in]     pBTable   points to twiddle Coef B buffer
+  @param[out]    pDst      points to output buffer
+  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  @return        none
+
+  @par
+                   The function implements a Real IFFT
+ */
+
 void arm_split_rifft_q15(
-    q15_t * pSrc,
-    uint32_t fftLen,
-    q15_t * pATable,
-    q15_t * pBTable,
-    q15_t * pDst,
-    uint32_t modifier)
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pATable,
+  const q15_t * pBTable,
+        q15_t * pDst,
+        uint32_t modifier)
 {
-    uint32_t i;                                    /* Loop Counter */
-    q31_t outR, outI;                              /* Temporary variables for output */
-    q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
-    q15_t *pSrc1, *pSrc2;
-    q15_t *pDst1 = &pDst[0];
-
-    pCoefA = &pATable[0];
-    pCoefB = &pBTable[0];
-
-    pSrc1 = &pSrc[0];
-    pSrc2 = &pSrc[2U * fftLen];
+        uint32_t i;                                    /* Loop Counter */
+        q31_t outR, outI;                              /* Temporary variables for output */
+  const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
+        q15_t *pSrc1, *pSrc2;
+        q15_t *pDst1 = &pDst[0];
+
+  pCoefA = &pATable[0];
+  pCoefB = &pBTable[0];
+
+  pSrc1 = &pSrc[0];
+  pSrc2 = &pSrc[2 * fftLen];
+
+  i = fftLen;
+  while (i > 0U)
+  {
+      /*
+        outR = (  pIn[2 * i]             * pATable[2 * i]
+                + pIn[2 * i + 1]         * pATable[2 * i + 1]
+                + pIn[2 * n - 2 * i]     * pBTable[2 * i]
+                - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+        outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+                - pIn[2 * i]             * pATable[2 * i + 1]
+                - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+                - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+       */
 
 #if defined (ARM_MATH_DSP)
 
-    /* Run the below code for Cortex-M4 and Cortex-M3 */
-    i = fftLen;
-
-    while (i > 0U)
-    {
-        /*
-        outR = (pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] +
-        pIn[2 * n - 2 * i] * pBTable[2 * i] -
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-
-        outI = (pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] -
-        pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
-        */
-
-
 #ifndef ARM_MATH_BIG_ENDIAN
-
-        /* pIn[2 * n - 2 * i] * pBTable[2 * i] -
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
-        outR = __SMUSD(*__SIMD32(pSrc2), *__SIMD32(pCoefB));
-
+      /* pIn[2 * n - 2 * i] * pBTable[2 * i] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
+      outR = __SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB));
 #else
+      /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
+      outR = -(__SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB)));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
-        /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] +
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
-        outR = -(__SMUSD(*__SIMD32(pSrc2), *__SIMD32(pCoefB)));
-
-#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
+      /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] + pIn[2 * n - 2 * i] * pBTable[2 * i] */
+      outR = __SMLAD(read_q15x2(pSrc1), read_q15x2 ((q15_t *) pCoefA), outR) >> 16U;
 
-        /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] +
-        pIn[2 * n - 2 * i] * pBTable[2 * i] */
-        outR = __SMLAD(*__SIMD32(pSrc1), *__SIMD32(pCoefA), outR) >> 16U;
-
-        /*
-        -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] +
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
-        outI = __SMUADX(*__SIMD32(pSrc2)--, *__SIMD32(pCoefB));
-
-        /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
+      /* -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+      outI = __SMUADX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
 
+      /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
 #ifndef ARM_MATH_BIG_ENDIAN
-
-        outI = __SMLSDX(*__SIMD32(pCoefA), *__SIMD32(pSrc1)++, -outI);
-
+      outI = __SMLSDX(read_q15x2 ((q15_t *) pCoefA), read_q15x2_ia (&pSrc1), -outI);
 #else
+      outI = __SMLSDX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), -outI);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
-        outI = __SMLSDX(*__SIMD32(pSrc1)++, *__SIMD32(pCoefA), -outI);
-
-#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
-        /* write output */
-
+      /* write output */
 #ifndef ARM_MATH_BIG_ENDIAN
-
-        *__SIMD32(pDst1)++ = __PKHBT(outR, (outI >> 16U), 16);
-
+      write_q15x2_ia (&pDst1, __PKHBT(outR, (outI >> 16U), 16));
 #else
+      write_q15x2_ia (&pDst1, __PKHBT((outI >> 16U), outR, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
-        *__SIMD32(pDst1)++ = __PKHBT((outI >> 16U), outR, 16);
 
-#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
+#else  /* #if defined (ARM_MATH_DSP) */
 
-        /* update coefficient pointer */
-        pCoefB = pCoefB + (2U * modifier);
-        pCoefA = pCoefA + (2U * modifier);
+      outR = *pSrc2 * *pCoefB;
+      outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
+      outR = outR + (*pSrc1 * *pCoefA);
+      outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
 
-        i--;
-    }
-#else
-    /* Run the below code for Cortex-M0 */
-    i = fftLen;
+      outI = *(pSrc1 + 1) * *pCoefA;
+      outI = outI - (*pSrc1 * *(pCoefA + 1));
+      outI = outI - (*pSrc2 * *(pCoefB + 1));
+      outI = outI - (*(pSrc2 + 1) * *(pCoefB));
 
-    while (i > 0U)
-    {
-        /*
-        outR = (pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] +
-        pIn[2 * n - 2 * i] * pBTable[2 * i] -
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-        */
+      /* update input pointers */
+      pSrc1 += 2U;
+      pSrc2 -= 2U;
 
-        outR = *pSrc2 * *pCoefB;
-        outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
-        outR = outR + (*pSrc1 * *pCoefA);
-        outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
+      /* write output */
+      *pDst1++ = (q15_t) outR;
+      *pDst1++ = (q15_t) (outI >> 16);
 
-        /*
-        outI = (pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] -
-        pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -
-        pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
-        */
-
-        outI = *(pSrc1 + 1) * *pCoefA;
-        outI = outI - (*pSrc1 * *(pCoefA + 1));
-        outI = outI - (*pSrc2 * *(pCoefB + 1));
-        outI = outI - (*(pSrc2 + 1) * *(pCoefB));
-
-        /* update input pointers */
-        pSrc1 += 2U;
-        pSrc2 -= 2U;
+#endif /* #if defined (ARM_MATH_DSP) */
 
-        /* write output */
-        *pDst1++ = (q15_t) outR;
-        *pDst1++ = (q15_t) (outI >> 16);
+      /* update coefficient pointer */
+      pCoefB = pCoefB + (2 * modifier);
+      pCoefA = pCoefA + (2 * modifier);
 
-        /* update coefficient pointer */
-        pCoefB = pCoefB + (2U * modifier);
-        pCoefA = pCoefA + (2U * modifier);
+      i--;
+  }
 
-        i--;
-    }
-#endif /* #if defined (ARM_MATH_DSP) */
 }
-- 
cgit