summaryrefslogtreecommitdiff
path: root/DSP/Source/StatisticsFunctions
diff options
context:
space:
mode:
authorrihab kouki <rihab.kouki@st.com>2020-07-28 11:24:49 +0100
committerrihab kouki <rihab.kouki@st.com>2020-07-28 11:24:49 +0100
commit96d6da4e252b06dcfdc041e7df23e86161c33007 (patch)
treea262f59bb1db7ec7819acae435f5049cbe5e2354 /DSP/Source/StatisticsFunctions
parent9f95ff5b6ba01db09552b84a0ab79607060a2666 (diff)
downloadst-cmsis-core-lowfat-96d6da4e252b06dcfdc041e7df23e86161c33007.tar.gz
st-cmsis-core-lowfat-96d6da4e252b06dcfdc041e7df23e86161c33007.tar.bz2
st-cmsis-core-lowfat-96d6da4e252b06dcfdc041e7df23e86161c33007.zip
Official ARM version: v5.6.0HEADmaster
Diffstat (limited to 'DSP/Source/StatisticsFunctions')
-rw-r--r--DSP/Source/StatisticsFunctions/CMakeLists.txt16
-rw-r--r--DSP/Source/StatisticsFunctions/StatisticsFunctions.c53
-rw-r--r--DSP/Source/StatisticsFunctions/arm_max_f32.c239
-rw-r--r--DSP/Source/StatisticsFunctions/arm_max_q15.c124
-rw-r--r--DSP/Source/StatisticsFunctions/arm_max_q31.c124
-rw-r--r--DSP/Source/StatisticsFunctions/arm_max_q7.c124
-rw-r--r--DSP/Source/StatisticsFunctions/arm_mean_f32.c133
-rw-r--r--DSP/Source/StatisticsFunctions/arm_mean_q15.c84
-rw-r--r--DSP/Source/StatisticsFunctions/arm_mean_q31.c91
-rw-r--r--DSP/Source/StatisticsFunctions/arm_mean_q7.c80
-rw-r--r--DSP/Source/StatisticsFunctions/arm_min_f32.c236
-rw-r--r--DSP/Source/StatisticsFunctions/arm_min_q15.c124
-rw-r--r--DSP/Source/StatisticsFunctions/arm_min_q31.c124
-rw-r--r--DSP/Source/StatisticsFunctions/arm_min_q7.c124
-rw-r--r--DSP/Source/StatisticsFunctions/arm_power_f32.c132
-rw-r--r--DSP/Source/StatisticsFunctions/arm_power_q15.c128
-rw-r--r--DSP/Source/StatisticsFunctions/arm_power_q31.c92
-rw-r--r--DSP/Source/StatisticsFunctions/arm_power_q7.c115
-rw-r--r--DSP/Source/StatisticsFunctions/arm_rms_f32.c141
-rw-r--r--DSP/Source/StatisticsFunctions/arm_rms_q15.c139
-rw-r--r--DSP/Source/StatisticsFunctions/arm_rms_q31.c121
-rw-r--r--DSP/Source/StatisticsFunctions/arm_std_f32.c176
-rw-r--r--DSP/Source/StatisticsFunctions/arm_std_q15.c181
-rw-r--r--DSP/Source/StatisticsFunctions/arm_std_q31.c148
-rw-r--r--DSP/Source/StatisticsFunctions/arm_var_f32.c323
-rw-r--r--DSP/Source/StatisticsFunctions/arm_var_q15.c182
-rw-r--r--DSP/Source/StatisticsFunctions/arm_var_q31.c146
27 files changed, 1980 insertions, 1720 deletions
diff --git a/DSP/Source/StatisticsFunctions/CMakeLists.txt b/DSP/Source/StatisticsFunctions/CMakeLists.txt
new file mode 100644
index 0000000..3f23355
--- /dev/null
+++ b/DSP/Source/StatisticsFunctions/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPStatistics)
+
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPStatistics STATIC ${SRC})
+
+configdsp(CMSISDSPStatistics ..)
+
+### Includes
+target_include_directories(CMSISDSPStatistics PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/DSP/Source/StatisticsFunctions/StatisticsFunctions.c b/DSP/Source/StatisticsFunctions/StatisticsFunctions.c
new file mode 100644
index 0000000..4f86aa4
--- /dev/null
+++ b/DSP/Source/StatisticsFunctions/StatisticsFunctions.c
@@ -0,0 +1,53 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: StatisticsFunctions.c
+ * Description: Combination of all statistics function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_max_f32.c"
+#include "arm_max_q15.c"
+#include "arm_max_q31.c"
+#include "arm_max_q7.c"
+#include "arm_mean_f32.c"
+#include "arm_mean_q15.c"
+#include "arm_mean_q31.c"
+#include "arm_mean_q7.c"
+#include "arm_min_f32.c"
+#include "arm_min_q15.c"
+#include "arm_min_q31.c"
+#include "arm_min_q7.c"
+#include "arm_power_f32.c"
+#include "arm_power_q15.c"
+#include "arm_power_q31.c"
+#include "arm_power_q7.c"
+#include "arm_rms_f32.c"
+#include "arm_rms_q15.c"
+#include "arm_rms_q31.c"
+#include "arm_std_f32.c"
+#include "arm_std_q15.c"
+#include "arm_std_q31.c"
+#include "arm_var_f32.c"
+#include "arm_var_q15.c"
+#include "arm_var_q31.c"
diff --git a/DSP/Source/StatisticsFunctions/arm_max_f32.c b/DSP/Source/StatisticsFunctions/arm_max_f32.c
index a0a68ac..cd54e2a 100644
--- a/DSP/Source/StatisticsFunctions/arm_max_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_max_f32.c
@@ -3,13 +3,13 @@
* Title: arm_max_f32.c
* Description: Maximum value of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -27,136 +27,237 @@
*/
#include "arm_math.h"
+#if defined(ARM_MATH_NEON)
+#include <limits.h>
+#endif
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup Max Maximum
- *
- * Computes the maximum value of an array of data.
- * The function returns both the maximum value and its position within the array.
- * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ @defgroup Max Maximum
+
+ Computes the maximum value of an array of data.
+ The function returns both the maximum value and its position within the array.
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup Max
- * @{
+ @addtogroup Max
+ @{
*/
-
/**
- * @brief Maximum value of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult maximum value returned here
- * @param[out] *pIndex index of maximum value returned here
- * @return none.
+ @brief Maximum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON)
void arm_max_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
float32_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
+ float32x4_t outV, srcV;
+ float32x2_t outV2;
+
+ uint32x4_t idxV;
+ uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX};
+ uint32x4_t index={4,5,6,7};
+ uint32x4_t delta={4,4,4,4};
+ uint32x4_t countV={0,1,2,3};
+ uint32x2_t countV2;
+
/* Initialise the count value. */
count = 0U;
+
/* Initialise the index value to zero. */
outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparison */
+ if (blockSize <= 3)
+ {
+ out = *pSrc++;
+
+ blkCnt = blockSize - 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ outV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = (blockSize - 4 ) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ srcV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ idxV = vcgtq_f32(srcV, outV);
+ outV = vbslq_f32(idxV, srcV, outV );
+ countV = vbslq_u32(idxV, index,countV );
+
+ index = vaddq_u32(index,delta);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV));
+ outV2 = vpmax_f32(outV2,outV2);
+ out = outV2[0];
+
+ idxV = vceqq_f32(outV, vdupq_n_f32(out));
+ countV = vbslq_u32(idxV, countV,maxIdx);
+
+ countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
+ countV2 = vpmin_u32(countV2,countV2);
+ outIndex = countV2[0];
+
+ /* if (blockSize - 1U) is not multiple of 4 */
+ blkCnt = (blockSize - 4 ) % 4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt ;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#else
+void arm_max_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 1U;
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 2U;
+ out = maxVal;
+ outIndex = index + 2U;
}
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
-
- /* compare for the maximum value */
- if (out < maxVal1)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 3U;
+ out = maxVal;
+ outIndex = index + 3U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 4U;
+ out = maxVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- float32_t maxVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
/* Update the maximum value and it's index */
- out = maxVal1;
+ out = maxVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -164,7 +265,7 @@ void arm_max_f32(
*pResult = out;
*pIndex = outIndex;
}
-
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of Max group
+ @} end of Max group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_max_q15.c b/DSP/Source/StatisticsFunctions/arm_max_q15.c
index 67d5e34..329b0c8 100644
--- a/DSP/Source/StatisticsFunctions/arm_max_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_max_q15.c
@@ -3,13 +3,13 @@
* Title: arm_max_q15.c
* Description: Maximum value of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,126 +29,112 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Max
- * @{
+ @addtogroup Max
+ @{
*/
-
/**
- * @brief Maximum value of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult maximum value returned here
- * @param[out] *pIndex index of maximum value returned here
- * @return none.
+ @brief Maximum value of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
*/
void arm_max_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult,
- uint32_t * pIndex)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q15_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q15_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 1U;
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 2U;
+ out = maxVal;
+ outIndex = index + 2U;
}
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
-
- /* compare for the maximum value */
- if (out < maxVal1)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 3U;
+ out = maxVal;
+ outIndex = index + 3U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 4U;
+ out = maxVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q15_t maxVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
/* Update the maximum value and it's index */
- out = maxVal1;
+ out = maxVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -158,5 +144,5 @@ void arm_max_q15(
}
/**
- * @} end of Max group
+ @} end of Max group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_max_q31.c b/DSP/Source/StatisticsFunctions/arm_max_q31.c
index 5d34bbd..99de13e 100644
--- a/DSP/Source/StatisticsFunctions/arm_max_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_max_q31.c
@@ -3,13 +3,13 @@
* Title: arm_max_q31.c
* Description: Maximum value of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,126 +29,112 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Max
- * @{
+ @addtogroup Max
+ @{
*/
-
/**
- * @brief Maximum value of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult maximum value returned here
- * @param[out] *pIndex index of maximum value returned here
- * @return none.
+ @brief Maximum value of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
*/
void arm_max_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult,
- uint32_t * pIndex)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q31_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 1U;
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 2U;
+ out = maxVal;
+ outIndex = index + 2U;
}
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
-
- /* compare for the maximum value */
- if (out < maxVal1)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 3U;
+ out = maxVal;
+ outIndex = index + 3U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 4U;
+ out = maxVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q31_t maxVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
/* Update the maximum value and it's index */
- out = maxVal1;
+ out = maxVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -158,5 +144,5 @@ void arm_max_q31(
}
/**
- * @} end of Max group
+ @} end of Max group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_max_q7.c b/DSP/Source/StatisticsFunctions/arm_max_q7.c
index 72f6e5e..9c8b6d3 100644
--- a/DSP/Source/StatisticsFunctions/arm_max_q7.c
+++ b/DSP/Source/StatisticsFunctions/arm_max_q7.c
@@ -3,13 +3,13 @@
* Title: arm_max_q7.c
* Description: Maximum value of a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,126 +29,112 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Max
- * @{
+ @addtogroup Max
+ @{
*/
-
/**
- * @brief Maximum value of a Q7 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult maximum value returned here
- * @param[out] *pIndex index of maximum value returned here
- * @return none.
+ @brief Maximum value of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
*/
void arm_max_q7(
- q7_t * pSrc,
- uint32_t blockSize,
- q7_t * pResult,
- uint32_t * pIndex)
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q7_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q7_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 1U;
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 2U;
+ out = maxVal;
+ outIndex = index + 2U;
}
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
- maxVal2 = *pSrc++;
-
- /* compare for the maximum value */
- if (out < maxVal1)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal1;
- outIndex = count + 3U;
+ out = maxVal;
+ outIndex = index + 3U;
}
- /* compare for the maximum value */
- if (out < maxVal2)
+ maxVal = *pSrc++;
+ if (out < maxVal)
{
- /* Update the maximum value and its index */
- out = maxVal2;
- outIndex = count + 4U;
+ out = maxVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q7_t maxVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
- maxVal1 = *pSrc++;
+ maxVal = *pSrc++;
/* compare for the maximum value */
- if (out < maxVal1)
+ if (out < maxVal)
{
/* Update the maximum value and it's index */
- out = maxVal1;
+ out = maxVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -158,5 +144,5 @@ void arm_max_q7(
}
/**
- * @} end of Max group
+ @} end of Max group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_mean_f32.c b/DSP/Source/StatisticsFunctions/arm_mean_f32.c
index 85a3b16..63d9652 100644
--- a/DSP/Source/StatisticsFunctions/arm_mean_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_mean_f32.c
@@ -3,13 +3,13 @@
* Title: arm_mean_f32.c
* Description: Mean value of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,97 +29,138 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup mean Mean
- *
- * Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
- * The underlying algorithm is used:
- *
- * <pre>
- * Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize;
- * </pre>
- *
- * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ @defgroup mean Mean
+
+ Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
+ The underlying algorithm is used:
+
+ <pre>
+ Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize;
+ </pre>
+
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup mean
- * @{
+ @addtogroup mean
+ @{
*/
-
/**
- * @brief Mean value of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult mean value returned here
- * @return none.
+ @brief Mean value of a floating-point vector.
+ @param[in] pSrc points to the input vector.
+ @param[in] blockSize number of samples in input vector.
+ @param[out] pResult mean value returned here.
+ @return none
*/
-
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_mean_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t sum = 0.0f; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ uint32_t blkCnt; /* Loop counter */
float32_t in1, in2, in3, in4;
+ float32x4_t inV;
- /*loop Unrolling */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in1 = *pSrc++;
- in2 = *pSrc++;
- in3 = *pSrc++;
- in4 = *pSrc++;
-
- sum += in1;
- sum += in2;
- sum += in3;
- sum += in4;
-
+ inV = vld1q_f32(pSrc);
+ sumV = vaddq_f32(sumV, inV);
+
+ pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = sumV2[0] + sumV2[1];
+
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store the result to the destination */
+ *pResult = sum / (float32_t) blockSize;
+}
+#else
+void arm_mean_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- /* Store the result to the destination */
- *pResult = sum / (float32_t) blockSize;
+ /* Store result to destination */
+ *pResult = (sum / blockSize);
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of mean group
+ @} end of mean group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_mean_q15.c b/DSP/Source/StatisticsFunctions/arm_mean_q15.c
index 7bf55c2..463aa84 100644
--- a/DSP/Source/StatisticsFunctions/arm_mean_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_mean_q15.c
@@ -3,13 +3,13 @@
* Title: arm_mean_q15.c
* Description: Mean value of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,59 +29,55 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup mean
- * @{
+ @addtogroup mean
+ @{
*/
-
/**
- * @brief Mean value of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult mean value returned here
- * @return none.
- *
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * The input is represented in 1.15 format and is accumulated in a 32-bit
- * accumulator in 17.15 format.
- * There is no risk of internal overflow with this approach, and the
- * full precision of intermediate result is preserved.
- * Finally, the accumulator is saturated and truncated to yield a result of 1.15 format.
- *
+ @brief Mean value of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult mean value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ The input is represented in 1.15 format and is accumulated in a 32-bit
+ accumulator in 17.15 format.
+ There is no risk of internal overflow with this approach, and the
+ full precision of intermediate result is preserved.
+ Finally, the accumulator is truncated to yield a result of 1.15 format.
*/
void arm_mean_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
{
- q31_t sum = 0; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary result storage */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in;
+#endif
- q31_t in;
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in = *__SIMD32(pSrc)++;
+ in = read_q15x2_ia ((q15_t **) &pSrc);
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
- in = *__SIMD32(pSrc)++;
+
+ in = read_q15x2_ia ((q15_t **) &pSrc);
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
@@ -89,32 +85,30 @@ void arm_mean_q15(
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- /* Store the result to the destination */
- *pResult = (q15_t) (sum / (q31_t)blockSize);
+ /* Store result to destination */
+ *pResult = (q15_t) (sum / (int32_t) blockSize);
}
/**
- * @} end of mean group
+ @} end of mean group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_mean_q31.c b/DSP/Source/StatisticsFunctions/arm_mean_q31.c
index ea83ced..4b0ed6e 100644
--- a/DSP/Source/StatisticsFunctions/arm_mean_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_mean_q31.c
@@ -3,13 +3,13 @@
* Title: arm_mean_q31.c
* Description: Mean value of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,95 +29,82 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup mean
- * @{
+ @addtogroup mean
+ @{
*/
-
/**
- * @brief Mean value of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult mean value returned here
- * @return none.
- *
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *\par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.31 format and is accumulated in a 64-bit
- * accumulator in 33.31 format.
- * There is no risk of internal overflow with this approach, and the
- * full precision of intermediate result is preserved.
- * Finally, the accumulator is truncated to yield a result of 1.31 format.
- *
+ @brief Mean value of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult mean value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.31 format and is accumulated in a 64-bit
+ accumulator in 33.31 format.
+ There is no risk of internal overflow with this approach, and the
+ full precision of intermediate result is preserved.
+ Finally, the accumulator is truncated to yield a result of 1.31 format.
*/
void arm_mean_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q63_t sum = 0; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- q31_t in1, in2, in3, in4;
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in1 = *pSrc++;
- in2 = *pSrc++;
- in3 = *pSrc++;
- in4 = *pSrc++;
+ sum += *pSrc++;
- sum += in1;
- sum += in2;
- sum += in3;
- sum += in4;
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
/* Decrement the loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- /* Store the result to the destination */
- *pResult = (q31_t) (sum / (int32_t) blockSize);
+ /* Store result to destination */
+ *pResult = (q31_t) (sum / blockSize);
}
/**
- * @} end of mean group
+ @} end of mean group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_mean_q7.c b/DSP/Source/StatisticsFunctions/arm_mean_q7.c
index a7bdfb8..8f52211 100644
--- a/DSP/Source/StatisticsFunctions/arm_mean_q7.c
+++ b/DSP/Source/StatisticsFunctions/arm_mean_q7.c
@@ -3,13 +3,13 @@
* Title: arm_mean_q7.c
* Description: Mean value of a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,57 +29,51 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup mean
- * @{
+ @addtogroup mean
+ @{
*/
-
/**
- * @brief Mean value of a Q7 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult mean value returned here
- * @return none.
- *
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * The input is represented in 1.7 format and is accumulated in a 32-bit
- * accumulator in 25.7 format.
- * There is no risk of internal overflow with this approach, and the
- * full precision of intermediate result is preserved.
- * Finally, the accumulator is truncated to yield a result of 1.7 format.
- *
+ @brief Mean value of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult mean value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ The input is represented in 1.7 format and is accumulated in a 32-bit
+ accumulator in 25.7 format.
+ There is no risk of internal overflow with this approach, and the
+ full precision of intermediate result is preserved.
+ Finally, the accumulator is truncated to yield a result of 1.7 format.
*/
void arm_mean_q7(
- q7_t * pSrc,
- uint32_t blockSize,
- q7_t * pResult)
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult)
{
- q31_t sum = 0; /* Temporary result storage */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary result storage */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in;
+#endif
- q31_t in;
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in = *__SIMD32(pSrc)++;
-
+ in = read_q7x4_ia ((q7_t **) &pSrc);
sum += ((in << 24U) >> 24U);
sum += ((in << 16U) >> 24U);
sum += ((in << 8U) >> 24U);
@@ -89,32 +83,30 @@ void arm_mean_q7(
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- /* Store the result to the destination */
+ /* Store result to destination */
*pResult = (q7_t) (sum / (int32_t) blockSize);
}
/**
- * @} end of mean group
+ @} end of mean group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_min_f32.c b/DSP/Source/StatisticsFunctions/arm_min_f32.c
index 858b0a2..6e9ff4b 100644
--- a/DSP/Source/StatisticsFunctions/arm_min_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_min_f32.c
@@ -3,13 +3,13 @@
* Title: arm_min_f32.c
* Description: Minimum value of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -27,136 +27,233 @@
*/
#include "arm_math.h"
+#include <limits.h>
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup Min Minimum
- *
- * Computes the minimum value of an array of data.
- * The function returns both the minimum value and its position within the array.
- * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ @defgroup Min Minimum
+
+ Computes the minimum value of an array of data.
+ The function returns both the minimum value and its position within the array.
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup Min
- * @{
+ @addtogroup Min
+ @{
*/
-
/**
- * @brief Minimum value of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult minimum value returned here
- * @param[out] *pIndex index of minimum value returned here
- * @return none.
+ @brief Minimum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON)
void arm_min_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- float32_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
+ float32_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex, count; /* loop counter */
+ float32x4_t outV, srcV;
+ float32x2_t outV2;
+
+ uint32x4_t idxV;
+ uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX};
+ uint32x4_t index={4,5,6,7};
+ uint32x4_t delta={4,4,4,4};
+ uint32x4_t countV={0,1,2,3};
+ uint32x2_t countV2;
+
/* Initialise the count value. */
count = 0U;
+
/* Initialise the index value to zero. */
outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparison */
+ if (blockSize <= 3)
+ {
+ out = *pSrc++;
+
+ blkCnt = blockSize - 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out > maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ outV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = (blockSize - 4 ) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ srcV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ idxV = vcltq_f32(srcV, outV);
+ outV = vbslq_f32(idxV, srcV, outV );
+ countV = vbslq_u32(idxV, index,countV );
+
+ index = vaddq_u32(index,delta);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ outV2 = vpmin_f32(vget_low_f32(outV),vget_high_f32(outV));
+ outV2 = vpmin_f32(outV2,outV2);
+ out = outV2[0];
+
+ idxV = vceqq_f32(outV, vdupq_n_f32(out));
+ countV = vbslq_u32(idxV, countV,maxIdx);
+
+ countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
+ countV2 = vpmin_u32(countV2,countV2);
+ outIndex = countV2[0];
+
+ /* if (blockSize - 1U) is not multiple of 4 */
+ blkCnt = (blockSize - 4 ) % 4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out > maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt ;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#else
+void arm_min_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 1U;
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 2U;
+ out = minVal;
+ outIndex = index + 2U;
}
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
-
- /* compare for the minimum value */
- if (out > minVal1)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 3U;
+ out = minVal;
+ outIndex = index + 3U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 4U;
+ out = minVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- float32_t minVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
/* Update the minimum value and it's index */
- out = minVal1;
+ out = minVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -164,7 +261,8 @@ void arm_min_f32(
*pResult = out;
*pIndex = outIndex;
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of Min group
+ @} end of Min group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_min_q15.c b/DSP/Source/StatisticsFunctions/arm_min_q15.c
index fdc32b7..9450383 100644
--- a/DSP/Source/StatisticsFunctions/arm_min_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_min_q15.c
@@ -3,13 +3,13 @@
* Title: arm_min_q15.c
* Description: Minimum value of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,127 +29,113 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Min
- * @{
+ @addtogroup Min
+ @{
*/
-
/**
- * @brief Minimum value of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult minimum value returned here
- * @param[out] *pIndex index of minimum value returned here
- * @return none.
+ @brief Minimum value of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
*/
void arm_min_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult,
- uint32_t * pIndex)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q15_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q15_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 1U;
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 2U;
+ out = minVal;
+ outIndex = index + 2U;
}
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
-
- /* compare for the minimum value */
- if (out > minVal1)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 3U;
+ out = minVal;
+ outIndex = index + 3U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 4U;
+ out = minVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q15_t minVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
/* Update the minimum value and it's index */
- out = minVal1;
+ out = minVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -159,5 +145,5 @@ void arm_min_q15(
}
/**
- * @} end of Min group
+ @} end of Min group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_min_q31.c b/DSP/Source/StatisticsFunctions/arm_min_q31.c
index fc4c155..e25eb47 100644
--- a/DSP/Source/StatisticsFunctions/arm_min_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_min_q31.c
@@ -3,13 +3,13 @@
* Title: arm_min_q31.c
* Description: Minimum value of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,127 +29,113 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Min
- * @{
+ @addtogroup Min
+ @{
*/
-
/**
- * @brief Minimum value of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult minimum value returned here
- * @param[out] *pIndex index of minimum value returned here
- * @return none.
+ @brief Minimum value of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
*/
void arm_min_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult,
- uint32_t * pIndex)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q31_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 1U;
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 2U;
+ out = minVal;
+ outIndex = index + 2U;
}
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
-
- /* compare for the minimum value */
- if (out > minVal1)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 3U;
+ out = minVal;
+ outIndex = index + 3U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 4U;
+ out = minVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q31_t minVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
/* Update the minimum value and it's index */
- out = minVal1;
+ out = minVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -159,5 +145,5 @@ void arm_min_q31(
}
/**
- * @} end of Min group
+ @} end of Min group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_min_q7.c b/DSP/Source/StatisticsFunctions/arm_min_q7.c
index 50362e6..2b171f0 100644
--- a/DSP/Source/StatisticsFunctions/arm_min_q7.c
+++ b/DSP/Source/StatisticsFunctions/arm_min_q7.c
@@ -3,13 +3,13 @@
* Title: arm_min_q7.c
* Description: Minimum value of a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,127 +29,113 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup Min
- * @{
+ @addtogroup Min
+ @{
*/
-
/**
- * @brief Minimum value of a Q7 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult minimum value returned here
- * @param[out] *pIndex index of minimum value returned here
- * @return none.
+ @brief Minimum value of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
*/
void arm_min_q7(
- q7_t * pSrc,
- uint32_t blockSize,
- q7_t * pResult,
- uint32_t * pIndex)
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
{
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q7_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
- q7_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex, count; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
- /* Initialise the count value. */
- count = 0U;
- /* Initialise the index value to zero. */
+ /* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
- /* Loop unrolling */
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 1U;
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 2U;
+ out = minVal;
+ outIndex = index + 2U;
}
- /* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
- minVal2 = *pSrc++;
-
- /* compare for the minimum value */
- if (out > minVal1)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal1;
- outIndex = count + 3U;
+ out = minVal;
+ outIndex = index + 3U;
}
- /* compare for the minimum value */
- if (out > minVal2)
+ minVal = *pSrc++;
+ if (out > minVal)
{
- /* Update the minimum value and its index */
- out = minVal2;
- outIndex = count + 4U;
+ out = minVal;
+ outIndex = index + 4U;
}
- count += 4U;
+ index += 4U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* if (blockSize - 1U) is not multiple of 4 */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
- /* Run the below code for Cortex-M0 */
-
- q7_t minVal1, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt, outIndex; /* loop counter */
-
- /* Initialise the index value to zero. */
- outIndex = 0U;
- /* Load first input value that act as reference value for comparision */
- out = *pSrc++;
+ /* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
- minVal1 = *pSrc++;
+ minVal = *pSrc++;
/* compare for the minimum value */
- if (out > minVal1)
+ if (out > minVal)
{
/* Update the minimum value and it's index */
- out = minVal1;
+ out = minVal;
outIndex = blockSize - blkCnt;
}
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
@@ -159,5 +145,5 @@ void arm_min_q7(
}
/**
- * @} end of Min group
+ @} end of Min group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_power_f32.c b/DSP/Source/StatisticsFunctions/arm_power_f32.c
index 1426735..a4825a5 100644
--- a/DSP/Source/StatisticsFunctions/arm_power_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_power_f32.c
@@ -3,13 +3,13 @@
* Title: arm_power_f32.c
* Description: Sum of the squares of the elements of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,40 +29,37 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup power Power
- *
- * Calculates the sum of the squares of the elements in the input vector.
- * The underlying algorithm is used:
- *
- * <pre>
- * Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1];
- * </pre>
- *
- * There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ @defgroup power Power
+
+ Calculates the sum of the squares of the elements in the input vector.
+ The underlying algorithm is used:
+
+ <pre>
+ Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1];
+ </pre>
+
+ There are separate functions for floating point, Q31, Q15, and Q7 data types.
*/
/**
- * @addtogroup power
- * @{
+ @addtogroup power
+ @{
*/
-
/**
- * @brief Sum of the squares of the elements of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult sum of the squares value returned here
- * @return none.
- *
+ @brief Sum of the squares of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
*/
-
-
+#if defined(ARM_MATH_NEON)
void arm_power_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
@@ -70,60 +67,109 @@ void arm_power_f32(
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+ float32x4_t inV;
- /*loop Unrolling */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
+ inV = vld1q_f32(pSrc);
+ sumV = vmlaq_f32(sumV, inV, inV);
+ pSrc += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = sumV2[0] + sumV2[1];
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* compute power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Store the result to the destination */
+ *pResult = sum;
+}
+#else
+void arm_power_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
+
in = *pSrc++;
sum += in * in;
+
in = *pSrc++;
sum += in * in;
+
in = *pSrc++;
sum += in * in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
-
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
-
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* compute power and then store the result in a temporary variable, sum. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Store the result to the destination */
+ /* Store result to destination */
*pResult = sum;
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of power group
+ @} end of power group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_power_q15.c b/DSP/Source/StatisticsFunctions/arm_power_q15.c
index 6d95f4d..12f524d 100644
--- a/DSP/Source/StatisticsFunctions/arm_power_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_power_q15.c
@@ -3,13 +3,13 @@
* Title: arm_power_q15.c
* Description: Sum of the squares of the elements of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,110 +29,104 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup power
- * @{
+ @addtogroup power
+ @{
*/
/**
- * @brief Sum of the squares of the elements of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult sum of the squares value returned here
- * @return none.
- *
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.15 format.
- * Intermediate multiplication yields a 2.30 format, and this
- * result is added without saturation to a 64-bit accumulator in 34.30 format.
- * With 33 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the return result is in 34.30 format.
- *
+ @brief Sum of the squares of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the return result is in 34.30 format.
*/
void arm_power_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q63_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
{
- q63_t sum = 0; /* Temporary result storage */
-
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q15_t in; /* Temporary variable to store input value */
- q31_t in32; /* Temporary variable to store input value */
- q15_t in16; /* Temporary variable to store input value */
- uint32_t blkCnt; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store packed input value */
+#endif
+#if defined (ARM_MATH_LOOPUNROLL)
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
- in32 = *__SIMD32(pSrc)++;
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
- in32 = *__SIMD32(pSrc)++;
+
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
+#else
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
- /* Decrement the loop counter */
- blkCnt--;
- }
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
- while (blkCnt > 0U)
- {
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
- in16 = *pSrc++;
- sum = __SMLALD(in16, in16, sum);
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
-#else
- /* Run the below code for Cortex-M0 */
-
- q15_t in; /* Temporary variable to store input value */
- uint32_t blkCnt; /* loop counter */
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+#else
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q31_t) in * in);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
-
- /* Store the results in 34.30 format */
+ /* Store result in 34.30 format */
*pResult = sum;
}
/**
- * @} end of power group
+ @} end of power group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_power_q31.c b/DSP/Source/StatisticsFunctions/arm_power_q31.c
index 16be249..1e193b3 100644
--- a/DSP/Source/StatisticsFunctions/arm_power_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_power_q31.c
@@ -3,13 +3,13 @@
* Title: arm_power_q31.c
* Description: Sum of the squares of the elements of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,58 +29,51 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup power
- * @{
+ @addtogroup power
+ @{
*/
/**
- * @brief Sum of the squares of the elements of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult sum of the squares value returned here
- * @return none.
- *
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.31 format.
- * Intermediate multiplication yields a 2.62 format, and this
- * result is truncated to 2.48 format by discarding the lower 14 bits.
- * The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
- * With 15 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the return result is in 16.48 format.
- *
+ @brief Sum of the squares of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.31 format.
+ Intermediate multiplication yields a 2.62 format, and this
+ result is truncated to 2.48 format by discarding the lower 14 bits.
+ The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
+ With 15 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the return result is in 16.48 format.
*/
void arm_power_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q63_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
{
- q63_t sum = 0; /* Temporary result storage */
- q31_t in;
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q31_t in; /* Temporary variable to store input value */
+#if defined (ARM_MATH_LOOPUNROLL)
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and then store the result in a temporary variable sum, providing 15 guard bits. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and store result in a temporary variable sum, providing 15 guard bits. */
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
@@ -93,37 +86,36 @@ void arm_power_q31(
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Store the results in 16.48 format */
+ /* Store results in 16.48 format */
*pResult = sum;
}
/**
- * @} end of power group
+ @} end of power group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_power_q7.c b/DSP/Source/StatisticsFunctions/arm_power_q7.c
index 24306cd..47405cd 100644
--- a/DSP/Source/StatisticsFunctions/arm_power_q7.c
+++ b/DSP/Source/StatisticsFunctions/arm_power_q7.c
@@ -3,13 +3,13 @@
* Title: arm_power_q7.c
* Description: Sum of the squares of the elements of a Q7 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,99 +29,108 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup power
- * @{
+ @addtogroup power
+ @{
*/
/**
- * @brief Sum of the squares of the elements of a Q7 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult sum of the squares value returned here
- * @return none.
- *
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- * \par
- * The function is implemented using a 32-bit internal accumulator.
- * The input is represented in 1.7 format.
- * Intermediate multiplication yields a 2.14 format, and this
- * result is added without saturation to an accumulator in 18.14 format.
- * With 17 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the return result is in 18.14 format.
- *
+ @brief Sum of the squares of the elements of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ The input is represented in 1.7 format.
+ Intermediate multiplication yields a 2.14 format, and this
+ result is added without saturation to an accumulator in 18.14 format.
+ With 17 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the return result is in 18.14 format.
*/
void arm_power_q7(
- q7_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q31_t sum = 0; /* Temporary result storage */
- q7_t in; /* Temporary variable to store input */
- uint32_t blkCnt; /* loop counter */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary result storage */
+ q7_t in; /* Temporary variable to store input value */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store packed input value */
+ q31_t in1, in2; /* Temporary variables to store input value */
+#endif
- q31_t input1; /* Temporary variable to store packed input */
- q31_t in1, in2; /* Temporary variables to store input */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* Reading two inputs of pSrc vector and packing */
- input1 = *__SIMD32(pSrc)++;
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q7x4_ia ((q7_t **) &pSrc);
- in1 = __SXTB16(__ROR(input1, 8));
- in2 = __SXTB16(input1);
+ in1 = __SXTB16(__ROR(in32, 8));
+ in2 = __SXTB16(in32);
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* calculate power and accumulate to accumulator */
sum = __SMLAD(in1, in1, sum);
sum = __SMLAD(in2, in2, sum);
+#else
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
- /* Decrement the loop counter */
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute Power and then store the result in a temporary variable, sum. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q15_t) in * in);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Store the result in 18.14 format */
+ /* Store result in 18.14 format */
*pResult = sum;
}
/**
- * @} end of power group
+ @} end of power group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_rms_f32.c b/DSP/Source/StatisticsFunctions/arm_rms_f32.c
index 8d1b708..4546510 100644
--- a/DSP/Source/StatisticsFunctions/arm_rms_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_rms_f32.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_rms_f32.c
- * Description: Root mean square value of an array of F32 type
+ * Description: Root mean square value of the elements of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,99 +29,148 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup RMS Root mean square (RMS)
- *
- *
- * Calculates the Root Mean Sqaure of the elements in the input vector.
- * The underlying algorithm is used:
- *
- * <pre>
- * Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize));
- * </pre>
- *
- * There are separate functions for floating point, Q31, and Q15 data types.
+ @defgroup RMS Root mean square (RMS)
+
+ Calculates the Root Mean Square of the elements in the input vector.
+ The underlying algorithm is used:
+
+ <pre>
+ Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize));
+ </pre>
+
+ There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
- * @addtogroup RMS
- * @{
+ @addtogroup RMS
+ @{
*/
-
/**
- * @brief Root Mean Square of the elements of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult rms value returned here
- * @return none.
- *
+ @brief Root Mean Square of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON)
void arm_rms_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
- float32_t sum = 0.0f; /* Accumulator */
- float32_t in; /* Tempoprary variable to store input value */
+ float32_t sum = 0.0f; /* accumulator */
+ float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+ float32x4_t inV;
- /* loop Unrolling */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute sum of the squares and then store the result in a temporary variable, sum */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ inV = vld1q_f32(pSrc);
+ sumV = vmlaq_f32(sumV, inV, inV);
+ pSrc += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = sumV2[0] + sumV2[1];
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* compute power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Rms and store the result in the destination */
+ arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
+}
+#else
+void arm_rms_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sum. */
sum += in * in;
+
in = *pSrc++;
sum += in * in;
+
in = *pSrc++;
sum += in * in;
+
in = *pSrc++;
sum += in * in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
in = *pSrc++;
- sum += in * in;
+ /* Compute sum of squares and store result in a temporary variable. */
+ sum += ( in * in);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Rms and store the result in the destination */
+ /* Compute Rms and store result in destination */
arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of RMS group
+ @} end of RMS group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_rms_q15.c b/DSP/Source/StatisticsFunctions/arm_rms_q15.c
index d0e61ca..9fcd964 100644
--- a/DSP/Source/StatisticsFunctions/arm_rms_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_rms_q15.c
@@ -3,13 +3,13 @@
* Title: arm_rms_q15.c
* Description: Root Mean Square of the elements of a Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,111 +29,106 @@
#include "arm_math.h"
/**
- * @addtogroup RMS
- * @{
+ @ingroup groupStats
*/
/**
- * @brief Root Mean Square of the elements of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult rms value returned here
- * @return none.
- *
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.15 format.
- * Intermediate multiplication yields a 2.30 format, and this
- * result is added without saturation to a 64-bit accumulator in 34.30 format.
- * With 33 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
- * 15 bits, and then saturated to yield a result in 1.15 format.
- *
+ @addtogroup RMS
+ @{
+ */
+
+/**
+ @brief Root Mean Square of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ 15 bits, and then saturated to yield a result in 1.15 format.
*/
void arm_rms_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
{
- q63_t sum = 0; /* accumulator */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q15_t in; /* Temporary variable to store input value */
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store input value */
+#endif
- q31_t in; /* temporary variable to store the input value */
- q15_t in1; /* temporary variable to store the input value */
- uint32_t blkCnt; /* loop counter */
+#if defined (ARM_MATH_LOOPUNROLL)
- /* loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
- in = *__SIMD32(pSrc)++;
- sum = __SMLALD(in, in, sum);
- in = *__SIMD32(pSrc)++;
- sum = __SMLALD(in, in, sum);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
+ /* Compute sum of squares and store result in a temporary variable. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sum = __SMLALD(in32, in32, sum);
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
- in1 = *pSrc++;
- sum = __SMLALD(in1, in1, sum);
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sum = __SMLALD(in32, in32, sum);
+#else
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
- /* Decrement the loop counter */
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
blkCnt--;
}
- /* Truncating and saturating the accumulator to 1.15 format */
- /* Store the result in the destination */
- arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- q15_t in; /* temporary variable to store the input value */
- uint32_t blkCnt; /* loop counter */
-
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable. */
sum += ((q31_t) in * in);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Truncating and saturating the accumulator to 1.15 format */
- /* Store the result in the destination */
+ /* Store result in destination */
arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
}
/**
- * @} end of RMS group
+ @} end of RMS group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_rms_q31.c b/DSP/Source/StatisticsFunctions/arm_rms_q31.c
index cb3c58e..5a3e8f3 100644
--- a/DSP/Source/StatisticsFunctions/arm_rms_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_rms_q31.c
@@ -3,13 +3,13 @@
* Title: arm_rms_q31.c
* Description: Root Mean Square of the elements of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,109 +29,96 @@
#include "arm_math.h"
/**
- * @addtogroup RMS
- * @{
+ @ingroup groupStats
*/
+/**
+ @addtogroup RMS
+ @{
+ */
/**
- * @brief Root Mean Square of the elements of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult rms value returned here
- * @return none.
- *
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- *\par
- * The function is implemented using an internal 64-bit accumulator.
- * The input is represented in 1.31 format, and intermediate multiplication
- * yields a 2.62 format.
- * The accumulator maintains full precision of the intermediate multiplication results,
- * but provides only a single guard bit.
- * There is no saturation on intermediate additions.
- * If the accumulator overflows, it wraps around and distorts the result.
- * In order to avoid overflows completely, the input signal must be scaled down by
- * log2(blockSize) bits, as a total of blockSize additions are performed internally.
- * Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
- *
+ @brief Root Mean Square of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The input is represented in 1.31 format, and intermediate multiplication
+ yields a 2.62 format.
+ The accumulator maintains full precision of the intermediate multiplication results,
+ but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ If the accumulator overflows, it wraps around and distorts the result.
+ In order to avoid overflows completely, the input signal must be scaled down by
+ log2(blockSize) bits, as a total of blockSize additions are performed internally.
+ Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
*/
void arm_rms_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q63_t sum = 0; /* accumulator */
- q31_t in; /* Temporary variable to store the input */
- uint32_t blkCnt; /* loop counter */
-
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ uint32_t blkCnt; /* Loop counter */
+ uint64_t sum = 0; /* Temporary result storage (can get never negative. changed type from q63 to uint64 */
+ q31_t in; /* Temporary variable to store input value */
- q31_t in1, in2, in3, in4; /* Temporary input variables */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 8 outputs at a time.
- ** a second loop below computes the remaining 1 to 7 samples. */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute sum of the squares and then store the result in a temporary variable, sum */
- /* read two samples from source buffer */
- in1 = pSrc[0];
- in2 = pSrc[1];
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
- /* calculate power and accumulate to accumulator */
- sum += (q63_t) in1 *in1;
- sum += (q63_t) in2 *in2;
-
- /* read two samples from source buffer */
- in3 = pSrc[2];
- in4 = pSrc[3];
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sum. */
+ sum += ((q63_t) in * in);
- /* calculate power and accumulate to accumulator */
- sum += (q63_t) in3 *in3;
- sum += (q63_t) in4 *in4;
+ in = *pSrc++;
+ sum += ((q63_t) in * in);
+ in = *pSrc++;
+ sum += ((q63_t) in * in);
- /* update source buffer to process next samples */
- pSrc += 4U;
+ in = *pSrc++;
+ sum += ((q63_t) in * in);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
- /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
- /* Compute sum of the squares and then store the results in a temporary variable, sum */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
in = *pSrc++;
- sum += (q63_t) in *in;
+ /* Compute sum of squares and store result in a temporary variable. */
+ sum += ((q63_t) in * in);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Convert data in 2.62 to 1.31 by 31 right shifts and saturate */
- /* Compute Rms and store the result in the destination vector */
+ /* Compute Rms and store result in destination vector */
arm_sqrt_q31(clip_q63_to_q31((sum / (q63_t) blockSize) >> 31), pResult);
}
/**
- * @} end of RMS group
+ @} end of RMS group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_std_f32.c b/DSP/Source/StatisticsFunctions/arm_std_f32.c
index 9750b88..e1e6577 100644
--- a/DSP/Source/StatisticsFunctions/arm_std_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_std_f32.c
@@ -3,13 +3,13 @@
* Title: arm_std_f32.c
* Description: Standard deviation of the elements of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,111 +29,131 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup STD Standard deviation
- *
- * Calculates the standard deviation of the elements in the input vector.
- * The underlying algorithm is used:
- *
- * <pre>
- * Result = sqrt((sumOfSquares - sum<sup>2</sup> / blockSize) / (blockSize - 1))
- *
- * where, sumOfSquares = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]
- *
- * sum = pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]
- * </pre>
- *
- * There are separate functions for floating point, Q31, and Q15 data types.
+ @defgroup STD Standard deviation
+
+ Calculates the standard deviation of the elements in the input vector.
+ The underlying algorithm is used:
+
+ <pre>
+ Result = sqrt((sumOfSquares - sum<sup>2</sup> / blockSize) / (blockSize - 1))
+
+ sumOfSquares = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]
+ sum = pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]
+ </pre>
+
+ There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
- * @addtogroup STD
- * @{
+ @addtogroup STD
+ @{
*/
-
/**
- * @brief Standard deviation of the elements of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult standard deviation value returned here
- * @return none.
+ @brief Standard deviation of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult standard deviation value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_std_f32(
- float32_t * pSrc,
- uint32_t blockSize,
- float32_t * pResult)
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
{
- float32_t sum = 0.0f; /* Temporary result storage */
- float32_t sumOfSquares = 0.0f; /* Sum of squares */
- float32_t in; /* input value */
- uint32_t blkCnt; /* loop counter */
-#if defined (ARM_MATH_DSP)
- float32_t meanOfSquares, mean, squareOfMean; /* Temporary variables */
+ float32_t var;
+ arm_var_f32(pSrc,blockSize,&var);
+ arm_sqrt_f32(var, pResult);
+}
#else
- float32_t squareOfSum; /* Square of Sum */
- float32_t var; /* Temporary varaince storage */
+void arm_std_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t sumOfSquares = 0.0f; /* Sum of squares */
+ float32_t in; /* Temporary variable to store input value */
+
+#ifndef ARM_MATH_CM0_FAMILY
+ float32_t meanOfSquares, mean, squareOfMean; /* Temporary variables */
+#else
+ float32_t squareOfSum; /* Square of Sum */
+ float32_t var; /* Temporary varaince storage */
#endif
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
in = *pSrc++;
- sum += in;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += in * in;
- in = *pSrc++;
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- sumOfSquares += in * in;
+
in = *pSrc++;
- sum += in;
sumOfSquares += in * in;
+ sum += in;
+
in = *pSrc++;
+ sumOfSquares += in * in;
sum += in;
+
+ in = *pSrc++;
sumOfSquares += in * in;
+ sum += in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
+ /* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ( in * in);
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- sumOfSquares += in * in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
+#ifndef ARM_MATH_CM0_FAMILY
+
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = sumOfSquares / ((float32_t) blockSize - 1.0f);
/* Compute mean of all input values */
@@ -143,44 +163,26 @@ void arm_std_f32(
squareOfMean = (mean * mean) * (((float32_t) blockSize) /
((float32_t) blockSize - 1.0f));
- /* Compute standard deviation and then store the result to the destination */
+ /* Compute standard deviation and store result to destination */
arm_sqrt_f32((meanOfSquares - squareOfMean), pResult);
#else
/* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
- in = *pSrc++;
- sumOfSquares += in * in;
-
- /* C = (A[0] + A[1] + ... + A[blockSize-1]) */
- /* Compute Sum of the input samples
- * and then store the result in a temporary variable, sum. */
- sum += in;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Compute the square of sum */
+ /* Compute square of sum */
squareOfSum = ((sum * sum) / (float32_t) blockSize);
- /* Compute the variance */
+ /* Compute variance */
var = ((sumOfSquares - squareOfSum) / (float32_t) (blockSize - 1.0f));
- /* Compute standard deviation and then store the result to the destination */
+ /* Compute standard deviation and store result in destination */
arm_sqrt_f32(var, pResult);
-#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* #ifndef ARM_MATH_CM0_FAMILY */
+
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of STD group
+ @} end of STD group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_std_q15.c b/DSP/Source/StatisticsFunctions/arm_std_q15.c
index 2f2f52e..8e5c042 100644
--- a/DSP/Source/StatisticsFunctions/arm_std_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_std_q15.c
@@ -3,13 +3,13 @@
* Title: arm_std_q15.c
* Description: Standard deviation of an array of Q15 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,146 +29,133 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup STD
- * @{
+ @addtogroup STD
+ @{
*/
/**
- * @brief Standard deviation of the elements of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult standard deviation value returned here
- * @return none.
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.15 format.
- * Intermediate multiplication yields a 2.30 format, and this
- * result is added without saturation to a 64-bit accumulator in 34.30 format.
- * With 33 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
- * 15 bits, and then saturated to yield a result in 1.15 format.
+ @brief Standard deviation of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult standard deviation value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ 15 bits, and then saturated to yield a result in 1.15 format.
*/
void arm_std_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
{
- q31_t sum = 0; /* Accumulator */
- q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
- uint32_t blkCnt; /* loop counter */
- q63_t sumOfSquares = 0; /* Accumulator */
-#if defined (ARM_MATH_DSP)
- q31_t in; /* input value */
- q15_t in1; /* input value */
-#else
- q15_t in; /* input value */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q15_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store input value */
#endif
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *__SIMD32(pSrc)++;
- sum += ((in << 16U) >> 16U);
- sum += (in >> 16U);
- sumOfSquares = __SMLALD(in, in, sumOfSquares);
- in = *__SIMD32(pSrc)++;
- sum += ((in << 16U) >> 16U);
- sum += (in >> 16U);
- sumOfSquares = __SMLALD(in, in, sumOfSquares);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ /* Compute sum and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+#else
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in1 = *pSrc++;
- sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
- sum += in1;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
- /* Compute square of mean */
- squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+#endif /* #if defined (ARM_MATH_DSP) */
- /* mean of the squares minus the square of the mean. */
- /* Compute standard deviation and store the result to the destination */
- arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += (in * in);
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
- squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+ squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
- /* mean of the squares minus the square of the mean. */
- /* Compute standard deviation and store the result to the destination */
+ /* mean of squares minus the square of mean. */
+ /* Compute standard deviation and store result in destination */
arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
-
-#endif /* #if defined (ARM_MATH_DSP) */
}
/**
- * @} end of STD group
+ @} end of STD group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_std_q31.c b/DSP/Source/StatisticsFunctions/arm_std_q31.c
index f02cbdd..cfb6cb8 100644
--- a/DSP/Source/StatisticsFunctions/arm_std_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_std_q31.c
@@ -1,15 +1,15 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_std_q31.c
- * Description: Standard deviation of an array of Q31 type.
+ * Description: Standard deviation of the elements of a Q31 vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,141 +29,119 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup STD
- * @{
+ @addtogroup STD
+ @{
*/
/**
- * @brief Standard deviation of the elements of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult standard deviation value returned here
- * @return none.
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- *\par
- * The function is implemented using an internal 64-bit accumulator.
- * The input is represented in 1.31 format, which is then downshifted by 8 bits
- * which yields 1.23, and intermediate multiplication yields a 2.46 format.
- * The accumulator maintains full precision of the intermediate multiplication results,
- * but provides only a 16 guard bits.
- * There is no saturation on intermediate additions.
- * If the accumulator overflows it wraps around and distorts the result.
- * In order to avoid overflows completely the input signal must be scaled down by
- * log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
- * After division, internal variables should be Q18.46
- * Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
- *
+ @brief Standard deviation of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector.
+ @param[in] blockSize number of samples in input vector.
+ @param[out] pResult standard deviation value returned here.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The input is represented in 1.31 format, which is then downshifted by 8 bits
+ which yields 1.23, and intermediate multiplication yields a 2.46 format.
+ The accumulator maintains full precision of the intermediate multiplication results,
+ but provides only a 16 guard bits.
+ There is no saturation on intermediate additions.
+ If the accumulator overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
+ After division, internal variables should be Q18.46
+ Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*/
void arm_std_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q63_t sum = 0; /* Accumulator */
- q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
- q31_t in; /* input value */
- uint32_t blkCnt; /* loop counter */
- q63_t sumOfSquares = 0; /* Accumulator */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Accumulator */
+ q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q31_t in; /* Temporary variable to store input value */
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
in = *pSrc++ >> 8U;
- sum += in;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
+
in = *pSrc++ >> 8U;
- sum += in;
sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
sum += in;
+
+ in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
-
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
- squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U));
+ squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
- /* Compute standard deviation and then store the result to the destination */
+ /* Compute standard deviation and store result in destination */
arm_sqrt_q31((meanOfSquares - squareOfMean) >> 15U, pResult);
}
/**
- * @} end of STD group
+ @} end of STD group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_var_f32.c b/DSP/Source/StatisticsFunctions/arm_var_f32.c
index c0f731d..3c325b1 100644
--- a/DSP/Source/StatisticsFunctions/arm_var_f32.c
+++ b/DSP/Source/StatisticsFunctions/arm_var_f32.c
@@ -3,13 +3,13 @@
* Title: arm_var_f32.c
* Description: Variance of the elements of a floating-point vector
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,153 +29,206 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @defgroup variance Variance
- *
- * Calculates the variance of the elements in the input vector.
- * The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
- *
- * <pre>
- * Result = sum(element - meanOfElements)^2) / numElement - 1
- *
- * where, meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize
- *
- * </pre>
- *
- * There are separate functions for floating point, Q31, and Q15 data types.
+ @defgroup variance Variance
+
+ Calculates the variance of the elements in the input vector.
+ The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
+
+ <pre>
+ Result = sum(element - meanOfElements)^2) / numElement - 1
+
+ meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize
+ </pre>
+
+ There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
- * @addtogroup variance
- * @{
+ @addtogroup variance
+ @{
*/
-
/**
- * @brief Variance of the elements of a floating-point vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult variance value returned here
- * @return none.
+ @brief Variance of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
*/
-
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
void arm_var_f32(
- float32_t * pSrc,
+ const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
- float32_t fMean, fValue;
- uint32_t blkCnt; /* loop counter */
- float32_t * pInput = pSrc;
- float32_t sum = 0.0f;
- float32_t fSum = 0.0f;
- #if defined(ARM_MATH_DSP)
- float32_t in1, in2, in3, in4;
- #endif
-
- if (blockSize <= 1U)
- {
- *pResult = 0;
- return;
- }
-
- #if defined(ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M7 */
-
- /*loop Unrolling */
- blkCnt = blockSize >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- in1 = *pInput++;
- in2 = *pInput++;
- in3 = *pInput++;
- in4 = *pInput++;
-
- sum += in1;
- sum += in2;
- sum += in3;
- sum += in4;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
- #else
- /* Run the below code for Cortex-M0 or Cortex-M3 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
-
- #endif
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- sum += *pInput++;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- fMean = sum / (float32_t) blockSize;
-
- pInput = pSrc;
-
- #if defined(ARM_MATH_DSP)
-
- /*loop Unrolling */
- blkCnt = blockSize >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U)
- {
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- blkCnt = blockSize % 0x4U;
- #else
- /* Run the below code for Cortex-M0 or Cortex-M3 */
-
- /* Loop over blockSize number of values */
- blkCnt = blockSize;
- #endif
-
- while (blkCnt > 0U)
- {
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* Variance */
- *pResult = fSum / (float32_t)(blockSize - 1.0f);
+ float32_t mean;
+
+ float32_t sum = 0.0f; /* accumulator */
+ float32_t in; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+ float32x4_t inV;
+ float32x4_t avg;
+
+ arm_mean_f32(pSrc,blockSize,&mean);
+ avg = vdupq_n_f32(mean);
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ inV = vld1q_f32(pSrc);
+ inV = vsubq_f32(inV, avg);
+ sumV = vmlaq_f32(sumV, inV, inV);
+ pSrc += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = sumV2[0] + sumV2[1];
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* compute power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ in = in - mean;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Variance */
+ *pResult = sum / (float32_t)(blockSize - 1.0f);
+
+}
+
+#else
+void arm_var_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t fSum = 0.0f;
+ float32_t fMean, fValue;
+ const float32_t * pInput = pSrc;
+
+ if (blockSize <= 1U)
+ {
+ *pResult = 0;
+ return;
+ }
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ sum += *pInput++;
+ sum += *pInput++;
+ sum += *pInput++;
+ sum += *pInput++;
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ sum += *pInput++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ fMean = sum / (float32_t) blockSize;
+
+ pInput = pSrc;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Variance */
+ *pResult = fSum / (float32_t)(blockSize - 1.0f);
}
+#endif /* #if defined(ARM_MATH_NEON) */
/**
- * @} end of variance group
+ @} end of variance group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_var_q15.c b/DSP/Source/StatisticsFunctions/arm_var_q15.c
index 5ba61f7..259e76b 100644
--- a/DSP/Source/StatisticsFunctions/arm_var_q15.c
+++ b/DSP/Source/StatisticsFunctions/arm_var_q15.c
@@ -3,13 +3,13 @@
* Title: arm_var_q15.c
* Description: Variance of an array of Q15 type
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,144 +29,136 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup variance
- * @{
+ @addtogroup variance
+ @{
*/
/**
- * @brief Variance of the elements of a Q15 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult variance value returned here
- * @return none.
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- * \par
- * The function is implemented using a 64-bit internal accumulator.
- * The input is represented in 1.15 format.
- * Intermediate multiplication yields a 2.30 format, and this
- * result is added without saturation to a 64-bit accumulator in 34.30 format.
- * With 33 guard bits in the accumulator, there is no risk of overflow, and the
- * full precision of the intermediate multiplication is preserved.
- * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
- * 15 bits, and then saturated to yield a result in 1.15 format.
+ @brief Variance of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ 15 bits, and then saturated to yield a result in 1.15 format.
*/
void arm_var_q15(
- q15_t * pSrc,
- uint32_t blockSize,
- q15_t * pResult)
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
{
- q31_t sum = 0; /* Accumulator */
- q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
- uint32_t blkCnt; /* loop counter */
- q63_t sumOfSquares = 0; /* Accumulator */
-#if defined (ARM_MATH_DSP)
- q31_t in; /* input value */
- q15_t in1; /* input value */
-#else
- q15_t in; /* input value */
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q15_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store input value */
#endif
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in = *__SIMD32(pSrc)++;
- sum += ((in << 16U) >> 16U);
- sum += (in >> 16U);
- sumOfSquares = __SMLALD(in, in, sumOfSquares);
- in = *__SIMD32(pSrc)++;
- sum += ((in << 16U) >> 16U);
- sum += (in >> 16U);
- sumOfSquares = __SMLALD(in, in, sumOfSquares);
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
- /* Decrement the loop counter */
- blkCnt--;
- }
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ /* Compute sum and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+#else
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
- in1 = *pSrc++;
- sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
- sum += in1;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+#endif /* #if defined (ARM_MATH_DSP) */
- /* Compute square of mean */
- squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+ /* Decrement loop counter */
+ blkCnt--;
+ }
- /* mean of the squares minus the square of the mean. */
- *pResult = (meanOfSquares - squareOfMean) >> 15U;
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+#if defined (ARM_MATH_DSP)
+ sumOfSquares = __SMLALD(in, in, sumOfSquares);
+#else
sumOfSquares += (in * in);
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+#endif /* #if defined (ARM_MATH_DSP) */
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
- squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
-
- /* mean of the squares minus the square of the mean. */
- *pResult = (meanOfSquares - squareOfMean) >> 15;
+ squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* mean of squares minus the square of mean. */
+ *pResult = (meanOfSquares - squareOfMean) >> 15U;
}
/**
- * @} end of variance group
+ @} end of variance group
*/
diff --git a/DSP/Source/StatisticsFunctions/arm_var_q31.c b/DSP/Source/StatisticsFunctions/arm_var_q31.c
index 526c6cd..558332f 100644
--- a/DSP/Source/StatisticsFunctions/arm_var_q31.c
+++ b/DSP/Source/StatisticsFunctions/arm_var_q31.c
@@ -3,13 +3,13 @@
* Title: arm_var_q31.c
* Description: Variance of an array of Q31 type
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,141 +29,119 @@
#include "arm_math.h"
/**
- * @ingroup groupStats
+ @ingroup groupStats
*/
/**
- * @addtogroup variance
- * @{
+ @addtogroup variance
+ @{
*/
/**
- * @brief Variance of the elements of a Q31 vector.
- * @param[in] *pSrc points to the input vector
- * @param[in] blockSize length of the input vector
- * @param[out] *pResult variance value returned here
- * @return none.
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- *\par
- * The function is implemented using an internal 64-bit accumulator.
- * The input is represented in 1.31 format, which is then downshifted by 8 bits
- * which yields 1.23, and intermediate multiplication yields a 2.46 format.
- * The accumulator maintains full precision of the intermediate multiplication results,
- * but provides only a 16 guard bits.
- * There is no saturation on intermediate additions.
- * If the accumulator overflows it wraps around and distorts the result.
- * In order to avoid overflows completely the input signal must be scaled down by
- * log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
- * After division, internal variables should be Q18.46
- * Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
- *
+ @brief Variance of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The input is represented in 1.31 format, which is then downshifted by 8 bits
+ which yields 1.23, and intermediate multiplication yields a 2.46 format.
+ The accumulator maintains full precision of the intermediate multiplication results,
+ but provides only a 16 guard bits.
+ There is no saturation on intermediate additions.
+ If the accumulator overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
+ After division, internal variables should be Q18.46
+ Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*/
void arm_var_q31(
- q31_t * pSrc,
- uint32_t blockSize,
- q31_t * pResult)
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
{
- q63_t sum = 0; /* Accumulator */
- q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
- q31_t in; /* input value */
- uint32_t blkCnt; /* loop counter */
- q63_t sumOfSquares = 0; /* Accumulator */
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q31_t in; /* Temporary variable to store input value */
- if (blockSize == 1U)
+ if (blockSize <= 1U)
{
*pResult = 0;
return;
}
-#if defined (ARM_MATH_DSP)
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+#if defined (ARM_MATH_LOOPUNROLL)
- /*loop Unrolling */
+ /* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
in = *pSrc++ >> 8U;
- sum += in;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
+
in = *pSrc++ >> 8U;
- sum += in;
sumOfSquares += ((q63_t) (in) * (in));
- in = *pSrc++ >> 8U;
sum += in;
- sumOfSquares += ((q63_t) (in) * (in));
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sum. */
in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
sum += in;
+
+ in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
#else
- /* Run the below code for Cortex-M0 */
- /* Loop over blockSize number of values */
+ /* Initialize blkCnt with number of samples */
blkCnt = blockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
- /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
- /* Compute Sum of squares of the input samples
- * and then store the result in a temporary variable, sumOfSquares. */
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+ /* Compute sum and store result in a temporary variable, sum. */
sum += in;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Compute Mean of squares of the input samples
- * and then store the result in a temporary variable, meanOfSquares. */
- meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
-
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
- squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U));
+ squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
- /* Compute standard deviation and then store the result to the destination */
+ /* Compute variance and store result in destination */
*pResult = (meanOfSquares - squareOfMean) >> 15U;
}
/**
- * @} end of variance group
+ @} end of variance group
*/