diff options
author | rihab kouki <rihab.kouki@st.com> | 2020-07-28 11:24:49 +0100 |
---|---|---|
committer | rihab kouki <rihab.kouki@st.com> | 2020-07-28 11:24:49 +0100 |
commit | 96d6da4e252b06dcfdc041e7df23e86161c33007 (patch) | |
tree | a262f59bb1db7ec7819acae435f5049cbe5e2354 /DSP/Source/StatisticsFunctions | |
parent | 9f95ff5b6ba01db09552b84a0ab79607060a2666 (diff) | |
download | st-cmsis-core-lowfat-master.tar.gz st-cmsis-core-lowfat-master.tar.bz2 st-cmsis-core-lowfat-master.zip |
Diffstat (limited to 'DSP/Source/StatisticsFunctions')
27 files changed, 1980 insertions, 1720 deletions
diff --git a/DSP/Source/StatisticsFunctions/CMakeLists.txt b/DSP/Source/StatisticsFunctions/CMakeLists.txt new file mode 100644 index 0000000..3f23355 --- /dev/null +++ b/DSP/Source/StatisticsFunctions/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required (VERSION 3.6) + +project(CMSISDSPStatistics) + + +file(GLOB SRC "./*_*.c") + +add_library(CMSISDSPStatistics STATIC ${SRC}) + +configdsp(CMSISDSPStatistics ..) + +### Includes +target_include_directories(CMSISDSPStatistics PUBLIC "${DSP}/../../Include") + + + diff --git a/DSP/Source/StatisticsFunctions/StatisticsFunctions.c b/DSP/Source/StatisticsFunctions/StatisticsFunctions.c new file mode 100644 index 0000000..4f86aa4 --- /dev/null +++ b/DSP/Source/StatisticsFunctions/StatisticsFunctions.c @@ -0,0 +1,53 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: StatisticsFunctions.c + * Description: Combination of all statistics function source files. + * + * $Date: 18. March 2019 + * $Revision: V1.0.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_max_f32.c" +#include "arm_max_q15.c" +#include "arm_max_q31.c" +#include "arm_max_q7.c" +#include "arm_mean_f32.c" +#include "arm_mean_q15.c" +#include "arm_mean_q31.c" +#include "arm_mean_q7.c" +#include "arm_min_f32.c" +#include "arm_min_q15.c" +#include "arm_min_q31.c" +#include "arm_min_q7.c" +#include "arm_power_f32.c" +#include "arm_power_q15.c" +#include "arm_power_q31.c" +#include "arm_power_q7.c" +#include "arm_rms_f32.c" +#include "arm_rms_q15.c" +#include "arm_rms_q31.c" +#include "arm_std_f32.c" +#include "arm_std_q15.c" +#include "arm_std_q31.c" +#include "arm_var_f32.c" +#include "arm_var_q15.c" +#include "arm_var_q31.c" diff --git a/DSP/Source/StatisticsFunctions/arm_max_f32.c b/DSP/Source/StatisticsFunctions/arm_max_f32.c index a0a68ac..cd54e2a 100644 --- a/DSP/Source/StatisticsFunctions/arm_max_f32.c +++ b/DSP/Source/StatisticsFunctions/arm_max_f32.c @@ -3,13 +3,13 @@ * Title: arm_max_f32.c * Description: Maximum value of a floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -27,136 +27,237 @@ */ #include "arm_math.h" +#if defined(ARM_MATH_NEON) +#include <limits.h> +#endif /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @defgroup Max Maximum - * - * Computes the maximum value of an array of data. - * The function returns both the maximum value and its position within the array. - * There are separate functions for floating-point, Q31, Q15, and Q7 data types. + @defgroup Max Maximum + + Computes the maximum value of an array of data. + The function returns both the maximum value and its position within the array. + There are separate functions for floating-point, Q31, Q15, and Q7 data types. */ /** - * @addtogroup Max - * @{ + @addtogroup Max + @{ */ - /** - * @brief Maximum value of a floating-point vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult maximum value returned here - * @param[out] *pIndex index of maximum value returned here - * @return none. + @brief Maximum value of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult maximum value returned here + @param[out] pIndex index of maximum value returned here + @return none */ - +#if defined(ARM_MATH_NEON) void arm_max_f32( - float32_t * pSrc, + const float32_t * pSrc, uint32_t blockSize, float32_t * pResult, uint32_t * pIndex) { -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ - float32_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */ uint32_t blkCnt, outIndex, count; /* loop counter */ + float32x4_t outV, srcV; + float32x2_t outV2; + + uint32x4_t idxV; + uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX}; + uint32x4_t index={4,5,6,7}; + uint32x4_t delta={4,4,4,4}; + uint32x4_t countV={0,1,2,3}; + uint32x2_t countV2; + /* Initialise the count value. */ count = 0U; + /* Initialise the index value to zero. */ outIndex = 0U; + + /* Load first input value that act as reference value for comparison */ + if (blockSize <= 3) + { + out = *pSrc++; + + blkCnt = blockSize - 1; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + maxVal1 = *pSrc++; + + /* compare for the maximum value */ + if (out < maxVal1) + { + /* Update the maximum value and it's index */ + out = maxVal1; + outIndex = blockSize - blkCnt; + } + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + outV = vld1q_f32(pSrc); + pSrc += 4; + + /* Compute 4 outputs at a time */ + blkCnt = (blockSize - 4 ) >> 2U; + + while (blkCnt > 0U) + { + srcV = vld1q_f32(pSrc); + pSrc += 4; + + idxV = vcgtq_f32(srcV, outV); + outV = vbslq_f32(idxV, srcV, outV ); + countV = vbslq_u32(idxV, index,countV ); + + index = vaddq_u32(index,delta); + + /* Decrement the loop counter */ + blkCnt--; + } + + outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV)); + outV2 = vpmax_f32(outV2,outV2); + out = outV2[0]; + + idxV = vceqq_f32(outV, vdupq_n_f32(out)); + countV = vbslq_u32(idxV, countV,maxIdx); + + countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV)); + countV2 = vpmin_u32(countV2,countV2); + outIndex = countV2[0]; + + /* if (blockSize - 1U) is not multiple of 4 */ + blkCnt = (blockSize - 4 ) % 4U; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + maxVal1 = *pSrc++; + + /* compare for the maximum value */ + if (out < maxVal1) + { + /* Update the maximum value and it's index */ + out = maxVal1; + outIndex = blockSize - blkCnt ; + } + + /* Decrement the loop counter */ + blkCnt--; + } + + + } + + /* Store the maximum value and it's index into destination pointers */ + *pResult = out; + *pIndex = outIndex; +} +#else +void arm_max_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex) +{ + float32_t maxVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt, outIndex; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + uint32_t index; /* index of maximum value */ +#endif + + /* Initialise index value to zero. */ + outIndex = 0U; + /* Load first input value that act as reference value for comparision */ out = *pSrc++; - /* Loop unrolling */ +#if defined (ARM_MATH_LOOPUNROLL) + /* Initialise index of maximum value. */ + index = 0U; + + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = (blockSize - 1U) >> 2U; while (blkCnt > 0U) { - /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; - maxVal2 = *pSrc++; + /* Initialize maxVal to next consecutive values one by one */ + maxVal = *pSrc++; /* compare for the maximum value */ - if (out < maxVal1) + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal1; - outIndex = count + 1U; + /* Update the maximum value and it's index */ + out = maxVal; + outIndex = index + 1U; } - /* compare for the maximum value */ - if (out < maxVal2) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal2; - outIndex = count + 2U; + out = maxVal; + outIndex = index + 2U; } - /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; - maxVal2 = *pSrc++; - - /* compare for the maximum value */ - if (out < maxVal1) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal1; - outIndex = count + 3U; + out = maxVal; + outIndex = index + 3U; } - /* compare for the maximum value */ - if (out < maxVal2) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal2; - outIndex = count + 4U; + out = maxVal; + outIndex = index + 4U; } - count += 4U; + index += 4U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* if (blockSize - 1U) is not multiple of 4 */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = (blockSize - 1U) % 4U; #else - /* Run the below code for Cortex-M0 */ - - float32_t maxVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex; /* loop counter */ - - /* Initialise the index value to zero. */ - outIndex = 0U; - /* Load first input value that act as reference value for comparision */ - out = *pSrc++; + /* Initialize blkCnt with number of samples */ blkCnt = (blockSize - 1U); -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; + maxVal = *pSrc++; /* compare for the maximum value */ - if (out < maxVal1) + if (out < maxVal) { /* Update the maximum value and it's index */ - out = maxVal1; + out = maxVal; outIndex = blockSize - blkCnt; } - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -164,7 +265,7 @@ void arm_max_f32( *pResult = out; *pIndex = outIndex; } - +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of Max group + @} end of Max group */ diff --git a/DSP/Source/StatisticsFunctions/arm_max_q15.c b/DSP/Source/StatisticsFunctions/arm_max_q15.c index 67d5e34..329b0c8 100644 --- a/DSP/Source/StatisticsFunctions/arm_max_q15.c +++ b/DSP/Source/StatisticsFunctions/arm_max_q15.c @@ -3,13 +3,13 @@ * Title: arm_max_q15.c * Description: Maximum value of a Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,126 +29,112 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup Max - * @{ + @addtogroup Max + @{ */ - /** - * @brief Maximum value of a Q15 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult maximum value returned here - * @param[out] *pIndex index of maximum value returned here - * @return none. + @brief Maximum value of a Q15 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult maximum value returned here + @param[out] pIndex index of maximum value returned here + @return none */ void arm_max_q15( - q15_t * pSrc, - uint32_t blockSize, - q15_t * pResult, - uint32_t * pIndex) + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex) { -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + q15_t maxVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt, outIndex; /* Loop counter */ - q15_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex, count; /* loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) + uint32_t index; /* index of maximum value */ +#endif - /* Initialise the count value. */ - count = 0U; - /* Initialise the index value to zero. */ + /* Initialise index value to zero. */ outIndex = 0U; /* Load first input value that act as reference value for comparision */ out = *pSrc++; - /* Loop unrolling */ +#if defined (ARM_MATH_LOOPUNROLL) + /* Initialise index of maximum value. */ + index = 0U; + + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = (blockSize - 1U) >> 2U; while (blkCnt > 0U) { - /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; - maxVal2 = *pSrc++; + /* Initialize maxVal to next consecutive values one by one */ + maxVal = *pSrc++; /* compare for the maximum value */ - if (out < maxVal1) + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal1; - outIndex = count + 1U; + /* Update the maximum value and it's index */ + out = maxVal; + outIndex = index + 1U; } - /* compare for the maximum value */ - if (out < maxVal2) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal2; - outIndex = count + 2U; + out = maxVal; + outIndex = index + 2U; } - /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; - maxVal2 = *pSrc++; - - /* compare for the maximum value */ - if (out < maxVal1) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal1; - outIndex = count + 3U; + out = maxVal; + outIndex = index + 3U; } - /* compare for the maximum value */ - if (out < maxVal2) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal2; - outIndex = count + 4U; + out = maxVal; + outIndex = index + 4U; } - count += 4U; + index += 4U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* if (blockSize - 1U) is not multiple of 4 */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = (blockSize - 1U) % 4U; #else - /* Run the below code for Cortex-M0 */ - - q15_t maxVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex; /* loop counter */ - - /* Initialise the index value to zero. */ - outIndex = 0U; - /* Load first input value that act as reference value for comparision */ - out = *pSrc++; + /* Initialize blkCnt with number of samples */ blkCnt = (blockSize - 1U); -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; + maxVal = *pSrc++; /* compare for the maximum value */ - if (out < maxVal1) + if (out < maxVal) { /* Update the maximum value and it's index */ - out = maxVal1; + out = maxVal; outIndex = blockSize - blkCnt; } - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -158,5 +144,5 @@ void arm_max_q15( } /** - * @} end of Max group + @} end of Max group */ diff --git a/DSP/Source/StatisticsFunctions/arm_max_q31.c b/DSP/Source/StatisticsFunctions/arm_max_q31.c index 5d34bbd..99de13e 100644 --- a/DSP/Source/StatisticsFunctions/arm_max_q31.c +++ b/DSP/Source/StatisticsFunctions/arm_max_q31.c @@ -3,13 +3,13 @@ * Title: arm_max_q31.c * Description: Maximum value of a Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,126 +29,112 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup Max - * @{ + @addtogroup Max + @{ */ - /** - * @brief Maximum value of a Q31 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult maximum value returned here - * @param[out] *pIndex index of maximum value returned here - * @return none. + @brief Maximum value of a Q31 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult maximum value returned here + @param[out] pIndex index of maximum value returned here + @return none */ void arm_max_q31( - q31_t * pSrc, - uint32_t blockSize, - q31_t * pResult, - uint32_t * pIndex) + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex) { -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + q31_t maxVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt, outIndex; /* Loop counter */ - q31_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex, count; /* loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) + uint32_t index; /* index of maximum value */ +#endif - /* Initialise the count value. */ - count = 0U; - /* Initialise the index value to zero. */ + /* Initialise index value to zero. */ outIndex = 0U; /* Load first input value that act as reference value for comparision */ out = *pSrc++; - /* Loop unrolling */ +#if defined (ARM_MATH_LOOPUNROLL) + /* Initialise index of maximum value. */ + index = 0U; + + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = (blockSize - 1U) >> 2U; while (blkCnt > 0U) { - /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; - maxVal2 = *pSrc++; + /* Initialize maxVal to next consecutive values one by one */ + maxVal = *pSrc++; /* compare for the maximum value */ - if (out < maxVal1) + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal1; - outIndex = count + 1U; + /* Update the maximum value and it's index */ + out = maxVal; + outIndex = index + 1U; } - /* compare for the maximum value */ - if (out < maxVal2) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal2; - outIndex = count + 2U; + out = maxVal; + outIndex = index + 2U; } - /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; - maxVal2 = *pSrc++; - - /* compare for the maximum value */ - if (out < maxVal1) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal1; - outIndex = count + 3U; + out = maxVal; + outIndex = index + 3U; } - /* compare for the maximum value */ - if (out < maxVal2) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal2; - outIndex = count + 4U; + out = maxVal; + outIndex = index + 4U; } - count += 4U; + index += 4U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* if (blockSize - 1U) is not multiple of 4 */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = (blockSize - 1U) % 4U; #else - /* Run the below code for Cortex-M0 */ - - q31_t maxVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex; /* loop counter */ - - /* Initialise the index value to zero. */ - outIndex = 0U; - /* Load first input value that act as reference value for comparision */ - out = *pSrc++; + /* Initialize blkCnt with number of samples */ blkCnt = (blockSize - 1U); -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; + maxVal = *pSrc++; /* compare for the maximum value */ - if (out < maxVal1) + if (out < maxVal) { /* Update the maximum value and it's index */ - out = maxVal1; + out = maxVal; outIndex = blockSize - blkCnt; } - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -158,5 +144,5 @@ void arm_max_q31( } /** - * @} end of Max group + @} end of Max group */ diff --git a/DSP/Source/StatisticsFunctions/arm_max_q7.c b/DSP/Source/StatisticsFunctions/arm_max_q7.c index 72f6e5e..9c8b6d3 100644 --- a/DSP/Source/StatisticsFunctions/arm_max_q7.c +++ b/DSP/Source/StatisticsFunctions/arm_max_q7.c @@ -3,13 +3,13 @@ * Title: arm_max_q7.c * Description: Maximum value of a Q7 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,126 +29,112 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup Max - * @{ + @addtogroup Max + @{ */ - /** - * @brief Maximum value of a Q7 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult maximum value returned here - * @param[out] *pIndex index of maximum value returned here - * @return none. + @brief Maximum value of a Q7 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult maximum value returned here + @param[out] pIndex index of maximum value returned here + @return none */ void arm_max_q7( - q7_t * pSrc, - uint32_t blockSize, - q7_t * pResult, - uint32_t * pIndex) + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult, + uint32_t * pIndex) { -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + q7_t maxVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt, outIndex; /* Loop counter */ - q7_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex, count; /* loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) + uint32_t index; /* index of maximum value */ +#endif - /* Initialise the count value. */ - count = 0U; - /* Initialise the index value to zero. */ + /* Initialise index value to zero. */ outIndex = 0U; /* Load first input value that act as reference value for comparision */ out = *pSrc++; - /* Loop unrolling */ +#if defined (ARM_MATH_LOOPUNROLL) + /* Initialise index of maximum value. */ + index = 0U; + + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = (blockSize - 1U) >> 2U; while (blkCnt > 0U) { - /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; - maxVal2 = *pSrc++; + /* Initialize maxVal to next consecutive values one by one */ + maxVal = *pSrc++; /* compare for the maximum value */ - if (out < maxVal1) + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal1; - outIndex = count + 1U; + /* Update the maximum value and it's index */ + out = maxVal; + outIndex = index + 1U; } - /* compare for the maximum value */ - if (out < maxVal2) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal2; - outIndex = count + 2U; + out = maxVal; + outIndex = index + 2U; } - /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; - maxVal2 = *pSrc++; - - /* compare for the maximum value */ - if (out < maxVal1) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal1; - outIndex = count + 3U; + out = maxVal; + outIndex = index + 3U; } - /* compare for the maximum value */ - if (out < maxVal2) + maxVal = *pSrc++; + if (out < maxVal) { - /* Update the maximum value and its index */ - out = maxVal2; - outIndex = count + 4U; + out = maxVal; + outIndex = index + 4U; } - count += 4U; + index += 4U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* if (blockSize - 1U) is not multiple of 4 */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = (blockSize - 1U) % 4U; #else - /* Run the below code for Cortex-M0 */ - - q7_t maxVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex; /* loop counter */ - - /* Initialise the index value to zero. */ - outIndex = 0U; - /* Load first input value that act as reference value for comparision */ - out = *pSrc++; + /* Initialize blkCnt with number of samples */ blkCnt = (blockSize - 1U); -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* Initialize maxVal to the next consecutive values one by one */ - maxVal1 = *pSrc++; + maxVal = *pSrc++; /* compare for the maximum value */ - if (out < maxVal1) + if (out < maxVal) { /* Update the maximum value and it's index */ - out = maxVal1; + out = maxVal; outIndex = blockSize - blkCnt; } - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -158,5 +144,5 @@ void arm_max_q7( } /** - * @} end of Max group + @} end of Max group */ diff --git a/DSP/Source/StatisticsFunctions/arm_mean_f32.c b/DSP/Source/StatisticsFunctions/arm_mean_f32.c index 85a3b16..63d9652 100644 --- a/DSP/Source/StatisticsFunctions/arm_mean_f32.c +++ b/DSP/Source/StatisticsFunctions/arm_mean_f32.c @@ -3,13 +3,13 @@ * Title: arm_mean_f32.c * Description: Mean value of a floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,97 +29,138 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @defgroup mean Mean - * - * Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector. - * The underlying algorithm is used: - * - * <pre> - * Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize; - * </pre> - * - * There are separate functions for floating-point, Q31, Q15, and Q7 data types. + @defgroup mean Mean + + Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector. + The underlying algorithm is used: + + <pre> + Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize; + </pre> + + There are separate functions for floating-point, Q31, Q15, and Q7 data types. */ /** - * @addtogroup mean - * @{ + @addtogroup mean + @{ */ - /** - * @brief Mean value of a floating-point vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult mean value returned here - * @return none. + @brief Mean value of a floating-point vector. + @param[in] pSrc points to the input vector. + @param[in] blockSize number of samples in input vector. + @param[out] pResult mean value returned here. + @return none */ - +#if defined(ARM_MATH_NEON_EXPERIMENTAL) void arm_mean_f32( - float32_t * pSrc, + const float32_t * pSrc, uint32_t blockSize, float32_t * pResult) { float32_t sum = 0.0f; /* Temporary result storage */ - uint32_t blkCnt; /* loop counter */ + float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */ + float32x2_t sumV2; -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + uint32_t blkCnt; /* Loop counter */ float32_t in1, in2, in3, in4; + float32x4_t inV; - /*loop Unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - in1 = *pSrc++; - in2 = *pSrc++; - in3 = *pSrc++; - in4 = *pSrc++; - - sum += in1; - sum += in2; - sum += in3; - sum += in4; - + inV = vld1q_f32(pSrc); + sumV = vaddq_f32(sumV, inV); + + pSrc += 4; /* Decrement the loop counter */ blkCnt--; } + sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); + sum = sumV2[0] + sumV2[1]; + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ + sum += *pSrc++; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ + /* Store the result to the destination */ + *pResult = sum / (float32_t) blockSize; +} +#else +void arm_mean_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult) +{ + uint32_t blkCnt; /* Loop counter */ + float32_t sum = 0.0f; /* Temporary result storage */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ + sum += *pSrc++; + + sum += *pSrc++; + + sum += *pSrc++; + + sum += *pSrc++; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ sum += *pSrc++; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ - /* Store the result to the destination */ - *pResult = sum / (float32_t) blockSize; + /* Store result to destination */ + *pResult = (sum / blockSize); } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of mean group + @} end of mean group */ diff --git a/DSP/Source/StatisticsFunctions/arm_mean_q15.c b/DSP/Source/StatisticsFunctions/arm_mean_q15.c index 7bf55c2..463aa84 100644 --- a/DSP/Source/StatisticsFunctions/arm_mean_q15.c +++ b/DSP/Source/StatisticsFunctions/arm_mean_q15.c @@ -3,13 +3,13 @@ * Title: arm_mean_q15.c * Description: Mean value of a Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,59 +29,55 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup mean - * @{ + @addtogroup mean + @{ */ - /** - * @brief Mean value of a Q15 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult mean value returned here - * @return none. - * - * @details - * <b>Scaling and Overflow Behavior:</b> - * \par - * The function is implemented using a 32-bit internal accumulator. - * The input is represented in 1.15 format and is accumulated in a 32-bit - * accumulator in 17.15 format. - * There is no risk of internal overflow with this approach, and the - * full precision of intermediate result is preserved. - * Finally, the accumulator is saturated and truncated to yield a result of 1.15 format. - * + @brief Mean value of a Q15 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult mean value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 32-bit internal accumulator. + The input is represented in 1.15 format and is accumulated in a 32-bit + accumulator in 17.15 format. + There is no risk of internal overflow with this approach, and the + full precision of intermediate result is preserved. + Finally, the accumulator is truncated to yield a result of 1.15 format. */ void arm_mean_q15( - q15_t * pSrc, - uint32_t blockSize, - q15_t * pResult) + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult) { - q31_t sum = 0; /* Temporary result storage */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + q31_t sum = 0; /* Temporary result storage */ -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) + q31_t in; +#endif - q31_t in; +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - in = *__SIMD32(pSrc)++; + in = read_q15x2_ia ((q15_t **) &pSrc); sum += ((in << 16U) >> 16U); sum += (in >> 16U); - in = *__SIMD32(pSrc)++; + + in = read_q15x2_ia ((q15_t **) &pSrc); sum += ((in << 16U) >> 16U); sum += (in >> 16U); @@ -89,32 +85,30 @@ void arm_mean_q15( blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ sum += *pSrc++; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ - /* Store the result to the destination */ - *pResult = (q15_t) (sum / (q31_t)blockSize); + /* Store result to destination */ + *pResult = (q15_t) (sum / (int32_t) blockSize); } /** - * @} end of mean group + @} end of mean group */ diff --git a/DSP/Source/StatisticsFunctions/arm_mean_q31.c b/DSP/Source/StatisticsFunctions/arm_mean_q31.c index ea83ced..4b0ed6e 100644 --- a/DSP/Source/StatisticsFunctions/arm_mean_q31.c +++ b/DSP/Source/StatisticsFunctions/arm_mean_q31.c @@ -3,13 +3,13 @@ * Title: arm_mean_q31.c * Description: Mean value of a Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,95 +29,82 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup mean - * @{ + @addtogroup mean + @{ */ - /** - * @brief Mean value of a Q31 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult mean value returned here - * @return none. - * - * @details - * <b>Scaling and Overflow Behavior:</b> - *\par - * The function is implemented using a 64-bit internal accumulator. - * The input is represented in 1.31 format and is accumulated in a 64-bit - * accumulator in 33.31 format. - * There is no risk of internal overflow with this approach, and the - * full precision of intermediate result is preserved. - * Finally, the accumulator is truncated to yield a result of 1.31 format. - * + @brief Mean value of a Q31 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult mean value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 64-bit internal accumulator. + The input is represented in 1.31 format and is accumulated in a 64-bit + accumulator in 33.31 format. + There is no risk of internal overflow with this approach, and the + full precision of intermediate result is preserved. + Finally, the accumulator is truncated to yield a result of 1.31 format. */ void arm_mean_q31( - q31_t * pSrc, - uint32_t blockSize, - q31_t * pResult) + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult) { - q63_t sum = 0; /* Temporary result storage */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + q63_t sum = 0; /* Temporary result storage */ -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) - q31_t in1, in2, in3, in4; - - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - in1 = *pSrc++; - in2 = *pSrc++; - in3 = *pSrc++; - in4 = *pSrc++; + sum += *pSrc++; - sum += in1; - sum += in2; - sum += in3; - sum += in4; + sum += *pSrc++; + + sum += *pSrc++; + + sum += *pSrc++; /* Decrement the loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ sum += *pSrc++; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ - /* Store the result to the destination */ - *pResult = (q31_t) (sum / (int32_t) blockSize); + /* Store result to destination */ + *pResult = (q31_t) (sum / blockSize); } /** - * @} end of mean group + @} end of mean group */ diff --git a/DSP/Source/StatisticsFunctions/arm_mean_q7.c b/DSP/Source/StatisticsFunctions/arm_mean_q7.c index a7bdfb8..8f52211 100644 --- a/DSP/Source/StatisticsFunctions/arm_mean_q7.c +++ b/DSP/Source/StatisticsFunctions/arm_mean_q7.c @@ -3,13 +3,13 @@ * Title: arm_mean_q7.c * Description: Mean value of a Q7 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,57 +29,51 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup mean - * @{ + @addtogroup mean + @{ */ - /** - * @brief Mean value of a Q7 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult mean value returned here - * @return none. - * - * @details - * <b>Scaling and Overflow Behavior:</b> - * \par - * The function is implemented using a 32-bit internal accumulator. - * The input is represented in 1.7 format and is accumulated in a 32-bit - * accumulator in 25.7 format. - * There is no risk of internal overflow with this approach, and the - * full precision of intermediate result is preserved. - * Finally, the accumulator is truncated to yield a result of 1.7 format. - * + @brief Mean value of a Q7 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult mean value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 32-bit internal accumulator. + The input is represented in 1.7 format and is accumulated in a 32-bit + accumulator in 25.7 format. + There is no risk of internal overflow with this approach, and the + full precision of intermediate result is preserved. + Finally, the accumulator is truncated to yield a result of 1.7 format. */ void arm_mean_q7( - q7_t * pSrc, - uint32_t blockSize, - q7_t * pResult) + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult) { - q31_t sum = 0; /* Temporary result storage */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + q31_t sum = 0; /* Temporary result storage */ -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) + q31_t in; +#endif - q31_t in; +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - in = *__SIMD32(pSrc)++; - + in = read_q7x4_ia ((q7_t **) &pSrc); sum += ((in << 24U) >> 24U); sum += ((in << 16U) >> 24U); sum += ((in << 8U) >> 24U); @@ -89,32 +83,30 @@ void arm_mean_q7( blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ sum += *pSrc++; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ - /* Store the result to the destination */ + /* Store result to destination */ *pResult = (q7_t) (sum / (int32_t) blockSize); } /** - * @} end of mean group + @} end of mean group */ diff --git a/DSP/Source/StatisticsFunctions/arm_min_f32.c b/DSP/Source/StatisticsFunctions/arm_min_f32.c index 858b0a2..6e9ff4b 100644 --- a/DSP/Source/StatisticsFunctions/arm_min_f32.c +++ b/DSP/Source/StatisticsFunctions/arm_min_f32.c @@ -3,13 +3,13 @@ * Title: arm_min_f32.c * Description: Minimum value of a floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -27,136 +27,233 @@ */ #include "arm_math.h" +#include <limits.h> /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @defgroup Min Minimum - * - * Computes the minimum value of an array of data. - * The function returns both the minimum value and its position within the array. - * There are separate functions for floating-point, Q31, Q15, and Q7 data types. + @defgroup Min Minimum + + Computes the minimum value of an array of data. + The function returns both the minimum value and its position within the array. + There are separate functions for floating-point, Q31, Q15, and Q7 data types. */ /** - * @addtogroup Min - * @{ + @addtogroup Min + @{ */ - /** - * @brief Minimum value of a floating-point vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult minimum value returned here - * @param[out] *pIndex index of minimum value returned here - * @return none. + @brief Minimum value of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult minimum value returned here + @param[out] pIndex index of minimum value returned here + @return none */ - +#if defined(ARM_MATH_NEON) void arm_min_f32( - float32_t * pSrc, + const float32_t * pSrc, uint32_t blockSize, float32_t * pResult, uint32_t * pIndex) { -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ - - float32_t minVal1, minVal2, out; /* Temporary variables to store the output value. */ + float32_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */ uint32_t blkCnt, outIndex, count; /* loop counter */ + float32x4_t outV, srcV; + float32x2_t outV2; + + uint32x4_t idxV; + uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX}; + uint32x4_t index={4,5,6,7}; + uint32x4_t delta={4,4,4,4}; + uint32x4_t countV={0,1,2,3}; + uint32x2_t countV2; + /* Initialise the count value. */ count = 0U; + /* Initialise the index value to zero. */ outIndex = 0U; + + /* Load first input value that act as reference value for comparison */ + if (blockSize <= 3) + { + out = *pSrc++; + + blkCnt = blockSize - 1; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + maxVal1 = *pSrc++; + + /* compare for the maximum value */ + if (out > maxVal1) + { + /* Update the maximum value and it's index */ + out = maxVal1; + outIndex = blockSize - blkCnt; + } + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + outV = vld1q_f32(pSrc); + pSrc += 4; + + /* Compute 4 outputs at a time */ + blkCnt = (blockSize - 4 ) >> 2U; + + while (blkCnt > 0U) + { + srcV = vld1q_f32(pSrc); + pSrc += 4; + + idxV = vcltq_f32(srcV, outV); + outV = vbslq_f32(idxV, srcV, outV ); + countV = vbslq_u32(idxV, index,countV ); + + index = vaddq_u32(index,delta); + + /* Decrement the loop counter */ + blkCnt--; + } + + outV2 = vpmin_f32(vget_low_f32(outV),vget_high_f32(outV)); + outV2 = vpmin_f32(outV2,outV2); + out = outV2[0]; + + idxV = vceqq_f32(outV, vdupq_n_f32(out)); + countV = vbslq_u32(idxV, countV,maxIdx); + + countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV)); + countV2 = vpmin_u32(countV2,countV2); + outIndex = countV2[0]; + + /* if (blockSize - 1U) is not multiple of 4 */ + blkCnt = (blockSize - 4 ) % 4U; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + maxVal1 = *pSrc++; + + /* compare for the maximum value */ + if (out > maxVal1) + { + /* Update the maximum value and it's index */ + out = maxVal1; + outIndex = blockSize - blkCnt ; + } + + /* Decrement the loop counter */ + blkCnt--; + } + } + + /* Store the maximum value and it's index into destination pointers */ + *pResult = out; + *pIndex = outIndex; +} +#else +void arm_min_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex) +{ + float32_t minVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt, outIndex; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + uint32_t index; /* index of maximum value */ +#endif + + /* Initialise index value to zero. */ + outIndex = 0U; + /* Load first input value that act as reference value for comparision */ out = *pSrc++; - /* Loop unrolling */ +#if defined (ARM_MATH_LOOPUNROLL) + /* Initialise index of maximum value. */ + index = 0U; + + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = (blockSize - 1U) >> 2U; while (blkCnt > 0U) { - /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; - minVal2 = *pSrc++; + /* Initialize minVal to next consecutive values one by one */ + minVal = *pSrc++; /* compare for the minimum value */ - if (out > minVal1) + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal1; - outIndex = count + 1U; + /* Update the minimum value and it's index */ + out = minVal; + outIndex = index + 1U; } - /* compare for the minimum value */ - if (out > minVal2) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal2; - outIndex = count + 2U; + out = minVal; + outIndex = index + 2U; } - /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; - minVal2 = *pSrc++; - - /* compare for the minimum value */ - if (out > minVal1) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal1; - outIndex = count + 3U; + out = minVal; + outIndex = index + 3U; } - /* compare for the minimum value */ - if (out > minVal2) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal2; - outIndex = count + 4U; + out = minVal; + outIndex = index + 4U; } - count += 4U; + index += 4U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* if (blockSize - 1U) is not multiple of 4 */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = (blockSize - 1U) % 4U; #else - /* Run the below code for Cortex-M0 */ - - float32_t minVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex; /* loop counter */ - - /* Initialise the index value to zero. */ - outIndex = 0U; - /* Load first input value that act as reference value for comparision */ - out = *pSrc++; + /* Initialize blkCnt with number of samples */ blkCnt = (blockSize - 1U); -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; + minVal = *pSrc++; /* compare for the minimum value */ - if (out > minVal1) + if (out > minVal) { /* Update the minimum value and it's index */ - out = minVal1; + out = minVal; outIndex = blockSize - blkCnt; } - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -164,7 +261,8 @@ void arm_min_f32( *pResult = out; *pIndex = outIndex; } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of Min group + @} end of Min group */ diff --git a/DSP/Source/StatisticsFunctions/arm_min_q15.c b/DSP/Source/StatisticsFunctions/arm_min_q15.c index fdc32b7..9450383 100644 --- a/DSP/Source/StatisticsFunctions/arm_min_q15.c +++ b/DSP/Source/StatisticsFunctions/arm_min_q15.c @@ -3,13 +3,13 @@ * Title: arm_min_q15.c * Description: Minimum value of a Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,127 +29,113 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup Min - * @{ + @addtogroup Min + @{ */ - /** - * @brief Minimum value of a Q15 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult minimum value returned here - * @param[out] *pIndex index of minimum value returned here - * @return none. + @brief Minimum value of a Q15 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult minimum value returned here + @param[out] pIndex index of minimum value returned here + @return none */ void arm_min_q15( - q15_t * pSrc, - uint32_t blockSize, - q15_t * pResult, - uint32_t * pIndex) + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex) { -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + q15_t minVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt, outIndex; /* Loop counter */ - q15_t minVal1, minVal2, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex, count; /* loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) + uint32_t index; /* index of maximum value */ +#endif - /* Initialise the count value. */ - count = 0U; - /* Initialise the index value to zero. */ + /* Initialise index value to zero. */ outIndex = 0U; /* Load first input value that act as reference value for comparision */ out = *pSrc++; - /* Loop unrolling */ +#if defined (ARM_MATH_LOOPUNROLL) + /* Initialise index of maximum value. */ + index = 0U; + + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = (blockSize - 1U) >> 2U; while (blkCnt > 0U) { - /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; - minVal2 = *pSrc++; + /* Initialize minVal to next consecutive values one by one */ + minVal = *pSrc++; /* compare for the minimum value */ - if (out > minVal1) + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal1; - outIndex = count + 1U; + /* Update the minimum value and it's index */ + out = minVal; + outIndex = index + 1U; } - /* compare for the minimum value */ - if (out > minVal2) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal2; - outIndex = count + 2U; + out = minVal; + outIndex = index + 2U; } - /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; - minVal2 = *pSrc++; - - /* compare for the minimum value */ - if (out > minVal1) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal1; - outIndex = count + 3U; + out = minVal; + outIndex = index + 3U; } - /* compare for the minimum value */ - if (out > minVal2) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal2; - outIndex = count + 4U; + out = minVal; + outIndex = index + 4U; } - count += 4U; + index += 4U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* if (blockSize - 1U) is not multiple of 4 */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = (blockSize - 1U) % 4U; #else - /* Run the below code for Cortex-M0 */ - - q15_t minVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex; /* loop counter */ - - /* Initialise the index value to zero. */ - outIndex = 0U; - /* Load first input value that act as reference value for comparision */ - out = *pSrc++; + /* Initialize blkCnt with number of samples */ blkCnt = (blockSize - 1U); -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; + minVal = *pSrc++; /* compare for the minimum value */ - if (out > minVal1) + if (out > minVal) { /* Update the minimum value and it's index */ - out = minVal1; + out = minVal; outIndex = blockSize - blkCnt; } - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -159,5 +145,5 @@ void arm_min_q15( } /** - * @} end of Min group + @} end of Min group */ diff --git a/DSP/Source/StatisticsFunctions/arm_min_q31.c b/DSP/Source/StatisticsFunctions/arm_min_q31.c index fc4c155..e25eb47 100644 --- a/DSP/Source/StatisticsFunctions/arm_min_q31.c +++ b/DSP/Source/StatisticsFunctions/arm_min_q31.c @@ -3,13 +3,13 @@ * Title: arm_min_q31.c * Description: Minimum value of a Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,127 +29,113 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup Min - * @{ + @addtogroup Min + @{ */ - /** - * @brief Minimum value of a Q31 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult minimum value returned here - * @param[out] *pIndex index of minimum value returned here - * @return none. + @brief Minimum value of a Q31 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult minimum value returned here + @param[out] pIndex index of minimum value returned here + @return none */ void arm_min_q31( - q31_t * pSrc, - uint32_t blockSize, - q31_t * pResult, - uint32_t * pIndex) + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex) { -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + q31_t minVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt, outIndex; /* Loop counter */ - q31_t minVal1, minVal2, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex, count; /* loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) + uint32_t index; /* index of maximum value */ +#endif - /* Initialise the count value. */ - count = 0U; - /* Initialise the index value to zero. */ + /* Initialise index value to zero. */ outIndex = 0U; /* Load first input value that act as reference value for comparision */ out = *pSrc++; - /* Loop unrolling */ +#if defined (ARM_MATH_LOOPUNROLL) + /* Initialise index of maximum value. */ + index = 0U; + + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = (blockSize - 1U) >> 2U; while (blkCnt > 0U) { - /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; - minVal2 = *pSrc++; + /* Initialize minVal to next consecutive values one by one */ + minVal = *pSrc++; /* compare for the minimum value */ - if (out > minVal1) + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal1; - outIndex = count + 1U; + /* Update the minimum value and it's index */ + out = minVal; + outIndex = index + 1U; } - /* compare for the minimum value */ - if (out > minVal2) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal2; - outIndex = count + 2U; + out = minVal; + outIndex = index + 2U; } - /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; - minVal2 = *pSrc++; - - /* compare for the minimum value */ - if (out > minVal1) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal1; - outIndex = count + 3U; + out = minVal; + outIndex = index + 3U; } - /* compare for the minimum value */ - if (out > minVal2) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal2; - outIndex = count + 4U; + out = minVal; + outIndex = index + 4U; } - count += 4U; + index += 4U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* if (blockSize - 1U) is not multiple of 4 */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = (blockSize - 1U) % 4U; #else - /* Run the below code for Cortex-M0 */ - - q31_t minVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex; /* loop counter */ - - /* Initialise the index value to zero. */ - outIndex = 0U; - /* Load first input value that act as reference value for comparision */ - out = *pSrc++; + /* Initialize blkCnt with number of samples */ blkCnt = (blockSize - 1U); -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; + minVal = *pSrc++; /* compare for the minimum value */ - if (out > minVal1) + if (out > minVal) { /* Update the minimum value and it's index */ - out = minVal1; + out = minVal; outIndex = blockSize - blkCnt; } - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -159,5 +145,5 @@ void arm_min_q31( } /** - * @} end of Min group + @} end of Min group */ diff --git a/DSP/Source/StatisticsFunctions/arm_min_q7.c b/DSP/Source/StatisticsFunctions/arm_min_q7.c index 50362e6..2b171f0 100644 --- a/DSP/Source/StatisticsFunctions/arm_min_q7.c +++ b/DSP/Source/StatisticsFunctions/arm_min_q7.c @@ -3,13 +3,13 @@ * Title: arm_min_q7.c * Description: Minimum value of a Q7 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,127 +29,113 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup Min - * @{ + @addtogroup Min + @{ */ - /** - * @brief Minimum value of a Q7 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult minimum value returned here - * @param[out] *pIndex index of minimum value returned here - * @return none. + @brief Minimum value of a Q7 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult minimum value returned here + @param[out] pIndex index of minimum value returned here + @return none */ void arm_min_q7( - q7_t * pSrc, - uint32_t blockSize, - q7_t * pResult, - uint32_t * pIndex) + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult, + uint32_t * pIndex) { -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + q7_t minVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt, outIndex; /* Loop counter */ - q7_t minVal1, minVal2, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex, count; /* loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) + uint32_t index; /* index of maximum value */ +#endif - /* Initialise the count value. */ - count = 0U; - /* Initialise the index value to zero. */ + /* Initialise index value to zero. */ outIndex = 0U; /* Load first input value that act as reference value for comparision */ out = *pSrc++; - /* Loop unrolling */ +#if defined (ARM_MATH_LOOPUNROLL) + /* Initialise index of maximum value. */ + index = 0U; + + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = (blockSize - 1U) >> 2U; while (blkCnt > 0U) { - /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; - minVal2 = *pSrc++; + /* Initialize minVal to next consecutive values one by one */ + minVal = *pSrc++; /* compare for the minimum value */ - if (out > minVal1) + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal1; - outIndex = count + 1U; + /* Update the minimum value and it's index */ + out = minVal; + outIndex = index + 1U; } - /* compare for the minimum value */ - if (out > minVal2) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal2; - outIndex = count + 2U; + out = minVal; + outIndex = index + 2U; } - /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; - minVal2 = *pSrc++; - - /* compare for the minimum value */ - if (out > minVal1) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal1; - outIndex = count + 3U; + out = minVal; + outIndex = index + 3U; } - /* compare for the minimum value */ - if (out > minVal2) + minVal = *pSrc++; + if (out > minVal) { - /* Update the minimum value and its index */ - out = minVal2; - outIndex = count + 4U; + out = minVal; + outIndex = index + 4U; } - count += 4U; + index += 4U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* if (blockSize - 1U) is not multiple of 4 */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = (blockSize - 1U) % 4U; #else - /* Run the below code for Cortex-M0 */ - - q7_t minVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex; /* loop counter */ - - /* Initialise the index value to zero. */ - outIndex = 0U; - /* Load first input value that act as reference value for comparision */ - out = *pSrc++; + /* Initialize blkCnt with number of samples */ blkCnt = (blockSize - 1U); -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* Initialize minVal to the next consecutive values one by one */ - minVal1 = *pSrc++; + minVal = *pSrc++; /* compare for the minimum value */ - if (out > minVal1) + if (out > minVal) { /* Update the minimum value and it's index */ - out = minVal1; + out = minVal; outIndex = blockSize - blkCnt; } - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -159,5 +145,5 @@ void arm_min_q7( } /** - * @} end of Min group + @} end of Min group */ diff --git a/DSP/Source/StatisticsFunctions/arm_power_f32.c b/DSP/Source/StatisticsFunctions/arm_power_f32.c index 1426735..a4825a5 100644 --- a/DSP/Source/StatisticsFunctions/arm_power_f32.c +++ b/DSP/Source/StatisticsFunctions/arm_power_f32.c @@ -3,13 +3,13 @@ * Title: arm_power_f32.c * Description: Sum of the squares of the elements of a floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,40 +29,37 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @defgroup power Power - * - * Calculates the sum of the squares of the elements in the input vector. - * The underlying algorithm is used: - * - * <pre> - * Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]; - * </pre> - * - * There are separate functions for floating point, Q31, Q15, and Q7 data types. + @defgroup power Power + + Calculates the sum of the squares of the elements in the input vector. + The underlying algorithm is used: + + <pre> + Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]; + </pre> + + There are separate functions for floating point, Q31, Q15, and Q7 data types. */ /** - * @addtogroup power - * @{ + @addtogroup power + @{ */ - /** - * @brief Sum of the squares of the elements of a floating-point vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult sum of the squares value returned here - * @return none. - * + @brief Sum of the squares of the elements of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult sum of the squares value returned here + @return none */ - - +#if defined(ARM_MATH_NEON) void arm_power_f32( - float32_t * pSrc, + const float32_t * pSrc, uint32_t blockSize, float32_t * pResult) { @@ -70,60 +67,109 @@ void arm_power_f32( float32_t in; /* Temporary variable to store input value */ uint32_t blkCnt; /* loop counter */ -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */ + float32x2_t sumV2; + float32x4_t inV; - /*loop Unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ + inV = vld1q_f32(pSrc); + sumV = vmlaq_f32(sumV, inV, inV); + pSrc += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); + sum = sumV2[0] + sumV2[1]; + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ + /* compute power and then store the result in a temporary variable, sum. */ + in = *pSrc++; + sum += in * in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Store the result to the destination */ + *pResult = sum; +} +#else +void arm_power_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult) +{ + uint32_t blkCnt; /* Loop counter */ + float32_t sum = 0.0f; /* Temporary result storage */ + float32_t in; /* Temporary variable to store input value */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ in = *pSrc++; sum += in * in; + in = *pSrc++; sum += in * in; + in = *pSrc++; sum += in * in; + in = *pSrc++; sum += in * in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; - #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ - +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* compute power and then store the result in a temporary variable, sum. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ in = *pSrc++; sum += in * in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Store the result to the destination */ + /* Store result to destination */ *pResult = sum; } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of power group + @} end of power group */ diff --git a/DSP/Source/StatisticsFunctions/arm_power_q15.c b/DSP/Source/StatisticsFunctions/arm_power_q15.c index 6d95f4d..12f524d 100644 --- a/DSP/Source/StatisticsFunctions/arm_power_q15.c +++ b/DSP/Source/StatisticsFunctions/arm_power_q15.c @@ -3,13 +3,13 @@ * Title: arm_power_q15.c * Description: Sum of the squares of the elements of a Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,110 +29,104 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup power - * @{ + @addtogroup power + @{ */ /** - * @brief Sum of the squares of the elements of a Q15 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult sum of the squares value returned here - * @return none. - * - * @details - * <b>Scaling and Overflow Behavior:</b> - * - * \par - * The function is implemented using a 64-bit internal accumulator. - * The input is represented in 1.15 format. - * Intermediate multiplication yields a 2.30 format, and this - * result is added without saturation to a 64-bit accumulator in 34.30 format. - * With 33 guard bits in the accumulator, there is no risk of overflow, and the - * full precision of the intermediate multiplication is preserved. - * Finally, the return result is in 34.30 format. - * + @brief Sum of the squares of the elements of a Q15 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult sum of the squares value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 64-bit internal accumulator. + The input is represented in 1.15 format. + Intermediate multiplication yields a 2.30 format, and this + result is added without saturation to a 64-bit accumulator in 34.30 format. + With 33 guard bits in the accumulator, there is no risk of overflow, and the + full precision of the intermediate multiplication is preserved. + Finally, the return result is in 34.30 format. */ void arm_power_q15( - q15_t * pSrc, - uint32_t blockSize, - q63_t * pResult) + const q15_t * pSrc, + uint32_t blockSize, + q63_t * pResult) { - q63_t sum = 0; /* Temporary result storage */ - -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + uint32_t blkCnt; /* Loop counter */ + q63_t sum = 0; /* Temporary result storage */ + q15_t in; /* Temporary variable to store input value */ - q31_t in32; /* Temporary variable to store input value */ - q15_t in16; /* Temporary variable to store input value */ - uint32_t blkCnt; /* loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in32; /* Temporary variable to store packed input value */ +#endif +#if defined (ARM_MATH_LOOPUNROLL) - /* loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute Power and then store the result in a temporary variable, sum. */ - in32 = *__SIMD32(pSrc)++; + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ +#if defined (ARM_MATH_DSP) + in32 = read_q15x2_ia ((q15_t **) &pSrc); sum = __SMLALD(in32, in32, sum); - in32 = *__SIMD32(pSrc)++; + + in32 = read_q15x2_ia ((q15_t **) &pSrc); sum = __SMLALD(in32, in32, sum); +#else + in = *pSrc++; + sum += ((q31_t) in * in); - /* Decrement the loop counter */ - blkCnt--; - } + in = *pSrc++; + sum += ((q31_t) in * in); - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; + in = *pSrc++; + sum += ((q31_t) in * in); - while (blkCnt > 0U) - { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute Power and then store the result in a temporary variable, sum. */ - in16 = *pSrc++; - sum = __SMLALD(in16, in16, sum); + in = *pSrc++; + sum += ((q31_t) in * in); +#endif /* #if defined (ARM_MATH_DSP) */ - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } -#else - /* Run the below code for Cortex-M0 */ - - q15_t in; /* Temporary variable to store input value */ - uint32_t blkCnt; /* loop counter */ + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; +#else - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute Power and then store the result in a temporary variable, sum. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ in = *pSrc++; sum += ((q31_t) in * in); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } -#endif /* #if defined (ARM_MATH_DSP) */ - - /* Store the results in 34.30 format */ + /* Store result in 34.30 format */ *pResult = sum; } /** - * @} end of power group + @} end of power group */ diff --git a/DSP/Source/StatisticsFunctions/arm_power_q31.c b/DSP/Source/StatisticsFunctions/arm_power_q31.c index 16be249..1e193b3 100644 --- a/DSP/Source/StatisticsFunctions/arm_power_q31.c +++ b/DSP/Source/StatisticsFunctions/arm_power_q31.c @@ -3,13 +3,13 @@ * Title: arm_power_q31.c * Description: Sum of the squares of the elements of a Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,58 +29,51 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup power - * @{ + @addtogroup power + @{ */ /** - * @brief Sum of the squares of the elements of a Q31 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult sum of the squares value returned here - * @return none. - * - * @details - * <b>Scaling and Overflow Behavior:</b> - * - * \par - * The function is implemented using a 64-bit internal accumulator. - * The input is represented in 1.31 format. - * Intermediate multiplication yields a 2.62 format, and this - * result is truncated to 2.48 format by discarding the lower 14 bits. - * The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format. - * With 15 guard bits in the accumulator, there is no risk of overflow, and the - * full precision of the intermediate multiplication is preserved. - * Finally, the return result is in 16.48 format. - * + @brief Sum of the squares of the elements of a Q31 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult sum of the squares value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 64-bit internal accumulator. + The input is represented in 1.31 format. + Intermediate multiplication yields a 2.62 format, and this + result is truncated to 2.48 format by discarding the lower 14 bits. + The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format. + With 15 guard bits in the accumulator, there is no risk of overflow, and the + full precision of the intermediate multiplication is preserved. + Finally, the return result is in 16.48 format. */ void arm_power_q31( - q31_t * pSrc, - uint32_t blockSize, - q63_t * pResult) + const q31_t * pSrc, + uint32_t blockSize, + q63_t * pResult) { - q63_t sum = 0; /* Temporary result storage */ - q31_t in; - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + q63_t sum = 0; /* Temporary result storage */ + q31_t in; /* Temporary variable to store input value */ +#if defined (ARM_MATH_LOOPUNROLL) -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ - - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and then store the result in a temporary variable sum, providing 15 guard bits. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and store result in a temporary variable sum, providing 15 guard bits. */ in = *pSrc++; sum += ((q63_t) in * in) >> 14U; @@ -93,37 +86,36 @@ void arm_power_q31( in = *pSrc++; sum += ((q63_t) in * in) >> 14U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute Power and then store the result in a temporary variable, sum. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ in = *pSrc++; sum += ((q63_t) in * in) >> 14U; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Store the results in 16.48 format */ + /* Store results in 16.48 format */ *pResult = sum; } /** - * @} end of power group + @} end of power group */ diff --git a/DSP/Source/StatisticsFunctions/arm_power_q7.c b/DSP/Source/StatisticsFunctions/arm_power_q7.c index 24306cd..47405cd 100644 --- a/DSP/Source/StatisticsFunctions/arm_power_q7.c +++ b/DSP/Source/StatisticsFunctions/arm_power_q7.c @@ -3,13 +3,13 @@ * Title: arm_power_q7.c * Description: Sum of the squares of the elements of a Q7 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,99 +29,108 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup power - * @{ + @addtogroup power + @{ */ /** - * @brief Sum of the squares of the elements of a Q7 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult sum of the squares value returned here - * @return none. - * - * @details - * <b>Scaling and Overflow Behavior:</b> - * - * \par - * The function is implemented using a 32-bit internal accumulator. - * The input is represented in 1.7 format. - * Intermediate multiplication yields a 2.14 format, and this - * result is added without saturation to an accumulator in 18.14 format. - * With 17 guard bits in the accumulator, there is no risk of overflow, and the - * full precision of the intermediate multiplication is preserved. - * Finally, the return result is in 18.14 format. - * + @brief Sum of the squares of the elements of a Q7 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult sum of the squares value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 32-bit internal accumulator. + The input is represented in 1.7 format. + Intermediate multiplication yields a 2.14 format, and this + result is added without saturation to an accumulator in 18.14 format. + With 17 guard bits in the accumulator, there is no risk of overflow, and the + full precision of the intermediate multiplication is preserved. + Finally, the return result is in 18.14 format. */ void arm_power_q7( - q7_t * pSrc, - uint32_t blockSize, - q31_t * pResult) + const q7_t * pSrc, + uint32_t blockSize, + q31_t * pResult) { - q31_t sum = 0; /* Temporary result storage */ - q7_t in; /* Temporary variable to store input */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + q31_t sum = 0; /* Temporary result storage */ + q7_t in; /* Temporary variable to store input value */ -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in32; /* Temporary variable to store packed input value */ + q31_t in1, in2; /* Temporary variables to store input value */ +#endif - q31_t input1; /* Temporary variable to store packed input */ - q31_t in1, in2; /* Temporary variables to store input */ +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* Reading two inputs of pSrc vector and packing */ - input1 = *__SIMD32(pSrc)++; + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ +#if defined (ARM_MATH_DSP) + in32 = read_q7x4_ia ((q7_t **) &pSrc); - in1 = __SXTB16(__ROR(input1, 8)); - in2 = __SXTB16(input1); + in1 = __SXTB16(__ROR(in32, 8)); + in2 = __SXTB16(in32); - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* calculate power and accumulate to accumulator */ sum = __SMLAD(in1, in1, sum); sum = __SMLAD(in2, in2, sum); +#else + in = *pSrc++; + sum += ((q15_t) in * in); - /* Decrement the loop counter */ + in = *pSrc++; + sum += ((q15_t) in * in); + + in = *pSrc++; + sum += ((q15_t) in * in); + + in = *pSrc++; + sum += ((q15_t) in * in); +#endif /* #if defined (ARM_MATH_DSP) */ + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute Power and then store the result in a temporary variable, sum. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ in = *pSrc++; sum += ((q15_t) in * in); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Store the result in 18.14 format */ + /* Store result in 18.14 format */ *pResult = sum; } /** - * @} end of power group + @} end of power group */ diff --git a/DSP/Source/StatisticsFunctions/arm_rms_f32.c b/DSP/Source/StatisticsFunctions/arm_rms_f32.c index 8d1b708..4546510 100644 --- a/DSP/Source/StatisticsFunctions/arm_rms_f32.c +++ b/DSP/Source/StatisticsFunctions/arm_rms_f32.c @@ -1,15 +1,15 @@ /* ---------------------------------------------------------------------- * Project: CMSIS DSP Library * Title: arm_rms_f32.c - * Description: Root mean square value of an array of F32 type + * Description: Root mean square value of the elements of a floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,99 +29,148 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @defgroup RMS Root mean square (RMS) - * - * - * Calculates the Root Mean Sqaure of the elements in the input vector. - * The underlying algorithm is used: - * - * <pre> - * Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize)); - * </pre> - * - * There are separate functions for floating point, Q31, and Q15 data types. + @defgroup RMS Root mean square (RMS) + + Calculates the Root Mean Square of the elements in the input vector. + The underlying algorithm is used: + + <pre> + Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize)); + </pre> + + There are separate functions for floating point, Q31, and Q15 data types. */ /** - * @addtogroup RMS - * @{ + @addtogroup RMS + @{ */ - /** - * @brief Root Mean Square of the elements of a floating-point vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult rms value returned here - * @return none. - * + @brief Root Mean Square of the elements of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult root mean square value returned here + @return none */ - +#if defined(ARM_MATH_NEON) void arm_rms_f32( - float32_t * pSrc, + const float32_t * pSrc, uint32_t blockSize, float32_t * pResult) { - float32_t sum = 0.0f; /* Accumulator */ - float32_t in; /* Tempoprary variable to store input value */ + float32_t sum = 0.0f; /* accumulator */ + float32_t in; /* Temporary variable to store input value */ uint32_t blkCnt; /* loop counter */ -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */ + float32x2_t sumV2; + float32x4_t inV; - /* loop Unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute sum of the squares and then store the result in a temporary variable, sum */ + /* Compute Power and then store the result in a temporary variable, sum. */ + inV = vld1q_f32(pSrc); + sumV = vmlaq_f32(sumV, inV, inV); + pSrc += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); + sum = sumV2[0] + sumV2[1]; + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ + /* compute power and then store the result in a temporary variable, sum. */ + in = *pSrc++; + sum += in * in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Compute Rms and store the result in the destination */ + arm_sqrt_f32(sum / (float32_t) blockSize, pResult); +} +#else +void arm_rms_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult) +{ + uint32_t blkCnt; /* Loop counter */ + float32_t sum = 0.0f; /* Temporary result storage */ + float32_t in; /* Temporary variable to store input value */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + in = *pSrc++; + /* Compute sum of squares and store result in a temporary variable, sum. */ sum += in * in; + in = *pSrc++; sum += in * in; + in = *pSrc++; sum += in * in; + in = *pSrc++; sum += in * in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute sum of the squares and then store the results in a temporary variable, sum */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + in = *pSrc++; - sum += in * in; + /* Compute sum of squares and store result in a temporary variable. */ + sum += ( in * in); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Compute Rms and store the result in the destination */ + /* Compute Rms and store result in destination */ arm_sqrt_f32(sum / (float32_t) blockSize, pResult); } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of RMS group + @} end of RMS group */ diff --git a/DSP/Source/StatisticsFunctions/arm_rms_q15.c b/DSP/Source/StatisticsFunctions/arm_rms_q15.c index d0e61ca..9fcd964 100644 --- a/DSP/Source/StatisticsFunctions/arm_rms_q15.c +++ b/DSP/Source/StatisticsFunctions/arm_rms_q15.c @@ -3,13 +3,13 @@ * Title: arm_rms_q15.c * Description: Root Mean Square of the elements of a Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,111 +29,106 @@ #include "arm_math.h" /** - * @addtogroup RMS - * @{ + @ingroup groupStats */ /** - * @brief Root Mean Square of the elements of a Q15 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult rms value returned here - * @return none. - * - * @details - * <b>Scaling and Overflow Behavior:</b> - * - * \par - * The function is implemented using a 64-bit internal accumulator. - * The input is represented in 1.15 format. - * Intermediate multiplication yields a 2.30 format, and this - * result is added without saturation to a 64-bit accumulator in 34.30 format. - * With 33 guard bits in the accumulator, there is no risk of overflow, and the - * full precision of the intermediate multiplication is preserved. - * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower - * 15 bits, and then saturated to yield a result in 1.15 format. - * + @addtogroup RMS + @{ + */ + +/** + @brief Root Mean Square of the elements of a Q15 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult root mean square value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 64-bit internal accumulator. + The input is represented in 1.15 format. + Intermediate multiplication yields a 2.30 format, and this + result is added without saturation to a 64-bit accumulator in 34.30 format. + With 33 guard bits in the accumulator, there is no risk of overflow, and the + full precision of the intermediate multiplication is preserved. + Finally, the 34.30 result is truncated to 34.15 format by discarding the lower + 15 bits, and then saturated to yield a result in 1.15 format. */ void arm_rms_q15( - q15_t * pSrc, - uint32_t blockSize, - q15_t * pResult) + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult) { - q63_t sum = 0; /* accumulator */ + uint32_t blkCnt; /* Loop counter */ + q63_t sum = 0; /* Temporary result storage */ + q15_t in; /* Temporary variable to store input value */ -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in32; /* Temporary variable to store input value */ +#endif - q31_t in; /* temporary variable to store the input value */ - q15_t in1; /* temporary variable to store the input value */ - uint32_t blkCnt; /* loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) - /* loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute sum of the squares and then store the results in a temporary variable, sum */ - in = *__SIMD32(pSrc)++; - sum = __SMLALD(in, in, sum); - in = *__SIMD32(pSrc)++; - sum = __SMLALD(in, in, sum); - - /* Decrement the loop counter */ - blkCnt--; - } + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; + /* Compute sum of squares and store result in a temporary variable. */ +#if defined (ARM_MATH_DSP) + in32 = read_q15x2_ia ((q15_t **) &pSrc); + sum = __SMLALD(in32, in32, sum); - while (blkCnt > 0U) - { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute sum of the squares and then store the results in a temporary variable, sum */ - in1 = *pSrc++; - sum = __SMLALD(in1, in1, sum); + in32 = read_q15x2_ia ((q15_t **) &pSrc); + sum = __SMLALD(in32, in32, sum); +#else + in = *pSrc++; + sum += ((q31_t) in * in); - /* Decrement the loop counter */ + in = *pSrc++; + sum += ((q31_t) in * in); + + in = *pSrc++; + sum += ((q31_t) in * in); + + in = *pSrc++; + sum += ((q31_t) in * in); +#endif /* #if defined (ARM_MATH_DSP) */ + + /* Decrement loop counter */ blkCnt--; } - /* Truncating and saturating the accumulator to 1.15 format */ - /* Store the result in the destination */ - arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult); + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - q15_t in; /* temporary variable to store the input value */ - uint32_t blkCnt; /* loop counter */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute sum of the squares and then store the results in a temporary variable, sum */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + in = *pSrc++; + /* Compute sum of squares and store result in a temporary variable. */ sum += ((q31_t) in * in); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } /* Truncating and saturating the accumulator to 1.15 format */ - /* Store the result in the destination */ + /* Store result in destination */ arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult); - -#endif /* #if defined (ARM_MATH_DSP) */ - } /** - * @} end of RMS group + @} end of RMS group */ diff --git a/DSP/Source/StatisticsFunctions/arm_rms_q31.c b/DSP/Source/StatisticsFunctions/arm_rms_q31.c index cb3c58e..5a3e8f3 100644 --- a/DSP/Source/StatisticsFunctions/arm_rms_q31.c +++ b/DSP/Source/StatisticsFunctions/arm_rms_q31.c @@ -3,13 +3,13 @@ * Title: arm_rms_q31.c * Description: Root Mean Square of the elements of a Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,109 +29,96 @@ #include "arm_math.h" /** - * @addtogroup RMS - * @{ + @ingroup groupStats */ +/** + @addtogroup RMS + @{ + */ /** - * @brief Root Mean Square of the elements of a Q31 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult rms value returned here - * @return none. - * - * @details - * <b>Scaling and Overflow Behavior:</b> - * - *\par - * The function is implemented using an internal 64-bit accumulator. - * The input is represented in 1.31 format, and intermediate multiplication - * yields a 2.62 format. - * The accumulator maintains full precision of the intermediate multiplication results, - * but provides only a single guard bit. - * There is no saturation on intermediate additions. - * If the accumulator overflows, it wraps around and distorts the result. - * In order to avoid overflows completely, the input signal must be scaled down by - * log2(blockSize) bits, as a total of blockSize additions are performed internally. - * Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value. - * + @brief Root Mean Square of the elements of a Q31 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult root mean square value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using an internal 64-bit accumulator. + The input is represented in 1.31 format, and intermediate multiplication + yields a 2.62 format. + The accumulator maintains full precision of the intermediate multiplication results, + but provides only a single guard bit. + There is no saturation on intermediate additions. + If the accumulator overflows, it wraps around and distorts the result. + In order to avoid overflows completely, the input signal must be scaled down by + log2(blockSize) bits, as a total of blockSize additions are performed internally. + Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value. */ void arm_rms_q31( - q31_t * pSrc, - uint32_t blockSize, - q31_t * pResult) + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult) { - q63_t sum = 0; /* accumulator */ - q31_t in; /* Temporary variable to store the input */ - uint32_t blkCnt; /* loop counter */ - -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ + uint32_t blkCnt; /* Loop counter */ + uint64_t sum = 0; /* Temporary result storage (can get never negative. changed type from q63 to uint64 */ + q31_t in; /* Temporary variable to store input value */ - q31_t in1, in2, in3, in4; /* Temporary input variables */ +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 8 outputs at a time. - ** a second loop below computes the remaining 1 to 7 samples. */ while (blkCnt > 0U) { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute sum of the squares and then store the result in a temporary variable, sum */ - /* read two samples from source buffer */ - in1 = pSrc[0]; - in2 = pSrc[1]; + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ - /* calculate power and accumulate to accumulator */ - sum += (q63_t) in1 *in1; - sum += (q63_t) in2 *in2; - - /* read two samples from source buffer */ - in3 = pSrc[2]; - in4 = pSrc[3]; + in = *pSrc++; + /* Compute sum of squares and store result in a temporary variable, sum. */ + sum += ((q63_t) in * in); - /* calculate power and accumulate to accumulator */ - sum += (q63_t) in3 *in3; - sum += (q63_t) in4 *in4; + in = *pSrc++; + sum += ((q63_t) in * in); + in = *pSrc++; + sum += ((q63_t) in * in); - /* update source buffer to process next samples */ - pSrc += 4U; + in = *pSrc++; + sum += ((q63_t) in * in); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 8, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { - /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ - /* Compute sum of the squares and then store the results in a temporary variable, sum */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + in = *pSrc++; - sum += (q63_t) in *in; + /* Compute sum of squares and store result in a temporary variable. */ + sum += ((q63_t) in * in); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } /* Convert data in 2.62 to 1.31 by 31 right shifts and saturate */ - /* Compute Rms and store the result in the destination vector */ + /* Compute Rms and store result in destination vector */ arm_sqrt_q31(clip_q63_to_q31((sum / (q63_t) blockSize) >> 31), pResult); } /** - * @} end of RMS group + @} end of RMS group */ diff --git a/DSP/Source/StatisticsFunctions/arm_std_f32.c b/DSP/Source/StatisticsFunctions/arm_std_f32.c index 9750b88..e1e6577 100644 --- a/DSP/Source/StatisticsFunctions/arm_std_f32.c +++ b/DSP/Source/StatisticsFunctions/arm_std_f32.c @@ -3,13 +3,13 @@ * Title: arm_std_f32.c * Description: Standard deviation of the elements of a floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,111 +29,131 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @defgroup STD Standard deviation - * - * Calculates the standard deviation of the elements in the input vector. - * The underlying algorithm is used: - * - * <pre> - * Result = sqrt((sumOfSquares - sum<sup>2</sup> / blockSize) / (blockSize - 1)) - * - * where, sumOfSquares = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1] - * - * sum = pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1] - * </pre> - * - * There are separate functions for floating point, Q31, and Q15 data types. + @defgroup STD Standard deviation + + Calculates the standard deviation of the elements in the input vector. + The underlying algorithm is used: + + <pre> + Result = sqrt((sumOfSquares - sum<sup>2</sup> / blockSize) / (blockSize - 1)) + + sumOfSquares = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1] + sum = pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1] + </pre> + + There are separate functions for floating point, Q31, and Q15 data types. */ /** - * @addtogroup STD - * @{ + @addtogroup STD + @{ */ - /** - * @brief Standard deviation of the elements of a floating-point vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult standard deviation value returned here - * @return none. + @brief Standard deviation of the elements of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult standard deviation value returned here + @return none */ - +#if defined(ARM_MATH_NEON_EXPERIMENTAL) void arm_std_f32( - float32_t * pSrc, - uint32_t blockSize, - float32_t * pResult) + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult) { - float32_t sum = 0.0f; /* Temporary result storage */ - float32_t sumOfSquares = 0.0f; /* Sum of squares */ - float32_t in; /* input value */ - uint32_t blkCnt; /* loop counter */ -#if defined (ARM_MATH_DSP) - float32_t meanOfSquares, mean, squareOfMean; /* Temporary variables */ + float32_t var; + arm_var_f32(pSrc,blockSize,&var); + arm_sqrt_f32(var, pResult); +} #else - float32_t squareOfSum; /* Square of Sum */ - float32_t var; /* Temporary varaince storage */ +void arm_std_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult) +{ + uint32_t blkCnt; /* Loop counter */ + float32_t sum = 0.0f; /* Temporary result storage */ + float32_t sumOfSquares = 0.0f; /* Sum of squares */ + float32_t in; /* Temporary variable to store input value */ + +#ifndef ARM_MATH_CM0_FAMILY + float32_t meanOfSquares, mean, squareOfMean; /* Temporary variables */ +#else + float32_t squareOfSum; /* Square of Sum */ + float32_t var; /* Temporary varaince storage */ #endif - if (blockSize == 1U) + if (blockSize <= 1U) { *pResult = 0; return; } -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ + in = *pSrc++; - sum += in; + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ sumOfSquares += in * in; - in = *pSrc++; + /* Compute sum and store result in a temporary variable, sum. */ sum += in; - sumOfSquares += in * in; + in = *pSrc++; - sum += in; sumOfSquares += in * in; + sum += in; + in = *pSrc++; + sumOfSquares += in * in; sum += in; + + in = *pSrc++; sumOfSquares += in * in; + sum += in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ + in = *pSrc++; + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ + sumOfSquares += ( in * in); + /* Compute sum and store result in a temporary variable, sum. */ sum += in; - sumOfSquares += in * in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Compute Mean of squares of the input samples - * and then store the result in a temporary variable, meanOfSquares. */ +#ifndef ARM_MATH_CM0_FAMILY + + /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */ meanOfSquares = sumOfSquares / ((float32_t) blockSize - 1.0f); /* Compute mean of all input values */ @@ -143,44 +163,26 @@ void arm_std_f32( squareOfMean = (mean * mean) * (((float32_t) blockSize) / ((float32_t) blockSize - 1.0f)); - /* Compute standard deviation and then store the result to the destination */ + /* Compute standard deviation and store result to destination */ arm_sqrt_f32((meanOfSquares - squareOfMean), pResult); #else /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sumOfSquares. */ - in = *pSrc++; - sumOfSquares += in * in; - - /* C = (A[0] + A[1] + ... + A[blockSize-1]) */ - /* Compute Sum of the input samples - * and then store the result in a temporary variable, sum. */ - sum += in; - - /* Decrement the loop counter */ - blkCnt--; - } - - /* Compute the square of sum */ + /* Compute square of sum */ squareOfSum = ((sum * sum) / (float32_t) blockSize); - /* Compute the variance */ + /* Compute variance */ var = ((sumOfSquares - squareOfSum) / (float32_t) (blockSize - 1.0f)); - /* Compute standard deviation and then store the result to the destination */ + /* Compute standard deviation and store result in destination */ arm_sqrt_f32(var, pResult); -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of STD group + @} end of STD group */ diff --git a/DSP/Source/StatisticsFunctions/arm_std_q15.c b/DSP/Source/StatisticsFunctions/arm_std_q15.c index 2f2f52e..8e5c042 100644 --- a/DSP/Source/StatisticsFunctions/arm_std_q15.c +++ b/DSP/Source/StatisticsFunctions/arm_std_q15.c @@ -3,13 +3,13 @@ * Title: arm_std_q15.c * Description: Standard deviation of an array of Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,146 +29,133 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup STD - * @{ + @addtogroup STD + @{ */ /** - * @brief Standard deviation of the elements of a Q15 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult standard deviation value returned here - * @return none. - * @details - * <b>Scaling and Overflow Behavior:</b> - * - * \par - * The function is implemented using a 64-bit internal accumulator. - * The input is represented in 1.15 format. - * Intermediate multiplication yields a 2.30 format, and this - * result is added without saturation to a 64-bit accumulator in 34.30 format. - * With 33 guard bits in the accumulator, there is no risk of overflow, and the - * full precision of the intermediate multiplication is preserved. - * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower - * 15 bits, and then saturated to yield a result in 1.15 format. + @brief Standard deviation of the elements of a Q15 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult standard deviation value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 64-bit internal accumulator. + The input is represented in 1.15 format. + Intermediate multiplication yields a 2.30 format, and this + result is added without saturation to a 64-bit accumulator in 34.30 format. + With 33 guard bits in the accumulator, there is no risk of overflow, and the + full precision of the intermediate multiplication is preserved. + Finally, the 34.30 result is truncated to 34.15 format by discarding the lower + 15 bits, and then saturated to yield a result in 1.15 format. */ void arm_std_q15( - q15_t * pSrc, - uint32_t blockSize, - q15_t * pResult) + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult) { - q31_t sum = 0; /* Accumulator */ - q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */ - uint32_t blkCnt; /* loop counter */ - q63_t sumOfSquares = 0; /* Accumulator */ -#if defined (ARM_MATH_DSP) - q31_t in; /* input value */ - q15_t in1; /* input value */ -#else - q15_t in; /* input value */ + uint32_t blkCnt; /* Loop counter */ + q31_t sum = 0; /* Accumulator */ + q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */ + q63_t sumOfSquares = 0; /* Sum of squares */ + q15_t in; /* Temporary variable to store input value */ + +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in32; /* Temporary variable to store input value */ #endif - if (blockSize == 1U) + if (blockSize <= 1U) { *pResult = 0; return; } -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ - in = *__SIMD32(pSrc)++; - sum += ((in << 16U) >> 16U); - sum += (in >> 16U); - sumOfSquares = __SMLALD(in, in, sumOfSquares); - in = *__SIMD32(pSrc)++; - sum += ((in << 16U) >> 16U); - sum += (in >> 16U); - sumOfSquares = __SMLALD(in, in, sumOfSquares); - - /* Decrement the loop counter */ - blkCnt--; - } + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ + /* Compute sum and store result in a temporary variable, sum. */ +#if defined (ARM_MATH_DSP) + in32 = read_q15x2_ia ((q15_t **) &pSrc); + sumOfSquares = __SMLALD(in32, in32, sumOfSquares); + sum += ((in32 << 16U) >> 16U); + sum += (in32 >> 16U); + + in32 = read_q15x2_ia ((q15_t **) &pSrc); + sumOfSquares = __SMLALD(in32, in32, sumOfSquares); + sum += ((in32 << 16U) >> 16U); + sum += (in32 >> 16U); +#else + in = *pSrc++; + sumOfSquares += (in * in); + sum += in; - while (blkCnt > 0U) - { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ - in1 = *pSrc++; - sumOfSquares = __SMLALD(in1, in1, sumOfSquares); - sum += in1; - - /* Decrement the loop counter */ - blkCnt--; - } + in = *pSrc++; + sumOfSquares += (in * in); + sum += in; - /* Compute Mean of squares of the input samples - * and then store the result in a temporary variable, meanOfSquares. */ - meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U)); + in = *pSrc++; + sumOfSquares += (in * in); + sum += in; - /* Compute square of mean */ - squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U))); + in = *pSrc++; + sumOfSquares += (in * in); + sum += in; +#endif /* #if defined (ARM_MATH_DSP) */ - /* mean of the squares minus the square of the mean. */ - /* Compute standard deviation and store the result to the destination */ - arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult); + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sumOfSquares. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ + in = *pSrc++; + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ sumOfSquares += (in * in); - - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - /* Compute sum of all input values and then store the result in a temporary variable, sum. */ + /* Compute sum and store result in a temporary variable, sum. */ sum += in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Compute Mean of squares of the input samples - * and then store the result in a temporary variable, meanOfSquares. */ - meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U)); + /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */ + meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U)); /* Compute square of mean */ - squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U))); + squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U))); - /* mean of the squares minus the square of the mean. */ - /* Compute standard deviation and store the result to the destination */ + /* mean of squares minus the square of mean. */ + /* Compute standard deviation and store result in destination */ arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult); - -#endif /* #if defined (ARM_MATH_DSP) */ } /** - * @} end of STD group + @} end of STD group */ diff --git a/DSP/Source/StatisticsFunctions/arm_std_q31.c b/DSP/Source/StatisticsFunctions/arm_std_q31.c index f02cbdd..cfb6cb8 100644 --- a/DSP/Source/StatisticsFunctions/arm_std_q31.c +++ b/DSP/Source/StatisticsFunctions/arm_std_q31.c @@ -1,15 +1,15 @@ /* ---------------------------------------------------------------------- * Project: CMSIS DSP Library * Title: arm_std_q31.c - * Description: Standard deviation of an array of Q31 type. + * Description: Standard deviation of the elements of a Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,141 +29,119 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup STD - * @{ + @addtogroup STD + @{ */ /** - * @brief Standard deviation of the elements of a Q31 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult standard deviation value returned here - * @return none. - * @details - * <b>Scaling and Overflow Behavior:</b> - * - *\par - * The function is implemented using an internal 64-bit accumulator. - * The input is represented in 1.31 format, which is then downshifted by 8 bits - * which yields 1.23, and intermediate multiplication yields a 2.46 format. - * The accumulator maintains full precision of the intermediate multiplication results, - * but provides only a 16 guard bits. - * There is no saturation on intermediate additions. - * If the accumulator overflows it wraps around and distorts the result. - * In order to avoid overflows completely the input signal must be scaled down by - * log2(blockSize)-8 bits, as a total of blockSize additions are performed internally. - * After division, internal variables should be Q18.46 - * Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value. - * + @brief Standard deviation of the elements of a Q31 vector. + @param[in] pSrc points to the input vector. + @param[in] blockSize number of samples in input vector. + @param[out] pResult standard deviation value returned here. + @return none + + @par Scaling and Overflow Behavior + The function is implemented using an internal 64-bit accumulator. + The input is represented in 1.31 format, which is then downshifted by 8 bits + which yields 1.23, and intermediate multiplication yields a 2.46 format. + The accumulator maintains full precision of the intermediate multiplication results, + but provides only a 16 guard bits. + There is no saturation on intermediate additions. + If the accumulator overflows it wraps around and distorts the result. + In order to avoid overflows completely the input signal must be scaled down by + log2(blockSize)-8 bits, as a total of blockSize additions are performed internally. + After division, internal variables should be Q18.46 + Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value. */ void arm_std_q31( - q31_t * pSrc, - uint32_t blockSize, - q31_t * pResult) + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult) { - q63_t sum = 0; /* Accumulator */ - q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */ - q31_t in; /* input value */ - uint32_t blkCnt; /* loop counter */ - q63_t sumOfSquares = 0; /* Accumulator */ + uint32_t blkCnt; /* Loop counter */ + q63_t sum = 0; /* Accumulator */ + q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */ + q63_t sumOfSquares = 0; /* Sum of squares */ + q31_t in; /* Temporary variable to store input value */ - if (blockSize == 1U) + if (blockSize <= 1U) { *pResult = 0; return; } -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ + in = *pSrc++ >> 8U; - sum += in; + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ sumOfSquares += ((q63_t) (in) * (in)); - in = *pSrc++ >> 8U; + /* Compute sum and store result in a temporary variable, sum. */ sum += in; - sumOfSquares += ((q63_t) (in) * (in)); + in = *pSrc++ >> 8U; - sum += in; sumOfSquares += ((q63_t) (in) * (in)); - in = *pSrc++ >> 8U; sum += in; - sumOfSquares += ((q63_t) (in) * (in)); - - /* Decrement the loop counter */ - blkCnt--; - } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; - - while (blkCnt > 0U) - { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ in = *pSrc++ >> 8U; + sumOfSquares += ((q63_t) (in) * (in)); sum += in; + + in = *pSrc++ >> 8U; sumOfSquares += ((q63_t) (in) * (in)); + sum += in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Compute Mean of squares of the input samples - * and then store the result in a temporary variable, meanOfSquares. */ - meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U); + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sumOfSquares. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ + in = *pSrc++ >> 8U; + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ sumOfSquares += ((q63_t) (in) * (in)); - - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - /* Compute sum of all input values and then store the result in a temporary variable, sum. */ + /* Compute sum and store result in a temporary variable, sum. */ sum += in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Compute Mean of squares of the input samples - * and then store the result in a temporary variable, meanOfSquares. */ - meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U); - -#endif /* #if defined (ARM_MATH_DSP) */ + /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */ + meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U)); /* Compute square of mean */ - squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U)); + squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U))); - /* Compute standard deviation and then store the result to the destination */ + /* Compute standard deviation and store result in destination */ arm_sqrt_q31((meanOfSquares - squareOfMean) >> 15U, pResult); } /** - * @} end of STD group + @} end of STD group */ diff --git a/DSP/Source/StatisticsFunctions/arm_var_f32.c b/DSP/Source/StatisticsFunctions/arm_var_f32.c index c0f731d..3c325b1 100644 --- a/DSP/Source/StatisticsFunctions/arm_var_f32.c +++ b/DSP/Source/StatisticsFunctions/arm_var_f32.c @@ -3,13 +3,13 @@ * Title: arm_var_f32.c * Description: Variance of the elements of a floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,153 +29,206 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @defgroup variance Variance - * - * Calculates the variance of the elements in the input vector. - * The underlying algorithm used is the direct method sometimes referred to as the two-pass method: - * - * <pre> - * Result = sum(element - meanOfElements)^2) / numElement - 1 - * - * where, meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize - * - * </pre> - * - * There are separate functions for floating point, Q31, and Q15 data types. + @defgroup variance Variance + + Calculates the variance of the elements in the input vector. + The underlying algorithm used is the direct method sometimes referred to as the two-pass method: + + <pre> + Result = sum(element - meanOfElements)^2) / numElement - 1 + + meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize + </pre> + + There are separate functions for floating point, Q31, and Q15 data types. */ /** - * @addtogroup variance - * @{ + @addtogroup variance + @{ */ - /** - * @brief Variance of the elements of a floating-point vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult variance value returned here - * @return none. + @brief Variance of the elements of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult variance value returned here + @return none */ - +#if defined(ARM_MATH_NEON_EXPERIMENTAL) void arm_var_f32( - float32_t * pSrc, + const float32_t * pSrc, uint32_t blockSize, float32_t * pResult) { - float32_t fMean, fValue; - uint32_t blkCnt; /* loop counter */ - float32_t * pInput = pSrc; - float32_t sum = 0.0f; - float32_t fSum = 0.0f; - #if defined(ARM_MATH_DSP) - float32_t in1, in2, in3, in4; - #endif - - if (blockSize <= 1U) - { - *pResult = 0; - return; - } - - #if defined(ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M7 */ - - /*loop Unrolling */ - blkCnt = blockSize >> 2U; - - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ - while (blkCnt > 0U) - { - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - in1 = *pInput++; - in2 = *pInput++; - in3 = *pInput++; - in4 = *pInput++; - - sum += in1; - sum += in2; - sum += in3; - sum += in4; - - /* Decrement the loop counter */ - blkCnt--; - } - - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; - - #else - /* Run the below code for Cortex-M0 or Cortex-M3 */ - - /* Loop over blockSize number of values */ - blkCnt = blockSize; - - #endif - - while (blkCnt > 0U) - { - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - sum += *pInput++; - - /* Decrement the loop counter */ - blkCnt--; - } - - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ - fMean = sum / (float32_t) blockSize; - - pInput = pSrc; - - #if defined(ARM_MATH_DSP) - - /*loop Unrolling */ - blkCnt = blockSize >> 2U; - - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ - while (blkCnt > 0U) - { - fValue = *pInput++ - fMean; - fSum += fValue * fValue; - fValue = *pInput++ - fMean; - fSum += fValue * fValue; - fValue = *pInput++ - fMean; - fSum += fValue * fValue; - fValue = *pInput++ - fMean; - fSum += fValue * fValue; - - /* Decrement the loop counter */ - blkCnt--; - } - - blkCnt = blockSize % 0x4U; - #else - /* Run the below code for Cortex-M0 or Cortex-M3 */ - - /* Loop over blockSize number of values */ - blkCnt = blockSize; - #endif - - while (blkCnt > 0U) - { - fValue = *pInput++ - fMean; - fSum += fValue * fValue; - - /* Decrement the loop counter */ - blkCnt--; - } - - /* Variance */ - *pResult = fSum / (float32_t)(blockSize - 1.0f); + float32_t mean; + + float32_t sum = 0.0f; /* accumulator */ + float32_t in; /* Temporary variable to store input value */ + uint32_t blkCnt; /* loop counter */ + + float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */ + float32x2_t sumV2; + float32x4_t inV; + float32x4_t avg; + + arm_mean_f32(pSrc,blockSize,&mean); + avg = vdupq_n_f32(mean); + + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ + /* Compute Power and then store the result in a temporary variable, sum. */ + inV = vld1q_f32(pSrc); + inV = vsubq_f32(inV, avg); + sumV = vmlaq_f32(sumV, inV, inV); + pSrc += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); + sum = sumV2[0] + sumV2[1]; + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ + /* compute power and then store the result in a temporary variable, sum. */ + in = *pSrc++; + in = in - mean; + sum += in * in; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Variance */ + *pResult = sum / (float32_t)(blockSize - 1.0f); + +} + +#else +void arm_var_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult) +{ + uint32_t blkCnt; /* Loop counter */ + float32_t sum = 0.0f; /* Temporary result storage */ + float32_t fSum = 0.0f; + float32_t fMean, fValue; + const float32_t * pInput = pSrc; + + if (blockSize <= 1U) + { + *pResult = 0; + return; + } + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ + + sum += *pInput++; + sum += *pInput++; + sum += *pInput++; + sum += *pInput++; + + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ + + sum += *pInput++; + + /* Decrement loop counter */ + blkCnt--; + } + + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ + fMean = sum / (float32_t) blockSize; + + pInput = pSrc; + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + fValue = *pInput++ - fMean; + fSum += fValue * fValue; + + fValue = *pInput++ - fMean; + fSum += fValue * fValue; + + fValue = *pInput++ - fMean; + fSum += fValue * fValue; + + fValue = *pInput++ - fMean; + fSum += fValue * fValue; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + fValue = *pInput++ - fMean; + fSum += fValue * fValue; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Variance */ + *pResult = fSum / (float32_t)(blockSize - 1.0f); } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of variance group + @} end of variance group */ diff --git a/DSP/Source/StatisticsFunctions/arm_var_q15.c b/DSP/Source/StatisticsFunctions/arm_var_q15.c index 5ba61f7..259e76b 100644 --- a/DSP/Source/StatisticsFunctions/arm_var_q15.c +++ b/DSP/Source/StatisticsFunctions/arm_var_q15.c @@ -3,13 +3,13 @@ * Title: arm_var_q15.c * Description: Variance of an array of Q15 type * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,144 +29,136 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup variance - * @{ + @addtogroup variance + @{ */ /** - * @brief Variance of the elements of a Q15 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult variance value returned here - * @return none. - * @details - * <b>Scaling and Overflow Behavior:</b> - * - * \par - * The function is implemented using a 64-bit internal accumulator. - * The input is represented in 1.15 format. - * Intermediate multiplication yields a 2.30 format, and this - * result is added without saturation to a 64-bit accumulator in 34.30 format. - * With 33 guard bits in the accumulator, there is no risk of overflow, and the - * full precision of the intermediate multiplication is preserved. - * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower - * 15 bits, and then saturated to yield a result in 1.15 format. + @brief Variance of the elements of a Q15 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult variance value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 64-bit internal accumulator. + The input is represented in 1.15 format. + Intermediate multiplication yields a 2.30 format, and this + result is added without saturation to a 64-bit accumulator in 34.30 format. + With 33 guard bits in the accumulator, there is no risk of overflow, and the + full precision of the intermediate multiplication is preserved. + Finally, the 34.30 result is truncated to 34.15 format by discarding the lower + 15 bits, and then saturated to yield a result in 1.15 format. */ void arm_var_q15( - q15_t * pSrc, - uint32_t blockSize, - q15_t * pResult) + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult) { - q31_t sum = 0; /* Accumulator */ - q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */ - uint32_t blkCnt; /* loop counter */ - q63_t sumOfSquares = 0; /* Accumulator */ -#if defined (ARM_MATH_DSP) - q31_t in; /* input value */ - q15_t in1; /* input value */ -#else - q15_t in; /* input value */ + uint32_t blkCnt; /* Loop counter */ + q31_t sum = 0; /* Accumulator */ + q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */ + q63_t sumOfSquares = 0; /* Sum of squares */ + q15_t in; /* Temporary variable to store input value */ + +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in32; /* Temporary variable to store input value */ #endif - if (blockSize == 1U) + if (blockSize <= 1U) { *pResult = 0; return; } -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ - in = *__SIMD32(pSrc)++; - sum += ((in << 16U) >> 16U); - sum += (in >> 16U); - sumOfSquares = __SMLALD(in, in, sumOfSquares); - in = *__SIMD32(pSrc)++; - sum += ((in << 16U) >> 16U); - sum += (in >> 16U); - sumOfSquares = __SMLALD(in, in, sumOfSquares); + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ - /* Decrement the loop counter */ - blkCnt--; - } + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ + /* Compute sum and store result in a temporary variable, sum. */ +#if defined (ARM_MATH_DSP) + in32 = read_q15x2_ia ((q15_t **) &pSrc); + sumOfSquares = __SMLALD(in32, in32, sumOfSquares); + sum += ((in32 << 16U) >> 16U); + sum += (in32 >> 16U); + + in32 = read_q15x2_ia ((q15_t **) &pSrc); + sumOfSquares = __SMLALD(in32, in32, sumOfSquares); + sum += ((in32 << 16U) >> 16U); + sum += (in32 >> 16U); +#else + in = *pSrc++; + sumOfSquares += (in * in); + sum += in; - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; + in = *pSrc++; + sumOfSquares += (in * in); + sum += in; - while (blkCnt > 0U) - { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ - in1 = *pSrc++; - sumOfSquares = __SMLALD(in1, in1, sumOfSquares); - sum += in1; - - /* Decrement the loop counter */ - blkCnt--; - } + in = *pSrc++; + sumOfSquares += (in * in); + sum += in; - /* Compute Mean of squares of the input samples - * and then store the result in a temporary variable, meanOfSquares. */ - meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U)); + in = *pSrc++; + sumOfSquares += (in * in); + sum += in; +#endif /* #if defined (ARM_MATH_DSP) */ - /* Compute square of mean */ - squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U))); + /* Decrement loop counter */ + blkCnt--; + } - /* mean of the squares minus the square of the mean. */ - *pResult = (meanOfSquares - squareOfMean) >> 15U; + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sumOfSquares. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ + in = *pSrc++; + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ +#if defined (ARM_MATH_DSP) + sumOfSquares = __SMLALD(in, in, sumOfSquares); +#else sumOfSquares += (in * in); - - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - /* Compute sum of all input values and then store the result in a temporary variable, sum. */ +#endif /* #if defined (ARM_MATH_DSP) */ + /* Compute sum and store result in a temporary variable, sum. */ sum += in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Compute Mean of squares of the input samples - * and then store the result in a temporary variable, meanOfSquares. */ - meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U)); + /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */ + meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U)); /* Compute square of mean */ - squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U))); - - /* mean of the squares minus the square of the mean. */ - *pResult = (meanOfSquares - squareOfMean) >> 15; + squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U))); -#endif /* #if defined (ARM_MATH_DSP) */ + /* mean of squares minus the square of mean. */ + *pResult = (meanOfSquares - squareOfMean) >> 15U; } /** - * @} end of variance group + @} end of variance group */ diff --git a/DSP/Source/StatisticsFunctions/arm_var_q31.c b/DSP/Source/StatisticsFunctions/arm_var_q31.c index 526c6cd..558332f 100644 --- a/DSP/Source/StatisticsFunctions/arm_var_q31.c +++ b/DSP/Source/StatisticsFunctions/arm_var_q31.c @@ -3,13 +3,13 @@ * Title: arm_var_q31.c * Description: Variance of an array of Q31 type * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,141 +29,119 @@ #include "arm_math.h" /** - * @ingroup groupStats + @ingroup groupStats */ /** - * @addtogroup variance - * @{ + @addtogroup variance + @{ */ /** - * @brief Variance of the elements of a Q31 vector. - * @param[in] *pSrc points to the input vector - * @param[in] blockSize length of the input vector - * @param[out] *pResult variance value returned here - * @return none. - * @details - * <b>Scaling and Overflow Behavior:</b> - * - *\par - * The function is implemented using an internal 64-bit accumulator. - * The input is represented in 1.31 format, which is then downshifted by 8 bits - * which yields 1.23, and intermediate multiplication yields a 2.46 format. - * The accumulator maintains full precision of the intermediate multiplication results, - * but provides only a 16 guard bits. - * There is no saturation on intermediate additions. - * If the accumulator overflows it wraps around and distorts the result. - * In order to avoid overflows completely the input signal must be scaled down by - * log2(blockSize)-8 bits, as a total of blockSize additions are performed internally. - * After division, internal variables should be Q18.46 - * Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value. - * + @brief Variance of the elements of a Q31 vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult variance value returned here + @return none + + @par Scaling and Overflow Behavior + The function is implemented using an internal 64-bit accumulator. + The input is represented in 1.31 format, which is then downshifted by 8 bits + which yields 1.23, and intermediate multiplication yields a 2.46 format. + The accumulator maintains full precision of the intermediate multiplication results, + but provides only a 16 guard bits. + There is no saturation on intermediate additions. + If the accumulator overflows it wraps around and distorts the result. + In order to avoid overflows completely the input signal must be scaled down by + log2(blockSize)-8 bits, as a total of blockSize additions are performed internally. + After division, internal variables should be Q18.46 + Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value. */ void arm_var_q31( - q31_t * pSrc, - uint32_t blockSize, - q31_t * pResult) + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult) { - q63_t sum = 0; /* Accumulator */ - q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */ - q31_t in; /* input value */ - uint32_t blkCnt; /* loop counter */ - q63_t sumOfSquares = 0; /* Accumulator */ + uint32_t blkCnt; /* Loop counter */ + q63_t sum = 0; /* Temporary result storage */ + q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */ + q63_t sumOfSquares = 0; /* Sum of squares */ + q31_t in; /* Temporary variable to store input value */ - if (blockSize == 1U) + if (blockSize <= 1U) { *pResult = 0; return; } -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ + in = *pSrc++ >> 8U; - sum += in; + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ sumOfSquares += ((q63_t) (in) * (in)); - in = *pSrc++ >> 8U; + /* Compute sum and store result in a temporary variable, sum. */ sum += in; - sumOfSquares += ((q63_t) (in) * (in)); + in = *pSrc++ >> 8U; - sum += in; sumOfSquares += ((q63_t) (in) * (in)); - in = *pSrc++ >> 8U; sum += in; - sumOfSquares += ((q63_t) (in) * (in)); - - /* Decrement the loop counter */ - blkCnt--; - } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; - - while (blkCnt > 0U) - { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sum. */ in = *pSrc++ >> 8U; + sumOfSquares += ((q63_t) (in) * (in)); sum += in; + + in = *pSrc++ >> 8U; sumOfSquares += ((q63_t) (in) * (in)); + sum += in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Compute Mean of squares of the input samples - * and then store the result in a temporary variable, meanOfSquares. */ - meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U); + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ - /* Compute Sum of squares of the input samples - * and then store the result in a temporary variable, sumOfSquares. */ + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + /* C = A[0] + A[1] + ... + A[blockSize-1] */ + in = *pSrc++ >> 8U; + /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ sumOfSquares += ((q63_t) (in) * (in)); - - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - /* Compute sum of all input values and then store the result in a temporary variable, sum. */ + /* Compute sum and store result in a temporary variable, sum. */ sum += in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Compute Mean of squares of the input samples - * and then store the result in a temporary variable, meanOfSquares. */ - meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U); - -#endif /* #if defined (ARM_MATH_DSP) */ + /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */ + meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U)); /* Compute square of mean */ - squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U)); + squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U))); - /* Compute standard deviation and then store the result to the destination */ + /* Compute variance and store result in destination */ *pResult = (meanOfSquares - squareOfMean) >> 15U; } /** - * @} end of variance group + @} end of variance group */ |