diff options
Diffstat (limited to 'DSP/Source/SupportFunctions')
22 files changed, 1319 insertions, 965 deletions
diff --git a/DSP/Source/SupportFunctions/CMakeLists.txt b/DSP/Source/SupportFunctions/CMakeLists.txt new file mode 100644 index 0000000..33c4f87 --- /dev/null +++ b/DSP/Source/SupportFunctions/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required (VERSION 3.6) + +project(CMSISDSPSupport) + + +file(GLOB SRC "./*_*.c") + +add_library(CMSISDSPSupport STATIC ${SRC}) + +configdsp(CMSISDSPSupport ..) + +### Includes +target_include_directories(CMSISDSPSupport PUBLIC "${DSP}/../../Include") + + + diff --git a/DSP/Source/SupportFunctions/SupportFunctions.c b/DSP/Source/SupportFunctions/SupportFunctions.c new file mode 100644 index 0000000..4deb19b --- /dev/null +++ b/DSP/Source/SupportFunctions/SupportFunctions.c @@ -0,0 +1,48 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: SupportFunctions.c + * Description: Combination of all support function source files. + * + * $Date: 18. March 2019 + * $Revision: V1.0.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_copy_f32.c" +#include "arm_copy_q15.c" +#include "arm_copy_q31.c" +#include "arm_copy_q7.c" +#include "arm_fill_f32.c" +#include "arm_fill_q15.c" +#include "arm_fill_q31.c" +#include "arm_fill_q7.c" +#include "arm_float_to_q15.c" +#include "arm_float_to_q31.c" +#include "arm_float_to_q7.c" +#include "arm_q15_to_float.c" +#include "arm_q15_to_q31.c" +#include "arm_q15_to_q7.c" +#include "arm_q31_to_float.c" +#include "arm_q31_to_q15.c" +#include "arm_q31_to_q7.c" +#include "arm_q7_to_float.c" +#include "arm_q7_to_q15.c" +#include "arm_q7_to_q31.c" diff --git a/DSP/Source/SupportFunctions/arm_copy_f32.c b/DSP/Source/SupportFunctions/arm_copy_f32.c index 1e2b5cf..707adc4 100644 --- a/DSP/Source/SupportFunctions/arm_copy_f32.c +++ b/DSP/Source/SupportFunctions/arm_copy_f32.c @@ -3,13 +3,13 @@ * Title: arm_copy_f32.c * Description: Copies the elements of a floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,66 +29,56 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @defgroup copy Vector Copy - * - * Copies sample by sample from source vector to destination vector. - * - * <pre> - * pDst[n] = pSrc[n]; 0 <= n < blockSize. - * </pre> - * - * There are separate functions for floating point, Q31, Q15, and Q7 data types. + @defgroup copy Vector Copy + + Copies sample by sample from source vector to destination vector. + + <pre> + pDst[n] = pSrc[n]; 0 <= n < blockSize. + </pre> + + There are separate functions for floating point, Q31, Q15, and Q7 data types. */ /** - * @addtogroup copy - * @{ + @addtogroup copy + @{ */ /** - * @brief Copies the elements of a floating-point vector. - * @param[in] *pSrc points to input vector - * @param[out] *pDst points to output vector - * @param[in] blockSize length of the input vector - * @return none. - * + @brief Copies the elements of a floating-point vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none */ - +#if defined(ARM_MATH_NEON_EXPERIMENTAL) void arm_copy_f32( - float32_t * pSrc, + const float32_t * pSrc, float32_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ -#if defined (ARM_MATH_DSP) - - /* Run the below code for Cortex-M4 and Cortex-M3 */ - float32_t in1, in2, in3, in4; + float32x4_t inV; - /*loop Unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = A */ /* Copy and then store the results in the destination buffer */ - in1 = *pSrc++; - in2 = *pSrc++; - in3 = *pSrc++; - in4 = *pSrc++; - - *pDst++ = in1; - *pDst++ = in2; - *pDst++ = in3; - *pDst++ = in4; + inV = vld1q_f32(pSrc); + vst1q_f32(pDst, inV); + pSrc += 4; + pDst += 4; /* Decrement the loop counter */ blkCnt--; @@ -96,28 +86,67 @@ void arm_copy_f32( /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; + blkCnt = blockSize & 3; + while (blkCnt > 0U) + { + /* C = A */ + /* Copy and then store the results in the destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement the loop counter */ + blkCnt--; + } +} #else +void arm_copy_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ - /* Run the below code for Cortex-M0 */ +#if defined (ARM_MATH_LOOPUNROLL) - /* Loop over blockSize number of values */ + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = A */ - /* Copy and then store the results in the destination buffer */ + + /* Copy and store result in destination buffer */ *pDst++ = *pSrc++; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } - +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of BasicCopy group + @} end of BasicCopy group */ diff --git a/DSP/Source/SupportFunctions/arm_copy_q15.c b/DSP/Source/SupportFunctions/arm_copy_q15.c index 0d2fffb..d8da113 100644 --- a/DSP/Source/SupportFunctions/arm_copy_q15.c +++ b/DSP/Source/SupportFunctions/arm_copy_q15.c @@ -3,13 +3,13 @@ * Title: arm_copy_q15.c * Description: Copies the elements of a Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,74 +29,68 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup copy - * @{ + @addtogroup copy + @{ */ + /** - * @brief Copies the elements of a Q15 vector. - * @param[in] *pSrc points to input vector - * @param[out] *pDst points to output vector - * @param[in] blockSize length of the input vector - * @return none. - * + @brief Copies the elements of a Q15 vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none */ void arm_copy_q15( - q15_t * pSrc, - q15_t * pDst, - uint32_t blockSize) + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) { - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ -#if defined (ARM_MATH_DSP) +#if defined (ARM_MATH_LOOPUNROLL) - /* Run the below code for Cortex-M4 and Cortex-M3 */ - - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = A */ - /* Read two inputs */ - *__SIMD32(pDst)++ = *__SIMD32(pSrc)++; - *__SIMD32(pDst)++ = *__SIMD32(pSrc)++; - /* Decrement the loop counter */ + /* read 2 times 2 samples at a time */ + write_q15x2_ia (&pDst, read_q15x2_ia ((q15_t **) &pSrc)); + write_q15x2_ia (&pDst, read_q15x2_ia ((q15_t **) &pSrc)); + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; - #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = A */ - /* Copy and then store the value in the destination buffer */ + + /* Copy and store result in destination buffer */ *pDst++ = *pSrc++; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of BasicCopy group + @} end of BasicCopy group */ diff --git a/DSP/Source/SupportFunctions/arm_copy_q31.c b/DSP/Source/SupportFunctions/arm_copy_q31.c index 5bf8934..e342a32 100644 --- a/DSP/Source/SupportFunctions/arm_copy_q31.c +++ b/DSP/Source/SupportFunctions/arm_copy_q31.c @@ -3,13 +3,13 @@ * Title: arm_copy_q31.c * Description: Copies the elements of a Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,83 +29,70 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup copy - * @{ + @addtogroup copy + @{ */ /** - * @brief Copies the elements of a Q31 vector. - * @param[in] *pSrc points to input vector - * @param[out] *pDst points to output vector - * @param[in] blockSize length of the input vector - * @return none. - * + @brief Copies the elements of a Q31 vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none */ void arm_copy_q31( - q31_t * pSrc, - q31_t * pDst, - uint32_t blockSize) + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) { - uint32_t blkCnt; /* loop counter */ - - -#if defined (ARM_MATH_DSP) + uint32_t blkCnt; /* Loop counter */ - /* Run the below code for Cortex-M4 and Cortex-M3 */ - q31_t in1, in2, in3, in4; +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = A */ - /* Copy and then store the values in the destination buffer */ - in1 = *pSrc++; - in2 = *pSrc++; - in3 = *pSrc++; - in4 = *pSrc++; - - *pDst++ = in1; - *pDst++ = in2; - *pDst++ = in3; - *pDst++ = in4; - - /* Decrement the loop counter */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = A */ - /* Copy and then store the value in the destination buffer */ + + /* Copy and store result in destination buffer */ *pDst++ = *pSrc++; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of BasicCopy group + @} end of BasicCopy group */ diff --git a/DSP/Source/SupportFunctions/arm_copy_q7.c b/DSP/Source/SupportFunctions/arm_copy_q7.c index 5c737cd..77da8ca 100644 --- a/DSP/Source/SupportFunctions/arm_copy_q7.c +++ b/DSP/Source/SupportFunctions/arm_copy_q7.c @@ -3,13 +3,13 @@ * Title: arm_copy_q7.c * Description: Copies the elements of a Q7 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,75 +29,67 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup copy - * @{ + @addtogroup copy + @{ */ /** - * @brief Copies the elements of a Q7 vector. - * @param[in] *pSrc points to input vector - * @param[out] *pDst points to output vector - * @param[in] blockSize length of the input vector - * @return none. - * + @brief Copies the elements of a Q7 vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none */ void arm_copy_q7( - q7_t * pSrc, - q7_t * pDst, - uint32_t blockSize) + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize) { - uint32_t blkCnt; /* loop counter */ - -#if defined (ARM_MATH_DSP) + uint32_t blkCnt; /* Loop counter */ - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = A */ - /* Copy and then store the results in the destination buffer */ - /* 4 samples are copied and stored at a time using SIMD */ - *__SIMD32(pDst)++ = *__SIMD32(pSrc)++; - /* Decrement the loop counter */ + /* read 4 samples at a time */ + write_q7x4_ia (&pDst, read_q7x4_ia ((q7_t **) &pSrc)); + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ - +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = A */ - /* Copy and then store the results in the destination buffer */ + + /* Copy and store result in destination buffer */ *pDst++ = *pSrc++; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of BasicCopy group + @} end of BasicCopy group */ diff --git a/DSP/Source/SupportFunctions/arm_fill_f32.c b/DSP/Source/SupportFunctions/arm_fill_f32.c index be749c8..29f6286 100644 --- a/DSP/Source/SupportFunctions/arm_fill_f32.c +++ b/DSP/Source/SupportFunctions/arm_fill_f32.c @@ -3,13 +3,13 @@ * Title: arm_fill_f32.c * Description: Fills a constant value into a floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,36 +29,35 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @defgroup Fill Vector Fill - * - * Fills the destination vector with a constant value. - * - * <pre> - * pDst[n] = value; 0 <= n < blockSize. - * </pre> - * - * There are separate functions for floating point, Q31, Q15, and Q7 data types. + @defgroup Fill Vector Fill + + Fills the destination vector with a constant value. + + <pre> + pDst[n] = value; 0 <= n < blockSize. + </pre> + + There are separate functions for floating point, Q31, Q15, and Q7 data types. */ /** - * @addtogroup Fill - * @{ + @addtogroup Fill + @{ */ /** - * @brief Fills a constant value into a floating-point vector. - * @param[in] value input value to be filled - * @param[out] *pDst points to output vector - * @param[in] blockSize length of the output vector - * @return none. - * + @brief Fills a constant value into a floating-point vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none */ - +#if defined(ARM_MATH_NEON_EXPERIMENTAL) void arm_fill_f32( float32_t value, float32_t * pDst, @@ -66,27 +65,19 @@ void arm_fill_f32( { uint32_t blkCnt; /* loop counter */ -#if defined (ARM_MATH_DSP) - /* Run the below code for Cortex-M4 and Cortex-M3 */ - float32_t in1 = value; - float32_t in2 = value; - float32_t in3 = value; - float32_t in4 = value; + float32x4_t inV = vdupq_n_f32(value); - /*loop Unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = value */ /* Fill the value in the destination buffer */ - *pDst++ = in1; - *pDst++ = in2; - *pDst++ = in3; - *pDst++ = in4; + vst1q_f32(pDst, inV); + pDst += 4; /* Decrement the loop counter */ blkCnt--; @@ -94,29 +85,67 @@ void arm_fill_f32( /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + /* C = value */ + /* Fill the value in the destination buffer */ + *pDst++ = value; + /* Decrement the loop counter */ + blkCnt--; + } +} #else +void arm_fill_f32( + float32_t value, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ - /* Run the below code for Cortex-M0 */ +#if defined (ARM_MATH_LOOPUNROLL) - /* Loop over blockSize number of values */ - blkCnt = blockSize; + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = value */ -#endif /* #if defined (ARM_MATH_DSP) */ + /* Fill value in destination buffer */ + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = value */ - /* Fill the value in the destination buffer */ + + /* Fill value in destination buffer */ *pDst++ = value; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } - +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of Fill group + @} end of Fill group */ diff --git a/DSP/Source/SupportFunctions/arm_fill_q15.c b/DSP/Source/SupportFunctions/arm_fill_q15.c index 27eb42c..d8c0f8d 100644 --- a/DSP/Source/SupportFunctions/arm_fill_q15.c +++ b/DSP/Source/SupportFunctions/arm_fill_q15.c @@ -3,13 +3,13 @@ * Title: arm_fill_q15.c * Description: Fills a constant value into a Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,21 +29,20 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup Fill - * @{ + @addtogroup Fill + @{ */ /** - * @brief Fills a constant value into a Q15 vector. - * @param[in] value input value to be filled - * @param[out] *pDst points to output vector - * @param[in] blockSize length of the output vector - * @return none. - * + @brief Fills a constant value into a Q15 vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none */ void arm_fill_q15( @@ -51,58 +50,51 @@ void arm_fill_q15( q15_t * pDst, uint32_t blockSize) { - uint32_t blkCnt; /* loop counter */ - -#if defined (ARM_MATH_DSP) - - /* Run the below code for Cortex-M4 and Cortex-M3 */ + uint32_t blkCnt; /* Loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) q31_t packedValue; /* value packed to 32 bits */ - - /*loop Unrolling */ - blkCnt = blockSize >> 2U; - /* Packing two 16 bit values to 32 bit value in order to use SIMD */ packedValue = __PKHBT(value, value, 16U); - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + while (blkCnt > 0U) { /* C = value */ - /* Fill the value in the destination buffer */ - *__SIMD32(pDst)++ = packedValue; - *__SIMD32(pDst)++ = packedValue; - /* Decrement the loop counter */ + /* fill 2 times 2 samples at a time */ + write_q15x2_ia (&pDst, packedValue); + write_q15x2_ia (&pDst, packedValue); + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = value */ - /* Fill the value in the destination buffer */ + + /* Fill value in destination buffer */ *pDst++ = value; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of Fill group + @} end of Fill group */ diff --git a/DSP/Source/SupportFunctions/arm_fill_q31.c b/DSP/Source/SupportFunctions/arm_fill_q31.c index 397a7b5..e174889 100644 --- a/DSP/Source/SupportFunctions/arm_fill_q31.c +++ b/DSP/Source/SupportFunctions/arm_fill_q31.c @@ -3,13 +3,13 @@ * Title: arm_fill_q31.c * Description: Fills a constant value into a Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,21 +29,20 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup Fill - * @{ + @addtogroup Fill + @{ */ /** - * @brief Fills a constant value into a Q31 vector. - * @param[in] value input value to be filled - * @param[out] *pDst points to output vector - * @param[in] blockSize length of the output vector - * @return none. - * + @brief Fills a constant value into a Q31 vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none */ void arm_fill_q31( @@ -51,59 +50,49 @@ void arm_fill_q31( q31_t * pDst, uint32_t blockSize) { - uint32_t blkCnt; /* loop counter */ - - -#if defined (ARM_MATH_DSP) + uint32_t blkCnt; /* Loop counter */ - /* Run the below code for Cortex-M4 and Cortex-M3 */ - q31_t in1 = value; - q31_t in2 = value; - q31_t in3 = value; - q31_t in4 = value; +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = value */ - /* Fill the value in the destination buffer */ - *pDst++ = in1; - *pDst++ = in2; - *pDst++ = in3; - *pDst++ = in4; - /* Decrement the loop counter */ + /* Fill value in destination buffer */ + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = value */ - /* Fill the value in the destination buffer */ + + /* Fill value in destination buffer */ *pDst++ = value; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of Fill group + @} end of Fill group */ diff --git a/DSP/Source/SupportFunctions/arm_fill_q7.c b/DSP/Source/SupportFunctions/arm_fill_q7.c index dffdf97..bca3267 100644 --- a/DSP/Source/SupportFunctions/arm_fill_q7.c +++ b/DSP/Source/SupportFunctions/arm_fill_q7.c @@ -3,13 +3,13 @@ * Title: arm_fill_q7.c * Description: Fills a constant value into a Q7 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,21 +29,20 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup Fill - * @{ + @addtogroup Fill + @{ */ /** - * @brief Fills a constant value into a Q7 vector. - * @param[in] value input value to be filled - * @param[out] *pDst points to output vector - * @param[in] blockSize length of the output vector - * @return none. - * + @brief Fills a constant value into a Q7 vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none */ void arm_fill_q7( @@ -51,56 +50,50 @@ void arm_fill_q7( q7_t * pDst, uint32_t blockSize) { - uint32_t blkCnt; /* loop counter */ - -#if defined (ARM_MATH_DSP) - - /* Run the below code for Cortex-M4 and Cortex-M3 */ + uint32_t blkCnt; /* Loop counter */ +#if defined (ARM_MATH_LOOPUNROLL) q31_t packedValue; /* value packed to 32 bits */ - /*loop Unrolling */ - blkCnt = blockSize >> 2U; - /* Packing four 8 bit values to 32 bit value in order to use SIMD */ packedValue = __PACKq7(value, value, value, value); - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + while (blkCnt > 0U) { /* C = value */ - /* Fill the value in the destination buffer */ - *__SIMD32(pDst)++ = packedValue; - /* Decrement the loop counter */ + /* fill 4 samples at a time */ + write_q7x4_ia (&pDst, packedValue); + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = value */ - /* Fill the value in the destination buffer */ + + /* Fill value in destination buffer */ *pDst++ = value; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of Fill group + @} end of Fill group */ diff --git a/DSP/Source/SupportFunctions/arm_float_to_q15.c b/DSP/Source/SupportFunctions/arm_float_to_q15.c index 0aa20f1..68c1ad0 100644 --- a/DSP/Source/SupportFunctions/arm_float_to_q15.c +++ b/DSP/Source/SupportFunctions/arm_float_to_q15.c @@ -3,13 +3,13 @@ * Title: arm_float_to_q15.c * Description: Converts the elements of the floating-point vector to Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,95 +29,92 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup float_to_x - * @{ + @addtogroup float_to_x + @{ */ /** - * @brief Converts the elements of the floating-point vector to Q15 vector. - * @param[in] *pSrc points to the floating-point input vector - * @param[out] *pDst points to the Q15 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * \par - * The equation used for the conversion process is: - * <pre> - * pDst[n] = (q15_t)(pSrc[n] * 32768); 0 <= n < blockSize. - * </pre> - * \par Scaling and Overflow Behavior: - * \par - * The function uses saturating arithmetic. - * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated. - * \note - * In order to apply rounding, the library should be rebuilt with the ROUNDING macro - * defined in the preprocessor section of project options. - * + @brief Converts the elements of the floating-point vector to Q15 vector. + @param[in] pSrc points to the floating-point input vector + @param[out] pDst points to the Q15 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q15_t)(pSrc[n] * 32768); 0 <= n < blockSize. + </pre> + + @par Scaling and Overflow Behavior + The function uses saturating arithmetic. + Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated. + + @note + In order to apply rounding, the library should be rebuilt with the ROUNDING macro + defined in the preprocessor section of project options. */ - - +#if defined(ARM_MATH_NEON_EXPERIMENTAL) void arm_float_to_q15( - float32_t * pSrc, + const float32_t * pSrc, q15_t * pDst, uint32_t blockSize) { - float32_t *pIn = pSrc; /* Src pointer */ + const float32_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ -#ifdef ARM_MATH_ROUNDING - float32_t in; + float32x4_t inV; + #ifdef ARM_MATH_ROUNDING + float32x4_t zeroV = vdupq_n_f32(0.0f); + float32x4_t pHalf = vdupq_n_f32(0.5f / 32768.0f); + float32x4_t mHalf = vdupq_n_f32(-0.5f / 32768.0f); + float32x4_t r; + uint32x4_t cmp; + #endif + + int32x4_t cvt; + int16x4_t outV; -#endif /* #ifdef ARM_MATH_ROUNDING */ - -#if defined (ARM_MATH_DSP) - - /* Run the below code for Cortex-M4 and Cortex-M3 */ - - /*loop Unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { #ifdef ARM_MATH_ROUNDING /* C = A * 32768 */ - /* convert from float to q15 and then store the results in the destination buffer */ - in = *pIn++; - in = (in * 32768.0f); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + /* Convert from float to q15 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + cmp = vcgtq_f32(inV,zeroV); + r = vbslq_f32(cmp,pHalf,mHalf); + inV = vaddq_f32(inV, r); - in = *pIn++; - in = (in * 32768.0f); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + pIn += 4; - in = *pIn++; - in = (in * 32768.0f); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + cvt = vcvtq_n_s32_f32(inV,15); + outV = vqmovn_s32(cvt); - in = *pIn++; - in = (in * 32768.0f); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + vst1_s16(pDst, outV); + pDst += 4; #else /* C = A * 32768 */ - /* convert from float to q15 and then store the results in the destination buffer */ - *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); - *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); - *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); - *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + /* Convert from float to q15 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + + cvt = vcvtq_n_s32_f32(inV,15); + outV = vqmovn_s32(cvt); + + vst1_s16(pDst, outV); + pDst += 4; + pIn += 4; #endif /* #ifdef ARM_MATH_ROUNDING */ @@ -127,14 +124,14 @@ void arm_float_to_q15( /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; + blkCnt = blockSize & 3; while (blkCnt > 0U) { #ifdef ARM_MATH_ROUNDING /* C = A * 32768 */ - /* convert from float to q15 and then store the results in the destination buffer */ + /* Convert from float to q15 and then store the results in the destination buffer */ in = *pIn++; in = (in * 32768.0f); in += in > 0.0f ? 0.5f : -0.5f; @@ -143,7 +140,7 @@ void arm_float_to_q15( #else /* C = A * 32768 */ - /* convert from float to q15 and then store the results in the destination buffer */ + /* Convert from float to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); #endif /* #ifdef ARM_MATH_ROUNDING */ @@ -151,42 +148,97 @@ void arm_float_to_q15( /* Decrement the loop counter */ blkCnt--; } +} +#else +void arm_float_to_q15( + const float32_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const float32_t *pIn = pSrc; /* Source pointer */ + +#ifdef ARM_MATH_ROUNDING + float32_t in; +#endif /* #ifdef ARM_MATH_ROUNDING */ +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A * 32768 */ + + /* convert from float to Q15 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); #else - /* Run the below code for Cortex-M0 */ + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { + /* C = A * 32768 */ + /* convert from float to Q15 and store result in destination buffer */ #ifdef ARM_MATH_ROUNDING - /* C = A * 32768 */ - /* convert from float to q15 and then store the results in the destination buffer */ - in = *pIn++; - in = (in * 32768.0f); - in += in > 0 ? 0.5f : -0.5f; + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); #else /* C = A * 32768 */ - /* convert from float to q15 and then store the results in the destination buffer */ + /* Convert from float to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); -#endif /* #ifdef ARM_MATH_ROUNDING */ +#endif /* #ifdef ARM_MATH_ROUNDING */ - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } -#endif /* #if defined (ARM_MATH_DSP) */ - } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of float_to_x group + @} end of float_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_float_to_q31.c b/DSP/Source/SupportFunctions/arm_float_to_q31.c index d17cc3a..479f8c5 100644 --- a/DSP/Source/SupportFunctions/arm_float_to_q31.c +++ b/DSP/Source/SupportFunctions/arm_float_to_q31.c @@ -3,13 +3,13 @@ * Title: arm_float_to_q31.c * Description: Converts the elements of the floating-point vector to Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,7 +29,7 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** @@ -37,56 +37,56 @@ */ /** - * @addtogroup float_to_x - * @{ + @addtogroup float_to_x + @{ */ /** - * @brief Converts the elements of the floating-point vector to Q31 vector. - * @param[in] *pSrc points to the floating-point input vector - * @param[out] *pDst points to the Q31 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - *\par Description: - * \par - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (q31_t)(pSrc[n] * 2147483648); 0 <= n < blockSize. - * </pre> - * <b>Scaling and Overflow Behavior:</b> - * \par - * The function uses saturating arithmetic. - * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated. - * - * \note In order to apply rounding, the library should be rebuilt with the ROUNDING macro - * defined in the preprocessor section of project options. + @brief Converts the elements of the floating-point vector to Q31 vector. + @param[in] pSrc points to the floating-point input vector + @param[out] pDst points to the Q31 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q31_t)(pSrc[n] * 2147483648); 0 <= n < blockSize. + </pre> + + @par Scaling and Overflow Behavior + The function uses saturating arithmetic. + Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated. + + @note + In order to apply rounding, the library should be rebuilt with the ROUNDING macro + defined in the preprocessor section of project options. */ - +#if defined(ARM_MATH_NEON) void arm_float_to_q31( - float32_t * pSrc, + const float32_t * pSrc, q31_t * pDst, uint32_t blockSize) { - float32_t *pIn = pSrc; /* Src pointer */ + const float32_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ -#ifdef ARM_MATH_ROUNDING - float32_t in; + float32x4_t inV; + #ifdef ARM_MATH_ROUNDING + float32x4_t zeroV = vdupq_n_f32(0.0f); + float32x4_t pHalf = vdupq_n_f32(0.5f / 2147483648.0f); + float32x4_t mHalf = vdupq_n_f32(-0.5f / 2147483648.0f); + float32x4_t r; + uint32x4_t cmp; + #endif -#endif /* #ifdef ARM_MATH_ROUNDING */ - -#if defined (ARM_MATH_DSP) - - /* Run the below code for Cortex-M4 and Cortex-M3 */ + int32x4_t outV; - /*loop Unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { @@ -94,35 +94,30 @@ void arm_float_to_q31( #ifdef ARM_MATH_ROUNDING /* C = A * 32768 */ - /* convert from float to Q31 and then store the results in the destination buffer */ - in = *pIn++; - in = (in * 2147483648.0f); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = clip_q63_to_q31((q63_t) (in)); + /* Convert from float to Q31 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + cmp = vcgtq_f32(inV,zeroV); + r = vbslq_f32(cmp,pHalf,mHalf); + inV = vaddq_f32(inV, r); - in = *pIn++; - in = (in * 2147483648.0f); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = clip_q63_to_q31((q63_t) (in)); + pIn += 4; - in = *pIn++; - in = (in * 2147483648.0f); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = clip_q63_to_q31((q63_t) (in)); + outV = vcvtq_n_s32_f32(inV,31); - in = *pIn++; - in = (in * 2147483648.0f); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = clip_q63_to_q31((q63_t) (in)); + vst1q_s32(pDst, outV); + pDst += 4; #else /* C = A * 2147483648 */ - /* convert from float to Q31 and then store the results in the destination buffer */ - *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); - *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); - *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); - *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + /* Convert from float to Q31 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + + outV = vcvtq_n_s32_f32(inV,31); + + vst1q_s32(pDst, outV); + pDst += 4; + pIn += 4; #endif /* #ifdef ARM_MATH_ROUNDING */ @@ -132,7 +127,7 @@ void arm_float_to_q31( /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; + blkCnt = blockSize & 3; while (blkCnt > 0U) { @@ -140,7 +135,7 @@ void arm_float_to_q31( #ifdef ARM_MATH_ROUNDING /* C = A * 2147483648 */ - /* convert from float to Q31 and then store the results in the destination buffer */ + /* Convert from float to Q31 and then store the results in the destination buffer */ in = *pIn++; in = (in * 2147483648.0f); in += in > 0.0f ? 0.5f : -0.5f; @@ -149,7 +144,7 @@ void arm_float_to_q31( #else /* C = A * 2147483648 */ - /* convert from float to Q31 and then store the results in the destination buffer */ + /* Convert from float to Q31 and then store the results in the destination buffer */ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); #endif /* #ifdef ARM_MATH_ROUNDING */ @@ -159,41 +154,99 @@ void arm_float_to_q31( } +} #else +void arm_float_to_q31( + const float32_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const float32_t *pIn = pSrc; /* Source pointer */ + +#ifdef ARM_MATH_ROUNDING + float32_t in; +#endif /* #ifdef ARM_MATH_ROUNDING */ - /* Run the below code for Cortex-M0 */ +#if defined (ARM_MATH_LOOPUNROLL) - /* Loop over blockSize number of values */ - blkCnt = blockSize; + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; while (blkCnt > 0U) { + /* C = A * 2147483648 */ + /* convert from float to Q31 and store result in destination buffer */ #ifdef ARM_MATH_ROUNDING - /* C = A * 2147483648 */ - /* convert from float to Q31 and then store the results in the destination buffer */ - in = *pIn++; - in = (in * 2147483648.0f); - in += in > 0 ? 0.5f : -0.5f; + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; *pDst++ = clip_q63_to_q31((q63_t) (in)); #else /* C = A * 2147483648 */ - /* convert from float to Q31 and then store the results in the destination buffer */ + /* Convert from float to Q31 and then store the results in the destination buffer */ + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); -#endif /* #ifdef ARM_MATH_ROUNDING */ +#endif /* #ifdef ARM_MATH_ROUNDING */ - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } -#endif /* #if defined (ARM_MATH_DSP) */ + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = A * 2147483648 */ + + /* convert from float to Q31 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + +#else + + /* C = A * 2147483648 */ + /* Convert from float to Q31 and then store the results in the destination buffer */ + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of float_to_x group + @} end of float_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_float_to_q7.c b/DSP/Source/SupportFunctions/arm_float_to_q7.c index 6629a69..5f2a7eb 100644 --- a/DSP/Source/SupportFunctions/arm_float_to_q7.c +++ b/DSP/Source/SupportFunctions/arm_float_to_q7.c @@ -3,13 +3,13 @@ * Title: arm_float_to_q7.c * Description: Converts the elements of the floating-point vector to Q7 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,12 +29,12 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup float_to_x - * @{ + @addtogroup float_to_x + @{ */ /** @@ -59,51 +59,89 @@ * defined in the preprocessor section of project options. */ - +#if defined(ARM_MATH_NEON) void arm_float_to_q7( - float32_t * pSrc, + const float32_t * pSrc, q7_t * pDst, uint32_t blockSize) { - float32_t *pIn = pSrc; /* Src pointer */ + const float32_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ + float32_t in; + float32x4_t inV; + #ifdef ARM_MATH_ROUNDING + float32x4_t zeroV = vdupq_n_f32(0.0f); + float32x4_t pHalf = vdupq_n_f32(0.5f / 128.0f); + float32x4_t mHalf = vdupq_n_f32(-0.5f / 128.0f); + float32x4_t r; + uint32x4_t cmp; + #endif + + int32x4_t cvt; + int16x4_t cvt1,cvt2; + int8x8_t outV; + + blkCnt = blockSize >> 3U; + + /* Compute 8 outputs at a time. + ** a second loop below computes the remaining 1 to 7 samples. */ + while (blkCnt > 0U) + { + #ifdef ARM_MATH_ROUNDING + /* C = A * 128 */ + /* Convert from float to q7 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + cmp = vcgtq_f32(inV,zeroV); + r = vbslq_f32(cmp,pHalf,mHalf); + inV = vaddq_f32(inV, r); + cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7)); + pIn += 4; + + inV = vld1q_f32(pIn); + cmp = vcgtq_f32(inV,zeroV); + r = vbslq_f32(cmp,pHalf,mHalf); + inV = vaddq_f32(inV, r); + cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7)); + pIn += 4; + + outV = vqmovn_s16(vcombine_s16(cvt1,cvt2)); + vst1_s8(pDst, outV); + pDst += 8; - float32_t in; +#else -#endif /* #ifdef ARM_MATH_ROUNDING */ + /* C = A * 128 */ + /* Convert from float to q7 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7)); + pIn += 4; -#if defined (ARM_MATH_DSP) + inV = vld1q_f32(pIn); + cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7)); + pIn += 4; - /* Run the below code for Cortex-M4 and Cortex-M3 */ + outV = vqmovn_s16(vcombine_s16(cvt1,cvt2)); - /*loop Unrolling */ - blkCnt = blockSize >> 2U; + vst1_s8(pDst, outV); + pDst += 8; +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 7; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { #ifdef ARM_MATH_ROUNDING /* C = A * 128 */ - /* convert from float to q7 and then store the results in the destination buffer */ - in = *pIn++; - in = (in * 128); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); - - in = *pIn++; - in = (in * 128); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); - - in = *pIn++; - in = (in * 128); - in += in > 0.0f ? 0.5f : -0.5f; - *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); - + /* Convert from float to q7 and then store the results in the destination buffer */ in = *pIn++; in = (in * 128); in += in > 0.0f ? 0.5f : -0.5f; @@ -112,10 +150,7 @@ void arm_float_to_q7( #else /* C = A * 128 */ - /* convert from float to q7 and then store the results in the destination buffer */ - *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); - *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); - *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); + /* Convert from float to q7 and then store the results in the destination buffer */ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); #endif /* #ifdef ARM_MATH_ROUNDING */ @@ -124,68 +159,95 @@ void arm_float_to_q7( blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; +} +#else +void arm_float_to_q7( + const float32_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const float32_t *pIn = pSrc; /* Source pointer */ + +#ifdef ARM_MATH_ROUNDING + float32_t in; +#endif /* #ifdef ARM_MATH_ROUNDING */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; while (blkCnt > 0U) { + /* C = A * 128 */ + /* Convert from float to q7 and store result in destination buffer */ #ifdef ARM_MATH_ROUNDING - /* C = A * 128 */ - /* convert from float to q7 and then store the results in the destination buffer */ - in = *pIn++; - in = (in * 128); + + in = (*pIn++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + + in = (*pIn++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + + in = (*pIn++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + + in = (*pIn++ * 128); in += in > 0.0f ? 0.5f : -0.5f; *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #else - /* C = A * 128 */ - /* convert from float to q7 and then store the results in the destination buffer */ + *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); + *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); + *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); -#endif /* #ifdef ARM_MATH_ROUNDING */ +#endif /* #ifdef ARM_MATH_ROUNDING */ - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { -#ifdef ARM_MATH_ROUNDING /* C = A * 128 */ - /* convert from float to q7 and then store the results in the destination buffer */ - in = *pIn++; - in = (in * 128.0f); - in += in > 0 ? 0.5f : -0.5f; - *pDst++ = (q7_t) (__SSAT((q31_t) (in), 8)); + + /* Convert from float to q7 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pIn++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #else - /* C = A * 128 */ - /* convert from float to q7 and then store the results in the destination buffer */ *pDst++ = (q7_t) __SSAT((q31_t) (*pIn++ * 128.0f), 8); -#endif /* #ifdef ARM_MATH_ROUNDING */ +#endif /* #ifdef ARM_MATH_ROUNDING */ - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } -#endif /* #if defined (ARM_MATH_DSP) */ - } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of float_to_x group + @} end of float_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_q15_to_float.c b/DSP/Source/SupportFunctions/arm_q15_to_float.c index 48ef947..f49d9b7 100644 --- a/DSP/Source/SupportFunctions/arm_q15_to_float.c +++ b/DSP/Source/SupportFunctions/arm_q15_to_float.c @@ -3,13 +3,13 @@ * Title: arm_q15_to_float.c * Description: Converts the elements of the Q15 vector to floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,7 +29,7 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** @@ -37,86 +37,130 @@ */ /** - * @addtogroup q15_to_x - * @{ + @addtogroup q15_to_x + @{ */ - - - /** - * @brief Converts the elements of the Q15 vector to floating-point vector. - * @param[in] *pSrc points to the Q15 input vector - * @param[out] *pDst points to the floating-point output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (float32_t) pSrc[n] / 32768; 0 <= n < blockSize. - * </pre> - * + @brief Converts the elements of the Q15 vector to floating-point vector. + @param[in] pSrc points to the Q15 input vector + @param[out] pDst points to the floating-point output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (float32_t) pSrc[n] / 32768; 0 <= n < blockSize. + </pre> */ - +#if defined(ARM_MATH_NEON_EXPERIMENTAL) void arm_q15_to_float( - q15_t * pSrc, + const q15_t * pSrc, float32_t * pDst, uint32_t blockSize) { - q15_t *pIn = pSrc; /* Src pointer */ + const q15_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ + int16x8_t inV; + int32x4_t inV0, inV1; + float32x4_t outV; -#if defined (ARM_MATH_DSP) + blkCnt = blockSize >> 3U; - /* Run the below code for Cortex-M4 and Cortex-M3 */ + /* Compute 8 outputs at a time. + ** a second loop below computes the remaining 1 to 7 samples. */ + while (blkCnt > 0U) + { + /* C = (float32_t) A / 32768 */ + /* convert from q15 to float and then store the results in the destination buffer */ + inV = vld1q_s16(pIn); + pIn += 8; + + inV0 = vmovl_s16(vget_low_s16(inV)); + inV1 = vmovl_s16(vget_high_s16(inV)); + + outV = vcvtq_n_f32_s32(inV0,15); + vst1q_f32(pDst, outV); + pDst += 4; + + outV = vcvtq_n_f32_s32(inV1,15); + vst1q_f32(pDst, outV); + pDst += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 8, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 7; - /*loop Unrolling */ - blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (float32_t) A / 32768 */ /* convert from q15 to float and then store the results in the destination buffer */ *pDst++ = ((float32_t) * pIn++ / 32768.0f); + + /* Decrement the loop counter */ + blkCnt--; + } +} +#else +void arm_q15_to_float( + const q15_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q15_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 32768 */ + + /* Convert from q15 to float and store result in destination buffer */ + *pDst++ = ((float32_t) * pIn++ / 32768.0f); *pDst++ = ((float32_t) * pIn++ / 32768.0f); *pDst++ = ((float32_t) * pIn++ / 32768.0f); *pDst++ = ((float32_t) * pIn++ / 32768.0f); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (float32_t) A / 32768 */ - /* convert from q15 to float and then store the results in the destination buffer */ - *pDst++ = ((float32_t) * pIn++ / 32768.0f); - /* Decrement the loop counter */ + /* Convert from q15 to float and store result in destination buffer */ + *pDst++ = ((float32_t) *pIn++ / 32768.0f); + + /* Decrement loop counter */ blkCnt--; } + } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of q15_to_x group + @} end of q15_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_q15_to_q31.c b/DSP/Source/SupportFunctions/arm_q15_to_q31.c index bf139a8..1afd489 100644 --- a/DSP/Source/SupportFunctions/arm_q15_to_q31.c +++ b/DSP/Source/SupportFunctions/arm_q15_to_q31.c @@ -3,13 +3,13 @@ * Title: arm_q15_to_q31.c * Description: Converts the elements of the Q15 vector to Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,57 +29,53 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup q15_to_x - * @{ + @addtogroup q15_to_x + @{ */ /** - * @brief Converts the elements of the Q15 vector to Q31 vector. - * @param[in] *pSrc points to the Q15 input vector - * @param[out] *pDst points to the Q31 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (q31_t) pSrc[n] << 16; 0 <= n < blockSize. - * </pre> - * + @brief Converts the elements of the Q15 vector to Q31 vector. + @param[in] pSrc points to the Q15 input vector + @param[out] pDst points to the Q31 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q31_t) pSrc[n] << 16; 0 <= n < blockSize. + </pre> */ - void arm_q15_to_q31( - q15_t * pSrc, - q31_t * pDst, - uint32_t blockSize) + const q15_t * pSrc, + q31_t * pDst, + uint32_t blockSize) { - q15_t *pIn = pSrc; /* Src pointer */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + const q15_t *pIn = pSrc; /* Source pointer */ -#if defined (ARM_MATH_DSP) +#if defined (ARM_MATH_LOOPUNROLL) + q31_t in1, in2; + q31_t out1, out2, out3, out4; +#endif - /* Run the below code for Cortex-M4 and Cortex-M3 */ - q31_t in1, in2; - q31_t out1, out2, out3, out4; +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (q31_t)A << 16 */ - /* convert from q15 to q31 and then store the results in the destination buffer */ - in1 = *__SIMD32(pIn)++; - in2 = *__SIMD32(pIn)++; + + /* Convert from q15 to q31 and store result in destination buffer */ + in1 = read_q15x2_ia ((q15_t **) &pIn); + in2 = read_q15x2_ia ((q15_t **) &pIn); #ifndef ARM_MATH_BIG_ENDIAN @@ -103,42 +99,40 @@ void arm_q15_to_q31( /* extract lower 16 bits to 32 bit result */ out4 = in2 << 16U; -#endif // #ifndef ARM_MATH_BIG_ENDIAN +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ *pDst++ = out1; *pDst++ = out2; *pDst++ = out3; *pDst++ = out4; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { - /* C = (q31_t)A << 16 */ - /* convert from q15 to q31 and then store the results in the destination buffer */ - *pDst++ = (q31_t) * pIn++ << 16; + /* C = (q31_t) A << 16 */ + + /* Convert from q15 to q31 and store result in destination buffer */ + *pDst++ = (q31_t) *pIn++ << 16; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of q15_to_x group + @} end of q15_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_q15_to_q7.c b/DSP/Source/SupportFunctions/arm_q15_to_q7.c index 7a45e58..d118b76 100644 --- a/DSP/Source/SupportFunctions/arm_q15_to_q7.c +++ b/DSP/Source/SupportFunctions/arm_q15_to_q7.c @@ -3,13 +3,13 @@ * Title: arm_q15_to_q7.c * Description: Converts the elements of the Q15 vector to Q7 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,58 +29,55 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup q15_to_x - * @{ + @addtogroup q15_to_x + @{ */ - /** - * @brief Converts the elements of the Q15 vector to Q7 vector. - * @param[in] *pSrc points to the Q15 input vector - * @param[out] *pDst points to the Q7 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (q7_t) pSrc[n] >> 8; 0 <= n < blockSize. - * </pre> - * + @brief Converts the elements of the Q15 vector to Q7 vector. + @param[in] pSrc points to the Q15 input vector + @param[out] pDst points to the Q7 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q7_t) pSrc[n] >> 8; 0 <= n < blockSize. + </pre> */ - void arm_q15_to_q7( - q15_t * pSrc, - q7_t * pDst, - uint32_t blockSize) + const q15_t * pSrc, + q7_t * pDst, + uint32_t blockSize) { - q15_t *pIn = pSrc; /* Src pointer */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + const q15_t *pIn = pSrc; /* Source pointer */ -#if defined (ARM_MATH_DSP) +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in1, in2; + q31_t out1, out2; +#endif - /* Run the below code for Cortex-M4 and Cortex-M3 */ - q31_t in1, in2; - q31_t out1, out2; +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (q7_t) A >> 8 */ - /* convert from q15 to q7 and then store the results in the destination buffer */ - in1 = *__SIMD32(pIn)++; - in2 = *__SIMD32(pIn)++; + + /* Convert from q15 to q7 and store result in destination buffer */ +#if defined (ARM_MATH_DSP) + + in1 = read_q15x2_ia ((q15_t **) &pIn); + in2 = read_q15x2_ia ((q15_t **) &pIn); #ifndef ARM_MATH_BIG_ENDIAN @@ -92,7 +89,7 @@ void arm_q15_to_q7( out1 = __PKHTB(in1, in2, 16); out2 = __PKHBT(in1, in2, 16); -#endif // #ifndef ARM_MATH_BIG_ENDIAN +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* rotate packed value by 24 */ out2 = ((uint32_t) out2 << 8) | ((uint32_t) out2 >> 24); @@ -106,37 +103,44 @@ void arm_q15_to_q7( out1 = out1 | out2; /* store 4 samples at a time to destiantion buffer */ - *__SIMD32(pDst)++ = out1; + write_q7x4_ia (&pDst, out1); + +#else + + *pDst++ = (q7_t) (*pIn++ >> 8); + *pDst++ = (q7_t) (*pIn++ >> 8); + *pDst++ = (q7_t) (*pIn++ >> 8); + *pDst++ = (q7_t) (*pIn++ >> 8); - /* Decrement the loop counter */ +#endif /* #if defined (ARM_MATH_DSP) */ + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (q7_t) A >> 8 */ - /* convert from q15 to q7 and then store the results in the destination buffer */ + + /* Convert from q15 to q7 and store result in destination buffer */ *pDst++ = (q7_t) (*pIn++ >> 8); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of q15_to_x group + @} end of q15_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_q31_to_float.c b/DSP/Source/SupportFunctions/arm_q31_to_float.c index d2d7505..03e7ec6 100644 --- a/DSP/Source/SupportFunctions/arm_q31_to_float.c +++ b/DSP/Source/SupportFunctions/arm_q31_to_float.c @@ -3,13 +3,13 @@ * Title: arm_q31_to_float.c * Description: Converts the elements of the Q31 vector to floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,7 +29,7 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** @@ -37,54 +37,51 @@ */ /** - * @addtogroup q31_to_x - * @{ + @addtogroup q31_to_x + @{ */ /** - * @brief Converts the elements of the Q31 vector to floating-point vector. - * @param[in] *pSrc points to the Q31 input vector - * @param[out] *pDst points to the floating-point output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (float32_t) pSrc[n] / 2147483648; 0 <= n < blockSize. - * </pre> - * + @brief Converts the elements of the Q31 vector to floating-point vector. + @param[in] pSrc points to the Q31 input vector + @param[out] pDst points to the floating-point output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (float32_t) pSrc[n] / 2147483648; 0 <= n < blockSize. + </pre> */ - +#if defined(ARM_MATH_NEON_EXPERIMENTAL) void arm_q31_to_float( - q31_t * pSrc, - float32_t * pDst, - uint32_t blockSize) + const q31_t * pSrc, + float32_t * pDst, + uint32_t blockSize) { - q31_t *pIn = pSrc; /* Src pointer */ + const q31_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ + int32x4_t inV; + float32x4_t outV; -#if defined (ARM_MATH_DSP) - - /* Run the below code for Cortex-M4 and Cortex-M3 */ - - /*loop Unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (float32_t) A / 2147483648 */ - /* convert from q31 to float and then store the results in the destination buffer */ - *pDst++ = ((float32_t) * pIn++ / 2147483648.0f); - *pDst++ = ((float32_t) * pIn++ / 2147483648.0f); - *pDst++ = ((float32_t) * pIn++ / 2147483648.0f); - *pDst++ = ((float32_t) * pIn++ / 2147483648.0f); + /* Convert from q31 to float and then store the results in the destination buffer */ + inV = vld1q_s32(pIn); + pIn += 4; + + outV = vcvtq_n_f32_s32(inV,31); + + vst1q_f32(pDst, outV); + pDst += 4; /* Decrement the loop counter */ blkCnt--; @@ -92,28 +89,71 @@ void arm_q31_to_float( /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4U; + blkCnt = blockSize & 3; + + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 2147483648 */ + /* Convert from q31 to float and then store the results in the destination buffer */ + *pDst++ = ((float32_t) * pIn++ / 2147483648.0f); + /* Decrement the loop counter */ + blkCnt--; + } +} #else +void arm_q31_to_float( + const q31_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + const q31_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 2147483648 */ + + /* Convert from q31 to float and store result in destination buffer */ + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; - /* Run the below code for Cortex-M0 */ +#else - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (float32_t) A / 2147483648 */ - /* convert from q31 to float and then store the results in the destination buffer */ - *pDst++ = ((float32_t) * pIn++ / 2147483648.0f); - /* Decrement the loop counter */ + /* Convert from q31 to float and store result in destination buffer */ + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + + /* Decrement loop counter */ blkCnt--; } + } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of q31_to_x group + @} end of q31_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_q31_to_q15.c b/DSP/Source/SupportFunctions/arm_q31_to_q15.c index c460fe7..8d82c28 100644 --- a/DSP/Source/SupportFunctions/arm_q31_to_q15.c +++ b/DSP/Source/SupportFunctions/arm_q31_to_q15.c @@ -3,13 +3,13 @@ * Title: arm_q31_to_q15.c * Description: Converts the elements of the Q31 vector to Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,55 +29,53 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup q31_to_x - * @{ + @addtogroup q31_to_x + @{ */ /** - * @brief Converts the elements of the Q31 vector to Q15 vector. - * @param[in] *pSrc points to the Q31 input vector - * @param[out] *pDst points to the Q15 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (q15_t) pSrc[n] >> 16; 0 <= n < blockSize. - * </pre> - * + @brief Converts the elements of the Q31 vector to Q15 vector. + @param[in] pSrc points to the Q31 input vector + @param[out] pDst points to the Q15 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q15_t) pSrc[n] >> 16; 0 <= n < blockSize. + </pre> */ - void arm_q31_to_q15( - q31_t * pSrc, - q15_t * pDst, - uint32_t blockSize) + const q31_t * pSrc, + q15_t * pDst, + uint32_t blockSize) { - q31_t *pIn = pSrc; /* Src pointer */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + const q31_t *pIn = pSrc; /* Source pointer */ -#if defined (ARM_MATH_DSP) +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in1, in2, in3, in4; + q31_t out1, out2; +#endif - /* Run the below code for Cortex-M4 and Cortex-M3 */ - q31_t in1, in2, in3, in4; - q31_t out1, out2; +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = (q15_t) A >> 16 */ - /* convert from q31 to q15 and then store the results in the destination buffer */ + /* C = (q15_t) (A >> 16) */ + + /* Convert from q31 to q15 and store result in destination buffer */ +#if defined (ARM_MATH_DSP) + in1 = *pIn++; in2 = *pIn++; in3 = *pIn++; @@ -85,49 +83,52 @@ void arm_q31_to_q15( /* pack two higher 16-bit values from two 32-bit values */ #ifndef ARM_MATH_BIG_ENDIAN - out1 = __PKHTB(in2, in1, 16); out2 = __PKHTB(in4, in3, 16); - #else - out1 = __PKHTB(in1, in2, 16); out2 = __PKHTB(in3, in4, 16); +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ -#endif // #ifdef ARM_MATH_BIG_ENDIAN + write_q15x2_ia (&pDst, out1); + write_q15x2_ia (&pDst, out2); - *__SIMD32(pDst)++ = out1; - *__SIMD32(pDst)++ = out2; +#else + + *pDst++ = (q15_t) (*pIn++ >> 16); + *pDst++ = (q15_t) (*pIn++ >> 16); + *pDst++ = (q15_t) (*pIn++ >> 16); + *pDst++ = (q15_t) (*pIn++ >> 16); - /* Decrement the loop counter */ +#endif /* #if defined (ARM_MATH_DSP) */ + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { - /* C = (q15_t) A >> 16 */ - /* convert from q31 to q15 and then store the results in the destination buffer */ + /* C = (q15_t) (A >> 16) */ + + /* Convert from q31 to q15 and store result in destination buffer */ *pDst++ = (q15_t) (*pIn++ >> 16); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of q31_to_x group + @} end of q31_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_q31_to_q7.c b/DSP/Source/SupportFunctions/arm_q31_to_q7.c index f092bed..c7d1b4c 100644 --- a/DSP/Source/SupportFunctions/arm_q31_to_q7.c +++ b/DSP/Source/SupportFunctions/arm_q31_to_q7.c @@ -3,13 +3,13 @@ * Title: arm_q31_to_q7.c * Description: Converts the elements of the Q31 vector to Q7 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,96 +29,82 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup q31_to_x - * @{ + @addtogroup q31_to_x + @{ */ /** - * @brief Converts the elements of the Q31 vector to Q7 vector. - * @param[in] *pSrc points to the Q31 input vector - * @param[out] *pDst points to the Q7 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (q7_t) pSrc[n] >> 24; 0 <= n < blockSize. - * </pre> - * + @brief Converts the elements of the Q31 vector to Q7 vector. + @param[in] pSrc points to the Q31 input vector + @param[out] pDst points to the Q7 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q7_t) pSrc[n] >> 24; 0 <= n < blockSize. + </pre> */ - void arm_q31_to_q7( - q31_t * pSrc, - q7_t * pDst, - uint32_t blockSize) + const q31_t * pSrc, + q7_t * pDst, + uint32_t blockSize) { - q31_t *pIn = pSrc; /* Src pointer */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + const q31_t *pIn = pSrc; /* Source pointer */ -#if defined (ARM_MATH_DSP) +#if defined (ARM_MATH_LOOPUNROLL) - /* Run the below code for Cortex-M4 and Cortex-M3 */ - q31_t in1, in2, in3, in4; q7_t out1, out2, out3, out4; - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { - /* C = (q7_t) A >> 24 */ - /* convert from q31 to q7 and then store the results in the destination buffer */ - in1 = *pIn++; - in2 = *pIn++; - in3 = *pIn++; - in4 = *pIn++; + /* C = (q7_t) (A >> 24) */ - out1 = (q7_t) (in1 >> 24); - out2 = (q7_t) (in2 >> 24); - out3 = (q7_t) (in3 >> 24); - out4 = (q7_t) (in4 >> 24); + /* Convert from q31 to q7 and store result in destination buffer */ - *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4); + out1 = (q7_t) (*pIn++ >> 24); + out2 = (q7_t) (*pIn++ >> 24); + out3 = (q7_t) (*pIn++ >> 24); + out4 = (q7_t) (*pIn++ >> 24); + write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4)); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { - /* C = (q7_t) A >> 24 */ - /* convert from q31 to q7 and then store the results in the destination buffer */ + /* C = (q7_t) (A >> 24) */ + + /* Convert from q31 to q7 and store result in destination buffer */ *pDst++ = (q7_t) (*pIn++ >> 24); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of q31_to_x group + @} end of q31_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_q7_to_float.c b/DSP/Source/SupportFunctions/arm_q7_to_float.c index ace437f..6bd86bf 100644 --- a/DSP/Source/SupportFunctions/arm_q7_to_float.c +++ b/DSP/Source/SupportFunctions/arm_q7_to_float.c @@ -3,13 +3,13 @@ * Title: arm_q7_to_float.c * Description: Converts the elements of the Q7 vector to floating-point vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,7 +29,7 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** @@ -37,83 +37,143 @@ */ /** - * @addtogroup q7_to_x - * @{ + @addtogroup q7_to_x + @{ */ /** - * @brief Converts the elements of the Q7 vector to floating-point vector. - * @param[in] *pSrc points to the Q7 input vector - * @param[out] *pDst points to the floating-point output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (float32_t) pSrc[n] / 128; 0 <= n < blockSize. - * </pre> - * + @brief Converts the elements of the Q7 vector to floating-point vector. + @param[in] pSrc points to the Q7 input vector + @param[out] pDst points to the floating-point output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (float32_t) pSrc[n] / 128; 0 <= n < blockSize. + </pre> */ - +#if defined(ARM_MATH_NEON) void arm_q7_to_float( - q7_t * pSrc, + const q7_t * pSrc, float32_t * pDst, uint32_t blockSize) { - q7_t *pIn = pSrc; /* Src pointer */ + const q7_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ + int8x16_t inV; + int16x8_t inVLO, inVHI; + int32x4_t inVLL, inVLH, inVHL, inVHH; + float32x4_t outV; + + blkCnt = blockSize >> 4U; + + /* Compute 16 outputs at a time. + ** a second loop below computes the remaining 1 to 15 samples. */ + while (blkCnt > 0U) + { + /* C = (float32_t) A / 128 */ + /* Convert from q7 to float and then store the results in the destination buffer */ + inV = vld1q_s8(pIn); + pIn += 16; + + inVLO = vmovl_s8(vget_low_s8(inV)); + inVHI = vmovl_s8(vget_high_s8(inV)); -#if defined (ARM_MATH_DSP) + inVLL = vmovl_s16(vget_low_s16(inVLO)); + inVLH = vmovl_s16(vget_high_s16(inVLO)); + inVHL = vmovl_s16(vget_low_s16(inVHI)); + inVHH = vmovl_s16(vget_high_s16(inVHI)); - /* Run the below code for Cortex-M4 and Cortex-M3 */ + outV = vcvtq_n_f32_s32(inVLL,7); + vst1q_f32(pDst, outV); + pDst += 4; - /*loop Unrolling */ + outV = vcvtq_n_f32_s32(inVLH,7); + vst1q_f32(pDst, outV); + pDst += 4; + + outV = vcvtq_n_f32_s32(inVHL,7); + vst1q_f32(pDst, outV); + pDst += 4; + + outV = vcvtq_n_f32_s32(inVHH,7); + vst1q_f32(pDst, outV); + pDst += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 16, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 0xF; + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 128 */ + /* Convert from q7 to float and then store the results in the destination buffer */ + *pDst++ = ((float32_t) * pIn++ / 128.0f); + + /* Decrement the loop counter */ + blkCnt--; + } +} +#else +void arm_q7_to_float( + const q7_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q7_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (float32_t) A / 128 */ - /* convert from q7 to float and then store the results in the destination buffer */ + + /* Convert from q7 to float and store result in destination buffer */ *pDst++ = ((float32_t) * pIn++ / 128.0f); *pDst++ = ((float32_t) * pIn++ / 128.0f); *pDst++ = ((float32_t) * pIn++ / 128.0f); *pDst++ = ((float32_t) * pIn++ / 128.0f); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (float32_t) A / 128 */ - /* convert from q7 to float and then store the results in the destination buffer */ + + /* Convert from q7 to float and store result in destination buffer */ *pDst++ = ((float32_t) * pIn++ / 128.0f); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } + } +#endif /* #if defined(ARM_MATH_NEON) */ /** - * @} end of q7_to_x group + @} end of q7_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_q7_to_q15.c b/DSP/Source/SupportFunctions/arm_q7_to_q15.c index 5348194..89afd10 100644 --- a/DSP/Source/SupportFunctions/arm_q7_to_q15.c +++ b/DSP/Source/SupportFunctions/arm_q7_to_q15.c @@ -3,13 +3,13 @@ * Title: arm_q7_to_q15.c * Description: Converts the elements of the Q7 vector to Q15 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,60 +29,55 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup q7_to_x - * @{ + @addtogroup q7_to_x + @{ */ - - - /** - * @brief Converts the elements of the Q7 vector to Q15 vector. - * @param[in] *pSrc points to the Q7 input vector - * @param[out] *pDst points to the Q15 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (q15_t) pSrc[n] << 8; 0 <= n < blockSize. - * </pre> - * + @brief Converts the elements of the Q7 vector to Q15 vector. + @param[in] pSrc points to the Q7 input vector + @param[out] pDst points to the Q15 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q15_t) pSrc[n] << 8; 0 <= n < blockSize. + </pre> */ - void arm_q7_to_q15( - q7_t * pSrc, - q15_t * pDst, - uint32_t blockSize) + const q7_t * pSrc, + q15_t * pDst, + uint32_t blockSize) { - q7_t *pIn = pSrc; /* Src pointer */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + const q7_t *pIn = pSrc; /* Source pointer */ -#if defined (ARM_MATH_DSP) - q31_t in; - q31_t in1, in2; - q31_t out1, out2; +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in; + q31_t in1, in2; + q31_t out1, out2; +#endif - /* Run the below code for Cortex-M4 and Cortex-M3 */ +#if defined (ARM_MATH_LOOPUNROLL) - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (q15_t) A << 8 */ - /* convert from q7 to q15 and then store the results in the destination buffer */ - in = *__SIMD32(pIn)++; + + /* Convert from q7 to q15 and store result in destination buffer */ +#if defined (ARM_MATH_DSP) + + in = read_q7x4_ia ((q7_t **) &pIn); /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); @@ -97,49 +92,52 @@ void arm_q7_to_q15( in2 = in2 & 0xFF00FF00; #ifndef ARM_MATH_BIG_ENDIAN - out2 = __PKHTB(in1, in2, 16); out1 = __PKHBT(in2, in1, 16); - #else - out1 = __PKHTB(in1, in2, 16); out2 = __PKHBT(in2, in1, 16); - #endif - *__SIMD32(pDst)++ = out1; - *__SIMD32(pDst)++ = out2; + write_q15x2_ia (&pDst, out1); + write_q15x2_ia (&pDst, out2); + +#else + + *pDst++ = (q15_t) *pIn++ << 8; + *pDst++ = (q15_t) *pIn++ << 8; + *pDst++ = (q15_t) *pIn++ << 8; + *pDst++ = (q15_t) *pIn++ << 8; + +#endif /* #if defined (ARM_MATH_DSP) */ - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (q15_t) A << 8 */ - /* convert from q7 to q15 and then store the results in the destination buffer */ + + /* Convert from q7 to q15 and store result in destination buffer */ *pDst++ = (q15_t) * pIn++ << 8; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of q7_to_x group + @} end of q7_to_x group */ diff --git a/DSP/Source/SupportFunctions/arm_q7_to_q31.c b/DSP/Source/SupportFunctions/arm_q7_to_q31.c index 27d0952..641c02d 100644 --- a/DSP/Source/SupportFunctions/arm_q7_to_q31.c +++ b/DSP/Source/SupportFunctions/arm_q7_to_q31.c @@ -3,13 +3,13 @@ * Title: arm_q7_to_q31.c * Description: Converts the elements of the Q7 vector to Q31 vector * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,56 +29,49 @@ #include "arm_math.h" /** - * @ingroup groupSupport + @ingroup groupSupport */ /** - * @addtogroup q7_to_x - * @{ + @addtogroup q7_to_x + @{ */ /** - * @brief Converts the elements of the Q7 vector to Q31 vector. - * @param[in] *pSrc points to the Q7 input vector - * @param[out] *pDst points to the Q31 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - * <pre> - * pDst[n] = (q31_t) pSrc[n] << 24; 0 <= n < blockSize. - * </pre> - * + @brief Converts the elements of the Q7 vector to Q31 vector. + @param[in] pSrc points to the Q7 input vector + @param[out] pDst points to the Q31 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q31_t) pSrc[n] << 24; 0 <= n < blockSize. + </pre> */ - void arm_q7_to_q31( - q7_t * pSrc, - q31_t * pDst, - uint32_t blockSize) + const q7_t * pSrc, + q31_t * pDst, + uint32_t blockSize) { - q7_t *pIn = pSrc; /* Src pointer */ - uint32_t blkCnt; /* loop counter */ + uint32_t blkCnt; /* Loop counter */ + const q7_t *pIn = pSrc; /* Source pointer */ -#if defined (ARM_MATH_DSP) +#if defined (ARM_MATH_LOOPUNROLL) - q31_t in; + q31_t in; - /* Run the below code for Cortex-M4 and Cortex-M3 */ - - /*loop Unrolling */ + /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = (q31_t) A << 24 */ - /* convert from q7 to q31 and then store the results in the destination buffer */ - in = *__SIMD32(pIn)++; + + /* Convert from q7 to q31 and store result in destination buffer */ + in = read_q7x4_ia ((q7_t **) &pIn); #ifndef ARM_MATH_BIG_ENDIAN @@ -94,37 +87,35 @@ void arm_q7_to_q31( *pDst++ = (__ROR(in, 16)) & 0xFF000000; *pDst++ = (__ROR(in, 8)) & 0xFF000000; -#endif // #ifndef ARM_MATH_BIG_ENDIAN +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else - /* Run the below code for Cortex-M0 */ - - /* Loop over blockSize number of values */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; -#endif /* #if defined (ARM_MATH_DSP) */ +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = (q31_t) A << 24 */ - /* convert from q7 to q31 and then store the results in the destination buffer */ + + /* Convert from q7 to q31 and store result in destination buffer */ *pDst++ = (q31_t) * pIn++ << 24; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } } /** - * @} end of q7_to_x group + @} end of q7_to_x group */ |