From 96d6da4e252b06dcfdc041e7df23e86161c33007 Mon Sep 17 00:00:00 2001 From: rihab kouki Date: Tue, 28 Jul 2020 11:24:49 +0100 Subject: Official ARM version: v5.6.0 --- docs/DSP/html/group__Conv.html | 326 +++++++++++++++++++---------------------- 1 file changed, 149 insertions(+), 177 deletions(-) (limited to 'docs/DSP/html/group__Conv.html') diff --git a/docs/DSP/html/group__Conv.html b/docs/DSP/html/group__Conv.html index dab311e..0b6f7f1 100644 --- a/docs/DSP/html/group__Conv.html +++ b/docs/DSP/html/group__Conv.html @@ -32,7 +32,7 @@ Logo
CMSIS-DSP -  Version 1.5.2 +  Version 1.7.0
CMSIS DSP Software Library
@@ -116,64 +116,61 @@ $(document).ready(function(){initNavTree('group__Conv.html','');}); - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + +

Functions

void arm_conv_f32 (float32_t *pSrcA, uint32_t srcALen, float32_t *pSrcB, uint32_t srcBLen, float32_t *pDst)
 Convolution of floating-point sequences. More...
 
void arm_conv_fast_opt_q15 (q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
 Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. More...
 
void arm_conv_fast_q15 (q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst)
 Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. More...
 
void arm_conv_fast_q31 (q31_t *pSrcA, uint32_t srcALen, q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
 Convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4. More...
 
void arm_conv_opt_q15 (q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
 Convolution of Q15 sequences. More...
 
void arm_conv_opt_q7 (q7_t *pSrcA, uint32_t srcALen, q7_t *pSrcB, uint32_t srcBLen, q7_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
 Convolution of Q7 sequences. More...
 
void arm_conv_q15 (q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst)
 Convolution of Q15 sequences. More...
 
void arm_conv_q31 (q31_t *pSrcA, uint32_t srcALen, q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
 Convolution of Q31 sequences. More...
 
void arm_conv_q7 (q7_t *pSrcA, uint32_t srcALen, q7_t *pSrcB, uint32_t srcBLen, q7_t *pDst)
 Convolution of Q7 sequences. More...
 
void arm_conv_f32 (const float32_t *pSrcA, uint32_t srcALen, const float32_t *pSrcB, uint32_t srcBLen, float32_t *pDst)
 Convolution of floating-point sequences. More...
 
void arm_conv_fast_opt_q15 (const q15_t *pSrcA, uint32_t srcALen, const q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
 Convolution of Q15 sequences (fast version). More...
 
void arm_conv_fast_q15 (const q15_t *pSrcA, uint32_t srcALen, const q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst)
 Convolution of Q15 sequences (fast version). More...
 
void arm_conv_fast_q31 (const q31_t *pSrcA, uint32_t srcALen, const q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
 Convolution of Q31 sequences (fast version). More...
 
void arm_conv_opt_q15 (const q15_t *pSrcA, uint32_t srcALen, const q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
 Convolution of Q15 sequences. More...
 
void arm_conv_opt_q7 (const q7_t *pSrcA, uint32_t srcALen, const q7_t *pSrcB, uint32_t srcBLen, q7_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
 Convolution of Q7 sequences. More...
 
void arm_conv_q15 (const q15_t *pSrcA, uint32_t srcALen, const q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst)
 Convolution of Q15 sequences. More...
 
void arm_conv_q31 (const q31_t *pSrcA, uint32_t srcALen, const q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
 Convolution of Q31 sequences. More...
 
void arm_conv_q7 (const q7_t *pSrcA, uint32_t srcALen, const q7_t *pSrcB, uint32_t srcBLen, q7_t *pDst)
 Convolution of Q7 sequences. More...
 

Description

-

Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector. Convolution is similar to correlation and is frequently used in filtering and data analysis. The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types. The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.

-
Algorithm
Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively. Then the convolution
-
-                  c[n] = a[n] * b[n]
-
is defined as
+

Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector. Convolution is similar to correlation and is frequently used in filtering and data analysis. The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types. The library also provides fast versions of the Q15 and Q31 functions.

+
Algorithm
Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively. Then the convolution
+    c[n] = a[n] * b[n]
+ 
+
is defined as
ConvolutionEquation.gif
-
Note that c[n] is of length srcALen + srcBLen - 1 and is defined over the interval n=0, 1, 2, ..., srcALen + srcBLen - 2. pSrcA points to the first input vector of length srcALen and pSrcB points to the second input vector of length srcBLen. The output result is written to pDst and the calling function must allocate srcALen+srcBLen-1 words for the result.
-
Conceptually, when two signals a[n] and b[n] are convolved, the signal b[n] slides over a[n]. For each offset n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
-
Note that convolution is a commutative operation:
-
-                  a[n] * b[n] = b[n] * a[n].
-
This means that switching the A and B arguments to the convolution functions has no effect.
-

Fixed-Point Behavior

-
Convolution requires summing up a large number of intermediate products. As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. Refer to the function specific documentation below for further details of the particular algorithm used.
-

Fast Versions

-
Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires the input signals should be scaled down to avoid intermediate overflows.
-

Opt Versions

-
Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
+
Note that c[n] is of length srcALen + srcBLen - 1 and is defined over the interval n=0, 1, 2, ..., srcALen + srcBLen - 2. pSrcA points to the first input vector of length srcALen and pSrcB points to the second input vector of length srcBLen. The output result is written to pDst and the calling function must allocate srcALen+srcBLen-1 words for the result.
+
Conceptually, when two signals a[n] and b[n] are convolved, the signal b[n] slides over a[n]. For each offset n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
+
Note that convolution is a commutative operation:
+    a[n] * b[n] = b[n] * a[n].
+ 
+
This means that switching the A and B arguments to the convolution functions has no effect.
+
Fixed-Point Behavior
Convolution requires summing up a large number of intermediate products. As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. Refer to the function specific documentation below for further details of the particular algorithm used.
+
Fast Versions
Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires the input signals should be scaled down to avoid intermediate overflows.
+
Opt Versions
Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions

Function Documentation

- +
- + @@ -185,7 +182,7 @@ Functions - + @@ -209,28 +206,26 @@ Functions
Parameters
void arm_conv_f32 (float32_tconst float32_t pSrcA,
float32_tconst float32_t pSrcB,
- - - - - + + + + +
[in]*pSrcApoints to the first input sequence.
[in]srcALenlength of the first input sequence.
[in]*pSrcBpoints to the second input sequence.
[in]srcBLenlength of the second input sequence.
[out]*pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]pSrcApoints to the first input sequence
[in]srcALenlength of the first input sequence
[in]pSrcBpoints to the second input sequence
[in]srcBLenlength of the second input sequence
[out]pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
-
Returns
none.
- -

References srcALen, and srcBLen.

+
Returns
none
- +
- + @@ -242,7 +237,7 @@ Functions - + @@ -276,36 +271,33 @@ Functions
void arm_conv_fast_opt_q15 (q15_tconst q15_t pSrcA,
q15_tconst q15_t pSrcB,
+

Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.

Parameters
- - - - - - - + + + + + + +
[in]*pSrcApoints to the first input sequence.
[in]srcALenlength of the first input sequence.
[in]*pSrcBpoints to the second input sequence.
[in]srcBLenlength of the second input sequence.
[out]*pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]*pScratch1points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
[in]*pScratch2points to scratch buffer of size min(srcALen, srcBLen).
[in]pSrcApoints to the first input sequence
[in]srcALenlength of the first input sequence
[in]pSrcBpoints to the second input sequence
[in]srcBLenlength of the second input sequence
[out]pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1
[in]pScratch1points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2
[in]pScratch2points to scratch buffer of size min(srcALen, srcBLen
-
Returns
none.
-
Restrictions
If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
-

Scaling and Overflow Behavior:

-
This fast version uses a 32-bit accumulator with 2.30 format. The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. There is no saturation on intermediate additions. Thus, if the accumulator overflows it wraps around and distorts the result. The input signals should be scaled down to avoid intermediate overflows. Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, as maximum of min(srcALen, srcBLen) number of additions are carried internally. The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
-
See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
- -

References __PKHBT, __SIMD32, __SMLAD(), __SMLADX(), _SIMD32_OFFSET, arm_copy_q15(), arm_fill_q15(), srcALen, and srcBLen.

+
Returns
none
+
Scaling and Overflow Behavior
This fast version uses a 32-bit accumulator with 2.30 format. The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. There is no saturation on intermediate additions. Thus, if the accumulator overflows it wraps around and distorts the result. The input signals should be scaled down to avoid intermediate overflows. Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, as maximum of min(srcALen, srcBLen) number of additions are carried internally. The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
Remarks
Refer to arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
- +
- + @@ -317,7 +309,7 @@ Functions - + @@ -339,33 +331,31 @@ Functions
void arm_conv_fast_q15 (q15_tconst q15_t pSrcA,
q15_tconst q15_t pSrcB,
+

Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.

Parameters
- - - - - + + + + +
[in]*pSrcApoints to the first input sequence.
[in]srcALenlength of the first input sequence.
[in]*pSrcBpoints to the second input sequence.
[in]srcBLenlength of the second input sequence.
[out]*pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]pSrcApoints to the first input sequence
[in]srcALenlength of the first input sequence
[in]pSrcBpoints to the second input sequence
[in]srcBLenlength of the second input sequence
[out]pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1
-
Returns
none.
-

Scaling and Overflow Behavior:

-
This fast version uses a 32-bit accumulator with 2.30 format. The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. There is no saturation on intermediate additions. Thus, if the accumulator overflows it wraps around and distorts the result. The input signals should be scaled down to avoid intermediate overflows. Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, as maximum of min(srcALen, srcBLen) number of additions are carried internally. The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
-
See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
- -

References __PKHBT, __SIMD32, __SMLAD(), __SMLADX(), _SIMD32_OFFSET, srcALen, and srcBLen.

+
Returns
none
+
Scaling and Overflow Behavior
This fast version uses a 32-bit accumulator with 2.30 format. The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. There is no saturation on intermediate additions. Thus, if the accumulator overflows it wraps around and distorts the result. The input signals should be scaled down to avoid intermediate overflows. Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, as maximum of min(srcALen, srcBLen) number of additions are carried internally. The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
Remarks
Refer to arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
- +
- + @@ -377,7 +367,7 @@ Functions - + @@ -399,34 +389,32 @@ Functions
void arm_conv_fast_q31 (q31_tconst q31_t pSrcA,
q31_tconst q31_t pSrcB,
+

Convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.

Parameters
- + - + - +
[in]*pSrcApoints to the first input sequence.
[in]pSrcApoints to the first input sequence.
[in]srcALenlength of the first input sequence.
[in]*pSrcBpoints to the second input sequence.
[in]pSrcBpoints to the second input sequence.
[in]srcBLenlength of the second input sequence.
[out]*pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[out]pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
-
Returns
none.
-

Scaling and Overflow Behavior:

-
This function is optimized for speed at the expense of fixed-point precision and overflow protection. The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30 format. Finally, the accumulator is saturated and converted to a 1.31 result.
-
The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result. In order to avoid overflows completely the input signals must be scaled down. Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, as maximum of min(srcALen, srcBLen) number of additions are carried internally.
-
See arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
- -

References srcALen, and srcBLen.

+
Returns
none
+
Scaling and Overflow Behavior
This function is optimized for speed at the expense of fixed-point precision and overflow protection. The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30 format. Finally, the accumulator is saturated and converted to a 1.31 result.
+
The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result. In order to avoid overflows completely the input signals must be scaled down. Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+
Remarks
Refer to arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
- +
- + @@ -438,7 +426,7 @@ Functions - + @@ -474,34 +462,30 @@ Functions
Parameters
void arm_conv_opt_q15 (q15_tconst q15_t pSrcA,
q15_tconst q15_t pSrcB,
- - - - - - - + + + + + + +
[in]*pSrcApoints to the first input sequence.
[in]srcALenlength of the first input sequence.
[in]*pSrcBpoints to the second input sequence.
[in]srcBLenlength of the second input sequence.
[out]*pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]*pScratch1points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
[in]*pScratch2points to scratch buffer of size min(srcALen, srcBLen).
[in]pSrcApoints to the first input sequence
[in]srcALenlength of the first input sequence
[in]pSrcBpoints to the second input sequence
[in]srcBLenlength of the second input sequence
[out]pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]pScratch1points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
[in]pScratch2points to scratch buffer of size min(srcALen, srcBLen).
-
Returns
none.
-
Restrictions
If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
-

Scaling and Overflow Behavior:

-
The function is implemented using a 64-bit internal accumulator. Both inputs are in 1.15 format and multiplications yield a 2.30 result. The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
-
Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
- -

References __PKHBT, __SIMD32, __SMLALD(), __SMLALDX(), _SIMD32_OFFSET, arm_copy_q15(), arm_fill_q15(), srcALen, and srcBLen.

+
Returns
none
+
Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator. Both inputs are in 1.15 format and multiplications yield a 2.30 result. The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+
Remarks
Refer to arm_conv_fast_q15() for a faster but less precise version of this function.
- +
- + @@ -513,7 +497,7 @@ Functions - + @@ -549,33 +533,29 @@ Functions
Parameters
void arm_conv_opt_q7 (q7_tconst q7_t pSrcA,
q7_tconst q7_t pSrcB,
- - - - - - - + + + + + + +
[in]*pSrcApoints to the first input sequence.
[in]srcALenlength of the first input sequence.
[in]*pSrcBpoints to the second input sequence.
[in]srcBLenlength of the second input sequence.
[out]*pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]*pScratch1points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
[in]*pScratch2points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
[in]pSrcApoints to the first input sequence
[in]srcALenlength of the first input sequence
[in]pSrcBpoints to the second input sequence
[in]srcBLenlength of the second input sequence
[out]pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]pScratch1points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
[in]pScratch2points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
-
Returns
none.
-
Restrictions
If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
-

Scaling and Overflow Behavior:

-
The function is implemented using a 32-bit internal accumulator. Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
- -

References __PACKq7, __PKHBT, __SIMD32, __SMLAD(), __SMLADX(), _SIMD32_OFFSET, arm_fill_q15(), srcALen, and srcBLen.

+
Returns
none
+
Scaling and Overflow Behavior
The function is implemented using a 32-bit internal accumulator. Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
- +
- + @@ -587,7 +567,7 @@ Functions - + @@ -611,32 +591,30 @@ Functions
Parameters
void arm_conv_q15 (q15_tconst q15_t pSrcA,
q15_tconst q15_t pSrcB,
- - - - - + + + + +
[in]*pSrcApoints to the first input sequence.
[in]srcALenlength of the first input sequence.
[in]*pSrcBpoints to the second input sequence.
[in]srcBLenlength of the second input sequence.
[out]*pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]pSrcApoints to the first input sequence
[in]srcALenlength of the first input sequence
[in]pSrcBpoints to the second input sequence
[in]srcBLenlength of the second input sequence
[out]pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
-
Returns
none.
-

Scaling and Overflow Behavior:

-
The function is implemented using a 64-bit internal accumulator. Both inputs are in 1.15 format and multiplications yield a 2.30 result. The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
-
Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
-
Refer the function arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
- -

References __PKHBT, __SIMD32, __SMLALD(), __SMLALDX(), _SIMD32_OFFSET, srcALen, and srcBLen.

+
Returns
none
+
Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator. Both inputs are in 1.15 format and multiplications yield a 2.30 result. The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+
Remarks
Refer to arm_conv_fast_q15() for a faster but less precise version of this function.
+
+Refer to arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
- +
- + @@ -648,7 +626,7 @@ Functions - + @@ -672,31 +650,28 @@ Functions
Parameters
void arm_conv_q31 (q31_tconst q31_t pSrcA,
q31_tconst q31_t pSrcB,
- - - - - + + + + +
[in]*pSrcApoints to the first input sequence.
[in]srcALenlength of the first input sequence.
[in]*pSrcBpoints to the second input sequence.
[in]srcBLenlength of the second input sequence.
[out]*pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]pSrcApoints to the first input sequence
[in]srcALenlength of the first input sequence
[in]pSrcBpoints to the second input sequence
[in]srcBLenlength of the second input sequence
[out]pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
-
Returns
none.
-

Scaling and Overflow Behavior:

-
The function is implemented using an internal 64-bit accumulator. The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. There is no saturation on intermediate additions. Thus, if the accumulator overflows it wraps around and distorts the result. The input signals should be scaled down to avoid intermediate overflows. Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, as maximum of min(srcALen, srcBLen) number of additions are carried internally. The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
-
See arm_conv_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
- -

References srcALen, and srcBLen.

+
Returns
none
+
Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. There is no saturation on intermediate additions. Thus, if the accumulator overflows it wraps around and distorts the result. The input signals should be scaled down to avoid intermediate overflows. Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, as maximum of min(srcALen, srcBLen) number of additions are carried internally. The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+
Remarks
Refer to arm_conv_fast_q31() for a faster but less precise implementation of this function.
- +
- + @@ -708,7 +683,7 @@ Functions - + @@ -732,20 +707,17 @@ Functions
Parameters
void arm_conv_q7 (q7_tconst q7_t pSrcA,
q7_tconst q7_t pSrcB,
- - - - - + + + + +
[in]*pSrcApoints to the first input sequence.
[in]srcALenlength of the first input sequence.
[in]*pSrcBpoints to the second input sequence.
[in]srcBLenlength of the second input sequence.
[out]*pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
[in]pSrcApoints to the first input sequence
[in]srcALenlength of the first input sequence
[in]pSrcBpoints to the second input sequence
[in]srcBLenlength of the second input sequence
[out]pDstpoints to the location where the output result is written. Length srcALen+srcBLen-1.
-
Returns
none.
-

Scaling and Overflow Behavior:

-
The function is implemented using a 32-bit internal accumulator. Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
-
Refer the function arm_conv_opt_q7() for a faster implementation of this function.
- -

References __SMLAD(), srcALen, and srcBLen.

+
Returns
none
+
Scaling and Overflow Behavior
The function is implemented using a 32-bit internal accumulator. Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
+
Remarks
Refer to arm_conv_opt_q7() for a faster implementation of this function.
@@ -754,7 +726,7 @@ Functions