summaryrefslogtreecommitdiff
path: root/NN/Include
diff options
context:
space:
mode:
authorrihab kouki <rihab.kouki@st.com>2020-07-28 11:24:49 +0100
committerrihab kouki <rihab.kouki@st.com>2020-07-28 11:24:49 +0100
commit96d6da4e252b06dcfdc041e7df23e86161c33007 (patch)
treea262f59bb1db7ec7819acae435f5049cbe5e2354 /NN/Include
parent9f95ff5b6ba01db09552b84a0ab79607060a2666 (diff)
downloadst-cmsis-core-lowfat-96d6da4e252b06dcfdc041e7df23e86161c33007.tar.gz
st-cmsis-core-lowfat-96d6da4e252b06dcfdc041e7df23e86161c33007.tar.bz2
st-cmsis-core-lowfat-96d6da4e252b06dcfdc041e7df23e86161c33007.zip
Official ARM version: v5.6.0HEADmaster
Diffstat (limited to 'NN/Include')
-rw-r--r--NN/Include/arm_nn_tables.h3
-rw-r--r--NN/Include/arm_nnfunctions.h205
-rw-r--r--NN/Include/arm_nnsupportfunctions.h95
3 files changed, 216 insertions, 87 deletions
diff --git a/NN/Include/arm_nn_tables.h b/NN/Include/arm_nn_tables.h
index d56d82c..36be5a8 100644
--- a/NN/Include/arm_nn_tables.h
+++ b/NN/Include/arm_nn_tables.h
@@ -53,7 +53,4 @@ extern const q15_t tanhTable_q15[256];
extern const q15_t sigmoidHTable_q15[192];
extern const q15_t sigmoidLTable_q15[128];
-extern const q15_t sigmoidLTable_q15[128];
-extern const q15_t sigmoidHTable_q15[192];
-
#endif /* ARM_NN_TABLES_H */
diff --git a/NN/Include/arm_nnfunctions.h b/NN/Include/arm_nnfunctions.h
index c6ec83a..331255b 100644
--- a/NN/Include/arm_nnfunctions.h
+++ b/NN/Include/arm_nnfunctions.h
@@ -34,7 +34,7 @@
* ------------
*
* This user manual describes the CMSIS NN software library,
- * a collection of efficient neural network kernels developed to maximize the
+ * a collection of efficient neural network kernels developed to maximize the
* performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
*
* The library is divided into a number of functions each covering a specific category:
@@ -47,8 +47,8 @@
*
* The library has separate functions for operating on different weight and activation data
* types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
- * kernels are included in the function description. The implementation details are also
- * described in this paper [1].
+ * kernels are included in the function description. The implementation details are also
+ * described in this paper [1].
*
* Block Diagram
* --------
@@ -86,7 +86,7 @@
/**
* @defgroup groupNN Neural Network Functions
- * These functions perform basic operations for neural network layers.
+ * These functions perform basic operations for neural network layers.
*/
#ifndef _ARM_NNFUNCTIONS_H
@@ -111,12 +111,12 @@ extern "C"
*
* The convolution is implemented in 2 steps: im2col and GEMM
*
- * im2col is a process of converting each patch of image data into
+ * im2col is a process of converting each patch of image data into
* a column. After im2col, the convolution is computed as matrix-matrix
* multiplication.
- *
+ *
* To reduce the memory footprint, the im2col is performed partially.
- * Each iteration, only a few column (i.e., patches) are generated and
+ * Each iteration, only a few column (i.e., patches) are generated and
* computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
*
*/
@@ -136,9 +136,9 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
- * @return The function returns <code>ARM_MATH_SUCCESS</code>
+ * @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
@@ -153,9 +153,9 @@ extern "C"
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
- q7_t * Im_out,
- const uint16_t dim_im_out,
- q15_t * bufferA,
+ q7_t * Im_out,
+ const uint16_t dim_im_out,
+ q15_t * bufferA,
q7_t * bufferB);
/**
@@ -180,7 +180,7 @@ extern "C"
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
- * @return The function returns <code>ARM_MATH_SUCCESS</code>
+ * @return The function returns <code>ARM_MATH_SUCCESS</code>
*/
arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
@@ -219,9 +219,9 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
- * @return The function returns <code>ARM_MATH_SUCCESS</code>
+ * @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
@@ -236,9 +236,9 @@ extern "C"
const q15_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
- q15_t * Im_out,
- const uint16_t dim_im_out,
- q15_t * bufferA,
+ q15_t * Im_out,
+ const uint16_t dim_im_out,
+ q15_t * bufferA,
q7_t * bufferB);
/**
@@ -256,7 +256,7 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@@ -278,9 +278,9 @@ extern "C"
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
- q7_t * Im_out,
- const uint16_t dim_im_out,
- q15_t * bufferA,
+ q7_t * Im_out,
+ const uint16_t dim_im_out,
+ q15_t * bufferA,
q7_t * bufferB);
/**
@@ -303,7 +303,7 @@ extern "C"
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@@ -355,7 +355,7 @@ extern "C"
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@@ -405,7 +405,7 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@@ -426,9 +426,9 @@ extern "C"
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
- q7_t * Im_out,
- const uint16_t dim_im_out,
- q15_t * bufferA,
+ q7_t * Im_out,
+ const uint16_t dim_im_out,
+ q15_t * bufferA,
q7_t * bufferB);
/**
@@ -446,7 +446,7 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@@ -468,9 +468,9 @@ extern "C"
const q15_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
- q15_t * Im_out,
- const uint16_t dim_im_out,
- q15_t * bufferA,
+ q15_t * Im_out,
+ const uint16_t dim_im_out,
+ q15_t * bufferA,
q7_t * bufferB);
/**
@@ -493,7 +493,7 @@ extern "C"
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@@ -508,7 +508,7 @@ extern "C"
*
* <b>Input dimension constraints:</b>
*
- * ch_im_in is multiple of 2
+ * ch_im_in is multiple of 2
*
* ch_im_out is multipe of 2
*
@@ -532,10 +532,10 @@ extern "C"
const uint16_t out_shift,
q15_t * Im_out,
const uint16_t dim_im_out_x,
- const uint16_t dim_im_out_y,
- q15_t * bufferA,
+ const uint16_t dim_im_out_y,
+ q15_t * bufferA,
q7_t * bufferB);
-
+
/**
* @brief Q7 depthwise separable convolution function
* @param[in] Im_in pointer to input tensor
@@ -551,7 +551,7 @@ extern "C"
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@@ -574,8 +574,8 @@ extern "C"
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
- const uint16_t dim_im_out,
- q15_t * bufferA,
+ const uint16_t dim_im_out,
+ q15_t * bufferA,
q7_t * bufferB);
/**
@@ -598,7 +598,7 @@ extern "C"
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
- * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@@ -642,7 +642,7 @@ extern "C"
*
* Here we have two types of kernel functions. The basic function
* implements the function using regular GEMV approach. The opt functions
- * operates with weights in interleaved formats.
+ * operates with weights in interleaved formats.
*
*/
@@ -666,9 +666,9 @@ extern "C"
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
- const uint16_t out_shift,
- const q7_t * bias,
- q7_t * pOut,
+ const uint16_t out_shift,
+ const q7_t * bias,
+ q7_t * pOut,
q15_t * vec_buffer);
/**
@@ -691,9 +691,9 @@ extern "C"
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
- const uint16_t out_shift,
- const q7_t * bias,
- q7_t * pOut,
+ const uint16_t out_shift,
+ const q7_t * bias,
+ q7_t * pOut,
q15_t * vec_buffer);
/**
@@ -716,9 +716,9 @@ extern "C"
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
- const uint16_t out_shift,
- const q15_t * bias,
- q15_t * pOut,
+ const uint16_t out_shift,
+ const q15_t * bias,
+ q15_t * pOut,
q15_t * vec_buffer);
/**
@@ -742,8 +742,8 @@ extern "C"
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
- const q15_t * bias,
- q15_t * pOut,
+ const q15_t * bias,
+ q15_t * pOut,
q15_t * vec_buffer);
/**
@@ -767,8 +767,8 @@ extern "C"
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
- const q7_t * bias,
- q15_t * pOut,
+ const q7_t * bias,
+ q15_t * pOut,
q15_t * vec_buffer);
/**
@@ -792,16 +792,16 @@ extern "C"
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
- const q7_t * bias,
- q15_t * pOut,
+ const q7_t * bias,
+ q15_t * pOut,
q15_t * vec_buffer);
/**
* @brief Matrix-Multiplication Kernels for Convolution
*
- * These functions are used within convolution layer functions for
+ * These functions are used within convolution layer functions for
* matrix multiplication.
- *
+ *
* The implementation is similar to CMSIS-DSP arm_mat_mult functions
* with one Q7 and one Q15 operands. The Q15 operand is the im2col
* output which is always with 2 columns.
@@ -826,8 +826,8 @@ extern "C"
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
- const uint16_t out_shift,
- const q7_t * bias,
+ const uint16_t out_shift,
+ const q7_t * bias,
q7_t * pOut);
/**
@@ -848,8 +848,8 @@ extern "C"
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
- const uint16_t out_shift,
- const q7_t * bias,
+ const uint16_t out_shift,
+ const q7_t * bias,
q7_t * pOut);
#ifdef __cplusplus
@@ -902,7 +902,7 @@ extern "C"
* @return none.
*/
- void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
+ void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
arm_nn_activation_type type);
/**
@@ -944,9 +944,9 @@ extern "C"
const uint16_t ch_im_in,
const uint16_t dim_kernel,
const uint16_t padding,
- const uint16_t stride,
- const uint16_t dim_im_out,
- q7_t * bufferA,
+ const uint16_t stride,
+ const uint16_t dim_im_out,
+ q7_t * bufferA,
q7_t * Im_out);
/**
@@ -969,9 +969,9 @@ extern "C"
const uint16_t ch_im_in,
const uint16_t dim_kernel,
const uint16_t padding,
- const uint16_t stride,
- const uint16_t dim_im_out,
- q7_t * bufferA,
+ const uint16_t stride,
+ const uint16_t dim_im_out,
+ q7_t * bufferA,
q7_t * Im_out);
/**
@@ -1003,6 +1003,71 @@ extern "C"
void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
+ /**
+ * @brief uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier
+ * and input channels. Unless specified otherwise, arguments are mandatory.
+ *
+ * @param[in] input Pointer to input tensor
+ * @param[in] input_x Width of input tensor
+ * @param[in] input_y Height of input tensor
+ * @param[in] input_ch Channels in input tensor
+ * @param[in] kernel Pointer to kernel weights
+ * @param[in] kernel_x Width of kernel
+ * @param[in] kernel_y Height of kernel
+ * @param[in] ch_mult Number of channel multiplier
+ * @param[in] pad_x Padding sizes x
+ * @param[in] pad_y Padding sizes y
+ * @param[in] stride_x Convolution stride along the width
+ * @param[in] stride_y Convolution stride along the height
+ * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
+ * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
+ * @param[in] bias Pointer to optional bias values. If no bias is
+ * availble, NULL is expected
+ * @param[in] input_offset Input tensor zero offset
+ * @param[in] filter_offset Kernel tensor zero offset
+ * @param[in] output_offset Output tensor zero offset
+ * @param[in,out] output Pointer to output tensor
+ * @param[in] output_x Width of output tensor
+ * @param[in] output_y Height of output tensor
+ * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
+ * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
+ * @param[in] out_shift Amount of right-shift for output
+ * @param[in] out_mult Output multiplier for requantization
+ * @return The function returns one of the following
+ * <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
+ * <code>ARM_MATH_SUCCESS</code> - Successful operation
+ * <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available
+ *
+ * <b> Input constraints</b>
+ * ch_mult is multiple of 2
+ * kernel_x is multiple of 2
+ *
+ */
+ arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
+ const uint16_t input_x,
+ const uint16_t input_y,
+ const uint16_t input_ch,
+ const uint8_t *kernel,
+ const uint16_t kernel_x,
+ const uint16_t kernel_y,
+ const int16_t ch_mult,
+ const int16_t pad_x,
+ const int16_t pad_y,
+ const int16_t stride_x,
+ const int16_t stride_y,
+ const int16_t dilation_x,
+ const int16_t dilation_y,
+ const int32_t *bias,
+ const int32_t input_offset,
+ const int32_t filter_offset,
+ const int32_t output_offset,
+ uint8_t *output,
+ const uint16_t output_x,
+ const uint16_t output_y,
+ const int32_t output_activation_min,
+ const int32_t output_activation_max,
+ const int32_t out_shift,
+ const int32_t out_mult);
#ifdef __cplusplus
}
#endif
diff --git a/NN/Include/arm_nnsupportfunctions.h b/NN/Include/arm_nnsupportfunctions.h
index 8460190..af426e1 100644
--- a/NN/Include/arm_nnsupportfunctions.h
+++ b/NN/Include/arm_nnsupportfunctions.h
@@ -32,13 +32,17 @@
#include "arm_math.h"
#include "arm_common_tables.h"
-//#include <cstring>
#ifdef __cplusplus
extern "C"
{
#endif
+#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0)
+#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
+#define Q31_MIN (0x80000000L)
+#define Q31_MAX (0x7FFFFFFFL)
+
/**
* @brief Union for SIMD access of Q31/Q15/Q7 types
*/
@@ -72,11 +76,11 @@ typedef enum
*/
/**
- * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift
- * @param[in] *pSrc points to the Q7 input vector
- * @param[out] *pDst points to the Q15 output vector
- * @param[in] blockSize length of the input vector
- * @return none.
+ * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift
+ * @param[in] *pSrc points to the Q7 input vector
+ * @param[out] *pDst points to the Q15 output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
*
*/
@@ -84,10 +88,10 @@ void arm_q7_to_q15_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t block
/**
* @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
- * @param[in] *pSrc points to the Q7 input vector
- * @param[out] *pDst points to the Q15 output vector
- * @param[in] blockSize length of the input vector
- * @return none.
+ * @param[in] *pSrc points to the Q7 input vector
+ * @param[out] *pDst points to the Q15 output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
*
*/
@@ -163,7 +167,7 @@ void arm_nn_mult_q15(
q15_t * pDst,
const uint16_t out_shift,
uint32_t blockSize);
-
+
/**
* @brief Q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
@@ -185,16 +189,79 @@ void arm_nn_mult_q7(
q7_t * pDst,
const uint16_t out_shift,
uint32_t blockSize);
-
+
/**
- * @brief defition to adding rouding offset
+ * @brief macro for adding rounding offset
*/
#ifndef ARM_NN_TRUNCATE
- #define NN_ROUND(out_shift) ( 0x1 << (out_shift - 1) )
+ #define NN_ROUND(out_shift) ( (0x1u << out_shift) >> 1 )
#else
#define NN_ROUND(out_shift) 0
#endif
+/**
+ * @brief Saturating doubling high multiply. Result matches
+ * NEON instruction VQRDMULH.
+ * @param[in] m1 Multiplicand
+ * @param[in] m2 Multiplier
+ * @return Result of multiplication.
+ *
+ */
+__STATIC_FORCEINLINE q31_t arm_nn_sat_doubling_high_mult(const q31_t m1, const q31_t m2)
+{
+ q31_t result = 0;
+ // Rounding offset to add for a right shift of 31
+ q63_t mult = 1 << 30;
+
+ if ((m1 < 0) ^ (m2 < 0))
+ {
+ mult = 1 - mult;
+ }
+ // Gets resolved as a SMLAL instruction
+ mult = mult + (q63_t)m1 * m2;
+
+ // Utilize all of the upper 32 bits. This is the doubling step
+ // as well.
+ result = mult / (1UL << 31);
+
+ if ((m1 == m2) && (m1 == Q31_MIN))
+ {
+ result = Q31_MAX;
+ }
+ return result;
+}
+
+/**
+ * @brief Rounding divide by power of two.
+ * @param[in] dividend - Dividend
+ * @param[in] exponent - Divisor = power(2, exponent)
+ * Range: [0, 31]
+ * @return Rounded result of division. Midpoint is rounded away from zero.
+ *
+ */
+__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent)
+{
+ q31_t result = 0;
+ const q31_t remainder_mask = (1l << exponent) - 1;
+ int32_t remainder = remainder_mask & dividend;
+
+ // Basic division
+ result = dividend >> exponent;
+
+ // Adjust 'result' for rounding (mid point away from zero)
+ q31_t threshold = remainder_mask >> 1;
+ if (result < 0)
+ {
+ threshold++;
+ }
+ if (remainder > threshold)
+ {
+ result++;
+ }
+
+ return result;
+}
+
#ifdef __cplusplus
}
#endif