From 96d6da4e252b06dcfdc041e7df23e86161c33007 Mon Sep 17 00:00:00 2001 From: rihab kouki Date: Tue, 28 Jul 2020 11:24:49 +0100 Subject: Official ARM version: v5.6.0 --- NN/Include/arm_nn_tables.h | 3 - NN/Include/arm_nnfunctions.h | 205 ++++++++++++++++++++++++------------ NN/Include/arm_nnsupportfunctions.h | 95 ++++++++++++++--- 3 files changed, 216 insertions(+), 87 deletions(-) (limited to 'NN/Include') diff --git a/NN/Include/arm_nn_tables.h b/NN/Include/arm_nn_tables.h index d56d82c..36be5a8 100644 --- a/NN/Include/arm_nn_tables.h +++ b/NN/Include/arm_nn_tables.h @@ -53,7 +53,4 @@ extern const q15_t tanhTable_q15[256]; extern const q15_t sigmoidHTable_q15[192]; extern const q15_t sigmoidLTable_q15[128]; -extern const q15_t sigmoidLTable_q15[128]; -extern const q15_t sigmoidHTable_q15[192]; - #endif /* ARM_NN_TABLES_H */ diff --git a/NN/Include/arm_nnfunctions.h b/NN/Include/arm_nnfunctions.h index c6ec83a..331255b 100644 --- a/NN/Include/arm_nnfunctions.h +++ b/NN/Include/arm_nnfunctions.h @@ -34,7 +34,7 @@ * ------------ * * This user manual describes the CMSIS NN software library, - * a collection of efficient neural network kernels developed to maximize the + * a collection of efficient neural network kernels developed to maximize the * performance and minimize the memory footprint of neural networks on Cortex-M processor cores. * * The library is divided into a number of functions each covering a specific category: @@ -47,8 +47,8 @@ * * The library has separate functions for operating on different weight and activation data * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the - * kernels are included in the function description. The implementation details are also - * described in this paper [1]. + * kernels are included in the function description. The implementation details are also + * described in this paper [1]. * * Block Diagram * -------- @@ -86,7 +86,7 @@ /** * @defgroup groupNN Neural Network Functions - * These functions perform basic operations for neural network layers. + * These functions perform basic operations for neural network layers. */ #ifndef _ARM_NNFUNCTIONS_H @@ -111,12 +111,12 @@ extern "C" * * The convolution is implemented in 2 steps: im2col and GEMM * - * im2col is a process of converting each patch of image data into + * im2col is a process of converting each patch of image data into * a column. After im2col, the convolution is computed as matrix-matrix * multiplication. - * + * * To reduce the memory footprint, the im2col is performed partially. - * Each iteration, only a few column (i.e., patches) are generated and + * Each iteration, only a few column (i.e., patches) are generated and * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions. * */ @@ -136,9 +136,9 @@ extern "C" * @param[in] out_shift amount of right-shift for output * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output - * @return The function returns ARM_MATH_SUCCESS + * @return The function returns ARM_MATH_SUCCESS * */ @@ -153,9 +153,9 @@ extern "C" const q7_t * bias, const uint16_t bias_shift, const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, q7_t * bufferB); /** @@ -180,7 +180,7 @@ extern "C" * @param[in] dim_im_out_y output tensor dimension y * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output - * @return The function returns ARM_MATH_SUCCESS + * @return The function returns ARM_MATH_SUCCESS */ arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in, @@ -219,9 +219,9 @@ extern "C" * @param[in] out_shift amount of right-shift for output * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output - * @return The function returns ARM_MATH_SUCCESS + * @return The function returns ARM_MATH_SUCCESS * */ @@ -236,9 +236,9 @@ extern "C" const q15_t * bias, const uint16_t bias_shift, const uint16_t out_shift, - q15_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA, + q15_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, q7_t * bufferB); /** @@ -256,7 +256,7 @@ extern "C" * @param[in] out_shift amount of right-shift for output * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns either * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. @@ -278,9 +278,9 @@ extern "C" const q7_t * bias, const uint16_t bias_shift, const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, q7_t * bufferB); /** @@ -303,7 +303,7 @@ extern "C" * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out_x output tensor dimension x * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns either * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. @@ -355,7 +355,7 @@ extern "C" * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out_x output tensor dimension x * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns either * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. @@ -405,7 +405,7 @@ extern "C" * @param[in] out_shift amount of right-shift for output * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns either * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. @@ -426,9 +426,9 @@ extern "C" const q7_t * bias, const uint16_t bias_shift, const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, q7_t * bufferB); /** @@ -446,7 +446,7 @@ extern "C" * @param[in] out_shift amount of right-shift for output * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns either * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. @@ -468,9 +468,9 @@ extern "C" const q15_t * bias, const uint16_t bias_shift, const uint16_t out_shift, - q15_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA, + q15_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, q7_t * bufferB); /** @@ -493,7 +493,7 @@ extern "C" * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out_x output tensor dimension x * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns either * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. @@ -508,7 +508,7 @@ extern "C" * * Input dimension constraints: * - * ch_im_in is multiple of 2 + * ch_im_in is multiple of 2 * * ch_im_out is multipe of 2 * @@ -532,10 +532,10 @@ extern "C" const uint16_t out_shift, q15_t * Im_out, const uint16_t dim_im_out_x, - const uint16_t dim_im_out_y, - q15_t * bufferA, + const uint16_t dim_im_out_y, + q15_t * bufferA, q7_t * bufferB); - + /** * @brief Q7 depthwise separable convolution function * @param[in] Im_in pointer to input tensor @@ -551,7 +551,7 @@ extern "C" * @param[in] out_shift amount of right-shift for output * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns either * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. @@ -574,8 +574,8 @@ extern "C" const uint16_t bias_shift, const uint16_t out_shift, q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA, + const uint16_t dim_im_out, + q15_t * bufferA, q7_t * bufferB); /** @@ -598,7 +598,7 @@ extern "C" * @param[in,out] Im_out pointer to output tensor * @param[in] dim_im_out_x output tensor dimension x * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns either * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. @@ -642,7 +642,7 @@ extern "C" * * Here we have two types of kernel functions. The basic function * implements the function using regular GEMV approach. The opt functions - * operates with weights in interleaved formats. + * operates with weights in interleaved formats. * */ @@ -666,9 +666,9 @@ extern "C" const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut, + const uint16_t out_shift, + const q7_t * bias, + q7_t * pOut, q15_t * vec_buffer); /** @@ -691,9 +691,9 @@ extern "C" const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut, + const uint16_t out_shift, + const q7_t * bias, + q7_t * pOut, q15_t * vec_buffer); /** @@ -716,9 +716,9 @@ extern "C" const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, - const uint16_t out_shift, - const q15_t * bias, - q15_t * pOut, + const uint16_t out_shift, + const q15_t * bias, + q15_t * pOut, q15_t * vec_buffer); /** @@ -742,8 +742,8 @@ extern "C" const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, - const q15_t * bias, - q15_t * pOut, + const q15_t * bias, + q15_t * pOut, q15_t * vec_buffer); /** @@ -767,8 +767,8 @@ extern "C" const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, - const q7_t * bias, - q15_t * pOut, + const q7_t * bias, + q15_t * pOut, q15_t * vec_buffer); /** @@ -792,16 +792,16 @@ extern "C" const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, - const q7_t * bias, - q15_t * pOut, + const q7_t * bias, + q15_t * pOut, q15_t * vec_buffer); /** * @brief Matrix-Multiplication Kernels for Convolution * - * These functions are used within convolution layer functions for + * These functions are used within convolution layer functions for * matrix multiplication. - * + * * The implementation is similar to CMSIS-DSP arm_mat_mult functions * with one Q7 and one Q15 operands. The Q15 operand is the im2col * output which is always with 2 columns. @@ -826,8 +826,8 @@ extern "C" const uint16_t ch_im_out, const uint16_t numCol_A, const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, + const uint16_t out_shift, + const q7_t * bias, q7_t * pOut); /** @@ -848,8 +848,8 @@ extern "C" const uint16_t ch_im_out, const uint16_t numCol_A, const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, + const uint16_t out_shift, + const q7_t * bias, q7_t * pOut); #ifdef __cplusplus @@ -902,7 +902,7 @@ extern "C" * @return none. */ - void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width, + void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width, arm_nn_activation_type type); /** @@ -944,9 +944,9 @@ extern "C" const uint16_t ch_im_in, const uint16_t dim_kernel, const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t * bufferA, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t * bufferA, q7_t * Im_out); /** @@ -969,9 +969,9 @@ extern "C" const uint16_t ch_im_in, const uint16_t dim_kernel, const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t * bufferA, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t * bufferA, q7_t * Im_out); /** @@ -1003,6 +1003,71 @@ extern "C" void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out); + /** + * @brief uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier + * and input channels. Unless specified otherwise, arguments are mandatory. + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_ch Channels in input tensor + * @param[in] kernel Pointer to kernel weights + * @param[in] kernel_x Width of kernel + * @param[in] kernel_y Height of kernel + * @param[in] ch_mult Number of channel multiplier + * @param[in] pad_x Padding sizes x + * @param[in] pad_y Padding sizes y + * @param[in] stride_x Convolution stride along the width + * @param[in] stride_y Convolution stride along the height + * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement. + * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement. + * @param[in] bias Pointer to optional bias values. If no bias is + * availble, NULL is expected + * @param[in] input_offset Input tensor zero offset + * @param[in] filter_offset Kernel tensor zero offset + * @param[in] output_offset Output tensor zero offset + * @param[in,out] output Pointer to output tensor + * @param[in] output_x Width of output tensor + * @param[in] output_y Height of output tensor + * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255} + * @param[in] out_shift Amount of right-shift for output + * @param[in] out_mult Output multiplier for requantization + * @return The function returns one of the following + * ARM_MATH_SIZE_MISMATCH - Not supported dimension of tensors + * ARM_MATH_SUCCESS - Successful operation + * ARM_MATH_ARGUMENT_ERROR - Implementation not available + * + * Input constraints + * ch_mult is multiple of 2 + * kernel_x is multiple of 2 + * + */ + arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const uint8_t *kernel, + const uint16_t kernel_x, + const uint16_t kernel_y, + const int16_t ch_mult, + const int16_t pad_x, + const int16_t pad_y, + const int16_t stride_x, + const int16_t stride_y, + const int16_t dilation_x, + const int16_t dilation_y, + const int32_t *bias, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_offset, + uint8_t *output, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t out_shift, + const int32_t out_mult); #ifdef __cplusplus } #endif diff --git a/NN/Include/arm_nnsupportfunctions.h b/NN/Include/arm_nnsupportfunctions.h index 8460190..af426e1 100644 --- a/NN/Include/arm_nnsupportfunctions.h +++ b/NN/Include/arm_nnsupportfunctions.h @@ -32,13 +32,17 @@ #include "arm_math.h" #include "arm_common_tables.h" -//#include #ifdef __cplusplus extern "C" { #endif +#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0) +#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift) +#define Q31_MIN (0x80000000L) +#define Q31_MAX (0x7FFFFFFFL) + /** * @brief Union for SIMD access of Q31/Q15/Q7 types */ @@ -72,11 +76,11 @@ typedef enum */ /** - * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift - * @param[in] *pSrc points to the Q7 input vector - * @param[out] *pDst points to the Q15 output vector - * @param[in] blockSize length of the input vector - * @return none. + * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift + * @param[in] *pSrc points to the Q7 input vector + * @param[out] *pDst points to the Q15 output vector + * @param[in] blockSize length of the input vector + * @return none. * */ @@ -84,10 +88,10 @@ void arm_q7_to_q15_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t block /** * @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift - * @param[in] *pSrc points to the Q7 input vector - * @param[out] *pDst points to the Q15 output vector - * @param[in] blockSize length of the input vector - * @return none. + * @param[in] *pSrc points to the Q7 input vector + * @param[out] *pDst points to the Q15 output vector + * @param[in] blockSize length of the input vector + * @return none. * */ @@ -163,7 +167,7 @@ void arm_nn_mult_q15( q15_t * pDst, const uint16_t out_shift, uint32_t blockSize); - + /** * @brief Q7 vector multiplication with variable output shifts * @param[in] *pSrcA pointer to the first input vector @@ -185,16 +189,79 @@ void arm_nn_mult_q7( q7_t * pDst, const uint16_t out_shift, uint32_t blockSize); - + /** - * @brief defition to adding rouding offset + * @brief macro for adding rounding offset */ #ifndef ARM_NN_TRUNCATE - #define NN_ROUND(out_shift) ( 0x1 << (out_shift - 1) ) + #define NN_ROUND(out_shift) ( (0x1u << out_shift) >> 1 ) #else #define NN_ROUND(out_shift) 0 #endif +/** + * @brief Saturating doubling high multiply. Result matches + * NEON instruction VQRDMULH. + * @param[in] m1 Multiplicand + * @param[in] m2 Multiplier + * @return Result of multiplication. + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_sat_doubling_high_mult(const q31_t m1, const q31_t m2) +{ + q31_t result = 0; + // Rounding offset to add for a right shift of 31 + q63_t mult = 1 << 30; + + if ((m1 < 0) ^ (m2 < 0)) + { + mult = 1 - mult; + } + // Gets resolved as a SMLAL instruction + mult = mult + (q63_t)m1 * m2; + + // Utilize all of the upper 32 bits. This is the doubling step + // as well. + result = mult / (1UL << 31); + + if ((m1 == m2) && (m1 == Q31_MIN)) + { + result = Q31_MAX; + } + return result; +} + +/** + * @brief Rounding divide by power of two. + * @param[in] dividend - Dividend + * @param[in] exponent - Divisor = power(2, exponent) + * Range: [0, 31] + * @return Rounded result of division. Midpoint is rounded away from zero. + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent) +{ + q31_t result = 0; + const q31_t remainder_mask = (1l << exponent) - 1; + int32_t remainder = remainder_mask & dividend; + + // Basic division + result = dividend >> exponent; + + // Adjust 'result' for rounding (mid point away from zero) + q31_t threshold = remainder_mask >> 1; + if (result < 0) + { + threshold++; + } + if (remainder > threshold) + { + result++; + } + + return result; +} + #ifdef __cplusplus } #endif -- cgit