Functions
arm_status	arm_convolve_1x1_HWC_q7_fast_nonsquare (const q7_t Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t bufferA, q7_t bufferB)
	Fast Q7 version of 1x1 convolution (non-sqaure shape) More...

arm_status	arm_convolve_HWC_q15_basic (const q15_t Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t Im_out, const uint16_t dim_im_out, q15_t bufferA, q7_t bufferB)
	Basic Q15 convolution function. More...

arm_status	arm_convolve_HWC_q15_fast (const q15_t Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t Im_out, const uint16_t dim_im_out, q15_t bufferA, q7_t bufferB)
	Fast Q15 convolution function. More...

arm_status	arm_convolve_HWC_q15_fast_nonsquare (const q15_t Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q15_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q15_t bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t bufferA, q7_t bufferB)
	Fast Q15 convolution function (non-sqaure shape) More...

arm_status	arm_convolve_HWC_q7_basic (const q7_t Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t Im_out, const uint16_t dim_im_out, q15_t bufferA, q7_t bufferB)
	Basic Q7 convolution function. More...

arm_status	arm_convolve_HWC_q7_basic_nonsquare (const q7_t Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t bufferA, q7_t bufferB)
	Basic Q7 convolution function (non-sqaure shape) More...

arm_status	arm_convolve_HWC_q7_fast (const q7_t Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t Im_out, const uint16_t dim_im_out, q15_t bufferA, q7_t bufferB)
	Fast Q7 convolution function. More...

arm_status	arm_convolve_HWC_q7_fast_nonsquare (const q7_t Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t bufferA, q7_t bufferB)
	Fast Q7 convolution function (non-sqaure shape) More...

arm_status	arm_convolve_HWC_q7_RGB (const q7_t Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t Im_out, const uint16_t dim_im_out, q15_t bufferA, q7_t bufferB)
	Q7 convolution function for RGB image. More...

arm_status	arm_depthwise_conv_u8_basic_ver1 (const uint8_t input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint8_t kernel, const uint16_t kernel_x, const uint16_t kernel_y, const int16_t ch_mult, const int16_t pad_x, const int16_t pad_y, const int16_t stride_x, const int16_t stride_y, const int16_t dilation_x, const int16_t dilation_y, const int32_t bias, const int32_t input_offset, const int32_t filter_offset, const int32_t output_offset, uint8_t output, const uint16_t output_x, const uint16_t output_y, const int32_t output_activation_min, const int32_t output_activation_max, const int32_t out_shift, const int32_t out_mult)
	uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier and input channels. Unless specified otherwise, arguments are mandatory. Both square and non-square inputs are accepted. More...

arm_status	arm_depthwise_separable_conv_HWC_q7 (const q7_t Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t Im_out, const uint16_t dim_im_out, q15_t bufferA, q7_t bufferB)
	Q7 depthwise separable convolution function. More...

arm_status	arm_depthwise_separable_conv_HWC_q7_nonsquare (const q7_t Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t bufferA, q7_t bufferB)
	Q7 depthwise separable convolution function (non-square shape) More...

Description

Perform convolution layer

The convolution is implemented in 2 steps: im2col and GEMM

im2col is a process of converting each patch of image data into a column. After im2col, the convolution is computed as matrix-matrix multiplication.

To reduce the memory footprint, the im2col is performed partially. Each iteration, only a few column (i.e., patches) are generated and computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.

Function Documentation

arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare	(	const q7_t *	Im_in,
		const uint16_t	dim_im_in_x,
		const uint16_t	dim_im_in_y,
		const uint16_t	ch_im_in,
		const q7_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel_x,
		const uint16_t	dim_kernel_y,
		const uint16_t	padding_x,
		const uint16_t	padding_y,
		const uint16_t	stride_x,
		const uint16_t	stride_y,
		const q7_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q7_t *	Im_out,
		const uint16_t	dim_im_out_x,
		const uint16_t	dim_im_out_y,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in_x	input tensor dimention x
[in]	dim_im_in_y	input tensor dimention y
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel_x	filter kernel size x
[in]	dim_kernel_y	filter kernel size y
[in]	padding_x	padding size x
[in]	padding_y	padding size y
[in]	stride_x	convolution stride x
[in]	stride_y	convolution stride y
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out_x	output tensor dimension x
[in]	dim_im_out_y	output tensor dimension y
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1 and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise separable convolution.

This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2

[1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861

References arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_q7_to_q15_reordered_no_shift(), and NN_ROUND.

arm_status arm_convolve_HWC_q15_basic	(	const q15_t *	Im_in,
		const uint16_t	dim_im_in,
		const uint16_t	ch_im_in,
		const q15_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel,
		const uint16_t	padding,
		const uint16_t	stride,
		const q15_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q15_t *	Im_out,
		const uint16_t	dim_im_out,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in	input tensor dimention
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel	filter kernel size
[in]	padding	padding sizes
[in]	stride	convolution stride
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out	output tensor dimension
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns ARM_MATH_SUCCESS

Buffer size:

bufferA size: ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

This basic version is designed to work for any input tensor and weight dimension.

References NN_ROUND.

arm_status arm_convolve_HWC_q15_fast	(	const q15_t *	Im_in,
		const uint16_t	dim_im_in,
		const uint16_t	ch_im_in,
		const q15_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel,
		const uint16_t	padding,
		const uint16_t	stride,
		const q15_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q15_t *	Im_out,
		const uint16_t	dim_im_out,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in	input tensor dimention
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel	filter kernel size
[in]	padding	padding sizes
[in]	stride	convolution stride
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out	output tensor dimension
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in is multiple of 2

ch_im_out is multipe of 2

References NN_ROUND.

arm_status arm_convolve_HWC_q15_fast_nonsquare	(	const q15_t *	Im_in,
		const uint16_t	dim_im_in_x,
		const uint16_t	dim_im_in_y,
		const uint16_t	ch_im_in,
		const q15_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel_x,
		const uint16_t	dim_kernel_y,
		const uint16_t	padding_x,
		const uint16_t	padding_y,
		const uint16_t	stride_x,
		const uint16_t	stride_y,
		const q15_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q15_t *	Im_out,
		const uint16_t	dim_im_out_x,
		const uint16_t	dim_im_out_y,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in_x	input tensor dimention x
[in]	dim_im_in_y	input tensor dimention y
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel_x	filter kernel size x
[in]	dim_kernel_y	filter kernel size y
[in]	padding_x	padding size x
[in]	padding_y	padding size y
[in]	stride_x	convolution stride x
[in]	stride_y	convolution stride y
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out_x	output tensor dimension x
[in]	dim_im_out_y	output tensor dimension y
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in is multiple of 2

ch_im_out is multipe of 2

References NN_ROUND.

arm_status arm_convolve_HWC_q7_basic	(	const q7_t *	Im_in,
		const uint16_t	dim_im_in,
		const uint16_t	ch_im_in,
		const q7_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel,
		const uint16_t	padding,
		const uint16_t	stride,
		const q7_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q7_t *	Im_out,
		const uint16_t	dim_im_out,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in	input tensor dimention
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel	filter kernel size
[in]	padding	padding sizes
[in]	stride	convolution stride
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out	output tensor dimension
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns ARM_MATH_SUCCESS

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

This basic version is designed to work for any input tensor and weight dimension.

References arm_nn_mat_mult_kernel_q7_q15(), arm_q7_to_q15_no_shift(), and NN_ROUND.

arm_status arm_convolve_HWC_q7_basic_nonsquare	(	const q7_t *	Im_in,
		const uint16_t	dim_im_in_x,
		const uint16_t	dim_im_in_y,
		const uint16_t	ch_im_in,
		const q7_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel_x,
		const uint16_t	dim_kernel_y,
		const uint16_t	padding_x,
		const uint16_t	padding_y,
		const uint16_t	stride_x,
		const uint16_t	stride_y,
		const q7_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q7_t *	Im_out,
		const uint16_t	dim_im_out_x,
		const uint16_t	dim_im_out_y,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in_x	input tensor dimention x
[in]	dim_im_in_y	input tensor dimention y
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel_x	filter kernel size x
[in]	dim_kernel_y	filter kernel size y
[in]	padding_x	padding size x
[in]	padding_y	padding size y
[in]	stride_x	convolution stride x
[in]	stride_y	convolution stride y
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out_x	output tensor dimension x
[in]	dim_im_out_y	output tensor dimension y
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns ARM_MATH_SUCCESS

References arm_nn_mat_mult_kernel_q7_q15(), arm_q7_to_q15_no_shift(), and NN_ROUND.

arm_status arm_convolve_HWC_q7_fast	(	const q7_t *	Im_in,
		const uint16_t	dim_im_in,
		const uint16_t	ch_im_in,
		const q7_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel,
		const uint16_t	padding,
		const uint16_t	stride,
		const q7_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q7_t *	Im_out,
		const uint16_t	dim_im_out,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in	input tensor dimention
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel	filter kernel size
[in]	padding	padding sizes
[in]	stride	convolution stride
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out	output tensor dimension
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )

ch_im_out is multipe of 2 ( bacause 2x2 mat_mult kernel )

The im2col converts the Q7 tensor input into Q15 column, which is stored in bufferA. There is reordering happenning during this im2col process with arm_q7_to_q15_reordered_no_shift. For every four elements, the second and third elements are swapped.

The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the GEMM computation with the reordered columns.

To speed-up the determination of the padding condition, we split the computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. This reduces the total number of boundary condition checks and improves the data copying performance.

References arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_q7_to_q15_reordered_no_shift(), and NN_ROUND.

Referenced by main().

arm_status arm_convolve_HWC_q7_fast_nonsquare	(	const q7_t *	Im_in,
		const uint16_t	dim_im_in_x,
		const uint16_t	dim_im_in_y,
		const uint16_t	ch_im_in,
		const q7_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel_x,
		const uint16_t	dim_kernel_y,
		const uint16_t	padding_x,
		const uint16_t	padding_y,
		const uint16_t	stride_x,
		const uint16_t	stride_y,
		const q7_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q7_t *	Im_out,
		const uint16_t	dim_im_out_x,
		const uint16_t	dim_im_out_y,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in_x	input tensor dimention x
[in]	dim_im_in_y	input tensor dimention y
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel_x	filter kernel size x
[in]	dim_kernel_y	filter kernel size y
[in]	padding_x	padding size x
[in]	padding_y	padding size y
[in]	stride_x	convolution stride x
[in]	stride_y	convolution stride y
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out_x	output tensor dimension x
[in]	dim_im_out_y	output tensor dimension y
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2

References arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_q7_to_q15_reordered_no_shift(), and NN_ROUND.

arm_status arm_convolve_HWC_q7_RGB	(	const q7_t *	Im_in,
		const uint16_t	dim_im_in,
		const uint16_t	ch_im_in,
		const q7_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel,
		const uint16_t	padding,
		const uint16_t	stride,
		const q7_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q7_t *	Im_out,
		const uint16_t	dim_im_out,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Q7 version of convolution for RGB image.

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in	input tensor dimention
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel	filter kernel size
[in]	padding	padding sizes
[in]	stride	convolution stride
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out	output tensor dimension
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in equals 3

This kernel is written exclusively for convolution with ch_im_in equals 3. This applies on the first layer of CNNs which has input image with RGB format.

References arm_nn_mat_mult_kernel_q7_q15(), arm_nnword::half_words, NN_ROUND, and arm_nnword::word.

Referenced by main().

arm_status arm_depthwise_conv_u8_basic_ver1	(	const uint8_t *	input,
		const uint16_t	input_x,
		const uint16_t	input_y,
		const uint16_t	input_ch,
		const uint8_t *	kernel,
		const uint16_t	kernel_x,
		const uint16_t	kernel_y,
		const int16_t	ch_mult,
		const int16_t	pad_x,
		const int16_t	pad_y,
		const int16_t	stride_x,
		const int16_t	stride_y,
		const int16_t	dilation_x,
		const int16_t	dilation_y,
		const int32_t *	bias,
		const int32_t	input_offset,
		const int32_t	filter_offset,
		const int32_t	output_offset,
		uint8_t *	output,
		const uint16_t	output_x,
		const uint16_t	output_y,
		const int32_t	output_activation_min,
		const int32_t	output_activation_max,
		const int32_t	out_shift,
		const int32_t	out_mult
	)

uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier and input channels. Unless specified otherwise, arguments are mandatory.

Parameters

[in]	input	Pointer to input tensor
[in]	input_x	Width of input tensor
[in]	input_y	Height of input tensor
[in]	input_ch	Channels in input tensor
[in]	kernel	Pointer to kernel weights
[in]	kernel_x	Width of kernel
[in]	kernel_y	Height of kernel
[in]	ch_mult	Number of channel multiplier
[in]	pad_x	Padding sizes x
[in]	pad_y	Padding sizes y
[in]	stride_x	Convolution stride along the width
[in]	stride_y	Convolution stride along the height
[in]	dilation_x	Dilation along width. Not used and intended for future enhancement.
[in]	dilation_y	Dilation along height. Not used and intended for future enhancement.
[in]	bias	Pointer to optional bias values. If no bias is availble, NULL is expected
[in]	input_offset	Input tensor zero offset
[in]	filter_offset	Kernel tensor zero offset
[in]	output_offset	Output tensor zero offset
[in,out]	output	Pointer to output tensor
[in]	output_x	Width of output tensor
[in]	output_y	Height of output tensor
[in]	output_activation_min	Minimum value to clamp the output to. Range : {0, 255}
[in]	output_activation_max	Minimum value to clamp the output to. Range : {0, 255}
[in]	out_shift	Amount of right-shift for output
[in]	out_mult	Output multiplier for requantization

Returns: The function returns one of the following ARM_MATH_SIZE_MISMATCH - Not supported dimension of tensors ARM_MATH_SUCCESS - Successful operation ARM_MATH_ARGUMENT_ERROR - Implementation not available

Input constraints ch_mult is multiple of 2 kernel_x is multiple of 2

References arm_nn_divide_by_power_of_two(), arm_nn_sat_doubling_high_mult(), DILATION_X, DILATION_Y, LEFT_SHIFT, and RIGHT_SHIFT.

arm_status arm_depthwise_separable_conv_HWC_q7	(	const q7_t *	Im_in,
		const uint16_t	dim_im_in,
		const uint16_t	ch_im_in,
		const q7_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel,
		const uint16_t	padding,
		const uint16_t	stride,
		const q7_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q7_t *	Im_out,
		const uint16_t	dim_im_out,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in	input tensor dimention
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel	filter kernel size
[in]	padding	padding sizes
[in]	stride	convolution stride
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out	output tensor dimension
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in equals ch_im_out

Implementation: There are 3 nested loop here: Inner loop: calculate each output value with MAC instruction over an accumulator Mid loop: loop over different output channel Outer loop: loop over different output (x, y)

References arm_nnword::bytes, NN_ROUND, and arm_nnword::word.

arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare	(	const q7_t *	Im_in,
		const uint16_t	dim_im_in_x,
		const uint16_t	dim_im_in_y,
		const uint16_t	ch_im_in,
		const q7_t *	wt,
		const uint16_t	ch_im_out,
		const uint16_t	dim_kernel_x,
		const uint16_t	dim_kernel_y,
		const uint16_t	padding_x,
		const uint16_t	padding_y,
		const uint16_t	stride_x,
		const uint16_t	stride_y,
		const q7_t *	bias,
		const uint16_t	bias_shift,
		const uint16_t	out_shift,
		q7_t *	Im_out,
		const uint16_t	dim_im_out_x,
		const uint16_t	dim_im_out_y,
		q15_t *	bufferA,
		q7_t *	bufferB
	)

Parameters

[in]	Im_in	pointer to input tensor
[in]	dim_im_in_x	input tensor dimention x
[in]	dim_im_in_y	input tensor dimention y
[in]	ch_im_in	number of input tensor channels
[in]	wt	pointer to kernel weights
[in]	ch_im_out	number of filters, i.e., output tensor channels
[in]	dim_kernel_x	filter kernel size x
[in]	dim_kernel_y	filter kernel size y
[in]	padding_x	padding sizes x
[in]	padding_y	padding sizes y
[in]	stride_x	convolution stride x
[in]	stride_y	convolution stride y
[in]	bias	pointer to bias
[in]	bias_shift	amount of left-shift for bias
[in]	out_shift	amount of right-shift for output
[in,out]	Im_out	pointer to output tensor
[in]	dim_im_out_x	output tensor dimension x
[in]	dim_im_out_y	output tensor dimension y
[in,out]	bufferA	pointer to buffer space for input
[in,out]	bufferB	pointer to buffer space for output

Returns: The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 2 ch_im_out is multiple of 2

References arm_nnword::bytes, NN_ROUND, and arm_nnword::word.

Functions

Description

Function Documentation