Skip to content

Commit

Permalink
4-bit support for convolution (#85)
Browse files Browse the repository at this point in the history
- Adds arm_convolve_wrapper function for 4bit weights
 - Adds arm_convolve_1x1 normal and fast variant for 4bit weights
- Adds arm_nn_mat_mult_kernel_s4_s16 for multiply 4bit weights with
16bit input
 - Adds mat_mult_nt_t_s4 function for dsp and scalar
 - Adds scalar and dsp implementation for arm_convolve with 4bit weights
 - Adds Unit tests for 4bit weight convolutions

Change-Id: Idea55432fdab2db05a033889d7c39dd0ea69f8ad

Signed-off-by: Ryan O'Shea <[email protected]>
  • Loading branch information
ArmRyan authored Nov 14, 2023
1 parent edececa commit bfc54ed
Show file tree
Hide file tree
Showing 231 changed files with 10,857 additions and 54 deletions.
7 changes: 7 additions & 0 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,12 @@
<file category="header" name="Include/arm_nn_math_types.h"/>

<file category="source" name="Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s4_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s4_fast.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s16.c"/>
Expand All @@ -50,14 +53,17 @@
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_fast_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_wrapper_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_transpose_conv_s8.c"/>
Expand Down Expand Up @@ -90,6 +96,7 @@
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_s8_to_s16_unordered_with_offset.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s4.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8_s32.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c"/>
Expand Down
231 changes: 229 additions & 2 deletions Include/arm_nnfunctions.h

Large diffs are not rendered by default.

102 changes: 100 additions & 2 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 7 November 2023
* $Revision: V.17.5.0
* $Date: 13 November 2023
* $Revision: V.17.6.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -340,6 +340,53 @@ int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
const int32_t *bias,
int8_t *output);

/**
* @brief General Matrix-multiplication function with per-channel requantization.
* This function assumes:
* - LHS input matrix NOT transposed (nt)
* - RHS input matrix transposed (t)
* - RHS is int8 packed with 2x int4
* - LHS is int8
*
* @note This operation also performs the broadcast bias addition before the requantization
*
* @param[in] lhs Pointer to the LHS input matrix
* @param[in] rhs Pointer to the RHS input matrix
* @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of
* output columns (or RHS input rows)
* @param[out] dst Pointer to the output matrix with "m" rows and "n" columns
* @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization.
* The length of this vector is equal to the number of output columns (or RHS input
* rows)
* @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length
* of this vector is equal to the number of output columns (or RHS input rows)
* @param[in] lhs_rows Number of LHS input rows
* @param[in] rhs_rows Number of RHS input rows
* @param[in] rhs_cols Number of LHS/RHS input columns
* @param[in] lhs_offset Offset to be applied to the LHS input value
* @param[in] dst_offset Offset to be applied the output result
* @param[in] activation_min Minimum value to clamp down the output. Range : int8
* @param[in] activation_max Maximum value to clamp up the output. Range : int8
* @param[in] lhs_cols_offset Column offset between subsequent lhs_rows
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
*/
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s4(const int8_t *lhs,
const int8_t *rhs,
const int32_t *bias,
int8_t *dst,
const int32_t *dst_multipliers,
const int32_t *dst_shifts,
const int32_t lhs_rows,
const int32_t rhs_rows,
const int32_t rhs_cols,
const int32_t lhs_offset,
const int32_t dst_offset,
const int32_t activation_min,
const int32_t activation_max,
const int32_t lhs_cols_offset);

/**
* @brief General Matrix-multiplication function with per-channel requantization.
* This function assumes:
Expand Down Expand Up @@ -822,6 +869,24 @@ __STATIC_FORCEINLINE void read_and_pad_s4_uneven(const int8_t *source, int32_t *
*out2 = SXTB16_RORn(__sxtb16(inA1), 4);
}

/**
* @brief read and expand one s4 word into two s16 words with ordering.
*/
__STATIC_FORCEINLINE void read_and_pad_s4_ordered(const int8_t *source, int32_t *out1, int32_t *out2)
{
int16_t in = arm_nn_read_s8x2(source);
int32_t inA = (in & 0x00FF) | ((in & 0xFF00) << 8);
int32_t inAbuf1 = SXTB16_RORn(__sxtb16(inA), 4);
int32_t inAbuf2 = SXTB16_RORn(__sxtb16(inA << 4), 4);
#ifndef ARM_MATH_BIG_ENDIAN
*out2 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
*out1 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
#else
*out1 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
*out2 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
#endif
}

/**
* @brief read and expand one s8 word into two s16 words with ordering.
*/
Expand Down Expand Up @@ -861,6 +926,39 @@ __STATIC_FORCEINLINE const int8_t *read_and_pad_reordered(const int8_t *source,

#endif

/**
* @brief Matrix-multiplication function for convolution with per-channel requantization and 4 bit weights.
* @param[in] input_a pointer to operand A, int8 packed with 2x int4.
* @param[in] input_b pointer to operand B, always consists of 2 vectors.
* @param[in] output_ch number of rows of A
* @param[in] out_shift pointer to per output channel requantization shift parameter.
* @param[in] out_mult pointer to per output channel requantization multiplier parameter.
* @param[in] out_offset output tensor offset.
* @param[in] activation_min minimum value to clamp the output to. Range : int8
* @param[in] activation_max maximum value to clamp the output to. Range : int8
* @param[in] num_col_a number of columns of A
* @param[in] output_bias per output channel bias. Range : int32
* @param[in,out] out_0 pointer to output
* @return The function returns one of the two
* 1. The incremented output pointer for a successful operation or
* 2. NULL if implementation is not available.
*
* @details This function does the matrix multiplication of weight matrix for all output channels
* with 2 columns from im2col and produces two elements/output_channel. The outputs are
* clamped in the range provided by activation min and max.
* Supported framework: TensorFlow Lite micro.
*/
int8_t *arm_nn_mat_mult_kernel_s4_s16(const int8_t *input_a,
const int16_t *input_b,
const uint16_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const int32_t num_col_a,
const int32_t *const output_bias,
int8_t *out_0);
/**
* @brief Matrix-multiplication function for convolution with per-channel requantization.
* @param[in] input_a pointer to operand A
Expand Down
26 changes: 13 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@ processors here are Cortex-M4 or a Cortex-M33 configured with optional DSP exten
Processors with Arm Helium Technology use the Arm M-profile Vector Extension(MVE) instructions for optimization.
Examples are Cortex-M55 or Cortex-M85 configured with MVE.

| Operator | C <br> int8 | C<br>int16 | C<br>int4* | DSP<br>int8 | DSP<br>int16 | DSP<br>int4* | MVE<br>int8 | MVE<br>int16 |
| --------------- | ----------- | ---------- | ----------- | ------------| -------------| -------------| ------------| -------------|
| Conv2D | Yes | Yes | No | Yes | Yes | No | Yes | Yes |
| DepthwiseConv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| TransposeConv2D | Yes | No | No | No | No | No | No | No |
| Fully Connected | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| Add | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
| Mul | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
| MaxPooling | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
| AvgPooling | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
| Softmax | Yes | Yes | N/A | Yes | Yes | N/A | Yes | No |
| LSTM | Yes | NA | No | Yes | NA | No | Yes | NA |
| SVDF | Yes | No | No | Yes | No | No | Yes | No |
| Operator | C <br> int8 | C<br>int16 | C<br>int4* | DSP<br>int8 | DSP<br>int16 | DSP<br>int4* | MVE<br>int8 | MVE<br>int16 |
| --------------- | ----------- | ---------- |------------| ------------| -------------|--------------| ------------| -------------|
| Conv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| DepthwiseConv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| TransposeConv2D | Yes | No | No | No | No | No | No | No |
| Fully Connected | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| Add | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
| Mul | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
| MaxPooling | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
| AvgPooling | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes |
| Softmax | Yes | Yes | N/A | Yes | Yes | N/A | Yes | No |
| LSTM | Yes | NA | No | Yes | NA | No | Yes | NA |
| SVDF | Yes | No | No | Yes | No | No | Yes | No |

* int4 weights + int8 activations

Expand Down
4 changes: 2 additions & 2 deletions Source/ConvolutionFunctions/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@
#

file(GLOB SRC_S4 "./*_s4*.c")
file(GLOB SRC "./*_s8*.c")
file(GLOB SRC_S8 "./*_s8*.c")
file(GLOB SRC_S16 "./*_s16*.c")
target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16} ${SRC_S4})
target_sources(cmsis-nn PRIVATE ${SRC_S4} ${SRC_S8} ${SRC_S16})
115 changes: 115 additions & 0 deletions Source/ConvolutionFunctions/arm_convolve_1x1_s4.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <[email protected]>
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1x1_s4.c
* Description: Generic s4 version of 1x1 convolution
*
* $Date: 01 November 2023
* $Revision: V.1.0.0
*
* Target : Arm(R) M-Profile Architecture
*
* -------------------------------------------------------------------- */

#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"

/**
* @ingroup Public
*/

/**
* @addtogroup NNConv
* @{
*/

/*
* A more generic version of s4 1x1 convolution intended for non-unity strides. This is slower
* than the _fast() version if used for unity stride values.
*
* Refer header file for details.
*
*/
arm_cmsis_nn_status arm_convolve_1x1_s4(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int8_t *input_data,
const cmsis_nn_dims *filter_dims,
const int8_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
int8_t *output_data)
{
(void)ctx;
(void)filter_dims;
(void)bias_dims;
if (conv_params->padding.w != 0 || conv_params->padding.h != 0)
{
return ARM_CMSIS_NN_ARG_ERROR;
}

const int32_t lhs_rows = output_dims->w;
const int32_t rhs_rows = output_dims->c;
const int32_t rhs_cols = input_dims->c;
const int32_t stride_w = conv_params->stride.w;
const int32_t input_inc = input_dims->w * conv_params->stride.h * rhs_cols;
const int32_t output_inc = output_dims->w * rhs_rows;
const int32_t output_h = output_dims->h;
const int32_t batch = input_dims->n;
const int8_t *input_data_ref = input_data;

for (int i_batch = 0; i_batch < batch; i_batch++)
{
input_data = input_data_ref + (i_batch * rhs_cols * input_dims->w * input_dims->h);
for (int i_output_h = 0; i_output_h < output_h; i_output_h++)
{
// Process one input row
arm_cmsis_nn_status result = arm_nn_mat_mult_nt_t_s4(input_data,
filter_data,
bias_data,
output_data,
quant_params->multiplier,
quant_params->shift,
lhs_rows,
rhs_rows,
rhs_cols,
conv_params->input_offset,
conv_params->output_offset,
conv_params->activation.min,
conv_params->activation.max,
rhs_cols * stride_w);
if (result != ARM_CMSIS_NN_SUCCESS)
{
return result;
}
input_data += input_inc;
output_data += output_inc;
}
}

/* Return to application */
return ARM_CMSIS_NN_SUCCESS;
}

/**
* @} end of NNConv group
*/
Loading

0 comments on commit bfc54ed

Please sign in to comment.