4-bit support for convolution (#85)

- Adds arm_convolve_wrapper function for 4bit weights - Adds arm_convolve_1x1 normal and fast variant for 4bit weights - Adds arm_nn_mat_mult_kernel_s4_s16 for multiply 4bit weights with 16bit input - Adds mat_mult_nt_t_s4 function for dsp and scalar - Adds scalar and dsp implementation for arm_convolve with 4bit weights - Adds Unit tests for 4bit weight convolutions Change-Id: Idea55432fdab2db05a033889d7c39dd0ea69f8ad Signed-off-by: Ryan O'Shea <[email protected]>
ARM-software · Nov 14, 2023 · bfc54ed · bfc54ed
1 parent edececa
commit bfc54ed
Show file tree

Hide file tree

Showing 231 changed files with 10,857 additions and 54 deletions.
diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
@@ -35,9 +35,12 @@
         <file category="header" name="Include/arm_nn_math_types.h"/>
 
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c"/>
+        <file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s4_s16.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c"/>
+        <file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s4_fast.c"/>
+        <file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s4.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s16.c"/>
@@ -50,14 +53,17 @@
         <file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_fast_s16.c"/>
+        <file category="source" name="Source/ConvolutionFunctions/arm_convolve_s4.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_s16.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c"/>
+        <file category="source" name="Source/ConvolutionFunctions/arm_convolve_wrapper_s4.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s16.c"/>
+        <file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s4.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_transpose_conv_s8.c"/>
@@ -90,6 +96,7 @@
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_s8_to_s16_unordered_with_offset.c"/>
+        <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s4.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8_s32.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c"/>

diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h
diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        7 November 2023
- * $Revision:    V.17.5.0
+ * $Date:        13 November 2023
+ * $Revision:    V.17.6.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -340,6 +340,53 @@ int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
                                   const int32_t *bias,
                                   int8_t *output);
 
+/**
+ * @brief General Matrix-multiplication function with per-channel requantization.
+ *        This function assumes:
+ *        - LHS input matrix NOT transposed (nt)
+ *        - RHS input matrix transposed (t)
+ *        - RHS is int8 packed with 2x int4
+ *        - LHS is int8
+ *
+ *  @note This operation also performs the broadcast bias addition before the requantization
+ *
+ * @param[in]  lhs                Pointer to the LHS input matrix
+ * @param[in]  rhs                Pointer to the RHS input matrix
+ * @param[in]  bias               Pointer to the bias vector. The length of this vector is equal to the number of
+ *                                output columns (or RHS input rows)
+ * @param[out] dst                Pointer to the output matrix with "m" rows and "n" columns
+ * @param[in]  dst_multipliers    Pointer to the multipliers vector needed for the per-channel requantization.
+ *                                The length of this vector is equal to the number of output columns (or RHS input
+ *                                rows)
+ * @param[in]  dst_shifts         Pointer to the shifts vector needed for the per-channel requantization. The length
+ *                                of this vector is equal to the number of output columns (or RHS input rows)
+ * @param[in]  lhs_rows           Number of LHS input rows
+ * @param[in]  rhs_rows           Number of RHS input rows
+ * @param[in]  rhs_cols           Number of LHS/RHS input columns
+ * @param[in]  lhs_offset         Offset to be applied to the LHS input value
+ * @param[in]  dst_offset         Offset to be applied the output result
+ * @param[in]  activation_min     Minimum value to clamp down the output. Range : int8
+ * @param[in]  activation_max     Maximum value to clamp up the output. Range : int8
+ * @param[in]  lhs_cols_offset    Column offset between subsequent lhs_rows
+ *
+ * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ */
+arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s4(const int8_t *lhs,
+                                            const int8_t *rhs,
+                                            const int32_t *bias,
+                                            int8_t *dst,
+                                            const int32_t *dst_multipliers,
+                                            const int32_t *dst_shifts,
+                                            const int32_t lhs_rows,
+                                            const int32_t rhs_rows,
+                                            const int32_t rhs_cols,
+                                            const int32_t lhs_offset,
+                                            const int32_t dst_offset,
+                                            const int32_t activation_min,
+                                            const int32_t activation_max,
+                                            const int32_t lhs_cols_offset);
+
 /**
  * @brief General Matrix-multiplication function with per-channel requantization.
  *        This function assumes:
@@ -822,6 +869,24 @@ __STATIC_FORCEINLINE void read_and_pad_s4_uneven(const int8_t *source, int32_t *
     *out2 = SXTB16_RORn(__sxtb16(inA1), 4);
 }
 
+/**
+ * @brief read and expand one s4 word into two s16 words with ordering.
+ */
+__STATIC_FORCEINLINE void read_and_pad_s4_ordered(const int8_t *source, int32_t *out1, int32_t *out2)
+{
+    int16_t in = arm_nn_read_s8x2(source);
+    int32_t inA = (in & 0x00FF) | ((in & 0xFF00) << 8);
+    int32_t inAbuf1 = SXTB16_RORn(__sxtb16(inA), 4);
+    int32_t inAbuf2 = SXTB16_RORn(__sxtb16(inA << 4), 4);
+    #ifndef ARM_MATH_BIG_ENDIAN
+    *out2 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
+    *out1 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
+    #else
+    *out1 = (int32_t)(PKHTB(inAbuf1, inAbuf2, 16));
+    *out2 = (int32_t)(PKHBT(inAbuf2, inAbuf1, 16));
+    #endif
+}
+
 /**
  * @brief read and expand one s8 word into two s16 words with ordering.
  */
@@ -861,6 +926,39 @@ __STATIC_FORCEINLINE const int8_t *read_and_pad_reordered(const int8_t *source,
 
 #endif
 
+/**
+ * @brief Matrix-multiplication function for convolution with per-channel requantization and 4 bit weights.
+ * @param[in]       input_a            pointer to operand A, int8 packed with 2x int4.
+ * @param[in]       input_b            pointer to operand B, always consists of 2 vectors.
+ * @param[in]       output_ch          number of rows of A
+ * @param[in]       out_shift          pointer to per output channel requantization shift parameter.
+ * @param[in]       out_mult           pointer to per output channel requantization multiplier parameter.
+ * @param[in]       out_offset         output tensor offset.
+ * @param[in]       activation_min     minimum value to clamp the output to. Range : int8
+ * @param[in]       activation_max     maximum value to clamp the output to. Range : int8
+ * @param[in]       num_col_a          number of columns of A
+ * @param[in]       output_bias        per output channel bias. Range : int32
+ * @param[in,out]   out_0              pointer to output
+ * @return     The function returns one of the two
+ *              1. The incremented output pointer for a successful operation or
+ *              2. NULL if implementation is not available.
+ *
+ * @details   This function does the matrix multiplication of weight matrix for all output channels
+ *            with 2 columns from im2col and produces two elements/output_channel. The outputs are
+ *            clamped in the range provided by activation min and max.
+ *            Supported framework: TensorFlow Lite micro.
+ */
+int8_t *arm_nn_mat_mult_kernel_s4_s16(const int8_t *input_a,
+                                      const int16_t *input_b,
+                                      const uint16_t output_ch,
+                                      const int32_t *out_shift,
+                                      const int32_t *out_mult,
+                                      const int32_t out_offset,
+                                      const int32_t activation_min,
+                                      const int32_t activation_max,
+                                      const int32_t num_col_a,
+                                      const int32_t *const output_bias,
+                                      int8_t *out_0);
 /**
  * @brief Matrix-multiplication function for convolution with per-channel requantization.
  * @param[in]       input_a            pointer to operand A

diff --git a/README.md b/README.md
@@ -23,19 +23,19 @@ processors here are Cortex-M4 or a Cortex-M33 configured with optional DSP exten
 Processors with Arm Helium Technology use the Arm M-profile Vector Extension(MVE) instructions for optimization.
 Examples are Cortex-M55 or Cortex-M85 configured with MVE.
 
-| Operator        | C <br> int8 | C<br>int16 | C<br>int4*  | DSP<br>int8 | DSP<br>int16 | DSP<br>int4* | MVE<br>int8 | MVE<br>int16 |
-| --------------- | ----------- | ---------- | ----------- | ------------| -------------| -------------| ------------| -------------|
-| Conv2D          | Yes         | Yes        | No          | Yes         | Yes          | No           | Yes         | Yes          |
-| DepthwiseConv2D | Yes         | Yes        | Yes         | Yes         | Yes          | Yes          | Yes         | Yes          |
-| TransposeConv2D | Yes         | No         | No          | No          | No           | No           | No          | No           |
-| Fully Connected | Yes         | Yes        | Yes         | Yes         | Yes          | Yes          | Yes         | Yes          |
-| Add             | Yes         | Yes        | N/A         | Yes         | Yes          | N/A          | Yes         | Yes          |
-| Mul             | Yes         | Yes        | N/A         | Yes         | Yes          | N/A          | Yes         | Yes          |
-| MaxPooling      | Yes         | Yes        | N/A         | Yes         | Yes          | N/A          | Yes         | Yes          |
-| AvgPooling      | Yes         | Yes        | N/A         | Yes         | Yes          | N/A          | Yes         | Yes          |
-| Softmax         | Yes         | Yes        | N/A         | Yes         | Yes          | N/A          | Yes         | No           |
-| LSTM            | Yes         | NA         | No          | Yes         | NA           | No           | Yes         | NA           |
-| SVDF            | Yes         | No         | No          | Yes         | No           | No           | Yes         | No           |
+| Operator        | C <br> int8 | C<br>int16 | C<br>int4* | DSP<br>int8 | DSP<br>int16 | DSP<br>int4* | MVE<br>int8 | MVE<br>int16 |
+| --------------- | ----------- | ---------- |------------| ------------| -------------|--------------| ------------| -------------|
+| Conv2D          | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          |
+| DepthwiseConv2D | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          |
+| TransposeConv2D | Yes         | No         | No         | No          | No           | No           | No          | No           |
+| Fully Connected | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          |
+| Add             | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          |
+| Mul             | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          |
+| MaxPooling      | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          |
+| AvgPooling      | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          |
+| Softmax         | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | No           |
+| LSTM            | Yes         | NA         | No         | Yes         | NA           | No           | Yes         | NA           |
+| SVDF            | Yes         | No         | No         | Yes         | No           | No           | Yes         | No           |
 
 * int4 weights + int8 activations
 

diff --git a/Source/ConvolutionFunctions/CMakeLists.txt b/Source/ConvolutionFunctions/CMakeLists.txt
@@ -17,6 +17,6 @@
 #
 
 file(GLOB SRC_S4 "./*_s4*.c")
-file(GLOB SRC "./*_s8*.c")
+file(GLOB SRC_S8 "./*_s8*.c")
 file(GLOB SRC_S16 "./*_s16*.c")
-target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16} ${SRC_S4})
+target_sources(cmsis-nn PRIVATE ${SRC_S4} ${SRC_S8} ${SRC_S16})
diff --git a/Source/ConvolutionFunctions/arm_convolve_1x1_s4.c b/Source/ConvolutionFunctions/arm_convolve_1x1_s4.c
@@ -0,0 +1,115 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <[email protected]>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_convolve_1x1_s4.c
+ * Description:  Generic s4 version of 1x1 convolution
+ *
+ * $Date:        01 November 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+/*
+ * A more generic version of s4 1x1 convolution intended for non-unity strides. This is slower
+ * than the _fast() version if used for unity stride values.
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_convolve_1x1_s4(const cmsis_nn_context *ctx,
+                                        const cmsis_nn_conv_params *conv_params,
+                                        const cmsis_nn_per_channel_quant_params *quant_params,
+                                        const cmsis_nn_dims *input_dims,
+                                        const int8_t *input_data,
+                                        const cmsis_nn_dims *filter_dims,
+                                        const int8_t *filter_data,
+                                        const cmsis_nn_dims *bias_dims,
+                                        const int32_t *bias_data,
+                                        const cmsis_nn_dims *output_dims,
+                                        int8_t *output_data)
+{
+    (void)ctx;
+    (void)filter_dims;
+    (void)bias_dims;
+    if (conv_params->padding.w != 0 || conv_params->padding.h != 0)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+
+    const int32_t lhs_rows = output_dims->w;
+    const int32_t rhs_rows = output_dims->c;
+    const int32_t rhs_cols = input_dims->c;
+    const int32_t stride_w = conv_params->stride.w;
+    const int32_t input_inc = input_dims->w * conv_params->stride.h * rhs_cols;
+    const int32_t output_inc = output_dims->w * rhs_rows;
+    const int32_t output_h = output_dims->h;
+    const int32_t batch = input_dims->n;
+    const int8_t *input_data_ref = input_data;
+
+    for (int i_batch = 0; i_batch < batch; i_batch++)
+    {
+        input_data = input_data_ref + (i_batch * rhs_cols * input_dims->w * input_dims->h);
+        for (int i_output_h = 0; i_output_h < output_h; i_output_h++)
+        {
+            // Process one input row
+            arm_cmsis_nn_status result = arm_nn_mat_mult_nt_t_s4(input_data,
+                                                                 filter_data,
+                                                                 bias_data,
+                                                                 output_data,
+                                                                 quant_params->multiplier,
+                                                                 quant_params->shift,
+                                                                 lhs_rows,
+                                                                 rhs_rows,
+                                                                 rhs_cols,
+                                                                 conv_params->input_offset,
+                                                                 conv_params->output_offset,
+                                                                 conv_params->activation.min,
+                                                                 conv_params->activation.max,
+                                                                 rhs_cols * stride_w);
+            if (result != ARM_CMSIS_NN_SUCCESS)
+            {
+                return result;
+            }
+            input_data += input_inc;
+            output_data += output_inc;
+        }
+    }
+
+    /* Return to application */
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/**
+ * @} end of NNConv group
+ */