From 47939cb3bbbde373b5fa185e31a6c4902726e038 Mon Sep 17 00:00:00 2001
From: laurilaatu <laurilaatu@users.noreply.github.com>
Date: Wed, 18 Dec 2024 16:07:19 +0000
Subject: [PATCH] Depthwise convolution for oneAPI (#1131)

* snapshot adding oneapi

* fix reduce constexpr

* further updates

* update the bridge and testbench

* fix issues discovered when compiling

* update bridge writing files

* build library (but not tested)

* fix a bug in testbench

* snapshot after some debugging

* remove forgotten debug printing

* add build

* pre-commit fixes

* fix more pre-commit

* fix more pre-commit errors

* snapshot of work before reworking types

* Use using to decide array type, some preliminary updates

* snapshot unifying types

* fix the testbench and bridge

* snapshot updating nnet_utils (not finished)

* define array in nnet_types for oneAPI

* fix parallel conv2d

* add back the streaming versions of algs, most unconverted

* tentatively complete streaming for dense but not functional

* first version that compiles streaming

* change how the pipe value type is extracted

* fix pre-commit error

* always treat elu as ELU class

* fix batchnorm

* snapshot towards fixing conv

* snapshot fixing test for streaming

* fix conv1d

* fix conv2d

* fix reshape and flatten for oneAPI

* initial oneAPI tests

* remove nnet_dense_compressed from oneAPI

* add merge functionality (untested)

* fix merge for oneAPI

* fix merge for oneAPI (missing commit)

* add zeropadding

* standardize paralellization spelling

* fix pointwise for oneAPI

* remove references to quartus

* more replace quartus with oneapi

* snapshot on the way towards implementing pooling

* fix io_stream pooling for oneAPI

* add fix for Conv2DBatchnorm

* accidentally committed CMakeLists.txt in my debug setup

* reshaping, not fully tested

* fix cloning of streams

* fix pytest library loading

* remove unused template

* fix some activation bugs

* fix the overwriting of directories in the pytest

* update version of test repository

* try to fix docker issue

* bump hls4ml-testing tag to 0.5.2

* try not restricting tensorflow-model-optimizatoin

* Update to 0.5.3 for testing

* bump to docker image 0.5.4, suggested by Ben

* fix pre-commit warning

* dial down N_TESTS_PER_YAML to 4

* revert tensorflow-model-optimization change

* fix issue of saving in "obsolete" h5 format

* fix embedding for oneAPI

* First attempt at adding RNNs to oneAPI

* fix bug in array size

* fix order or indices

* make queues static in bridge

* fix logic error in repack stream

* changing the style, but functionally identical

* update pointwise optimizer for oneAPI

* add oneAPI to test_multi_dense.py

* fix updating weight types

* initial changes of templates, for testing

* fix weight naming, product selection

* make im2col the default; fix winograd size

* fix up streaming dense and convolution

* fix prelu, some batchnorm

* fix weight array of exponential types

* move ACExponentialPrecisionDefinition to oneapi_types

* attempt to fix batchnorm and recurrent

* fixed BatchNormalizationQuantizedTanhConfigTemplate template selection

* fix embedding_stream

* fix lstm and simple rnn

* fix GRU

* fix winograd, and also disable it by default

* fix threshold name

* split bn_quant to be backend-specific

* add type inference to oneAPI

* add oneAPI to pytorch tests

* fix pooling with padding for oneAPI and Quartus

* Compilation for larger models enabled by increasing -fconstexpr-steps

* add oneapi clone tests; remove reduntand multi_clone test

* remove some attributes to avoid overwrite warnings

* make extra handling for oneAPI like others (as in PR #1067)

* remove warnings for extra optimizers that are not scheduled on purpose

* update parametrized activations

* intial depthconv2d implementation

* intial depthconv2d implementation

* Rename to depthconv, add strides and add tests

* Remove class for DepthwiseConv2D

* Remove Separable convolution template

* Remove layer optimizer for sepconv

* Loop unroll

* Pre-commit format

* Fix spelling

* depthconv1d, channel order in loop, product

* Gather result to accum

---------

Co-authored-by: Jovan Mitrevski <jmitrevs@fnal.gov>
Co-authored-by: Jovan Mitrevski <j.p.mitrevski@gmail.com>
---
 hls4ml/backends/fpga/fpga_backend.py          |  2 +-
 .../oneapi/passes/convolution_templates.py    | 40 +++++++++-
 hls4ml/model/optimizer/__init__.py            |  2 +-
 .../optimizer/passes/seperable_to_dw_conv.py  | 10 +--
 .../firmware/nnet_utils/nnet_depthconv1d.h    | 19 +++++
 .../nnet_utils/nnet_depthconv1d_resource.h    | 60 +++++++++++++++
 .../firmware/nnet_utils/nnet_depthconv2d.h    | 19 +++++
 .../nnet_utils/nnet_depthconv2d_resource.h    | 76 +++++++++++++++++++
 test/pytest/test_depthconv1d.py               |  1 +
 test/pytest/test_depthconv2d.py               |  1 +
 test/pytest/test_sepconv1d.py                 |  1 +
 test/pytest/test_sepconv2d.py                 |  1 +
 12 files changed, 222 insertions(+), 10 deletions(-)
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index fbfed71c5b..b20fdf1228 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -94,7 +94,7 @@ def __init__(self, name):
             attrs.append(ConfigurableAttribute('reuse_factor', default=1, description=descriptions.reuse_factor))
             self.attribute_map[layer] = attrs
 
-        # seperable is kind of special because it is effectively two layers that will be split
+        # separable is kind of special because it is effectively two layers that will be split
         for layer in (SeparableConv1D, SeparableConv2D):
             attrs = self.attribute_map.get(layer, [])
             attrs.append(TypeAttribute('depthwise_accum'))
diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
index 17154559d8..64d9e42228 100644
--- a/hls4ml/backends/oneapi/passes/convolution_templates.py
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -1,7 +1,7 @@
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
-from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm
+from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm, DepthwiseConv1D, DepthwiseConv2D
 
 # TODO - Dilation rate ?
 
@@ -70,9 +70,20 @@
 conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
 
 
+depthconv1d_function_template = (
+    'nnet::depthwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+depthconv1d_include_list = [
+    'nnet_utils/nnet_conv1d.h',
+    'nnet_utils/nnet_conv1d_resource.h',
+    'nnet_utils/nnet_depthconv1d.h',
+    'nnet_utils/nnet_depthconv1d_resource.h',
+]
+
+
 class Conv1DConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__(Conv1D)
+        super().__init__((Conv1D, DepthwiseConv1D))
         self.template = conv1d_config_template
         self.mult_template = conv_mult_config_template
 
@@ -137,6 +148,12 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class DepthwiseConv1DFunctionTemplate(Conv1DFunctionTemplate):
+    def __init__(self):
+        super(Conv1DFunctionTemplate, self).__init__(DepthwiseConv1D, include_header=depthconv1d_include_list)
+        self.template = depthconv1d_function_template
+
+
 ''' 2D Conv '''
 conv2d_config_template = """struct config{index} : nnet::conv2d_config {{
     static const unsigned in_height = {in_height};
@@ -183,7 +200,7 @@ def format(self, node):
 
 class Conv2DConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__((Conv2D, Conv2DBatchnorm))
+        super().__init__((Conv2D, Conv2DBatchnorm, DepthwiseConv2D))
         self.template = conv2d_config_template
         self.mult_template = conv_mult_config_template
 
@@ -233,3 +250,20 @@ def format(self, node):
             raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         return self.template.format(**params)
+
+
+depthconv2d_function_template = (
+    'nnet::depthwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+depthconv2d_include_list = [
+    'nnet_utils/nnet_conv2d.h',
+    'nnet_utils/nnet_conv2d_resource.h',
+    'nnet_utils/nnet_depthconv2d.h',
+    'nnet_utils/nnet_depthconv2d_resource.h',
+]
+
+
+class DepthwiseConv2DFunctionTemplate(Conv2DFunctionTemplate):
+    def __init__(self):
+        super(Conv2DFunctionTemplate, self).__init__(DepthwiseConv2D, include_header=depthconv2d_include_list)
+        self.template = depthconv2d_function_template
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index a745eceba1..7e9325ccd0 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -59,7 +59,7 @@
     'convert',
     [
         'channels_last_converter',
-        'seperable_to_depthwise_and_conv',
+        'separable_to_depthwise_and_conv',
         'remove_transpose_before_flatten',
         'remove_nop_transpose',
         'remove_single_channel_transpose',
diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
index 38eef1e7d0..10840ec410 100644
--- a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
+++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
@@ -1,5 +1,5 @@
 """
-This optimizer converts a seperable convolution to a depthwise followed by a regular convolution.
+This optimizer converts a separable convolution to a depthwise followed by a regular convolution.
 For backends with a custom pointwise implementations the regular convolution will subsequently
 be converted to a pointwise convolution by a different optimizer.
 """
@@ -10,8 +10,8 @@
 from hls4ml.model.optimizer import OptimizerPass
 
 
-class SeperableToDepthwiseAndConv(OptimizerPass):
-    """Convert Seperable to DepthwiseConv + Conv (potentially later Pointwise)"""
+class SeparableToDepthwiseAndConv(OptimizerPass):
+    """Convert Separable to DepthwiseConv + Conv (potentially later Pointwise)"""
 
     _dw_attributes = (
         'in_width',
@@ -70,7 +70,7 @@ def transform(self, model, node):
             model.config.parse_name_config(dw_name, dw_layer_config)
 
         # creating the attributes
-        dw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._dw_attributes if k in node.attributes}
+        dw_attributes = {k: node.attributes[k] for k in SeparableToDepthwiseAndConv._dw_attributes if k in node.attributes}
         dw_attributes['n_filt'] = dw_attributes['n_chan'] * dw_attributes['depth_multiplier']
         dw_attributes['use_bias'] = False
 
@@ -100,7 +100,7 @@ def transform(self, model, node):
             model.config.parse_name_config(pw_name, pw_layer_config)
 
         # creating the attributes
-        pw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._pw_attributes if k in node.attributes}
+        pw_attributes = {k: node.attributes[k] for k in SeparableToDepthwiseAndConv._pw_attributes if k in node.attributes}
         pw_attributes['filt_width'] = 1
         pw_attributes['filt_height'] = 1
         pw_attributes['stride_width'] = 1
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h
new file mode 100644
index 0000000000..d2c774fcf8
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h
@@ -0,0 +1,19 @@
+#ifndef NNET_DEPTH_CONV1D_H_
+#define NNET_DEPTH_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d.h"
+#include "nnet_depthconv1d_resource.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
+
+    depthwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h
new file mode 100644
index 0000000000..c06b6b14e7
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h
@@ -0,0 +1,60 @@
+#ifndef NNET_DEPTH_CONV1D_LATENCY_H_
+#define NNET_DEPTH_CONV1D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_resource.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+
+    int depth_multiplier = CONFIG_T::n_filt / CONFIG_T::n_chan;
+    [[intel::fpga_register]] int res_idx = 0;
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::out_width * CONFIG_T::n_filt];
+
+DM_LOOP:
+    #pragma unroll
+    for (int dm = 0; dm < depth_multiplier; dm++) {
+
+    WIDTH_LOOP:
+        #pragma unroll
+        for (int w = 0; w < CONFIG_T::out_width; w++) {
+
+        CHAN_LOOP:
+            #pragma unroll
+            for (int c = 0; c < CONFIG_T::n_chan; c++) {
+
+                res_idx = (w * CONFIG_T::n_filt) + (c * depth_multiplier) + dm;
+
+                acc[res_idx] = biases[c * depth_multiplier + dm];
+
+            KERNEL_W_LOOP:
+                #pragma unroll
+                for (int kw = 0; kw < CONFIG_T::filt_width; kw++) {
+
+                    int w_in = w * CONFIG_T::stride_width + kw - CONFIG_T::pad_left;
+
+                    if ((w_in >= 0) && (w_in < CONFIG_T::in_width)) {
+
+                        acc[res_idx] += CONFIG_T::mult_config::
+                            template product<typename data_T::value_type, typename CONFIG_T::weight_t::value_type>::product(
+                                data[(w_in)*CONFIG_T::n_chan + c],
+                                weights[(dm * CONFIG_T::filt_width * CONFIG_T::n_chan) + (kw * CONFIG_T::n_chan) + c]);
+                    }
+                }
+            }
+        }
+    }
+
+RESULT:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::out_width * CONFIG_T::n_filt; ires++) {
+        res[ires] = cast<typename CONFIG_T::accum_t, typename res_T::value_type, CONFIG_T>(acc[ires]);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h
new file mode 100644
index 0000000000..87dc1805d9
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h
@@ -0,0 +1,19 @@
+#ifndef NNET_DEPTH_CONV2D_H_
+#define NNET_DEPTH_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d.h"
+#include "nnet_depthconv2d_resource.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
+
+    depthwise_conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h
new file mode 100644
index 0000000000..91ddc28f65
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h
@@ -0,0 +1,76 @@
+#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_
+#define NNET_SEPARABLE_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_resource.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+
+    int depth_multiplier = CONFIG_T::n_filt / CONFIG_T::n_chan;
+    [[intel::fpga_register]] int res_idx = 0;
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::out_width * CONFIG_T::out_height * CONFIG_T::n_filt];
+
+DM_LOOP:
+    #pragma unroll
+    for (int dm = 0; dm < depth_multiplier; dm++) {
+
+    HEIGHT_LOOP:
+        #pragma unroll
+        for (int h = 0; h < CONFIG_T::out_height; h++) {
+        WIDTH_LOOP:
+            #pragma unroll
+            for (int w = 0; w < CONFIG_T::out_width; w++) {
+
+            CHAN_LOOP:
+                #pragma unroll
+                for (int c = 0; c < CONFIG_T::n_chan; c++) {
+
+                    res_idx =
+                        (h * CONFIG_T::out_width * CONFIG_T::n_filt) + (w * CONFIG_T::n_filt) + (c * depth_multiplier) + dm;
+
+                    acc[res_idx] = biases[c * depth_multiplier + dm];
+
+                KERNEL_H_LOOP:
+                    #pragma unroll
+                    for (int kh = 0; kh < CONFIG_T::filt_height; kh++) {
+                    KERNEL_W_LOOP:
+                        #pragma unroll
+                        for (int kw = 0; kw < CONFIG_T::filt_width; kw++) {
+
+                            int h_in = h * CONFIG_T::stride_height + kh - CONFIG_T::pad_top;
+                            int w_in = w * CONFIG_T::stride_width + kw - CONFIG_T::pad_left;
+
+                            if ((h_in >= 0) && (h_in < CONFIG_T::in_height) && (w_in >= 0) && (w_in < CONFIG_T::in_width)) {
+
+                                acc[res_idx] +=
+                                    CONFIG_T::mult_config::template product<typename data_T::value_type,
+                                                                            typename CONFIG_T::weight_t::value_type>::
+                                        product(
+                                            data[(h_in)*CONFIG_T::in_width * CONFIG_T::n_chan + (w_in)*CONFIG_T::n_chan + c],
+                                            weights[(dm * CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan) +
+                                                    (kh * CONFIG_T::filt_width * CONFIG_T::n_chan) +
+                                                    (kw * CONFIG_T::n_chan) + c]);
+
+                                ;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+RESULT:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::out_width * CONFIG_T::out_height * CONFIG_T::n_filt; ires++) {
+        res[ires] = cast<typename CONFIG_T::accum_t, typename res_T::value_type, CONFIG_T>(acc[ires]);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/test/pytest/test_depthconv1d.py b/test/pytest/test_depthconv1d.py
index 3734815af0..85c8e2ac4f 100644
--- a/test/pytest/test_depthconv1d.py
+++ b/test/pytest/test_depthconv1d.py
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),
diff --git a/test/pytest/test_depthconv2d.py b/test/pytest/test_depthconv2d.py
index 9178edf368..4832cb1ae9 100644
--- a/test/pytest/test_depthconv2d.py
+++ b/test/pytest/test_depthconv2d.py
@@ -24,6 +24,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),
diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py
index 64312e9932..aef24db040 100644
--- a/test/pytest/test_sepconv1d.py
+++ b/test/pytest/test_sepconv1d.py
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 4732c7c7f1..1d056f15c9 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),