From 47939cb3bbbde373b5fa185e31a6c4902726e038 Mon Sep 17 00:00:00 2001 From: laurilaatu Date: Wed, 18 Dec 2024 16:07:19 +0000 Subject: [PATCH] Depthwise convolution for oneAPI (#1131) * snapshot adding oneapi * fix reduce constexpr * further updates * update the bridge and testbench * fix issues discovered when compiling * update bridge writing files * build library (but not tested) * fix a bug in testbench * snapshot after some debugging * remove forgotten debug printing * add build * pre-commit fixes * fix more pre-commit * fix more pre-commit errors * snapshot of work before reworking types * Use using to decide array type, some preliminary updates * snapshot unifying types * fix the testbench and bridge * snapshot updating nnet_utils (not finished) * define array in nnet_types for oneAPI * fix parallel conv2d * add back the streaming versions of algs, most unconverted * tentatively complete streaming for dense but not functional * first version that compiles streaming * change how the pipe value type is extracted * fix pre-commit error * always treat elu as ELU class * fix batchnorm * snapshot towards fixing conv * snapshot fixing test for streaming * fix conv1d * fix conv2d * fix reshape and flatten for oneAPI * initial oneAPI tests * remove nnet_dense_compressed from oneAPI * add merge functionality (untested) * fix merge for oneAPI * fix merge for oneAPI (missing commit) * add zeropadding * standardize paralellization spelling * fix pointwise for oneAPI * remove references to quartus * more replace quartus with oneapi * snapshot on the way towards implementing pooling * fix io_stream pooling for oneAPI * add fix for Conv2DBatchnorm * accidentally committed CMakeLists.txt in my debug setup * reshaping, not fully tested * fix cloning of streams * fix pytest library loading * remove unused template * fix some activation bugs * fix the overwriting of directories in the pytest * update version of test repository * try to fix docker issue * bump hls4ml-testing tag to 0.5.2 * try not restricting tensorflow-model-optimizatoin * Update to 0.5.3 for testing * bump to docker image 0.5.4, suggested by Ben * fix pre-commit warning * dial down N_TESTS_PER_YAML to 4 * revert tensorflow-model-optimization change * fix issue of saving in "obsolete" h5 format * fix embedding for oneAPI * First attempt at adding RNNs to oneAPI * fix bug in array size * fix order or indices * make queues static in bridge * fix logic error in repack stream * changing the style, but functionally identical * update pointwise optimizer for oneAPI * add oneAPI to test_multi_dense.py * fix updating weight types * initial changes of templates, for testing * fix weight naming, product selection * make im2col the default; fix winograd size * fix up streaming dense and convolution * fix prelu, some batchnorm * fix weight array of exponential types * move ACExponentialPrecisionDefinition to oneapi_types * attempt to fix batchnorm and recurrent * fixed BatchNormalizationQuantizedTanhConfigTemplate template selection * fix embedding_stream * fix lstm and simple rnn * fix GRU * fix winograd, and also disable it by default * fix threshold name * split bn_quant to be backend-specific * add type inference to oneAPI * add oneAPI to pytorch tests * fix pooling with padding for oneAPI and Quartus * Compilation for larger models enabled by increasing -fconstexpr-steps * add oneapi clone tests; remove reduntand multi_clone test * remove some attributes to avoid overwrite warnings * make extra handling for oneAPI like others (as in PR #1067) * remove warnings for extra optimizers that are not scheduled on purpose * update parametrized activations * intial depthconv2d implementation * intial depthconv2d implementation * Rename to depthconv, add strides and add tests * Remove class for DepthwiseConv2D * Remove Separable convolution template * Remove layer optimizer for sepconv * Loop unroll * Pre-commit format * Fix spelling * depthconv1d, channel order in loop, product * Gather result to accum --------- Co-authored-by: Jovan Mitrevski Co-authored-by: Jovan Mitrevski --- hls4ml/backends/fpga/fpga_backend.py | 2 +- .../oneapi/passes/convolution_templates.py | 40 +++++++++- hls4ml/model/optimizer/__init__.py | 2 +- .../optimizer/passes/seperable_to_dw_conv.py | 10 +-- .../firmware/nnet_utils/nnet_depthconv1d.h | 19 +++++ .../nnet_utils/nnet_depthconv1d_resource.h | 60 +++++++++++++++ .../firmware/nnet_utils/nnet_depthconv2d.h | 19 +++++ .../nnet_utils/nnet_depthconv2d_resource.h | 76 +++++++++++++++++++ test/pytest/test_depthconv1d.py | 1 + test/pytest/test_depthconv2d.py | 1 + test/pytest/test_sepconv1d.py | 1 + test/pytest/test_sepconv2d.py | 1 + 12 files changed, 222 insertions(+), 10 deletions(-) create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index fbfed71c5b..b20fdf1228 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -94,7 +94,7 @@ def __init__(self, name): attrs.append(ConfigurableAttribute('reuse_factor', default=1, description=descriptions.reuse_factor)) self.attribute_map[layer] = attrs - # seperable is kind of special because it is effectively two layers that will be split + # separable is kind of special because it is effectively two layers that will be split for layer in (SeparableConv1D, SeparableConv2D): attrs = self.attribute_map.get(layer, []) attrs.append(TypeAttribute('depthwise_accum')) diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py index 17154559d8..64d9e42228 100644 --- a/hls4ml/backends/oneapi/passes/convolution_templates.py +++ b/hls4ml/backends/oneapi/passes/convolution_templates.py @@ -1,7 +1,7 @@ from hls4ml.backends.backend import get_backend from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate -from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm +from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm, DepthwiseConv1D, DepthwiseConv2D # TODO - Dilation rate ? @@ -70,9 +70,20 @@ conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h'] +depthconv1d_function_template = ( + 'nnet::depthwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) +depthconv1d_include_list = [ + 'nnet_utils/nnet_conv1d.h', + 'nnet_utils/nnet_conv1d_resource.h', + 'nnet_utils/nnet_depthconv1d.h', + 'nnet_utils/nnet_depthconv1d_resource.h', +] + + class Conv1DConfigTemplate(LayerConfigTemplate): def __init__(self): - super().__init__(Conv1D) + super().__init__((Conv1D, DepthwiseConv1D)) self.template = conv1d_config_template self.mult_template = conv_mult_config_template @@ -137,6 +148,12 @@ def format(self, node): return self.template.format(**params) +class DepthwiseConv1DFunctionTemplate(Conv1DFunctionTemplate): + def __init__(self): + super(Conv1DFunctionTemplate, self).__init__(DepthwiseConv1D, include_header=depthconv1d_include_list) + self.template = depthconv1d_function_template + + ''' 2D Conv ''' conv2d_config_template = """struct config{index} : nnet::conv2d_config {{ static const unsigned in_height = {in_height}; @@ -183,7 +200,7 @@ def format(self, node): class Conv2DConfigTemplate(LayerConfigTemplate): def __init__(self): - super().__init__((Conv2D, Conv2DBatchnorm)) + super().__init__((Conv2D, Conv2DBatchnorm, DepthwiseConv2D)) self.template = conv2d_config_template self.mult_template = conv_mult_config_template @@ -233,3 +250,20 @@ def format(self, node): raise RuntimeError('channels_first not supported on oneAPI') params['data_format'] = 'cl' return self.template.format(**params) + + +depthconv2d_function_template = ( + 'nnet::depthwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) +depthconv2d_include_list = [ + 'nnet_utils/nnet_conv2d.h', + 'nnet_utils/nnet_conv2d_resource.h', + 'nnet_utils/nnet_depthconv2d.h', + 'nnet_utils/nnet_depthconv2d_resource.h', +] + + +class DepthwiseConv2DFunctionTemplate(Conv2DFunctionTemplate): + def __init__(self): + super(Conv2DFunctionTemplate, self).__init__(DepthwiseConv2D, include_header=depthconv2d_include_list) + self.template = depthconv2d_function_template diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index a745eceba1..7e9325ccd0 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -59,7 +59,7 @@ 'convert', [ 'channels_last_converter', - 'seperable_to_depthwise_and_conv', + 'separable_to_depthwise_and_conv', 'remove_transpose_before_flatten', 'remove_nop_transpose', 'remove_single_channel_transpose', diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py index 38eef1e7d0..10840ec410 100644 --- a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py +++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py @@ -1,5 +1,5 @@ """ -This optimizer converts a seperable convolution to a depthwise followed by a regular convolution. +This optimizer converts a separable convolution to a depthwise followed by a regular convolution. For backends with a custom pointwise implementations the regular convolution will subsequently be converted to a pointwise convolution by a different optimizer. """ @@ -10,8 +10,8 @@ from hls4ml.model.optimizer import OptimizerPass -class SeperableToDepthwiseAndConv(OptimizerPass): - """Convert Seperable to DepthwiseConv + Conv (potentially later Pointwise)""" +class SeparableToDepthwiseAndConv(OptimizerPass): + """Convert Separable to DepthwiseConv + Conv (potentially later Pointwise)""" _dw_attributes = ( 'in_width', @@ -70,7 +70,7 @@ def transform(self, model, node): model.config.parse_name_config(dw_name, dw_layer_config) # creating the attributes - dw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._dw_attributes if k in node.attributes} + dw_attributes = {k: node.attributes[k] for k in SeparableToDepthwiseAndConv._dw_attributes if k in node.attributes} dw_attributes['n_filt'] = dw_attributes['n_chan'] * dw_attributes['depth_multiplier'] dw_attributes['use_bias'] = False @@ -100,7 +100,7 @@ def transform(self, model, node): model.config.parse_name_config(pw_name, pw_layer_config) # creating the attributes - pw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._pw_attributes if k in node.attributes} + pw_attributes = {k: node.attributes[k] for k in SeparableToDepthwiseAndConv._pw_attributes if k in node.attributes} pw_attributes['filt_width'] = 1 pw_attributes['filt_height'] = 1 pw_attributes['stride_width'] = 1 diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h new file mode 100644 index 0000000000..d2c774fcf8 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h @@ -0,0 +1,19 @@ +#ifndef NNET_DEPTH_CONV1D_H_ +#define NNET_DEPTH_CONV1D_H_ + +#include "nnet_common.h" +#include "nnet_conv1d.h" +#include "nnet_depthconv1d_resource.h" + +namespace nnet { + +template +void depthwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + + depthwise_conv_1d_resource_cl(data, res, weights, biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h new file mode 100644 index 0000000000..c06b6b14e7 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h @@ -0,0 +1,60 @@ +#ifndef NNET_DEPTH_CONV1D_LATENCY_H_ +#define NNET_DEPTH_CONV1D_LATENCY_H_ + +#include "nnet_common.h" +#include "nnet_conv1d_resource.h" +#include "nnet_mult.h" + +namespace nnet { + +template +void depthwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + + int depth_multiplier = CONFIG_T::n_filt / CONFIG_T::n_chan; + [[intel::fpga_register]] int res_idx = 0; + + [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::out_width * CONFIG_T::n_filt]; + +DM_LOOP: + #pragma unroll + for (int dm = 0; dm < depth_multiplier; dm++) { + + WIDTH_LOOP: + #pragma unroll + for (int w = 0; w < CONFIG_T::out_width; w++) { + + CHAN_LOOP: + #pragma unroll + for (int c = 0; c < CONFIG_T::n_chan; c++) { + + res_idx = (w * CONFIG_T::n_filt) + (c * depth_multiplier) + dm; + + acc[res_idx] = biases[c * depth_multiplier + dm]; + + KERNEL_W_LOOP: + #pragma unroll + for (int kw = 0; kw < CONFIG_T::filt_width; kw++) { + + int w_in = w * CONFIG_T::stride_width + kw - CONFIG_T::pad_left; + + if ((w_in >= 0) && (w_in < CONFIG_T::in_width)) { + + acc[res_idx] += CONFIG_T::mult_config:: + template product::product( + data[(w_in)*CONFIG_T::n_chan + c], + weights[(dm * CONFIG_T::filt_width * CONFIG_T::n_chan) + (kw * CONFIG_T::n_chan) + c]); + } + } + } + } + } + +RESULT: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::out_width * CONFIG_T::n_filt; ires++) { + res[ires] = cast(acc[ires]); + } +} +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h new file mode 100644 index 0000000000..87dc1805d9 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h @@ -0,0 +1,19 @@ +#ifndef NNET_DEPTH_CONV2D_H_ +#define NNET_DEPTH_CONV2D_H_ + +#include "nnet_common.h" +#include "nnet_conv2d.h" +#include "nnet_depthconv2d_resource.h" + +namespace nnet { + +template +void depthwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + + depthwise_conv_2d_resource_cl(data, res, weights, biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h new file mode 100644 index 0000000000..91ddc28f65 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h @@ -0,0 +1,76 @@ +#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_ +#define NNET_SEPARABLE_CONV2D_LATENCY_H_ + +#include "nnet_common.h" +#include "nnet_conv2d_resource.h" +#include "nnet_mult.h" + +namespace nnet { + +template +void depthwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + + int depth_multiplier = CONFIG_T::n_filt / CONFIG_T::n_chan; + [[intel::fpga_register]] int res_idx = 0; + + [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::out_width * CONFIG_T::out_height * CONFIG_T::n_filt]; + +DM_LOOP: + #pragma unroll + for (int dm = 0; dm < depth_multiplier; dm++) { + + HEIGHT_LOOP: + #pragma unroll + for (int h = 0; h < CONFIG_T::out_height; h++) { + WIDTH_LOOP: + #pragma unroll + for (int w = 0; w < CONFIG_T::out_width; w++) { + + CHAN_LOOP: + #pragma unroll + for (int c = 0; c < CONFIG_T::n_chan; c++) { + + res_idx = + (h * CONFIG_T::out_width * CONFIG_T::n_filt) + (w * CONFIG_T::n_filt) + (c * depth_multiplier) + dm; + + acc[res_idx] = biases[c * depth_multiplier + dm]; + + KERNEL_H_LOOP: + #pragma unroll + for (int kh = 0; kh < CONFIG_T::filt_height; kh++) { + KERNEL_W_LOOP: + #pragma unroll + for (int kw = 0; kw < CONFIG_T::filt_width; kw++) { + + int h_in = h * CONFIG_T::stride_height + kh - CONFIG_T::pad_top; + int w_in = w * CONFIG_T::stride_width + kw - CONFIG_T::pad_left; + + if ((h_in >= 0) && (h_in < CONFIG_T::in_height) && (w_in >= 0) && (w_in < CONFIG_T::in_width)) { + + acc[res_idx] += + CONFIG_T::mult_config::template product:: + product( + data[(h_in)*CONFIG_T::in_width * CONFIG_T::n_chan + (w_in)*CONFIG_T::n_chan + c], + weights[(dm * CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan) + + (kh * CONFIG_T::filt_width * CONFIG_T::n_chan) + + (kw * CONFIG_T::n_chan) + c]); + + ; + } + } + } + } + } + } + } + +RESULT: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::out_width * CONFIG_T::out_height * CONFIG_T::n_filt; ires++) { + res[ires] = cast(acc[ires]); + } +} +} // namespace nnet +#endif diff --git a/test/pytest/test_depthconv1d.py b/test/pytest/test_depthconv1d.py index 3734815af0..85c8e2ac4f 100644 --- a/test/pytest/test_depthconv1d.py +++ b/test/pytest/test_depthconv1d.py @@ -23,6 +23,7 @@ @pytest.mark.parametrize( 'backend, io_type', [ + ('oneAPI', 'io_parallel'), ('Vivado', 'io_parallel'), ('Vitis', 'io_parallel'), ('Vivado', 'io_stream'), diff --git a/test/pytest/test_depthconv2d.py b/test/pytest/test_depthconv2d.py index 9178edf368..4832cb1ae9 100644 --- a/test/pytest/test_depthconv2d.py +++ b/test/pytest/test_depthconv2d.py @@ -24,6 +24,7 @@ @pytest.mark.parametrize( 'backend, io_type', [ + ('oneAPI', 'io_parallel'), ('Vivado', 'io_parallel'), ('Vitis', 'io_parallel'), ('Vivado', 'io_stream'), diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py index 64312e9932..aef24db040 100644 --- a/test/pytest/test_sepconv1d.py +++ b/test/pytest/test_sepconv1d.py @@ -23,6 +23,7 @@ @pytest.mark.parametrize( 'backend, io_type', [ + ('oneAPI', 'io_parallel'), ('Vivado', 'io_parallel'), ('Vitis', 'io_parallel'), ('Vivado', 'io_stream'), diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index 4732c7c7f1..1d056f15c9 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -23,6 +23,7 @@ @pytest.mark.parametrize( 'backend, io_type', [ + ('oneAPI', 'io_parallel'), ('Vivado', 'io_parallel'), ('Vitis', 'io_parallel'), ('Vivado', 'io_stream'),