From ea5c5a86fb2d353a90eb1300824b6529888b26d8 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 08:47:30 -0700 Subject: [PATCH 01/10] merge --- hls4ml/templates/vivado/build_prj.tcl | 2 +- .../templates/vivado/nnet_utils/nnet_common.h | 1 + .../templates/vivado/nnet_utils/nnet_conv1d.h | 16 +- .../vivado/nnet_utils/nnet_conv1d_latency.h | 221 ++++++++++++++++++ .../vivado/nnet_utils/nnet_conv_stream.h | 2 - 5 files changed, 237 insertions(+), 5 deletions(-) diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index d34337c573..6383b910ca 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -161,7 +161,7 @@ if {$opt(reset)} { } else { open_solution "solution1" } -catch {config_array_partition -maximum_size 4096} +catch {config_array_partition -maximum_size 8192} config_compile -name_max_length 80 set_part $part config_schedule -enable_dsp_full_reg=false diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index fed0395a1a..b6582e1406 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -24,6 +24,7 @@ namespace nnet { // Common type definitions enum io_type { io_parallel = 0, io_stream }; enum strategy { latency, resource }; +enum class conv_implementation { linebuffer=0, encoded=1, pointwise=2}; /* --- * Balanced tree reduce implementation. diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index e2e0211b49..c2990ea97a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -53,9 +53,21 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], #pragma HLS INLINE region - // Nothing special to be done for io_parallel implementation if (CONFIG_T::strategy == nnet::latency) { - conv_1d_latency_cl(data, res, weights, biases); + if (CONFIG_T::implementation == conv_implementation::pointwise){ + // Use pointwise unrolled implementation + if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) { + pointwise_conv_1d_latency_cl_split_by_rf(data, res, weights, biases); + } + else { + assert(CONFIG_T::reuse_factor == 1); + pointwise_conv_1d_latency_cl(data, res, weights, biases); + } + } + else { + // Use standard unrolled implementation + conv_1d_resource_cl(data, res, weights, biases); + } } else { conv_1d_resource_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 0d9afb10cb..8549ae9add 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,5 +84,226 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } +template +void pointwise_conv_1d_latency_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan/CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan/CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width/CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + //const int multiplier_limit = compute_multiplier_limit(weights); + //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + + // Convolve, saving all multiplication results to accumulate later + ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; + int index_weight = cc*CONFIG_T::n_filt + ff; + int index_data = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if((ii*CONFIG_T::stride_width) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){ + mult[index_mult] = 0; + } + else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + }//end channel loop + }//end filter loop + }//end output loop + + + // Initialize accumulator with input biases + for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + acc[ii][ff]=biases[ff]; + } + } + + + // Accumulate multiplication result + AccumOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + //Do "dot product" sum within filter and sum over channels + AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + }//end channel loop + }//end filter loop + }//end output loop + + + // Cast to "res_t" type + for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + +template void pointwise_conv_1d_latency_cl_split_by_rf( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + + data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor]; + #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 + res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor]; + #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 + + for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { + #pragma HLS UNROLL + data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii]; + } + } + + pointwise_conv_1d_latency_cl(data_tmp[0], res_tmp[0], weights, biases); + pointwise_conv_1d_latency_cl(data_tmp[1], res_tmp[1], weights, biases); + if (CONFIG_T::reuse_factor > 2) pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); + if (CONFIG_T::reuse_factor > 3) pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); + if (CONFIG_T::reuse_factor > 4) pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); + if (CONFIG_T::reuse_factor > 5) pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); + if (CONFIG_T::reuse_factor > 6) pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); + if (CONFIG_T::reuse_factor > 7) pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); + if (CONFIG_T::reuse_factor > 8) pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); + if (CONFIG_T::reuse_factor > 9) pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); + if (CONFIG_T::reuse_factor > 10) pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); + if (CONFIG_T::reuse_factor > 11) pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); + if (CONFIG_T::reuse_factor > 12) pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); + if (CONFIG_T::reuse_factor > 13) pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); + if (CONFIG_T::reuse_factor > 14) pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); + if (CONFIG_T::reuse_factor > 15) pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); + if (CONFIG_T::reuse_factor > 16) pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); + if (CONFIG_T::reuse_factor > 17) pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); + if (CONFIG_T::reuse_factor > 18) pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); + if (CONFIG_T::reuse_factor > 19) pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); + if (CONFIG_T::reuse_factor > 20) pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); + if (CONFIG_T::reuse_factor > 21) pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); + if (CONFIG_T::reuse_factor > 22) pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); + if (CONFIG_T::reuse_factor > 23) pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); + if (CONFIG_T::reuse_factor > 24) pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); + if (CONFIG_T::reuse_factor > 25) pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); + if (CONFIG_T::reuse_factor > 26) pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); + if (CONFIG_T::reuse_factor > 27) pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); + if (CONFIG_T::reuse_factor > 28) pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); + if (CONFIG_T::reuse_factor > 29) pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); + if (CONFIG_T::reuse_factor > 30) pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); + if (CONFIG_T::reuse_factor > 31) pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); + if (CONFIG_T::reuse_factor > 32) pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); + if (CONFIG_T::reuse_factor > 33) pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); + if (CONFIG_T::reuse_factor > 34) pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); + if (CONFIG_T::reuse_factor > 35) pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); + if (CONFIG_T::reuse_factor > 36) pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); + if (CONFIG_T::reuse_factor > 37) pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); + if (CONFIG_T::reuse_factor > 38) pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); + if (CONFIG_T::reuse_factor > 39) pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); + if (CONFIG_T::reuse_factor > 40) pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); + if (CONFIG_T::reuse_factor > 41) pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); + if (CONFIG_T::reuse_factor > 42) pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); + if (CONFIG_T::reuse_factor > 43) pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); + if (CONFIG_T::reuse_factor > 44) pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); + if (CONFIG_T::reuse_factor > 45) pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 46) pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 47) pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); + if (CONFIG_T::reuse_factor > 48) pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); + if (CONFIG_T::reuse_factor > 49) pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); + if (CONFIG_T::reuse_factor > 50) pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); + if (CONFIG_T::reuse_factor > 51) pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); + if (CONFIG_T::reuse_factor > 52) pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); + if (CONFIG_T::reuse_factor > 53) pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); + if (CONFIG_T::reuse_factor > 54) pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); + if (CONFIG_T::reuse_factor > 55) pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 56) pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 57) pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); + if (CONFIG_T::reuse_factor > 58) pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); + if (CONFIG_T::reuse_factor > 59) pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); + if (CONFIG_T::reuse_factor > 60) pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); + if (CONFIG_T::reuse_factor > 61) pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); + if (CONFIG_T::reuse_factor > 62) pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); + if (CONFIG_T::reuse_factor > 63) pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); + if (CONFIG_T::reuse_factor > 64) pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); + if (CONFIG_T::reuse_factor > 65) pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); + if (CONFIG_T::reuse_factor > 66) pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); + if (CONFIG_T::reuse_factor > 67) pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); + if (CONFIG_T::reuse_factor > 68) pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); + if (CONFIG_T::reuse_factor > 69) pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); + if (CONFIG_T::reuse_factor > 70) pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); + if (CONFIG_T::reuse_factor > 71) pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); + if (CONFIG_T::reuse_factor > 72) pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); + if (CONFIG_T::reuse_factor > 73) pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); + if (CONFIG_T::reuse_factor > 74) pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); + if (CONFIG_T::reuse_factor > 75) pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); + if (CONFIG_T::reuse_factor > 76) pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); + if (CONFIG_T::reuse_factor > 77) pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); + if (CONFIG_T::reuse_factor > 78) pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); + if (CONFIG_T::reuse_factor > 79) pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); + if (CONFIG_T::reuse_factor > 80) pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); + if (CONFIG_T::reuse_factor > 81) pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); + if (CONFIG_T::reuse_factor > 82) pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); + if (CONFIG_T::reuse_factor > 83) pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); + if (CONFIG_T::reuse_factor > 84) pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); + if (CONFIG_T::reuse_factor > 85) pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); + if (CONFIG_T::reuse_factor > 86) pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); + if (CONFIG_T::reuse_factor > 87) pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); + if (CONFIG_T::reuse_factor > 88) pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); + if (CONFIG_T::reuse_factor > 89) pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); + if (CONFIG_T::reuse_factor > 90) pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); + if (CONFIG_T::reuse_factor > 91) pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); + if (CONFIG_T::reuse_factor > 92) pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); + if (CONFIG_T::reuse_factor > 93) pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); + if (CONFIG_T::reuse_factor > 94) pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); + if (CONFIG_T::reuse_factor > 95) pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); + if (CONFIG_T::reuse_factor > 96) pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); + if (CONFIG_T::reuse_factor > 97) pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); + if (CONFIG_T::reuse_factor > 98) pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); + if (CONFIG_T::reuse_factor > 99) pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); + if (CONFIG_T::reuse_factor > 100) pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); + if (CONFIG_T::reuse_factor > 101) pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); + if (CONFIG_T::reuse_factor > 102) pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); + if (CONFIG_T::reuse_factor > 103) pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); + if (CONFIG_T::reuse_factor > 104) pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); + if (CONFIG_T::reuse_factor > 105) pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); + if (CONFIG_T::reuse_factor > 106) pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); + if (CONFIG_T::reuse_factor > 107) pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); + if (CONFIG_T::reuse_factor > 108) pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); + if (CONFIG_T::reuse_factor > 109) pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); + if (CONFIG_T::reuse_factor > 110) pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); + if (CONFIG_T::reuse_factor > 111) pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); + if (CONFIG_T::reuse_factor > 112) pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); + if (CONFIG_T::reuse_factor > 113) pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); + if (CONFIG_T::reuse_factor > 114) pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); + if (CONFIG_T::reuse_factor > 115) pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); + if (CONFIG_T::reuse_factor > 116) pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); + if (CONFIG_T::reuse_factor > 117) pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); + if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); + if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + + for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { + #pragma HLS UNROLL + res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; + } + } +} + } // namespace nnet #endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index 7bd47442f6..b763938cb3 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -8,8 +8,6 @@ namespace nnet { -enum class conv_implementation { linebuffer = 0, encoded = 1 }; - // ************************************************* // Encoded Implementation (Vlad's) // ************************************************* From 6849e0b4d0a1b352cac1d61870273882dc112705 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 22 Dec 2022 16:21:25 -0600 Subject: [PATCH 02/10] add pointwise --- hls4ml/backends/vivado/vivado_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 1d4c96d982..4dab5f5c18 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -72,7 +72,7 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) - attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) + attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer')) self.attribute_map[layer] = attrs def _register_flows(self): From 0244b666652e2667c8df72c134f9abd94c731685 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 25 Mar 2023 18:29:44 -0700 Subject: [PATCH 03/10] latency --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index c2990ea97a..e2dee3485a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -66,7 +66,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } else { // Use standard unrolled implementation - conv_1d_resource_cl(data, res, weights, biases); + conv_1d_latency_cl(data, res, weights, biases); } } else { conv_1d_resource_cl(data, res, weights, biases); From 3ae7752e70dc43d0687b39a90d7c4d0fd6f9b797 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 25 Mar 2023 18:56:58 -0700 Subject: [PATCH 04/10] unroll --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 8549ae9add..4179c1dde8 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -104,6 +104,7 @@ void pointwise_conv_1d_latency_cl( // Parallel mode #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization @@ -114,6 +115,7 @@ void pointwise_conv_1d_latency_cl( ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; int index_weight = cc*CONFIG_T::n_filt + ff; int index_data = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; @@ -132,6 +134,7 @@ void pointwise_conv_1d_latency_cl( // Initialize accumulator with input biases for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL acc[ii][ff]=biases[ff]; } } @@ -152,6 +155,7 @@ void pointwise_conv_1d_latency_cl( // Cast to "res_t" type for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); } } @@ -169,7 +173,9 @@ template void pointwise_conv_1d_la res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor]; #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 + RFInputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + InnerInputLoop: for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii]; @@ -297,7 +303,9 @@ template void pointwise_conv_1d_la if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + RFOutputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + InnerOutputLoop: for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; From 23126b70ca5496bcc7da993d95a8d939920bd8bc Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 26 Mar 2023 17:19:08 -0700 Subject: [PATCH 05/10] add hls unroll --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 4179c1dde8..c5b520c703 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -175,6 +175,7 @@ template void pointwise_conv_1d_la RFInputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL InnerInputLoop: for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL @@ -305,6 +306,7 @@ template void pointwise_conv_1d_la RFOutputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL InnerOutputLoop: for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL From 6aff9e996df95955d010013c2163a723ab8a8170 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 8 Jun 2023 08:15:11 -0700 Subject: [PATCH 06/10] fix pragma from walkie --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index c5b520c703..c423c7a228 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -108,8 +108,8 @@ void pointwise_conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - //const int multiplier_limit = compute_multiplier_limit(weights); - //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + int multiplier_limit = ceil ( (float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor )* CONFIG_T::n_filt * CONFIG_T::n_chan ) / float(CONFIG_T::reuse_factor) ); + #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { From 7f1c318dea6767d5b0e4996786c356d48bfa4560 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 Jun 2023 18:46:37 +0000 Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit hooks --- hls4ml/backends/vivado/vivado_backend.py | 4 +- .../templates/vivado/nnet_utils/nnet_common.h | 2 +- .../templates/vivado/nnet_utils/nnet_conv1d.h | 8 +- .../vivado/nnet_utils/nnet_conv1d_latency.h | 488 +++++++++++------- 4 files changed, 311 insertions(+), 191 deletions(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 4dab5f5c18..1eb58f0952 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -72,7 +72,9 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) - attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer')) + attrs.append( + ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer') + ) self.attribute_map[layer] = attrs def _register_flows(self): diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index b6582e1406..e942a1dc89 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -24,7 +24,7 @@ namespace nnet { // Common type definitions enum io_type { io_parallel = 0, io_stream }; enum strategy { latency, resource }; -enum class conv_implementation { linebuffer=0, encoded=1, pointwise=2}; +enum class conv_implementation { linebuffer = 0, encoded = 1, pointwise = 2 }; /* --- * Balanced tree reduce implementation. diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index e2dee3485a..0f2e89ac8f 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -54,17 +54,15 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], #pragma HLS INLINE region if (CONFIG_T::strategy == nnet::latency) { - if (CONFIG_T::implementation == conv_implementation::pointwise){ + if (CONFIG_T::implementation == conv_implementation::pointwise) { // Use pointwise unrolled implementation if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) { pointwise_conv_1d_latency_cl_split_by_rf(data, res, weights, biases); - } - else { + } else { assert(CONFIG_T::reuse_factor == 1); pointwise_conv_1d_latency_cl(data, res, weights, biases); } - } - else { + } else { // Use standard unrolled implementation conv_1d_latency_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index c423c7a228..aabc869823 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,17 +84,15 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } -template -void pointwise_conv_1d_latency_cl( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan/CONFIG_T::reuse_factor], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) -{ +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_width == 1); - typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan/CONFIG_T::reuse_factor]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_width/CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 @@ -108,209 +106,331 @@ void pointwise_conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - int multiplier_limit = ceil ( (float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor )* CONFIG_T::n_filt * CONFIG_T::n_chan ) / float(CONFIG_T::reuse_factor) ); - #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit - - // Convolve, saving all multiplication results to accumulate later - ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { - #pragma HLS UNROLL - int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; - int index_weight = cc*CONFIG_T::n_filt + ff; - int index_data = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; - - if((ii*CONFIG_T::stride_width) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){ + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { mult[index_mult] = 0; - } - else { + } else { mult[index_mult] = data[index_data] * weights[index_weight]; } - }//end channel loop - }//end filter loop - }//end output loop - + } // end channel loop + } // end filter loop + } // end output loop // Initialize accumulator with input biases - for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL - acc[ii][ff]=biases[ff]; + acc[ii][ff] = biases[ff]; } } - - // Accumulate multiplication result - AccumOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - //Do "dot product" sum within filter and sum over channels - AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { - int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; acc[ii][ff] += mult[index_mult]; - }//end channel loop - }//end filter loop - }//end output loop - + } // end channel loop + } // end filter loop + } // end output loop // Cast to "res_t" type - for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); } } } -template void pointwise_conv_1d_latency_cl_split_by_rf( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) -{ +template +void pointwise_conv_1d_latency_cl_split_by_rf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor]; + data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 - res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor]; + res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor]; #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 - - RFInputLoop: - for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerInputLoop: - for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { + +RFInputLoop: + for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL + InnerInputLoop: + for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL - data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii]; + data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii]; } } pointwise_conv_1d_latency_cl(data_tmp[0], res_tmp[0], weights, biases); pointwise_conv_1d_latency_cl(data_tmp[1], res_tmp[1], weights, biases); - if (CONFIG_T::reuse_factor > 2) pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); - if (CONFIG_T::reuse_factor > 3) pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); - if (CONFIG_T::reuse_factor > 4) pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); - if (CONFIG_T::reuse_factor > 5) pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); - if (CONFIG_T::reuse_factor > 6) pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); - if (CONFIG_T::reuse_factor > 7) pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); - if (CONFIG_T::reuse_factor > 8) pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); - if (CONFIG_T::reuse_factor > 9) pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); - if (CONFIG_T::reuse_factor > 10) pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); - if (CONFIG_T::reuse_factor > 11) pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); - if (CONFIG_T::reuse_factor > 12) pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); - if (CONFIG_T::reuse_factor > 13) pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); - if (CONFIG_T::reuse_factor > 14) pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); - if (CONFIG_T::reuse_factor > 15) pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); - if (CONFIG_T::reuse_factor > 16) pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); - if (CONFIG_T::reuse_factor > 17) pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); - if (CONFIG_T::reuse_factor > 18) pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); - if (CONFIG_T::reuse_factor > 19) pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); - if (CONFIG_T::reuse_factor > 20) pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); - if (CONFIG_T::reuse_factor > 21) pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); - if (CONFIG_T::reuse_factor > 22) pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); - if (CONFIG_T::reuse_factor > 23) pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); - if (CONFIG_T::reuse_factor > 24) pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); - if (CONFIG_T::reuse_factor > 25) pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); - if (CONFIG_T::reuse_factor > 26) pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); - if (CONFIG_T::reuse_factor > 27) pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); - if (CONFIG_T::reuse_factor > 28) pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); - if (CONFIG_T::reuse_factor > 29) pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); - if (CONFIG_T::reuse_factor > 30) pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); - if (CONFIG_T::reuse_factor > 31) pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); - if (CONFIG_T::reuse_factor > 32) pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); - if (CONFIG_T::reuse_factor > 33) pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); - if (CONFIG_T::reuse_factor > 34) pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); - if (CONFIG_T::reuse_factor > 35) pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); - if (CONFIG_T::reuse_factor > 36) pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); - if (CONFIG_T::reuse_factor > 37) pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); - if (CONFIG_T::reuse_factor > 38) pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); - if (CONFIG_T::reuse_factor > 39) pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); - if (CONFIG_T::reuse_factor > 40) pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); - if (CONFIG_T::reuse_factor > 41) pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); - if (CONFIG_T::reuse_factor > 42) pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); - if (CONFIG_T::reuse_factor > 43) pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); - if (CONFIG_T::reuse_factor > 44) pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); - if (CONFIG_T::reuse_factor > 45) pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 46) pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 47) pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); - if (CONFIG_T::reuse_factor > 48) pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); - if (CONFIG_T::reuse_factor > 49) pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); - if (CONFIG_T::reuse_factor > 50) pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); - if (CONFIG_T::reuse_factor > 51) pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); - if (CONFIG_T::reuse_factor > 52) pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); - if (CONFIG_T::reuse_factor > 53) pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); - if (CONFIG_T::reuse_factor > 54) pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); - if (CONFIG_T::reuse_factor > 55) pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 56) pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 57) pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); - if (CONFIG_T::reuse_factor > 58) pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); - if (CONFIG_T::reuse_factor > 59) pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); - if (CONFIG_T::reuse_factor > 60) pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); - if (CONFIG_T::reuse_factor > 61) pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); - if (CONFIG_T::reuse_factor > 62) pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); - if (CONFIG_T::reuse_factor > 63) pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); - if (CONFIG_T::reuse_factor > 64) pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); - if (CONFIG_T::reuse_factor > 65) pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); - if (CONFIG_T::reuse_factor > 66) pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); - if (CONFIG_T::reuse_factor > 67) pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); - if (CONFIG_T::reuse_factor > 68) pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); - if (CONFIG_T::reuse_factor > 69) pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); - if (CONFIG_T::reuse_factor > 70) pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); - if (CONFIG_T::reuse_factor > 71) pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); - if (CONFIG_T::reuse_factor > 72) pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); - if (CONFIG_T::reuse_factor > 73) pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); - if (CONFIG_T::reuse_factor > 74) pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); - if (CONFIG_T::reuse_factor > 75) pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); - if (CONFIG_T::reuse_factor > 76) pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); - if (CONFIG_T::reuse_factor > 77) pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); - if (CONFIG_T::reuse_factor > 78) pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); - if (CONFIG_T::reuse_factor > 79) pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); - if (CONFIG_T::reuse_factor > 80) pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); - if (CONFIG_T::reuse_factor > 81) pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); - if (CONFIG_T::reuse_factor > 82) pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); - if (CONFIG_T::reuse_factor > 83) pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); - if (CONFIG_T::reuse_factor > 84) pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); - if (CONFIG_T::reuse_factor > 85) pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); - if (CONFIG_T::reuse_factor > 86) pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); - if (CONFIG_T::reuse_factor > 87) pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); - if (CONFIG_T::reuse_factor > 88) pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); - if (CONFIG_T::reuse_factor > 89) pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); - if (CONFIG_T::reuse_factor > 90) pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); - if (CONFIG_T::reuse_factor > 91) pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); - if (CONFIG_T::reuse_factor > 92) pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); - if (CONFIG_T::reuse_factor > 93) pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); - if (CONFIG_T::reuse_factor > 94) pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); - if (CONFIG_T::reuse_factor > 95) pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); - if (CONFIG_T::reuse_factor > 96) pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); - if (CONFIG_T::reuse_factor > 97) pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); - if (CONFIG_T::reuse_factor > 98) pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); - if (CONFIG_T::reuse_factor > 99) pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); - if (CONFIG_T::reuse_factor > 100) pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); - if (CONFIG_T::reuse_factor > 101) pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); - if (CONFIG_T::reuse_factor > 102) pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); - if (CONFIG_T::reuse_factor > 103) pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); - if (CONFIG_T::reuse_factor > 104) pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); - if (CONFIG_T::reuse_factor > 105) pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); - if (CONFIG_T::reuse_factor > 106) pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); - if (CONFIG_T::reuse_factor > 107) pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); - if (CONFIG_T::reuse_factor > 108) pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); - if (CONFIG_T::reuse_factor > 109) pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); - if (CONFIG_T::reuse_factor > 110) pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); - if (CONFIG_T::reuse_factor > 111) pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); - if (CONFIG_T::reuse_factor > 112) pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); - if (CONFIG_T::reuse_factor > 113) pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); - if (CONFIG_T::reuse_factor > 114) pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); - if (CONFIG_T::reuse_factor > 115) pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); - if (CONFIG_T::reuse_factor > 116) pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); - if (CONFIG_T::reuse_factor > 117) pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); - if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); - if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); - - RFOutputLoop: - for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerOutputLoop: - for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { + if (CONFIG_T::reuse_factor > 2) + pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); + if (CONFIG_T::reuse_factor > 3) + pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); + if (CONFIG_T::reuse_factor > 4) + pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); + if (CONFIG_T::reuse_factor > 5) + pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); + if (CONFIG_T::reuse_factor > 6) + pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); + if (CONFIG_T::reuse_factor > 7) + pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); + if (CONFIG_T::reuse_factor > 8) + pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); + if (CONFIG_T::reuse_factor > 9) + pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); + if (CONFIG_T::reuse_factor > 10) + pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); + if (CONFIG_T::reuse_factor > 11) + pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); + if (CONFIG_T::reuse_factor > 12) + pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); + if (CONFIG_T::reuse_factor > 13) + pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); + if (CONFIG_T::reuse_factor > 14) + pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); + if (CONFIG_T::reuse_factor > 15) + pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); + if (CONFIG_T::reuse_factor > 16) + pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); + if (CONFIG_T::reuse_factor > 17) + pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); + if (CONFIG_T::reuse_factor > 18) + pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); + if (CONFIG_T::reuse_factor > 19) + pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); + if (CONFIG_T::reuse_factor > 20) + pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); + if (CONFIG_T::reuse_factor > 21) + pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); + if (CONFIG_T::reuse_factor > 22) + pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); + if (CONFIG_T::reuse_factor > 23) + pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); + if (CONFIG_T::reuse_factor > 24) + pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); + if (CONFIG_T::reuse_factor > 25) + pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); + if (CONFIG_T::reuse_factor > 26) + pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); + if (CONFIG_T::reuse_factor > 27) + pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); + if (CONFIG_T::reuse_factor > 28) + pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); + if (CONFIG_T::reuse_factor > 29) + pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); + if (CONFIG_T::reuse_factor > 30) + pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); + if (CONFIG_T::reuse_factor > 31) + pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); + if (CONFIG_T::reuse_factor > 32) + pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); + if (CONFIG_T::reuse_factor > 33) + pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); + if (CONFIG_T::reuse_factor > 34) + pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); + if (CONFIG_T::reuse_factor > 35) + pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); + if (CONFIG_T::reuse_factor > 36) + pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); + if (CONFIG_T::reuse_factor > 37) + pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); + if (CONFIG_T::reuse_factor > 38) + pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); + if (CONFIG_T::reuse_factor > 39) + pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); + if (CONFIG_T::reuse_factor > 40) + pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); + if (CONFIG_T::reuse_factor > 41) + pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); + if (CONFIG_T::reuse_factor > 42) + pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); + if (CONFIG_T::reuse_factor > 43) + pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); + if (CONFIG_T::reuse_factor > 44) + pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); + if (CONFIG_T::reuse_factor > 45) + pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 46) + pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 47) + pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); + if (CONFIG_T::reuse_factor > 48) + pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); + if (CONFIG_T::reuse_factor > 49) + pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); + if (CONFIG_T::reuse_factor > 50) + pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); + if (CONFIG_T::reuse_factor > 51) + pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); + if (CONFIG_T::reuse_factor > 52) + pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); + if (CONFIG_T::reuse_factor > 53) + pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); + if (CONFIG_T::reuse_factor > 54) + pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); + if (CONFIG_T::reuse_factor > 55) + pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 56) + pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 57) + pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); + if (CONFIG_T::reuse_factor > 58) + pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); + if (CONFIG_T::reuse_factor > 59) + pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); + if (CONFIG_T::reuse_factor > 60) + pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); + if (CONFIG_T::reuse_factor > 61) + pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); + if (CONFIG_T::reuse_factor > 62) + pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); + if (CONFIG_T::reuse_factor > 63) + pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); + if (CONFIG_T::reuse_factor > 64) + pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); + if (CONFIG_T::reuse_factor > 65) + pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); + if (CONFIG_T::reuse_factor > 66) + pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); + if (CONFIG_T::reuse_factor > 67) + pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); + if (CONFIG_T::reuse_factor > 68) + pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); + if (CONFIG_T::reuse_factor > 69) + pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); + if (CONFIG_T::reuse_factor > 70) + pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); + if (CONFIG_T::reuse_factor > 71) + pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); + if (CONFIG_T::reuse_factor > 72) + pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); + if (CONFIG_T::reuse_factor > 73) + pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); + if (CONFIG_T::reuse_factor > 74) + pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); + if (CONFIG_T::reuse_factor > 75) + pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); + if (CONFIG_T::reuse_factor > 76) + pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); + if (CONFIG_T::reuse_factor > 77) + pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); + if (CONFIG_T::reuse_factor > 78) + pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); + if (CONFIG_T::reuse_factor > 79) + pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); + if (CONFIG_T::reuse_factor > 80) + pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); + if (CONFIG_T::reuse_factor > 81) + pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); + if (CONFIG_T::reuse_factor > 82) + pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); + if (CONFIG_T::reuse_factor > 83) + pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); + if (CONFIG_T::reuse_factor > 84) + pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); + if (CONFIG_T::reuse_factor > 85) + pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); + if (CONFIG_T::reuse_factor > 86) + pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); + if (CONFIG_T::reuse_factor > 87) + pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); + if (CONFIG_T::reuse_factor > 88) + pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); + if (CONFIG_T::reuse_factor > 89) + pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); + if (CONFIG_T::reuse_factor > 90) + pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); + if (CONFIG_T::reuse_factor > 91) + pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); + if (CONFIG_T::reuse_factor > 92) + pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); + if (CONFIG_T::reuse_factor > 93) + pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); + if (CONFIG_T::reuse_factor > 94) + pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); + if (CONFIG_T::reuse_factor > 95) + pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); + if (CONFIG_T::reuse_factor > 96) + pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); + if (CONFIG_T::reuse_factor > 97) + pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); + if (CONFIG_T::reuse_factor > 98) + pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); + if (CONFIG_T::reuse_factor > 99) + pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); + if (CONFIG_T::reuse_factor > 100) + pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); + if (CONFIG_T::reuse_factor > 101) + pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); + if (CONFIG_T::reuse_factor > 102) + pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); + if (CONFIG_T::reuse_factor > 103) + pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); + if (CONFIG_T::reuse_factor > 104) + pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); + if (CONFIG_T::reuse_factor > 105) + pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); + if (CONFIG_T::reuse_factor > 106) + pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); + if (CONFIG_T::reuse_factor > 107) + pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); + if (CONFIG_T::reuse_factor > 108) + pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); + if (CONFIG_T::reuse_factor > 109) + pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); + if (CONFIG_T::reuse_factor > 110) + pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); + if (CONFIG_T::reuse_factor > 111) + pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); + if (CONFIG_T::reuse_factor > 112) + pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); + if (CONFIG_T::reuse_factor > 113) + pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); + if (CONFIG_T::reuse_factor > 114) + pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); + if (CONFIG_T::reuse_factor > 115) + pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); + if (CONFIG_T::reuse_factor > 116) + pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); + if (CONFIG_T::reuse_factor > 117) + pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); + if (CONFIG_T::reuse_factor > 118) + pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); + if (CONFIG_T::reuse_factor > 119) + pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + +RFOutputLoop: + for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL + InnerOutputLoop: + for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL - res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; + res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; } } } From 69aecc6dc187a6e9a1ecdd2e7449629f1a88e87b Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 17:27:20 -0700 Subject: [PATCH 08/10] add test --- hls4ml/backends/vivado/vivado_backend.py | 1 - test/pytest/test_pointwiseconv.py | 37 ++++++++++++------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 1eb58f0952..1a99d90a8e 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -71,7 +71,6 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) - # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) attrs.append( ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer') ) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 28314fe130..080106955e 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -21,20 +21,22 @@ @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) @pytest.mark.parametrize( - 'backend, io_type, strategy', + 'backend, io_type, strategy, conv_implementation', [ - ('Quartus', 'io_parallel', 'resource'), - ('Vivado', 'io_parallel', 'resource'), - ('Vitis', 'io_parallel', 'resource'), - ('Vivado', 'io_parallel', 'latency'), - ('Vitis', 'io_parallel', 'latency'), - ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource'), - ('Vitis', 'io_stream', 'latency'), - ('Vitis', 'io_stream', 'resource'), + ('Quartus', 'io_parallel', 'resource', 'LineBuffer'), + ('Vivado', 'io_parallel', 'resource', 'LineBuffer'), + ('Vitis', 'io_parallel', 'resource', 'LineBuffer'), + ('Vivado', 'io_parallel', 'latency', 'LineBuffer'), + ('Vitis', 'io_parallel', 'latency', 'LineBuffer'), + ('Vivado', 'io_parallel', 'latency', 'Pointwise'), + ('Vitis', 'io_parallel', 'latency', 'Pointwise'), + ('Vivado', 'io_stream', 'latency', 'LineBuffer'), + ('Vivado', 'io_stream', 'resource', 'LineBuffer'), + ('Vitis', 'io_stream', 'latency', 'LineBuffer'), + ('Vitis', 'io_stream', 'resource', 'LineBuffer'), ], ) -def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): +def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_implementation): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -47,6 +49,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): kernel_initializer='normal', use_bias=False, data_format=chans, + name='pointwise1d' ) ) model.compile(optimizer='adam', loss='mse') @@ -55,14 +58,13 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): keras_prediction = model.predict(X_input) default_precision = 'ac_fixed<32,16,true>' if backend == 'Quartus' else 'ap_fixed<32,16>' - config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision) + config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') config['Model']['Strategy'] = strategy + config['LayerName']['pointwise1d']['ConvImplementation'] = conv_implementation output_dir = str( test_root_path - / 'hls4mlprj_pointwise1d_{}_strides_{}_{}_padding_{}_{}_{}'.format( - chans, strides[0], padds, backend, io_type, strategy - ) + / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_implementation}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend @@ -100,6 +102,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): kernel_initializer='normal', use_bias=False, data_format=chans, + name='pointwise2d' ) ) @@ -114,9 +117,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( test_root_path - / 'hls4mlprj_pointwise2d_{}_strides_{}_{}_padding_{}_{}_{}'.format( - chans, stride_cfg, padds, backend, io_type, strategy - ) + / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}' ) hls_model = hls4ml.converters.convert_from_keras_model( From 4febceded10000b3b1b6b4254c9b9c230a9f475c Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 17:48:44 -0700 Subject: [PATCH 09/10] pre-commit --- test/pytest/test_pointwiseconv.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 080106955e..0cb75b7a87 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -49,7 +49,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv kernel_initializer='normal', use_bias=False, data_format=chans, - name='pointwise1d' + name='pointwise1d', ) ) model.compile(optimizer='adam', loss='mse') @@ -102,7 +102,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): kernel_initializer='normal', use_bias=False, data_format=chans, - name='pointwise2d' + name='pointwise2d', ) ) @@ -116,8 +116,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): config['Model']['Strategy'] = strategy stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( - test_root_path - / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}' + test_root_path / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}' ) hls_model = hls4ml.converters.convert_from_keras_model( From 56797e73ecb1a830c28128387536308fd3f50beb Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 17:53:37 -0700 Subject: [PATCH 10/10] pre-commit --- test/pytest/test_pointwiseconv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 0cb75b7a87..cbe2036712 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -21,7 +21,7 @@ @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) @pytest.mark.parametrize( - 'backend, io_type, strategy, conv_implementation', + 'backend, io_type, strategy, conv_impl', [ ('Quartus', 'io_parallel', 'resource', 'LineBuffer'), ('Vivado', 'io_parallel', 'resource', 'LineBuffer'), @@ -36,7 +36,7 @@ ('Vitis', 'io_stream', 'resource', 'LineBuffer'), ], ) -def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_implementation): +def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -60,11 +60,11 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv default_precision = 'ac_fixed<32,16,true>' if backend == 'Quartus' else 'ap_fixed<32,16>' config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') config['Model']['Strategy'] = strategy - config['LayerName']['pointwise1d']['ConvImplementation'] = conv_implementation + config['LayerName']['pointwise1d']['ConvImplementation'] = conv_impl output_dir = str( test_root_path - / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_implementation}' + / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_impl}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend