diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp deleted file mode 100644 index 76f59e3448e694..00000000000000 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once -#include "primitive.hpp" -#include "activation.hpp" -#include -#include -#include "intel_gpu/graph/serialization/activation_serializer.hpp" - -namespace cldnn { - -/// @brief Weights orders -/// @details Specifies the order in which the weights are concatenated. -/// e.g. [i, o, f, z] : [input, output, forget, block] -/// ONNX order: iofz -/// Caffe order: ifoz -/// pyTorch order: izof -/// OV order: fizo -enum class lstm_weights_order { - iofz, - ifoz, - izof, - fizo -}; - -struct lstm_elt : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(lstm_elt) - - lstm_elt() : primitive_base("", {}), clip(0), input_forget(0), offset_order(lstm_weights_order::iofz), direction(0) {} - - using vec_activation = std::vector; - using vec_activation_param = std::vector; - - /// @brief Constructs lstm layer. - /// @param id This primitive id. - /// @param input input primitive id. - /// @param input cell Primitive id containing cell data. Provide empty string if using lstm without cell values. - /// @param clip Clip threshold. Provide 0 if using lstm without activations clip threshold. - /// @param input_forget Provide 0 if using lstm without coupled input-forget gates. - /// @param offset_order. Order of the concatenated weights, recurrent, and bias. ONNX default is iofz [input, output, forget, block]. - /// @param direction default = 0, bidirectional = 1. - lstm_elt(const primitive_id& id, - const input_info& input, - const primitive_id& cell = "", - const float clip = 0, - const bool input_forget = 0, - const std::vector activations = {activation_func::logistic, - activation_func::hyperbolic_tan, - activation_func::hyperbolic_tan}, - const std::vector activation_params = {}, - const lstm_weights_order offset_order = lstm_weights_order::iofz, - const uint32_t direction = 0) - : primitive_base(id, {input}), - cell(cell), - clip(clip), - input_forget(input_forget), - activations(activations), - activation_params(activation_params), - offset_order(offset_order), - direction(direction) {} - - /// @brief Primitive id containing the initial value of the cell state data. - primitive_id cell; - /// @brief Cell clip threshold T. It is applied to the input of activations [-T, T]. No clip is applied if it is not specified. - float clip; - /// @brief Couple the input and forget gates if input_forget is 1. Default is 0. - bool input_forget; - /// @brief A list of 3 activation functions for the input, output, forget, cell, and hidden. - std::vector activations; - /// @brief Optional scaling values used by some activation functions. The values are consumed in the order of activation functions. - std::vector activation_params; - /// @brief Weights, recurrent weights, and biases order. [iofz] : ONNX, [ifoz] : Caffe - lstm_weights_order offset_order; - /// @brief direction default = 0, bidirectional = 1. - uint32_t direction; - - size_t hash() const override { - size_t seed = primitive::hash(); - seed = hash_combine(seed, clip); - seed = hash_combine(seed, input_forget); - seed = hash_range(seed, activations.begin(), activations.end()); - for (auto& act_param : activation_params) { - seed = hash_combine(seed, act_param.a); - seed = hash_combine(seed, act_param.b); - } - seed = hash_combine(seed, offset_order); - seed = hash_combine(seed, direction); - seed = hash_combine(seed, cell.empty()); - return seed; - } - - bool operator==(const primitive& rhs) const override { - if (!compare_common_params(rhs)) - return false; - - auto rhs_casted = downcast(rhs); - - bool act_params_eq = activation_params.size() == rhs_casted.activation_params.size(); - for (size_t i = 0; i < activation_params.size(); ++i) { - act_params_eq &= activation_params[i].a == rhs_casted.activation_params[i].a && - activation_params[i].b == rhs_casted.activation_params[i].b; - } - - #define cmp_fields(name) name == rhs_casted.name - return act_params_eq && - cmp_fields(clip) && - cmp_fields(input_forget) && - cmp_fields(activations) && - cmp_fields(offset_order) && - cmp_fields(direction) && - cmp_fields(cell.empty()); - #undef cmp_fields - } - - void save(BinaryOutputBuffer& ob) const override { - primitive_base::save(ob); - ob << cell; - ob << clip; - ob << input_forget; - ob << activations; - ob << activation_params; - ob << make_data(&offset_order, sizeof(lstm_weights_order)); - ob << direction; - } - - void load(BinaryInputBuffer& ib) override { - primitive_base::load(ib); - ib >> cell; - ib >> clip; - ib >> input_forget; - ib >> activations; - ib >> activation_params; - ib >> make_data(&offset_order, sizeof(lstm_weights_order)); - ib >> direction; - } - -protected: - std::vector get_dependencies() const override { - std::vector ret; - if (!cell.empty()) - ret.push_back(cell); - return ret; - } -}; - -} // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm_cell.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm_cell.hpp new file mode 100644 index 00000000000000..c53840d4bfd0c4 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm_cell.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "primitive.hpp" +#include "activation.hpp" +#include +#include +#include "intel_gpu/graph/serialization/activation_serializer.hpp" +#include "rnn.hpp" + + +namespace cldnn { + +struct lstm_cell : public RNNParams { + CLDNN_DECLARE_PRIMITIVE(lstm_cell) + using vec_activation = std::vector; + using vec_activation_param = std::vector; + using RNNParams::RNNParams; + lstm_cell(const lstm_cell&) = default; + lstm_cell() : RNNParams() {} +}; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp new file mode 100644 index 00000000000000..ff167267c1aa24 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp @@ -0,0 +1,189 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "primitive.hpp" +#include "activation.hpp" +#include +#include +#include +#include "intel_gpu/graph/serialization/activation_serializer.hpp" + +namespace cldnn { + +/// @brief Weights orders +/// @details Specifies the order in which the weights are concatenated. +/// e.g. [i, o, f, z] : [input, output, forget, block] +/// ONNX order: iofz +/// Caffe order: ifoz +/// pyTorch order: izof +/// OV order: fizo +enum class lstm_weights_order { + iofz, + ifoz, + izof, + fizo +}; + +template +struct RNNParams : public primitive_base { + RNNParams() : primitive_base("", {}) {} + RNNParams(const RNNParams&) = default; + RNNParams(const primitive_id& id, + const input_info& x, + const input_info& initial_hidden_state, + const input_info& initial_cell_state, + const input_info& W, + const input_info& R, + const input_info& B, + const input_info& seq_lenghts, + const float clip = 0, + bool input_forget = false, + const std::vector& activations = {activation_func::logistic, + activation_func::hyperbolic_tan, + activation_func::hyperbolic_tan}, + const std::vector& activation_params = {}, + const lstm_weights_order& offset_order = lstm_weights_order::iofz, + const ov::op::RecurrentSequenceDirection direction = ov::op::RecurrentSequenceDirection::FORWARD, + const padding& output_padding = padding(), + const int num_outputs = 1) + : primitive_base(id, {x}, num_outputs, {optional_data_type()}, {output_padding}), + x(x), + initial_hidden_state(initial_hidden_state), + initial_cell_state(initial_cell_state), + W(W), + R(R), + B(B), + seq_lenghts(seq_lenghts), + clip(clip), + input_forget(input_forget), + activations(activations), + activation_params(activation_params), + offset_order(offset_order), + direction(direction) { + std::vector pids{initial_hidden_state.pid, initial_cell_state.pid, W.pid, R.pid, B.pid, seq_lenghts.pid}; + for (auto pid : pids) { + if (!pid.empty()) { + primitive_base::input.push_back(pid); + } + } + } + + input_info x; + input_info initial_hidden_state; + input_info initial_cell_state; + input_info W; + input_info R; + input_info B; + input_info seq_lenghts; + /// @brief Cell clip threshold T. It is applied to the input of activations [-T, T]. No clip is applied if it is not specified. + float clip; + bool input_forget; + /// @brief A list of 3 activation functions for the input, output, forget, cell, and hidden. + std::vector activations; + /// @brief Optional scaling values used by some activation functions. The values are consumed in the order of activation functions. + std::vector activation_params; + /// @brief Weights, recurrent weights, and biases order. [iofz] : ONNX, [ifoz] : Caffe + lstm_weights_order offset_order; + /// @brief direction of LSTMSequence - only FORWARD or REVERSE, currently BIDIRECTIONAL not supported + ov::op::RecurrentSequenceDirection direction; + + int num_directions() const { + return direction == ov::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1; + } + + size_t hash() const override { + size_t seed = primitive::hash(); + seed = hash_combine(seed, x.pid); + seed = hash_combine(seed, initial_hidden_state.pid); + seed = hash_combine(seed, initial_cell_state.pid); + seed = hash_combine(seed, seq_lenghts.pid); + seed = hash_combine(seed, W.pid); + seed = hash_combine(seed, R.pid); + seed = hash_combine(seed, B.pid); + seed = hash_combine(seed, clip); + seed = hash_range(seed, activations.begin(), activations.end()); + for (auto& act_param : activation_params) { + seed = hash_combine(seed, act_param.a); + seed = hash_combine(seed, act_param.b); + } + seed = hash_combine(seed, offset_order); + seed = hash_combine(seed, direction); + return seed; + } + + bool operator==(const primitive& rhs) const override { + if (!primitive::compare_common_params(rhs)) + return false; + + auto rhs_casted = downcast(rhs); + bool act_params_eq = activation_params.size() == rhs_casted.activation_params.size(); + for (size_t i = 0; i < activation_params.size(); ++i) { + act_params_eq &= activation_params[i].a == rhs_casted.activation_params[i].a && + activation_params[i].b == rhs_casted.activation_params[i].b; + } + + #define cmp_fields(name) name == rhs_casted.name + return act_params_eq && + cmp_fields(x) && + cmp_fields(initial_hidden_state) && + cmp_fields(initial_cell_state) && + cmp_fields(seq_lenghts) && + cmp_fields(W) && + cmp_fields(R) && + cmp_fields(B) && + cmp_fields(clip) && + cmp_fields(activations) && + cmp_fields(offset_order) && + cmp_fields(direction); + #undef cmp_fields + } + + void save(BinaryOutputBuffer& ob) const override { + primitive_base::save(ob); + ob << x; + ob << initial_hidden_state; + ob << initial_cell_state; + ob << W; + ob << R; + ob << B; + ob << seq_lenghts; + ob << clip; + ob << activations; + ob << activation_params; + ob << make_data(&offset_order, sizeof(lstm_weights_order)); + ob << make_data(&direction, sizeof(ov::op::RecurrentSequenceDirection)); + } + + void load(BinaryInputBuffer& ib) override{ + primitive_base::load(ib); + ib >> x; + ib >> initial_hidden_state; + ib >> initial_cell_state; + ib >> W; + ib >> R; + ib >> B; + ib >> seq_lenghts; + ib >> clip; + ib >> activations; + ib >> activation_params; + ib >> make_data(&offset_order, sizeof(lstm_weights_order)); + ib >> make_data(&direction, sizeof(ov::op::RecurrentSequenceDirection)); + } +}; + +struct lstm_seq : public RNNParams { + CLDNN_DECLARE_PRIMITIVE(lstm_seq) + using vec_activation = std::vector; + using vec_activation_param = std::vector; + using RNNParams::RNNParams; + lstm_seq() : RNNParams() { + weights = W.pid; + input = x.pid; + } + lstm_seq(const lstm_seq&) = default; + primitive_id input; + primitive_id weights; +}; +} //namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp index febcabd57efba0..199261772dcf2e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp @@ -56,6 +56,7 @@ static constexpr Property max_dynamic_batch{"DYN static constexpr Property nv12_two_inputs{"GPU_NV12_TWO_INPUTS"}; static constexpr Property buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"}; static constexpr Property max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"}; +static constexpr Property use_onednn{"USE_ONEDNN"}; } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/graph/concatenation.cpp b/src/plugins/intel_gpu/src/graph/concatenation.cpp index b493bb217b1c32..87dad139c10404 100644 --- a/src/plugins/intel_gpu/src/graph/concatenation.cpp +++ b/src/plugins/intel_gpu/src/graph/concatenation.cpp @@ -120,6 +120,9 @@ concatenation_inst::typed_primitive_inst(network& network, concatenation_node co if (dim == node.get_primitive()->axis) { concat_count += input_mem_size[dim]; } else { + if (i.first->get_outputs_count() > 1 && i.first->get_user_index(node) > 0) { + continue; + } CLDNN_ERROR_NOT_EQUAL(node.id(), "Input size dim: " + std::to_string(dim), input_size[dim], diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp index 9805b45ad005ed..407ed3b87fd4e8 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp @@ -9,7 +9,10 @@ #include "convolution_inst.h" #include "deconvolution_inst.h" #include "fully_connected_inst.h" +#include "lstm_seq_inst.h" #include "intel_gpu/runtime/format.hpp" +#include "permute_inst.h" +#include "crop_inst.h" #ifdef ENABLE_ONEDNN_FOR_GPU #include "graph/impls/onednn/utils.hpp" #endif // ENABLE_ONEDNN_FOR_GPU @@ -18,10 +21,17 @@ namespace cldnn { post_optimize_weights::post_optimize_weights(reorder_factory& rf_ref) : base_pass("post_optimize_weights"), _rf(rf_ref) {} -template post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) { +template +post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) { return weights_bias_offset(node.get_primitive()->input.size(), program_helpers::wrap_if_single(node.get_primitive()->weights).size()); } +template <> +post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const lstm_seq_node& node) { + const int W_idx = 3; + return weights_bias_offset(W_idx, 3); +} + // function which prepares given primitive for weights optimization template void post_optimize_weights::optimize_weights(T& node, program& p) { @@ -109,15 +119,26 @@ void post_optimize_weights::optimize_weights(T& node, program& p) { set_implementation(weights_reorder_node); } } else { - auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params); - // insert new weights reorder node to topology - p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second); - // set weights reorder's node output layout and implementation - auto& weights_reorder_node = node.get_dependency(i); - weights_reorder_node.get_output_layout(false); + if (node.type() == lstm_seq::type_id()) { + program_node& prev_node = node.get_dependency(i); + if (i == 5) { + add_lstm_bias_reorder(prev_node.id(), weights_reorder_params, p, prev_node, node); + } else { + add_lstm_weights_reorder(prev_node.id(), weights_reorder_params, p, prev_node, node, i); + } + auto& weights_reorder_node = node.get_dependency(i); + weights_reorder_node.get_output_layout(false); + } else { + auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params); + // insert new weights reorder node to topology + p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second); + // set weights reorder's node output layout and implementation + auto& weights_reorder_node = node.get_dependency(i); + weights_reorder_node.get_output_layout(false); - if (!weights_reorder.second) { - set_implementation(weights_reorder_node); + if (!weights_reorder.second) { + set_implementation(weights_reorder_node); + } } } } @@ -126,6 +147,110 @@ void post_optimize_weights::optimize_weights(T& node, program& p) { node.set_output_layout(output_layout, false); } +void post_optimize_weights::select_implementation(program& p, program_node& node) { + node.set_selected_impl(node.type()->create_impl(node)); + if (auto impl = node.get_selected_impl()) { + auto params = node.get_kernel_impl_params(); + p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source()); + } +} + +void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std::shared_ptr reorder_params, program& p, \ + cldnn::program_node& prev, cldnn::program_node& node, size_t i) { + OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized."); + std::string reorder_id = input_id + "_reo_" + std::to_string(i); + const auto dir_num = static_cast(reorder_params->get_input_layout().get_shape()[0]); + auto hiddenSize = reorder_params->get_input_layout().get_shape()[1] / 4; + auto inputSize = static_cast(reorder_params->get_input_layout().get_shape()[2]); + int size_third; + const int W_idx = 3; + if (i == W_idx) { + size_third = inputSize; + } else { + size_third = static_cast(hiddenSize); + } + auto cropSizeR = cldnn::tensor{dir_num, static_cast(hiddenSize), 1, size_third}; + std::string crop_id_b = input_id + "_c"; + auto get_crop_node = [&](int cropNum) -> cldnn::program_node& { + auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum)); + auto crop_prim = std::make_shared(crop_id, reorder_id, cropSizeR, cldnn::tensor{0, static_cast(cropNum*hiddenSize), 0, 0}); + return p.get_or_create(crop_prim); + }; + + auto& crop0_node = get_crop_node(0); + auto& crop1_node = get_crop_node(1); + auto crop2_id = primitive_id(crop_id_b + std::to_string(2)); + auto crop2_prim = std::make_shared(crop2_id, reorder_id, cldnn::tensor{dir_num, static_cast(2*hiddenSize), 1, size_third}, + cldnn::tensor{0, static_cast(2*hiddenSize), 0, 0}); + auto& crop2_node = p.get_or_create(crop2_prim); + std::vector con_input{input_info(crop_id_b + "1"), input_info(crop_id_b + "0"), input_info(crop_id_b + "2")}; + cldnn::primitive_id concat_id{input_id + "cont"}; + auto con = std::make_shared(concat_id, con_input, 1); + auto& con_node = p.get_or_create(con); + p.add_intermediate(con_node, node, prev, true); + p.add_intermediate(crop1_node, con_node, prev, true); + p.add_connection(prev, crop0_node, 0); + p.add_connection(prev, crop2_node, 0); + p.add_connection(crop0_node, con_node, 0); + p.add_connection(crop2_node, con_node, 0); + std::string permute_id = input_id + "_perx"; + std::vector ord{0, 2, 1}; + auto permute = std::make_shared(permute_id, input_info{concat_id}, ord); + auto& permute_node = p.get_or_create(permute); + p.add_intermediate(permute_node, node, con_node, true); + auto set_implementation_and_output = [this, &p](program_node& node) { + node.get_output_layout(false); + select_implementation(p, node); + p.mark_if_constant(node); + node.recalc_output_layout(false); + }; + set_implementation_and_output(crop1_node); + set_implementation_and_output(crop0_node); + set_implementation_and_output(crop2_node); + set_implementation_and_output(con_node); + set_implementation_and_output(permute_node); +} + +void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::shared_ptr reorder_params, program& p, \ + cldnn::program_node& prev, cldnn::program_node& node) { + OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized."); + const auto dir_num = static_cast(reorder_params->get_input_layout().get_shape()[0]); + auto hiddenSize = reorder_params->get_output_layout().get_shape()[1] / 4; + auto cropSize = cldnn::tensor{dir_num, static_cast(hiddenSize), 1, 1}; + std::string crop_id_b = input_id + "_c"; + auto get_crop_node = [&](int cropNum) -> cldnn::program_node& { + auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum)); + auto crop_prim = std::make_shared(crop_id, input_id, cropSize, cldnn::tensor{0, static_cast(cropNum*hiddenSize), 0, 0}); + return p.get_or_create(crop_prim); + }; + auto& crop0_node = get_crop_node(0); + auto& crop1_node = get_crop_node(1); + auto crop2_id = primitive_id(crop_id_b + std::to_string(2)); + auto crop2_prim = std::make_shared(crop2_id, input_id, cldnn::tensor{dir_num, static_cast(2*hiddenSize), 1, 1}, + cldnn::tensor{0, static_cast(2*hiddenSize), 0, 0}); + auto& crop2_node = p.get_or_create(crop2_prim); + std::vector con_input{input_info(crop1_node.id()), input_info(crop0_node.id()), input_info(crop2_node.id())}; + cldnn::primitive_id concat_id{input_id + "concat"}; + auto con = std::make_shared(concat_id, con_input, 1); + auto& con_node = p.get_or_create(con); + p.add_intermediate(con_node, node, prev, true); + p.add_intermediate(crop1_node, con_node, prev, true); + p.add_connection(prev, crop0_node, 0); + p.add_connection(prev, crop2_node, 0); + p.add_connection(crop0_node, con_node, 0); + p.add_connection(crop2_node, con_node, 0); + auto set_implementation_and_output = [this, &p](program_node& node) { + node.get_output_layout(false); + select_implementation(p, node); + p.mark_if_constant(node); + node.recalc_output_layout(false); + }; + set_implementation_and_output(crop0_node); + set_implementation_and_output(crop1_node); + set_implementation_and_output(crop2_node); + set_implementation_and_output(con_node); +} + void post_optimize_weights::run(program& p) { for (auto& node : p.get_processing_order()) { if (node->is_type()) { @@ -134,8 +259,11 @@ void post_optimize_weights::run(program& p) { optimize_weights(node->as(), p); } else if (node->is_type()) { optimize_weights(node->as(), p); + } else if (node->is_type()) { + optimize_weights(node->as(), p); } } + p.get_processing_order().calc_processing_order(p); } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index e94714c84fdebf..de7f51b071ae53 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -17,11 +17,12 @@ #include "depth_to_space_inst.h" #include "resample_inst.h" #include "loop_inst.h" -#include "lstm_elt_inst.h" +#include "lstm_cell_inst.h" #include "strided_slice_inst.h" #include "shape_of_inst.h" #include "non_max_suppression_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" +#include "lstm_seq_inst.h" #include "border_inst.h" #include "pass_manager.h" @@ -504,6 +505,8 @@ bool crop_in_place_optimization::match(const program_node& node, } if (user->is_type() && user->get_dependency_index(node) == 0) return false; + if (user->is_type() || user->is_type()) + return false; } // do not optimize crop, that must be calculated in propagate_constants @@ -519,10 +522,6 @@ bool crop_in_place_optimization::match(const program_node& node, return false; if (node.get_users().size() > 0) { - if (node.get_program().is_body_program() && node.get_dependency(0).is_type()) { - return false; - } - GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing && node.is_dynamic()) { return false; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index c323109850c489..60d1e8aa7e10b7 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -439,7 +439,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { }; auto conv_supports_fusings = [&](convolution_node& node) -> bool { - if (lo.get_optimization_attributes().use_onednn_impls == 1 && + if (lo.has_all_enabled_onednn_impls_optimization_attribute() && lo.get_preferred_impl_type(node, format::byxf /*dummy value to disable format checking*/) == impl_types::onednn) { return true; } @@ -491,7 +491,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { }; auto fc_supports_fusings = [&](fully_connected_node& node) -> bool { - if (lo.get_optimization_attributes().use_onednn_impls && + if (lo.has_all_enabled_onednn_impls_optimization_attribute() && lo.get_preferred_impl_type(node, format::any /*dummy*/) == impl_types::onednn) { return true; } else { @@ -589,7 +589,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { // Do not fuse if the estimated format is fs_b_yx_fsv32 because the optimized kernel does not support fusion if (out_layout.data_type == data_types::f16 && out_layout.is_static() && out_layout.batch() > 1 && ((lo.get_optimization_attributes().fs_b_yx_fsv32_network && - !lo.get_optimization_attributes().use_onednn_impls && !has_reorder_behind_mvn()) || + !lo.has_all_enabled_onednn_impls_optimization_attribute() && !has_reorder_behind_mvn()) || out_layout.format == format::fs_b_yx_fsv32)) { return false; } @@ -665,7 +665,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { if (input.in_shape_of_subgraph || node->in_shape_of_subgraph) return; - if (lo.get_optimization_attributes().use_onednn_impls) { + if (lo.has_all_enabled_onednn_impls_optimization_attribute()) { if (input.is_type() || input.is_type()) return; auto additional_params_input = activation_node.get_primitive()->additional_params_input; @@ -768,7 +768,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { return; // Onednn reorder does not support eltwise nor binary post operation - if (lo.get_optimization_attributes().use_onednn_impls && input.is_type()) { + if (lo.has_all_enabled_onednn_impls_optimization_attribute() && input.is_type()) { return; } @@ -809,7 +809,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { (lo.should_select_b_fs_yx_fsv16_layout(input_data.as(), input_data.get_input_layout(1)) && !is_grouped_conv(input_data.as())) || // Avoid fusing to b_fs_yx_fsv16 (and similar) kernels - lo.get_optimization_attributes().use_onednn_impls || + (lo.has_all_enabled_onednn_impls_optimization_attribute()) || (in_dt_is_i8_u8 && out_dt_is_i8_u8)); should_fuse |= input_data.is_type() && quantize_node.get_scale_shift_opt(); @@ -1067,7 +1067,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { } } - if (lo.get_optimization_attributes().use_onednn_impls && lo.is_primitive_implemented_for_onednn(*fused_node)) { + if (lo.has_all_enabled_onednn_impls_optimization_attribute() && lo.is_primitive_implemented_for_onednn(*fused_node)) { auto eltw_in_size = peer_node->get_output_layout(); if (eltw_in_size.is_dynamic() // this whitelist condition is temporarily and to be relaxed soon. diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 28ee84c4a4ec02..1e5f943600fc05 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -489,7 +489,7 @@ void remove_redundant_reorders::run(program& p) { (dep.get_output_layout().format == format::b_fs_yx_fsv16 || dep.get_output_layout().format == format::bfyx || (dep.get_output_layout().format == format::fs_b_yx_fsv32 && - !lo.get_optimization_attributes().use_onednn_impls)); + !lo.has_all_enabled_onednn_impls_optimization_attribute())); auto convert_color_opt = usr->is_type() && prim_desc->has_surface_input(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 213da8cb0ab606..218b6268f9d340 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -56,9 +56,9 @@ std::map get_preferred_formats(program& p, layout_o onednn_impls_counter++; } - if (lo.get_optimization_attributes().use_onednn_impls && onednn_impls_counter < 1) { + if (!lo.is_empty_onednn_impls_optimization_attribute() && onednn_impls_counter < 1) { should_update_fmt_map = true; - lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0); + lo.clear_onednn_impls_optimization_attribute(); GPU_DEBUG_LOG << "Disable oneDNN implementations globally" << std::endl; } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp index 8a1197dfb843a6..fcd6dab33754fd 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp @@ -69,7 +69,7 @@ void select_preferred_formats::run(program& p) { #ifdef ENABLE_ONEDNN_FOR_GPU auto& engine = p.get_engine(); - if (p.get_layout_optimizer().get_optimization_attributes().use_onednn_impls) { + if (!p.get_layout_optimizer().is_empty_onednn_impls_optimization_attribute()) { engine.create_onednn_engine(p.get_config()); } #endif // ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 409fd824063da6..0a999a5a124d3b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -309,6 +309,8 @@ kernel_selector::data_layout to_data_layout(format f) { return kernel_selector::data_layout::bfzyx; case format::bzyxf: return kernel_selector::data_layout::bzyxf; + case format::ybfx: + return kernel_selector::data_layout::ybfx; case format::fs_b_yx_fsv32: return kernel_selector::data_layout::fs_b_yx_fsv32; case format::bfwzyx: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp new file mode 100644 index 00000000000000..a41cd1065122de --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "primitive_base.hpp" + +#include "lstm_cell_inst.h" +#include "lstm/lstm_cell_and_seq_kernel_selector.h" +#include "lstm/lstm_kernel_base.h" +#include "openvino/op/lstm_cell.hpp" +#include "lstm_cell.hpp" + +namespace cldnn { +namespace ocl { + +struct lstm_cell_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + using kernel_selector_t = kernel_selector::lstm_cell_and_seq_kernel_selector; + using kernel_params_t = kernel_selector::lstm_params; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::lstm_cell_impl) + + std::unique_ptr clone() const override { + return make_unique(*this); + } + +protected: + kernel_arguments_data get_arguments(const typed_primitive_inst& instance) const override { + kernel_arguments_data args; + for (size_t i = 0; i < instance.inputs_memory_count(); i++) { + args.inputs.push_back(instance.input_memory_ptr(i)); + } + + for (size_t i = 0; i < instance.outputs_memory_count(); i++) { + args.outputs.push_back(instance.output_memory_ptr(i)); + } + return args; + } + +public: + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param); + for (size_t i = 1; i < 6; ++i) { + params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(i))); + } + + if (!primitive->activations.empty()) { + auto a_sz = primitive->activations.size(); + auto param_sz = primitive->activation_params.size(); + OPENVINO_ASSERT(param_sz == 0 || a_sz == param_sz, "[GPU] Unexpected activation params count in lstm_cell impl: ", param_sz); + for (size_t i = 0; i < a_sz; i++) { + params.activations.emplace_back(get_kernel_selector_activation_param(primitive->activations[i]), + param_sz ? primitive->activation_params[i].a : 0.0f, + param_sz ? primitive->activation_params[i].b : 0.0f); + } + } + + if (primitive->clip > 0.0f) { + params.activations.emplace_back(get_kernel_selector_activation_param(activation_func::clamp), -primitive->clip, primitive->clip); + } + + params.SetOffsetOrder(static_cast(primitive->offset_order)); + params.clip = primitive->clip; + params.direction = primitive->direction; + + return params; + } + + static kernel_impl_params static_canonicalize_shapes(const kernel_impl_params& impl_params) { + if (impl_params.get_input_layout().get_partial_shape().size() != 3) { + return primitive_impl::static_canonicalize_shapes(impl_params); + } + auto updated_impl_params = canonicalize_fused_shapes(impl_params); + return updated_impl_params; + } + + kernel_impl_params canonicalize_shapes(const kernel_impl_params& impl_params) const override { + return static_canonicalize_shapes(impl_params); + } +}; + +std::unique_ptr LSTMCellImplementationManager::create_impl(const program_node& node, const kernel_impl_params& params) const { + OPENVINO_ASSERT(node.is_type()); + return typed_primitive_impl_ocl::create(static_cast(node), params); +} + +} // namespace ocl +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::lstm_cell_impl) +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::lstm_cell) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.hpp new file mode 100644 index 00000000000000..731bacf2e17e4f --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "lstm_cell_inst.h" +#include "impls/registry/implementation_manager.hpp" +#include "intel_gpu/runtime/layout.hpp" + +#include +namespace cldnn { +namespace ocl { + +struct LSTMCellImplementationManager: public ImplementationManager { + OV_GPU_PRIMITIVE_IMPL("ocl::lstm_cell") + LSTMCellImplementationManager(shape_types shape_type, ValidateFunc vf = nullptr) : ImplementationManager(impl_types::ocl, shape_type, vf) {} + + std::unique_ptr create_impl(const program_node& node, const kernel_impl_params& params) const override; + + bool validate_impl(const program_node& node) const override { + assert(node.is_type()); + + const auto& input_layout = node.get_input_layout(0); + const auto& output_layout = node.get_output_layout(0); + + auto input_fmt = input_layout.format; + auto output_fmt = output_layout.format; + auto in_dt = input_layout.data_type; + auto out_dt = output_layout.data_type; + static const std::vector supported_formats = { + format::bfyx, + format::fyxb, + }; + static const std::vector supported_data_types = { + data_types::f32, + data_types::f16, + }; + + if (!one_of(in_dt, supported_data_types) || !one_of(out_dt, supported_data_types)) { + return false; + } + + return one_of(input_fmt.value, supported_formats) && one_of(output_fmt.value, supported_formats); + } +}; + +} // namespace ocl +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp deleted file mode 100644 index 5de12d83fdbab3..00000000000000 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "primitive_base.hpp" - -#include "lstm_elt_inst.h" -#include "lstm/lstm_elt_kernel_selector.h" -#include "lstm/lstm_elt_kernel_base.h" - -namespace cldnn { -namespace ocl { - -struct lstm_elt_impl : typed_primitive_impl_ocl { - using parent = typed_primitive_impl_ocl; - using parent::parent; - using kernel_selector_t = kernel_selector::lstm_elt_kernel_selector; - using kernel_params_t = kernel_selector::lstm_elt_params; - - DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::lstm_elt_impl) - - std::unique_ptr clone() const override { - return make_deep_copy(*this); - } - -protected: - kernel_arguments_data get_arguments(const typed_primitive_inst& instance) const override { - kernel_arguments_data args = parent::get_arguments(instance); - - args.cell = instance.cell_term() ? instance.cell_memory() : nullptr; - args.outputs = { instance.output_memory_ptr() }; - - return args; - } - -public: - static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { - const auto& primitive = impl_param.typed_desc(); - auto params = get_default_params(impl_param); - - if (!primitive->cell.empty()) { - const auto& cell_idx = 1; - const auto& cell_layout = impl_param.input_layouts[cell_idx]; - params.SetCell(convert_data_tensor(cell_layout)); - // TODO: make a generic function to get the direction - if (cell_layout.spatial(1) > 1) { - params.cell_direction = primitive->direction; - } - } - - if (!primitive->activations.empty()) { - auto a_sz = primitive->activations.size(); - auto param_sz = primitive->activation_params.size(); - OPENVINO_ASSERT(param_sz == 0|| a_sz == param_sz, "[GPU] Unexpected activation params count in lstm_elt impl: ", param_sz); - for (size_t i = 0; i < a_sz; i++) { - params.activations.emplace_back(get_kernel_selector_activation_param(primitive->activations[i]), - param_sz ? primitive->activation_params[i].a : 0.0f, - param_sz ? primitive->activation_params[i].b : 0.0f); - } - } - - if (primitive->clip > 0.0f) { - params.activations.emplace_back(get_kernel_selector_activation_param(activation_func::clamp), -primitive->clip, primitive->clip); - } - - params.SetOffsetOrder(static_cast(primitive->offset_order)); - params.clip = primitive->clip; - params.input_forget = primitive->input_forget; - params.direction = primitive->direction; - - return params; - } - - static kernel_impl_params static_canonicalize_shapes(const kernel_impl_params& impl_params) { - if (impl_params.get_input_layout().get_partial_shape().size() != 2) { - return primitive_impl::static_canonicalize_shapes(impl_params); - } - auto updated_impl_params = canonicalize_fused_shapes(impl_params); - - auto& input_layout = updated_impl_params.input_layouts[0]; - auto& weights_layout = updated_impl_params.input_layouts[1]; - auto& output_layout = updated_impl_params.output_layouts[0]; - - auto input_pshape = input_layout.get_partial_shape(); - auto weights_pshape = weights_layout.get_partial_shape(); - auto output_pshape = output_layout.get_partial_shape(); - - auto lstm_input_size = static_cast(input_pshape[1].get_length()); - auto lstm_batch_size = static_cast(input_pshape[0].get_length()); - auto lstm_hidden_size = static_cast(lstm_input_size / 4); - - GPU_DEBUG_LOG << "lstm_input_size : " << lstm_input_size << std::endl; - GPU_DEBUG_LOG << "lstm_batch_size : " << lstm_batch_size << std::endl; - GPU_DEBUG_LOG << "lstm_hidden_size : " << lstm_hidden_size << std::endl; - - GPU_DEBUG_LOG << "origin input_pshape : " << input_layout.to_short_string() << std::endl; - GPU_DEBUG_LOG << "origin weights_layout : " << weights_layout.to_short_string() << std::endl; - - input_pshape = {lstm_batch_size, 1, 1, lstm_input_size}; - input_layout.set_partial_shape(input_pshape); - - weights_pshape = {lstm_batch_size, 1, 1, lstm_hidden_size}; // {batch, direction, 1, hidden_size} - weights_layout.format = format::adjust_to_rank(weights_layout.format, weights_pshape.size()); - weights_layout.set_partial_shape(weights_pshape); - - updated_impl_params.weights_layout = weights_layout; - - GPU_DEBUG_LOG << "input_layout : " << input_layout.to_short_string() << std::endl; - GPU_DEBUG_LOG << "weights_layout : " << weights_layout.to_short_string() << std::endl; - GPU_DEBUG_LOG << "output_layout : " << output_layout.to_short_string() << std::endl; - - OPENVINO_ASSERT(input_pshape.size() == 4 && weights_pshape.size() == 4, "input and weights shape should be rank 4"); - return updated_impl_params; - } - - kernel_impl_params canonicalize_shapes(const kernel_impl_params& impl_params) const override { - return static_canonicalize_shapes(impl_params); - } -}; - -namespace detail { - -attach_lstm_elt_impl::attach_lstm_elt_impl() { - implementation_map::add(impl_types::ocl, typed_primitive_impl_ocl::create, { - std::make_tuple(data_types::f32, format::bfyx), - std::make_tuple(data_types::f16, format::bfyx), - std::make_tuple(data_types::f32, format::fyxb), - std::make_tuple(data_types::f16, format::fyxb), - }); -} - -} // namespace detail -} // namespace ocl -} // namespace cldnn - -BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::lstm_elt_impl) -BIND_BINARY_BUFFER_WITH_TYPE(cldnn::lstm_elt) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index 2597e419e66a41..9b31e70d4ab69d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -39,7 +39,6 @@ void register_implementations() { REGISTER_OCL(kv_cache); REGISTER_OCL(paged_attention); REGISTER_OCL(lrn); - REGISTER_OCL(lstm_elt); REGISTER_OCL(multiclass_nms); REGISTER_OCL(multinomial); REGISTER_OCL(mutable_data); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index d4b08b5154ef4b..906210f08252a4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -113,7 +113,6 @@ REGISTER_OCL(group_normalization); REGISTER_OCL(kv_cache); REGISTER_OCL(paged_attention); REGISTER_OCL(lrn); -REGISTER_OCL(lstm_elt); REGISTER_OCL(multiclass_nms); REGISTER_OCL(multinomial); REGISTER_OCL(mutable_data); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp new file mode 100644 index 00000000000000..3fb8ae13d3baa4 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "primitive_base.hpp" + +#include "lstm_seq_inst.h" +#include "rnn_seq.hpp" +#include "lstm/lstm_cell_and_seq_kernel_selector.h" +#include "lstm/lstm_kernel_base.h" +#include "openvino/op/lstm_sequence.hpp" +#include "impls/registry/implementation_manager.hpp" + +namespace cldnn { +namespace ocl { + +struct rnn_seq_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + using kernel_selector_t = kernel_selector::lstm_cell_and_seq_kernel_selector; + using kernel_params_t = kernel_selector::lstm_params; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::rnn_seq_impl) + + std::unique_ptr clone() const override { + return make_unique(*this); + } + +protected: + kernel_arguments_data get_arguments(const typed_primitive_inst& instance) const override { + kernel_arguments_data args; + for (size_t i = 0; i < instance.inputs_memory_count(); i++) { + args.inputs.push_back(instance.input_memory_ptr(i)); + } + + for (size_t i = 0; i < instance.outputs_memory_count(); i++) { + args.outputs.push_back(instance.output_memory_ptr(i)); + } + return args; + } + +public: + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param); + params.sequential = true; + for (size_t i = 1; i < impl_param.input_layouts.size(); ++i) { + params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(i))); + } + + if (!primitive->activations.empty()) { + auto a_sz = primitive->activations.size(); + auto param_sz = primitive->activation_params.size(); + OPENVINO_ASSERT(param_sz == 0|| a_sz == param_sz, "[GPU] Unexpected activation params count in lstm_seq impl: ", param_sz); + for (size_t i = 0; i < a_sz; i++) { + params.activations.emplace_back(get_kernel_selector_activation_param(primitive->activations[i]), + param_sz ? primitive->activation_params[i].a : 0.0f, + param_sz ? primitive->activation_params[i].b : 0.0f); + } + } + + if (primitive->clip > 0.0f) { + params.activations.emplace_back(get_kernel_selector_activation_param(activation_func::clamp), -primitive->clip, primitive->clip); + } + + params.SetOffsetOrder(static_cast(primitive->offset_order)); + params.clip = primitive->clip; + params.direction = primitive->direction; + return params; + } + + static kernel_impl_params static_canonicalize_shapes(const kernel_impl_params& impl_params) { + if (impl_params.get_input_layout().get_partial_shape().size() != 3) { + return primitive_impl::static_canonicalize_shapes(impl_params); + } + auto updated_impl_params = canonicalize_fused_shapes(impl_params); + return updated_impl_params; + } + + kernel_impl_params canonicalize_shapes(const kernel_impl_params& impl_params) const override { + return static_canonicalize_shapes(impl_params); + } +}; + +std::unique_ptr RNNSeqImplementationManager::create_impl(const program_node& node, const kernel_impl_params& params) const { + OPENVINO_ASSERT(node.is_type()); + return typed_primitive_impl_ocl::create(static_cast(node), params); +} + +} // namespace ocl +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::rnn_seq_impl) +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::lstm_seq) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.hpp new file mode 100644 index 00000000000000..3e71ad2be51192 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "lstm_seq_inst.h" +#include "impls/registry/implementation_manager.hpp" +#include "intel_gpu/runtime/layout.hpp" + +#include +namespace cldnn { +namespace ocl { + +struct RNNSeqImplementationManager: public ImplementationManager { + OV_GPU_PRIMITIVE_IMPL("ocl::lstm_seq") + RNNSeqImplementationManager(shape_types shape_type, ValidateFunc vf = nullptr) : ImplementationManager(impl_types::ocl, shape_type, vf) {} + + std::unique_ptr create_impl(const program_node& node, const kernel_impl_params& params) const override; + + bool validate_impl(const program_node& node) const override { + assert(node.is_type()); + + const auto& input_layout = node.get_input_layout(0); + const auto& output_layout = node.get_output_layout(0); + + auto input_fmt = input_layout.format; + auto output_fmt = output_layout.format; + auto in_dt = input_layout.data_type; + auto out_dt = output_layout.data_type; + static const std::vector supported_formats = { + format::bfyx + }; + static const std::vector supported_data_types = { + data_types::f32, + data_types::f16, + }; + + if (!one_of(in_dt, supported_data_types) || !one_of(out_dt, supported_data_types)) { + return false; + } + + return one_of(input_fmt.value, supported_formats) && one_of(output_fmt.value, supported_formats); + } +}; + +} // namespace ocl +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp new file mode 100644 index 00000000000000..167be7be6e7481 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp @@ -0,0 +1,218 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "impls/onednn/utils.hpp" +#include "lstm_seq_inst.h" +#include "primitive_onednn_base.h" +#include "lstm_seq_onednn.hpp" +#include "impls/registry/implementation_map.hpp" + +#include "kernel_selector_common.h" + +#include + +#include +#include +namespace cldnn { +namespace onednn { + +struct lstm_seq_onednn : typed_primitive_onednn_impl { + using parent = typed_primitive_onednn_impl; + using parent::parent; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::onednn::lstm_seq_onednn) + +protected: + std::unique_ptr clone() const override { + return make_unique(*this); + } + + std::unordered_map get_arguments(lstm_seq_inst& instance) const override { + std::unordered_map args; + std::vector> dnnl_arg{{DNNL_ARG_SRC_LAYER, DNNL_ARG_SRC_ITER, DNNL_ARG_SRC_ITER_C}, {DNNL_ARG_WEIGHTS_LAYER, + DNNL_ARG_WEIGHTS_ITER, DNNL_ARG_BIAS}, {DNNL_ARG_DST_LAYER, DNNL_ARG_DST_ITER, DNNL_ARG_DST_ITER_C}}; + + for (int i = 0; i < 3; i++) { + for (int j = 0 ; j < 3; j++) { + dnnl::memory mem; + switch (i) { + case 0: + { + auto& input = instance.input_memory(j); + auto offset = onednn::get_offset(instance.get_input_layout(j), _pd.dnnl::primitive_desc_base::src_desc(j)); + mem = input.get_onednn_memory(_pd.dnnl::primitive_desc_base::src_desc(j), offset); + break; + } + case 1: + { + auto& input = instance.input_memory(3+j); + auto offset = onednn::get_offset(instance.get_input_layout(3+j), _pd.dnnl::primitive_desc_base::weights_desc(j)); + mem = input.get_onednn_memory(_pd.dnnl::primitive_desc_base::weights_desc(j), offset); + break; + } + case 2: + { + auto& output = instance.output_memory(j); + auto offset = onednn::get_offset(instance.get_output_layout(j), _pd.dnnl::primitive_desc_base::dst_desc(j)); + mem = output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(j), offset); + break; + } + default: + break; + } + args.insert({dnnl_arg[i][j], mem}); + } + } + return args; + } + + static cldnn::layout get_reorder_layout(const kernel_impl_params& impl_params, size_t layout_nr) { + auto weights_shape = impl_params.get_input_layout(layout_nr).get_shape(); + auto target_weights_layout = impl_params.get_input_layout(layout_nr); + target_weights_layout.format = cldnn::format::bfzyx; + auto layout = target_weights_layout.clone_with_other_shape(ov::Shape{weights_shape[0], weights_shape[1], weights_shape[2], 1, 1}); + return layout; + } + + static std::shared_ptr get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) { + const auto weights_layout_idx = 3; + auto source_weights_layout = impl_params.get_input_layout(weights_layout_idx); + auto target_weights_layout = get_reorder_layout(impl_params, weights_layout_idx); + auto W_desc = onednn::layout_to_memory_desc(source_weights_layout); + auto grouped_weights = format::is_grouped(source_weights_layout.format); + + return std::make_shared(source_weights_layout, + target_weights_layout, + W_desc, + W_desc, + false, + grouped_weights); + } + static std::shared_ptr get_lstm_primitive_descriptor(const kernel_impl_params& impl_params, cldnn::engine& engine, + const dnnl::primitive_attr& attr, + ov::op::RecurrentSequenceDirection direction) { + auto prim = impl_params.typed_desc(); + auto num_dir = static_cast(prim->num_directions()); + const auto& src_shape = impl_params.get_input_layout(0).get_shape(); + auto mod_src_shape = src_shape; + std::swap(mod_src_shape[0], mod_src_shape[1]); + auto input_md = onednn::layout_to_memory_desc(impl_params.get_input_layout(0).clone_with_other_shape(mod_src_shape), dnnl::memory::format_tag::abc); + auto initial_hidden_shape_mod = impl_params.get_input_layout(1).get_shape(); + initial_hidden_shape_mod = { 1, num_dir, initial_hidden_shape_mod[0], initial_hidden_shape_mod[2] }; + auto initial_hidden = onednn::layout_to_memory_desc(impl_params.get_input_layout(1).clone_with_other_shape(initial_hidden_shape_mod)); + auto initial_cell = onednn::layout_to_memory_desc(impl_params.get_input_layout(2).clone_with_other_shape(initial_hidden_shape_mod)); + auto W_shape_mod = impl_params.get_input_layout(3).get_shape(); + W_shape_mod = {1, num_dir, W_shape_mod[2], 4, W_shape_mod[1]/4}; + auto w_layout = impl_params.get_input_layout(3).clone_with_other_shape(W_shape_mod); + w_layout.format = cldnn::format::bfzyx; + auto W_md = onednn::layout_to_memory_desc(w_layout); + auto R_shape_mod = impl_params.get_input_layout(4).get_shape(); + R_shape_mod = {1, num_dir, R_shape_mod[2], 4, R_shape_mod[1]/4}; + auto r_layout = impl_params.get_input_layout(4).clone_with_other_shape(R_shape_mod); + r_layout.format = cldnn::format::bfzyx; + auto R_md = onednn::layout_to_memory_desc(r_layout); + auto B_shape_mod = impl_params.get_input_layout(5).get_shape(); + B_shape_mod = {1, num_dir, 4, B_shape_mod[1]/4}; + auto b_layout = impl_params.get_input_layout(5).clone_with_other_shape(B_shape_mod); + b_layout.format = cldnn::format::bfyx; + auto B_md = onednn::layout_to_memory_desc(b_layout); + auto out_shape = impl_params.get_output_layout().get_shape(); + out_shape = {out_shape[2], out_shape[0], out_shape[3]*num_dir}; + auto output_md = onednn::layout_to_memory_desc(impl_params.get_output_layout().clone_with_other_shape(out_shape), dnnl::memory::format_tag::abc); + auto output1_md = onednn::layout_to_memory_desc(impl_params.get_output_layout(1).clone_with_other_shape(initial_hidden_shape_mod)); + auto output2_md = onednn::layout_to_memory_desc(impl_params.get_output_layout(2).clone_with_other_shape(initial_hidden_shape_mod)); + OPENVINO_ASSERT(input_md.get_format_kind() != dnnl::memory::format_kind::any, + "[GPU] The format kind of the input memory descriptor of onednn lstm_seq cannot be 'any'."); + OPENVINO_ASSERT(output_md.get_format_kind() != dnnl::memory::format_kind::any, + "[GPU] The format kind of the output memory descriptor of onednn lstm_seq cannot be 'any'."); + + auto eng = engine.get_onednn_engine(); + dnnl::rnn_direction lstm_desc_dir; + if (direction == ov::op::RecurrentSequenceDirection::FORWARD) { + lstm_desc_dir = dnnl::rnn_direction::unidirectional_left2right; + } else if (direction == ov::op::RecurrentSequenceDirection::REVERSE) { + lstm_desc_dir = dnnl::rnn_direction::unidirectional_right2left; + } else { + lstm_desc_dir = dnnl::rnn_direction::bidirectional_concat; + } + return std::make_shared( + eng, + dnnl::prop_kind::forward_inference, + lstm_desc_dir, + input_md, + initial_hidden, + initial_cell, + W_md, + R_md, + B_md, + output_md, + output1_md, + output2_md); + } + +public: + void save(BinaryOutputBuffer& ob) const override { +#ifdef ONEDNN_PRIMITIVE_SERIALIZATION + parent::save(ob); + + std::vector prim_cache; + prim_cache = _prim.get_cache_blob(); + ob << prim_cache; +#endif + } + + void load(BinaryInputBuffer& ib) override { +#ifdef ONEDNN_PRIMITIVE_SERIALIZATION + parent::load(ib); + + const kernel_impl_params* impl_params = reinterpret_cast(ib.getKernelImplParams()); + + auto input_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(0)); + auto initial_hidden_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(1)); + auto initial_cell_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(2)); + auto W_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(3)); + auto R_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(4)); + auto B_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(5)); + auto output_md = onednn::layout_to_memory_desc(impl_params->get_output_layout()); + auto output2_md = onednn::layout_to_memory_desc(impl_params->get_output_layout()); + auto prim_desc = std::make_shared( + ib.get_engine().get_onednn_engine(), + dnnl::prop_kind::forward_inference, + dnnl::rnn_direction::undef, + input_md, + initial_hidden_md, + initial_cell_md, + W_md, + R_md, + B_md, + output_md, + output_md, + output2_md); + _pd = *prim_desc; + + std::vector prim_cache; + ib >> prim_cache; + _prim = dnnl::primitive(_pd, prim_cache); +#endif + } + + static std::unique_ptr create(const lstm_seq_node& arg, const kernel_impl_params& impl_params) { + auto& engine = impl_params.prog->get_engine(); + auto& config = impl_params.prog->get_config(); + auto attr = impl_params.attrs_onednn; + auto direction = arg.direction(); + auto prim_desc = get_lstm_primitive_descriptor(impl_params, engine, *attr, direction); + return cldnn::make_unique(engine, config, attr, *prim_desc, get_weights_reorder(impl_params, *prim_desc)); + } +}; + +std::unique_ptr LSTMSeqImplementationManager::create_impl(const program_node& node, const kernel_impl_params& params) const { + assert(node.is_type()); + return onednn::lstm_seq_onednn::create(static_cast(node), params); +} + +} // namespace onednn +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::onednn::lstm_seq_onednn) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.hpp new file mode 100644 index 00000000000000..545ae780a7548b --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.hpp @@ -0,0 +1,82 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "lstm_seq_inst.h" +#include "reshape_inst.h" +#include "intel_gpu/runtime/utils.hpp" +#include "impls/registry/implementation_manager.hpp" +#include "transformations/utils/utils.hpp" + +#include + + +namespace cldnn { +namespace onednn { + +struct LSTMSeqImplementationManager : public ImplementationManager { + OV_GPU_PRIMITIVE_IMPL("onednn::lstm_seq") + LSTMSeqImplementationManager(shape_types shape_type) : ImplementationManager(impl_types::onednn, shape_type) {} + std::unique_ptr create_impl(const program_node& node, const kernel_impl_params& params) const override; + + bool validate_impl(const program_node& node) const override { + assert(node.is_type()); + const auto& info = node.get_program().get_engine().get_device_info(); + if (info.arch == gpu_arch::unknown) + return false; + + auto in0_dt = node.get_input_layout(0).data_type; + auto in1_dt = node.get_input_layout(1).data_type; + auto in2_dt = node.get_input_layout(2).data_type; + auto in3_dt = node.get_input_layout(3).data_type; + auto in4_dt = node.get_input_layout(4).data_type; + auto in5_dt = node.get_input_layout(5).data_type; + auto out0_dt = node.get_output_layout(0).data_type; + auto out1_dt = node.get_output_layout(1).data_type; + auto out2_dt = node.get_output_layout(2).data_type; + bool cell_state_check = one_of(in2_dt, {data_types::f16, data_types::bf16, data_types::f32}) && + one_of(out2_dt, {data_types::f16, data_types::bf16, data_types::f32}); + bool f16_case = everyone_is(data_types::f16, in0_dt, in1_dt, in3_dt, in4_dt, out0_dt, out1_dt); + bool bf16_case = everyone_is(data_types::bf16, in0_dt, in1_dt, in3_dt, in4_dt, out0_dt, out1_dt); + bool f32_case = everyone_is(data_types::f32, in0_dt, in1_dt, in3_dt, in4_dt, in5_dt, out0_dt, out1_dt); + bool u8u8u8_case = one_of(out0_dt, {data_types::u8, data_types::f32}) && everyone_is(data_types::i8, in3_dt, in4_dt) && + everyone_is(data_types::u8, in0_dt, in1_dt, out1_dt) && everyone_is(data_types::f32, in2_dt, in5_dt, out2_dt); + bool f32u8f32_case = everyone_is(data_types::u8, in0_dt) && everyone_is(data_types::i8, in3_dt, in4_dt) && + one_of(out0_dt, {data_types::u8, data_types::f32}) && everyone_is(data_types::f32, in1_dt, in5_dt, out1_dt); + bool s8s8s8_case = everyone_is(data_types::i8, in0_dt, in1_dt, out0_dt, out1_dt) && one_of(out0_dt, {data_types::i8, data_types::f32}) && + everyone_is(data_types::f32, in2_dt, in5_dt, out2_dt); + bool f32s8f32_case = everyone_is(data_types::i8, in0_dt, in3_dt, in4_dt) && one_of(out0_dt, {data_types::i8, data_types::f32}) && + everyone_is(data_types::f32, in1_dt, in5_dt, out1_dt); + + if (!cell_state_check) + return false; + if (!f16_case && !f32_case && !bf16_case && !u8u8u8_case && !f32u8f32_case && !s8s8s8_case && !f32s8f32_case) + return false; + + return node.get_input_layout(0).format == cldnn::format::bfyx || node.get_input_layout(0).format == cldnn::format::fbyx + || node.get_input_layout(0).format == cldnn::format::ybfx; + } + + in_out_fmts_t query_formats(const program_node& node) const override { + assert(node.is_type()); + std::vector in_fmts(node.get_dependencies().size(), format::any); + std::vector out_fmts(node.get_outputs_count(), format::any); + + size_t out_rank = node.get_output_layout().get_rank(); + for (size_t idx = 0; idx < node.get_dependencies().size(); idx++) { + if (node.get_dependency(idx).is_constant()) + continue; + + auto target_format = format::get_default_format(out_rank); + if (idx == 0) + in_fmts[idx] = format::fbyx; + in_fmts[idx] = target_format; + } + out_fmts[0] = format::ybfx; + + return {in_fmts, out_fmts}; + } +}; + +} // namespace onednn +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index a8aa43671ed048..75e087a25fb48f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -270,6 +270,10 @@ dnnl::memory::desc layout_to_memory_desc(cldnn::layout l, dnnl::memory::format_t } else if (target_fmt == dnnl::memory::format_tag::ab) { dims.push_back(l.batch()); dims.push_back(l.get_tensor().count() / l.batch()); + } else if (target_fmt == dnnl::memory::format_tag::abc) { + dims.push_back(l.batch()); + dims.push_back(l.feature()); + dims.push_back(l.spatial(1)); } else if (target_fmt == dnnl::memory::format_tag::ba) { dims.push_back(l.feature()); dims.push_back(l.get_tensor().count() / l.feature()); diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/lstm_cell_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/lstm_cell_impls.cpp new file mode 100644 index 00000000000000..09ba1f670b29d3 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/registry/lstm_cell_impls.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "primitive_inst.h" +#include "registry.hpp" +#include "intel_gpu/primitives/rnn.hpp" + +#if OV_GPU_WITH_OCL + #include "impls/ocl/lstm_cell.hpp" +#endif + +namespace ov { +namespace intel_gpu { + +using namespace cldnn; + +const std::vector>& Registry::get_implementations() { + static const std::vector> impls = { + OV_GPU_CREATE_INSTANCE_OCL(ocl::LSTMCellImplementationManager, shape_types::static_shape) + }; + + return impls; +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/lstm_seq_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/lstm_seq_impls.cpp new file mode 100644 index 00000000000000..4b718bd1c74c72 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/registry/lstm_seq_impls.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "primitive_inst.h" +#include "registry.hpp" +#include "intel_gpu/primitives/rnn.hpp" + +#if OV_GPU_WITH_OCL + #include "impls/ocl/rnn_seq.hpp" +#endif + +#if OV_GPU_WITH_ONEDNN + #include "impls/onednn/lstm_seq_onednn.hpp" +#endif + +namespace ov { +namespace intel_gpu { + +using namespace cldnn; + +const std::vector>& Registry::get_implementations() { + static const std::vector> impls = { + OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::LSTMSeqImplementationManager, shape_types::static_shape) + OV_GPU_CREATE_INSTANCE_OCL(ocl::RNNSeqImplementationManager, shape_types::static_shape) + }; + + return impls; +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp b/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp index a6bb8ad6eebcc2..a837d614d0fb1d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp @@ -130,6 +130,8 @@ REGISTER_IMPLS(fully_connected); REGISTER_IMPLS(gather); REGISTER_IMPLS(gather_nd); REGISTER_IMPLS(gemm); +REGISTER_IMPLS(lstm_cell); +REGISTER_IMPLS(lstm_seq); REGISTER_IMPLS(pooling); REGISTER_IMPLS(reduce); REGISTER_IMPLS(reorder); @@ -171,7 +173,6 @@ REGISTER_DEFAULT_IMPLS(grid_sample, OCL_S); REGISTER_DEFAULT_IMPLS(group_normalization, OCL_S, OCL_D); REGISTER_DEFAULT_IMPLS(kv_cache, OCL_S, OCL_D); REGISTER_DEFAULT_IMPLS(lrn, OCL_S); -REGISTER_DEFAULT_IMPLS(lstm_elt, OCL_S); REGISTER_DEFAULT_IMPLS(multiclass_nms, OCL_S); REGISTER_DEFAULT_IMPLS(multinomial, OCL_S); REGISTER_DEFAULT_IMPLS(mutable_data, OCL_S); diff --git a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h index 52abc5f0cf8cb4..e7d5bdc8bdabdf 100644 --- a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h +++ b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h @@ -95,8 +95,7 @@ class layout_optimizer { b_fs_zyx_fsv32_network, b_fs_yx_fsv16_network, b_fs_zyx_fsv16_network, - bs_fs_yx_bsv16_fsv16_network, - use_onednn_impls + bs_fs_yx_bsv16_fsv16_network }; struct optimization_attributes { @@ -107,7 +106,7 @@ class layout_optimizer { int32_t b_fs_yx_fsv16_network = 0; int32_t b_fs_zyx_fsv16_network = 0; int32_t bs_fs_yx_bsv16_fsv16_network = 0; - int32_t use_onednn_impls = 0; + std::map onednn_impls = {}; }; private: @@ -190,6 +189,33 @@ class layout_optimizer { void set_optimization_attribute(optimization_attributes_type attribute, int32_t val); optimization_attributes get_optimization_attributes() { return _optimization_attributes; } + template + void enable_onednn_for() { + _optimization_attributes.onednn_impls[PT::type_id()] = true; + } + + template + void disable_onednn_for() { + _optimization_attributes.onednn_impls[PT::type_id()] = false; + } + void add_all_onednn_impls_optimization_attribute(); + bool has_all_enabled_onednn_impls_optimization_attribute(); + template + bool is_enabled_onednn_for() { + auto type_id = PT::type_id(); + auto it = _optimization_attributes.onednn_impls.find(type_id); + if (it == _optimization_attributes.onednn_impls.end()) { + return false; + } + + return it->second; + } + void set_value_onednn(primitive_type_id p_type, bool val); + bool contains_onednn_impls_optimization_attribute(const program_node*); + bool is_empty_onednn_impls_optimization_attribute(); + void clear_onednn_impls_optimization_attribute(); + std::map get_all_onednn_impls_optimization_attribute(); + void set_implementation_forcing(const ov::intel_gpu::ImplForcingMap& map); const std::map>& get_implementation_forcing() const; diff --git a/src/plugins/intel_gpu/src/graph/include/lstm_cell_inst.h b/src/plugins/intel_gpu/src/graph/include/lstm_cell_inst.h new file mode 100644 index 00000000000000..38c4232a500eb9 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/lstm_cell_inst.h @@ -0,0 +1,38 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "intel_gpu/primitives/lstm_cell.hpp" +#include "primitive_inst.h" + +#include + +namespace cldnn { +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + using parent::parent; +}; + +using lstm_cell_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + template + static std::vector calc_output_layouts(lstm_cell_node const& node, kernel_impl_params const& impl_param); + static layout calc_output_layout(lstm_cell_node const& node, kernel_impl_params const& impl_param); + static std::string to_string(lstm_cell_node const& node); + +public: + typed_primitive_inst(network& network, lstm_cell_node const& node); +}; + +using lstm_cell_inst = typed_primitive_inst; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/lstm_elt_inst.h b/src/plugins/intel_gpu/src/graph/include/lstm_elt_inst.h deleted file mode 100644 index 1524598c6f3987..00000000000000 --- a/src/plugins/intel_gpu/src/graph/include/lstm_elt_inst.h +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once -#include "intel_gpu/primitives/lstm.hpp" -#include "primitive_inst.h" - -#include - -namespace cldnn { -template <> -struct typed_program_node : public typed_program_node_base { - using parent = typed_program_node_base; - -public: - using parent::parent; - - program_node& input() const { return get_dependency(0); } - program_node& cell() const { return get_dependency(1); } - bool cell_term() const { return !get_primitive()->cell.empty(); } - lstm_weights_order offset_order() const { return get_primitive()->offset_order; } - float clip() const { - float clip_val = get_primitive()->clip; - if (clip_val < 0) - throw std::range_error("Clip value < 0"); - return clip_val; - } - bool input_forget() const { return get_primitive()->input_forget; } - int32_t direction() const { return get_primitive()->direction; } -}; - -using lstm_elt_node = typed_program_node; - -template <> -class typed_primitive_inst : public typed_primitive_inst_base { - using parent = typed_primitive_inst_base; - using parent::parent; - -public: - template - static std::vector calc_output_layouts(lstm_elt_node const& node, kernel_impl_params const& impl_param); - static layout calc_output_layout(lstm_elt_node const& node, kernel_impl_params const& impl_param); - static std::string to_string(lstm_elt_node const& node); - -public: - typed_primitive_inst(network& network, lstm_elt_node const& node); - - memory::ptr cell_memory() const { return dep_memory_ptr(1); } - bool cell_term() const { return !get_typed_desc()->cell.empty(); } - lstm_weights_order offset_order() const { return get_typed_desc()->offset_order; } - float clip() const { - float clip_val = get_typed_desc()->clip; - if (clip_val < 0) - throw std::range_error("Clip value < 0"); - return clip_val; - } - bool input_forget() const { return get_typed_desc()->input_forget; } - uint32_t direction() const { return get_typed_desc()->direction; } -}; - -using lstm_elt_inst = typed_primitive_inst; - -} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/lstm_seq_inst.h b/src/plugins/intel_gpu/src/graph/include/lstm_seq_inst.h new file mode 100644 index 00000000000000..33ad7bebac2fbc --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/lstm_seq_inst.h @@ -0,0 +1,39 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "intel_gpu/primitives/rnn.hpp" +#include "primitive_inst.h" + +#include + +namespace cldnn { +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + using parent::parent; + ov::op::RecurrentSequenceDirection direction() const { return get_primitive()->direction; } +}; + +using lstm_seq_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + template + static std::vector calc_output_layouts(lstm_seq_node const& node, kernel_impl_params const& impl_param); + static layout calc_output_layout(lstm_seq_node const& node, kernel_impl_params const& impl_param); + static std::string to_string(lstm_seq_node const& node); + +public: + typed_primitive_inst(network& network, lstm_seq_node const& node); +}; + +using lstm_seq_inst = typed_primitive_inst; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index 281f95a892662c..9850c25a64ec5d 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -196,6 +196,11 @@ class post_optimize_weights : public base_pass { weights_bias_offset get_weights_bias_offset(const T& node); template void optimize_weights(T& node, program& p); + void select_implementation(program& p, program_node& node); + void add_lstm_weights_reorder(primitive_id input_id, std::shared_ptr reorder_params, program& p, cldnn::program_node&, \ + cldnn::program_node&, size_t); + void add_lstm_bias_reorder(primitive_id input_id, std::shared_ptr reorder_params, program& p, cldnn::program_node&, \ + cldnn::program_node&); reorder_factory& _rf; }; diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h b/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h index eb3f313af00de6..81f614ba9fd43a 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h @@ -51,7 +51,7 @@ struct primitive_type_base : primitive_type { if ((node.get_forced_impl_type() & impl_type) != impl_type) continue; - if (impl_type == impl_types::onednn && !node.get_program().get_layout_optimizer().get_optimization_attributes().use_onednn_impls) + if (impl_type == impl_types::onednn && !node.get_program().get_layout_optimizer().contains_onednn_impls_optimization_attribute(&node)) continue; shape_types supported_shape_type = impl->get_shape_type(); @@ -168,7 +168,7 @@ struct primitive_type_base : primitive_type { return true; continue; } else { - if (impl_type == impl_types::onednn && !node.get_program().get_layout_optimizer().get_optimization_attributes().use_onednn_impls) + if (impl_type == impl_types::onednn && !node.get_program().get_layout_optimizer().contains_onednn_impls_optimization_attribute(&node)) continue; if (!impl->validate(node)) diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index bb4d739b3a07c1..5262e8c4621e72 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -27,7 +27,6 @@ #include "pooling_inst.h" #include "reduce_inst.h" #include "one_hot_inst.h" -#include "permute_inst.h" #include "quantize_inst.h" #include "mvn_inst.h" #include "depth_to_space_inst.h" @@ -37,7 +36,10 @@ #include "gather_inst.h" #include "broadcast_inst.h" #include "loop_inst.h" +#include "concatenation_inst.h" +#include "permute_inst.h" #include "dft_inst.h" +#include "lstm_seq_inst.h" #include "to_string_utils.h" #include #include @@ -114,7 +116,6 @@ bool layout_optimizer::is_format_supported(program_node& node, format::type fmt) node.get_input_layout(0).data_type != data_types::i8 && node.get_input_layout(0).data_type != data_types::u8) return false; - if (node.is_type()) return node.get_output_layout().format == fmt; @@ -132,7 +133,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, auto next_output_layout = next.get_output_layout(); auto prev_dt = prev.get_output_layout().data_type; auto next_dt = next.get_output_layout().data_type; - auto use_onednn_impls = _optimization_attributes.use_onednn_impls; + auto use_onednn_impls = has_all_enabled_onednn_impls_optimization_attribute(); if (prev.is_dynamic() || next.is_dynamic()) return false; @@ -365,7 +366,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, reorder_node auto next = node.get_users().front(); auto dt_prev = prev.get_output_layout().data_type; auto dt_next = next->get_output_layout().data_type; - auto use_onednn_impls = _optimization_attributes.use_onednn_impls; + auto use_onednn_impls = contains_onednn_impls_optimization_attribute(&node) && contains_onednn_impls_optimization_attribute(&prev); if (prev.is_type()) return true; @@ -927,7 +928,7 @@ format layout_optimizer::get_expected_format(convolution_node const& node) { } bool onednn_valid_post_ops = get_post_ops_count(node) <= 32; - bool use_onednn_impls = _optimization_attributes.use_onednn_impls && input_layout.data_type != data_types::f32; + bool use_onednn_impls = contains_onednn_impls_optimization_attribute(&node) && input_layout.data_type != data_types::f32; // Use planar bfyx format for dynamic convolutions with explicit padding in clDNN if (node.is_dynamic() && output_layout.get_partial_shape().size() == 4 && node.use_explicit_padding() && !i8_u8_input && @@ -1038,7 +1039,7 @@ format layout_optimizer::get_expected_format(deconvolution_node const& node) { } auto expected_shape = output_layout.get_shape(); - bool use_onednn_impls = _optimization_attributes.use_onednn_impls; + bool use_onednn_impls = contains_onednn_impls_optimization_attribute(&node); auto available = node.get_primitive()->type->get_available_impl_types(node); @@ -1086,7 +1087,7 @@ format layout_optimizer::get_expected_format(quantize_node const& node) { return all_users_gemm; }; - auto use_onednn_impls = _optimization_attributes.use_onednn_impls; + auto use_onednn_impls = has_all_enabled_onednn_impls_optimization_attribute(); if (use_onednn_impls) { expected = format::any; @@ -1126,7 +1127,7 @@ format layout_optimizer::get_expected_format(quantize_node const& node) { bool layout_optimizer::is_primitive_implemented_for_onednn(program_node& node) { if (node.is_type() || node.is_type() || node.is_type() || node.is_type() || node.is_type() || - node.is_type() || node.is_type() || node.is_type()) { + node.is_type() || node.is_type() || node.is_type() || node.is_type()) { return true; } @@ -1219,7 +1220,7 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format format layout_optimizer::get_preferred_format(program_node& node) { format expected = format::any; auto output_layout = node.get_output_layout(); - bool use_onednn_impls = _optimization_attributes.use_onednn_impls; + bool use_onednn_impls = contains_onednn_impls_optimization_attribute(&node); bool allow_new_shape_infer = node.get_program().is_new_shape_infer(); @@ -1417,14 +1418,55 @@ void layout_optimizer::set_optimization_attribute(optimization_attributes_type a case optimization_attributes_type::bs_fs_yx_bsv16_fsv16_network: _optimization_attributes.bs_fs_yx_bsv16_fsv16_network = val; break; - case optimization_attributes_type::use_onednn_impls: - _optimization_attributes.use_onednn_impls = val; - break; default: throw std::out_of_range("unsupported layout optimization attribute"); } } +void layout_optimizer::add_all_onednn_impls_optimization_attribute() { + enable_onednn_for(); + enable_onednn_for(); + enable_onednn_for(); + enable_onednn_for(); + enable_onednn_for(); + enable_onednn_for(); + enable_onednn_for(); + enable_onednn_for(); + enable_onednn_for(); +} + +bool layout_optimizer::has_all_enabled_onednn_impls_optimization_attribute() { + return is_enabled_onednn_for() && is_enabled_onednn_for() && is_enabled_onednn_for() && + is_enabled_onednn_for() && is_enabled_onednn_for() && is_enabled_onednn_for() && + is_enabled_onednn_for() && is_enabled_onednn_for() && is_enabled_onednn_for(); +} + +void layout_optimizer::set_value_onednn(primitive_type_id p_type, bool val) { + _optimization_attributes.onednn_impls[p_type] = val; +} + +bool layout_optimizer::contains_onednn_impls_optimization_attribute(const program_node* node) { + auto type_id = node->type(); + auto it = _optimization_attributes.onednn_impls.find(type_id); + if (it == _optimization_attributes.onednn_impls.end()) { + return false; + } + + return it->second; +} + +bool layout_optimizer::is_empty_onednn_impls_optimization_attribute() { + return _optimization_attributes.onednn_impls.empty(); +} + +void layout_optimizer::clear_onednn_impls_optimization_attribute() { + _optimization_attributes.onednn_impls.clear(); +} + +std::map layout_optimizer::get_all_onednn_impls_optimization_attribute() { + return _optimization_attributes.onednn_impls; +} + bool layout_optimizer::is_format_optimized(const convolution_node& node, const format& format, bool use_weak_restrictions) { auto input_layout = node.get_input_layout(); auto weights_layout = node.weights().get_output_layout(); diff --git a/src/plugins/intel_gpu/src/graph/lstm_cell.cpp b/src/plugins/intel_gpu/src/graph/lstm_cell.cpp new file mode 100644 index 00000000000000..0b300199fb05a3 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/lstm_cell.cpp @@ -0,0 +1,51 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "lstm_cell_inst.h" +#include "primitive_type_base.h" +#include "json_object.h" +#include + +namespace cldnn { +GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_cell) + +layout lstm_cell_inst::calc_output_layout(lstm_cell_node const& node, kernel_impl_params const& impl_param) { + const auto& input_layout = impl_param.get_input_layout(0); + const auto& input_pshape = input_layout.get_partial_shape(); + const auto& input_layout_hidden = impl_param.get_input_layout(1); + const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape(); + const auto& lstm_batch_size = input_pshape[0]; + const auto& lstm_hidden_size = input_pshape_hidden[1]; + + return cldnn::layout{ov::PartialShape{lstm_batch_size, lstm_hidden_size}, input_layout.data_type, input_layout.format}; +} + +template +std::vector lstm_cell_inst::calc_output_layouts(lstm_cell_node const& node, kernel_impl_params const& impl_param) { + const auto& input_layout = impl_param.get_input_layout(0); + const auto& input_pshape = input_layout.get_partial_shape(); + const auto& input_layout_hidden = impl_param.get_input_layout(1); + const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape(); + const auto& lstm_batch_size = input_pshape[0]; + const auto& lstm_hidden_size = input_pshape_hidden[1]; + + auto out_layout = cldnn::layout{ShapeType{lstm_batch_size, lstm_hidden_size}, input_layout.data_type, input_layout.format}; + return {out_layout, out_layout}; +} + +template std::vector lstm_cell_inst::calc_output_layouts(lstm_cell_node const& node, const kernel_impl_params& impl_param); + +std::string lstm_cell_inst::to_string(lstm_cell_node const& node) { + auto node_info = node.desc_to_json(); + + std::stringstream primitive_description; + + json_composite lstm_cell_info; + node_info->add("lstm cell info", lstm_cell_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +lstm_cell_inst::typed_primitive_inst(network& network, lstm_cell_node const& node) : parent(network, node) {} +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/lstm_elt.cpp b/src/plugins/intel_gpu/src/graph/lstm_elt.cpp deleted file mode 100644 index 098e89aa45003e..00000000000000 --- a/src/plugins/intel_gpu/src/graph/lstm_elt.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// -#include "lstm_elt_inst.h" -#include "primitive_type_base.h" -#include "intel_gpu/runtime/error_handler.hpp" -#include "json_object.h" -#include - -namespace cldnn { -GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_elt) - -layout lstm_elt_inst::calc_output_layout(lstm_elt_node const& node, kernel_impl_params const& impl_param) { - assert(static_cast(impl_param.desc->output_data_types[0]) == false && - "Output data type forcing is not supported for lstm_elt_node!"); - auto input_layout = impl_param.get_input_layout(); - - // tempGEMM{bfyx} = [b: batch, f: direction, x: 1, y: 4 * hidden_size ] input - // cell{bfyx} = [b: batch, f: direction, x: 1, y: hidden_size ] optional - // output{bfyx} = [b: batch, f: 2, x: direction, y: hidden_size ] output - // The output of the lstm_elt node is the concatenation of the intermediate [hidden, cell] tensors. - // A crop/split node is needed to extract each individual tensors - auto result = - layout(input_layout.data_type, - input_layout.format, - tensor(input_layout.batch(), 2, input_layout.spatial(0) / 4, input_layout.feature())); - return result; -} - -template -std::vector lstm_elt_inst::calc_output_layouts(lstm_elt_node const& node, kernel_impl_params const& impl_param) { - std::vector output_layouts; - - // input partial shape [batch, input_size (= hidden_size * 4)] - auto input_layout = impl_param.get_input_layout(); - auto input_pshape = input_layout.get_partial_shape(); - OPENVINO_ASSERT(static_cast(impl_param.desc->output_data_types[0]) == false, "Output data type forcing is not supported for lstm_elt_node!"); - OPENVINO_ASSERT(input_pshape.rank().get_length() == 2, "input_layout rank should be 2 on dynamic shape."); - - int lstm_input_size, lstm_batch_size, lstm_hidden_size; - if (input_pshape[input_pshape.size() - 1].is_static()) { - lstm_input_size = input_pshape[input_pshape.size() - 1].get_length(); - lstm_hidden_size = lstm_input_size / 4; - } else { - lstm_input_size = -1; - lstm_hidden_size = -1; - } - - if (input_pshape[input_pshape.size() - 2].is_static()) { - lstm_batch_size = input_pshape[input_pshape.size() - 2].get_length(); - } else { - lstm_batch_size = -1; - } - - return {cldnn::layout{ov::PartialShape{lstm_batch_size, 2, 1, lstm_hidden_size}, input_layout.data_type, input_layout.format}}; -} - -template std::vector lstm_elt_inst::calc_output_layouts(lstm_elt_node const& node, const kernel_impl_params& impl_param); - -std::string lstm_elt_inst::to_string(lstm_elt_node const& node) { - auto desc = node.get_primitive(); - auto node_info = node.desc_to_json(); - auto cell_id = desc->cell; - - std::stringstream primitive_description; - - json_composite lstm_elt_info; - lstm_elt_info.add("cell id", cell_id); - node_info->add("lstm elt info", lstm_elt_info); - node_info->dump(primitive_description); - - return primitive_description.str(); -} - -lstm_elt_inst::typed_primitive_inst(network& network, lstm_elt_node const& node) : parent(network, node) { - auto input_size = node.get_input_layout(); - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), - "input format", - input_size.format.value, - "expected format", - format::bfyx, - format::fyxb); -} -} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/lstm_seq.cpp b/src/plugins/intel_gpu/src/graph/lstm_seq.cpp new file mode 100644 index 00000000000000..f06f7a644ad12a --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/lstm_seq.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "lstm_seq_inst.h" +#include "primitive_type_base.h" +#include "json_object.h" +#include + +namespace cldnn { +GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_seq) + +layout lstm_seq_inst::calc_output_layout(lstm_seq_node const& node, kernel_impl_params const& impl_param) { + const auto& desc = impl_param.typed_desc(); + const auto& input_layout = impl_param.get_input_layout(0); + const auto& input_pshape = input_layout.get_partial_shape(); + const auto& input_layout_hidden = impl_param.get_input_layout(1); + const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape(); + const auto& lstm_batch_size = input_pshape[0]; + const auto& lstm_seq_length = input_pshape[1]; + const auto& lstm_hidden_size = input_pshape_hidden[2]; + + auto first_out_fmt = cldnn::format::bfyx; + if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) { + first_out_fmt = node.get_preferred_output_fmt(); + } + + return cldnn::layout{ov::PartialShape{lstm_batch_size, desc->num_directions(), lstm_seq_length, lstm_hidden_size}, input_layout.data_type, first_out_fmt}; +} + +template +std::vector lstm_seq_inst::calc_output_layouts(lstm_seq_node const& node, kernel_impl_params const& impl_param) { + const auto& desc = impl_param.typed_desc(); + const auto& input_layout = impl_param.get_input_layout(0); + const auto& input_pshape = input_layout.get_partial_shape(); + const auto& input_layout_hidden = impl_param.get_input_layout(1); + const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape(); + const auto& lstm_batch_size = input_pshape[0]; + const auto& lstm_seq_length = input_pshape[1]; + const auto& lstm_hidden_size = input_pshape_hidden[2]; + + auto first_out_fmt = cldnn::format::bfyx; + auto second_out_fmt = input_layout.format; + auto third_out_fmt = input_layout.format; + if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) { + first_out_fmt = node.get_preferred_output_fmt(); + } + auto num_directions = desc->num_directions(); + + return {cldnn::layout{ShapeType{lstm_batch_size, num_directions, lstm_seq_length, lstm_hidden_size}, input_layout.data_type, first_out_fmt}, \ + cldnn::layout{ShapeType{lstm_batch_size, num_directions, lstm_hidden_size}, input_layout.data_type, second_out_fmt}, \ + cldnn::layout{ShapeType{lstm_batch_size, num_directions, lstm_hidden_size}, input_layout.data_type, third_out_fmt}}; +} + +template std::vector lstm_seq_inst::calc_output_layouts(lstm_seq_node const& node, const kernel_impl_params& impl_param); + +std::string lstm_seq_inst::to_string(lstm_seq_node const& node) { + auto node_info = node.desc_to_json(); + + std::stringstream primitive_description; + + json_composite lstm_seq_info; + node_info->add("lstm seq info", lstm_seq_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +lstm_seq_inst::typed_primitive_inst(network& network, lstm_seq_node const& node) : parent(network, node) {} +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 2bfaac84134387..bdffb9c4980722 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -70,6 +70,9 @@ #include "unique_inst.hpp" #include "condition_inst.h" #include "to_string_utils.h" +#include "intel_gpu/graph/serialization/map_serializer.hpp" + +#include "intel_gpu/primitives/rnn.hpp" // TODO: Remove once we have interface for kernels cache #include "impls/ocl/kernels_cache.hpp" @@ -151,15 +154,14 @@ program::program(engine& engine_ref, is_internal(is_internal), _is_body_program(is_body_program), _compilation_context(compilation_context) { - _config.apply_user_properties(_engine.get_device_info()); init_primitives(); GPU_DEBUG_INFO << "Program config\n" << _config.to_string(); init_program(); prepare_nodes(topology); program_node::reset_unique_id(); - if (no_optimizations) { init_graph(); + _config.apply_user_properties(_engine.get_device_info()); } else { build_program(is_internal); if (_is_body_program) { @@ -494,6 +496,7 @@ void program::set_options() { void program::build_program(bool is_internal) { init_graph(); + _config.apply_user_properties(_engine.get_device_info()); { pre_optimize_graph(is_internal); } run_graph_compilation(); { post_optimize_graph(is_internal); } @@ -523,6 +526,9 @@ void program::init_graph() { for (auto& node : processing_order) { if (!node->is_type()) node->get_output_layouts(); + if (node->is_type()) { + _config.set_property(ov::intel_gpu::use_onednn(true)); + } } // Perform initial shape_of subgraphs markup apply_opt_pass(); @@ -1631,11 +1637,17 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { #ifdef ENABLE_ONEDNN_FOR_GPU bool enable_onednn_for_tests = get_config().get_property(ov::intel_gpu::optimize_data) || is_internal_program(); auto& engine = get_engine(); - if (engine.get_device_info().supports_immad && - engine.get_device_info().vendor_id == INTEL_VENDOR_ID && + if (engine.get_device_info().vendor_id == INTEL_VENDOR_ID && get_config().get_property(ov::intel_gpu::queue_type) == QueueTypes::in_order && - enable_onednn_for_tests) - lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 1); + enable_onednn_for_tests) { + if (engine.get_device_info().supports_immad) { + lo.add_all_onednn_impls_optimization_attribute(); + } else { + if (get_config().get_property(ov::intel_gpu::use_onednn)) { + lo.enable_onednn_for(); + } + } + } #endif } @@ -1779,7 +1791,13 @@ void program::save(cldnn::BinaryOutputBuffer& ob) const { ob << _is_body_program; ob << _can_be_optimized; - ob << get_layout_optimizer().get_optimization_attributes().use_onednn_impls; + auto onednn_impls_size = get_layout_optimizer().get_all_onednn_impls_optimization_attribute().size(); + ob << onednn_impls_size; + for (const auto& onednn_impl : get_layout_optimizer().get_all_onednn_impls_optimization_attribute()) { + ob << prim_map_storage::instance().get_type_string(onednn_impl.first); + ob << onednn_impl.second; + } + processing_order.save(ob); { @@ -1903,9 +1921,18 @@ void program::load(cldnn::BinaryInputBuffer& ib) { ib >> _is_body_program; ib >> _can_be_optimized; - int32_t use_onednn_attr = 0; - ib >> use_onednn_attr; - get_layout_optimizer().set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, use_onednn_attr); + + size_t num_of_onednn_impls; + ib >> num_of_onednn_impls; + for (size_t num = 0; num < num_of_onednn_impls; num++) { + primitive_id p_id{}; + bool enabled; + ib >> p_id; + ib >> enabled; + auto ptype_id = prim_map_storage::instance().get_type_id(p_id); + get_layout_optimizer().set_value_onednn(ptype_id, enabled); + } + _loaded_from_cache = true; processing_order.load(ib, *this); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl new file mode 100644 index 00000000000000..f2cf2ca985e855 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl @@ -0,0 +1,215 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/common.cl" + +#define INPUT0_TYPE_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) +#define INPUT1_TYPE_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, VEC_SIZE) +#define INPUT3_TYPE_VEC MAKE_VECTOR_TYPE(INPUT3_TYPE, VEC_SIZE) +#define INPUT4_TYPE_VEC MAKE_VECTOR_TYPE(INPUT4_TYPE, VEC_SIZE) +#define OUTPUT_TYPE_VEC MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE) +#define READ_VEC(offset, ptr) CAT(vload, VEC_SIZE)(offset, ptr) + +#ifdef SEQUENCE +#define GET_IN0_IDX(b, f, y) INPUT0_GET_INDEX(b, f, y, 0) + #if DIRECTION == 2 + #define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, f, y, 0) + #define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, f, y, 0) + #define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(b, f, y, 0) + #define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(b, f, y, 0) + #define GET_IN5_IDX(b, f) INPUT5_GET_INDEX(b, f, 0, 0) + #else + #define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, 0, y, 0) + #define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, 0, y, 0) + #define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(0, f, y, 0) + #define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(0, f, y, 0) + #define GET_IN5_IDX(b, f) INPUT5_GET_INDEX(0, f, 0, 0) + #endif +#else +#define GET_IN0_IDX(b, f, y) INPUT0_GET_INDEX(b, y, 0, 0) +#define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, y, 0, 0) +#define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, y, 0, 0) +#define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(f, y, 0, 0) +#define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(f, y, 0, 0) +#define GET_IN5_IDX(b, f) INPUT5_GET_INDEX(f, 0, 0, 0) +#endif + +KERNEL(lstm_cell_and_seq_bfyx)( + const __global INPUT0_TYPE* x, + const __global INPUT1_TYPE* initial_hidden_state, + const __global INPUT2_TYPE* initial_cell_state, + const __global INPUT3_TYPE* W, + const __global INPUT4_TYPE* R, + const __global INPUT5_TYPE* B, +#ifdef SEQUENCE + const __global INPUT6_TYPE* sequence_lengths, + __global OUTPUT_TYPE* hidden_history, + __global OUTPUT1_TYPE* hidden_state, + __global OUTPUT2_TYPE* cell_state +#else + __global OUTPUT_TYPE* hidden_state, + __global OUTPUT1_TYPE* cell_state +#endif +) +{ + const uint b = get_global_id(1); + const uint local_idx = get_local_id(0); + const uint weight_offsets[4] = {GEMM_OFFSET_F, GEMM_OFFSET_I, GEMM_OFFSET_Z, GEMM_OFFSET_O}; + #ifdef SEQUENCE + const uint real_seq_length = sequence_lengths[INPUT6_GET_INDEX(b, 0, 0, 0)]; + #else + const uint real_seq_length = 1; + #endif + #if DIRECTION == 2 + unroll_for(uint dir=0;dir0){ + barrier(CLK_LOCAL_MEM_FENCE); + } + #endif + unroll_for(uint l=0;l= HIDDEN_SIZE) { + continue; + } + ACCUMULATOR_TYPE gate_output[GATE_NUM]; + unroll_for(uint k=0;k0){ + barrier(CLK_LOCAL_MEM_FENCE); + } + #endif + unroll_for(uint l=0;l= HIDDEN_SIZE) { + continue; + } + ACCUMULATOR_TYPE gate_output[GATE_NUM]; + unroll_for(uint k=0;k