diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp
deleted file mode 100644
index 76f59e3448e694..00000000000000
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-#include "primitive.hpp"
-#include "activation.hpp"
-#include <vector>
-#include <algorithm>
-#include "intel_gpu/graph/serialization/activation_serializer.hpp"
-
-namespace cldnn {
-
-/// @brief Weights orders
-/// @details Specifies the order in which the weights are concatenated.
-/// e.g. [i, o, f, z] : [input, output, forget, block]
-/// ONNX order: iofz
-/// Caffe order: ifoz
-/// pyTorch order: izof
-/// OV order: fizo
-enum class lstm_weights_order {
-    iofz,
-    ifoz,
-    izof,
-    fizo
-};
-
-struct lstm_elt : public primitive_base<lstm_elt> {
-    CLDNN_DECLARE_PRIMITIVE(lstm_elt)
-
-    lstm_elt() : primitive_base("", {}), clip(0), input_forget(0), offset_order(lstm_weights_order::iofz), direction(0) {}
-
-    using vec_activation = std::vector<activation_func>;
-    using vec_activation_param = std::vector<activation_additional_params>;
-
-    /// @brief Constructs lstm layer.
-    /// @param id This primitive id.
-    /// @param input input primitive id.
-    /// @param input cell Primitive id containing cell data. Provide empty string if using lstm without cell values.
-    /// @param clip Clip threshold. Provide 0 if using lstm without activations clip threshold.
-    /// @param input_forget Provide 0 if using lstm without coupled input-forget gates.
-    /// @param offset_order. Order of the concatenated weights, recurrent, and bias. ONNX default is iofz [input, output, forget, block].
-    /// @param direction default = 0, bidirectional = 1.
-    lstm_elt(const primitive_id& id,
-             const input_info& input,
-             const primitive_id& cell = "",
-             const float clip = 0,
-             const bool input_forget = 0,
-             const std::vector<activation_func> activations = {activation_func::logistic,
-                                                               activation_func::hyperbolic_tan,
-                                                               activation_func::hyperbolic_tan},
-             const std::vector<activation_additional_params> activation_params = {},
-             const lstm_weights_order offset_order = lstm_weights_order::iofz,
-             const uint32_t direction = 0)
-        : primitive_base(id, {input}),
-          cell(cell),
-          clip(clip),
-          input_forget(input_forget),
-          activations(activations),
-          activation_params(activation_params),
-          offset_order(offset_order),
-          direction(direction) {}
-
-    /// @brief Primitive id containing the initial value of the cell state data.
-    primitive_id cell;
-    /// @brief Cell clip threshold T. It is applied to the input of activations [-T, T]. No clip is applied if it is not specified.
-    float clip;
-    /// @brief Couple the input and forget gates if input_forget is 1. Default is 0.
-    bool input_forget;
-    /// @brief A list of 3 activation functions for the input, output, forget, cell, and hidden.
-    std::vector<activation_func> activations;
-    /// @brief Optional scaling values used by some activation functions. The values are consumed in the order of activation functions.
-    std::vector<activation_additional_params> activation_params;
-    /// @brief Weights, recurrent weights, and biases order. [iofz] : ONNX, [ifoz] : Caffe
-    lstm_weights_order offset_order;
-    /// @brief direction default = 0, bidirectional = 1.
-    uint32_t direction;
-
-    size_t hash() const override {
-        size_t seed = primitive::hash();
-        seed = hash_combine(seed, clip);
-        seed = hash_combine(seed, input_forget);
-        seed = hash_range(seed, activations.begin(), activations.end());
-        for (auto& act_param : activation_params) {
-            seed = hash_combine(seed, act_param.a);
-            seed = hash_combine(seed, act_param.b);
-        }
-        seed = hash_combine(seed, offset_order);
-        seed = hash_combine(seed, direction);
-        seed = hash_combine(seed, cell.empty());
-        return seed;
-    }
-
-    bool operator==(const primitive& rhs) const override {
-        if (!compare_common_params(rhs))
-            return false;
-
-        auto rhs_casted = downcast<const lstm_elt>(rhs);
-
-        bool act_params_eq = activation_params.size() == rhs_casted.activation_params.size();
-        for (size_t i = 0; i < activation_params.size(); ++i) {
-            act_params_eq &= activation_params[i].a == rhs_casted.activation_params[i].a &&
-                             activation_params[i].b == rhs_casted.activation_params[i].b;
-        }
-
-        #define cmp_fields(name) name == rhs_casted.name
-        return act_params_eq &&
-               cmp_fields(clip) &&
-               cmp_fields(input_forget) &&
-               cmp_fields(activations) &&
-               cmp_fields(offset_order) &&
-               cmp_fields(direction) &&
-               cmp_fields(cell.empty());
-        #undef cmp_fields
-    }
-
-    void save(BinaryOutputBuffer& ob) const override {
-        primitive_base<lstm_elt>::save(ob);
-        ob << cell;
-        ob << clip;
-        ob << input_forget;
-        ob << activations;
-        ob << activation_params;
-        ob << make_data(&offset_order, sizeof(lstm_weights_order));
-        ob << direction;
-    }
-
-    void load(BinaryInputBuffer& ib) override {
-        primitive_base<lstm_elt>::load(ib);
-        ib >> cell;
-        ib >> clip;
-        ib >> input_forget;
-        ib >> activations;
-        ib >> activation_params;
-        ib >> make_data(&offset_order, sizeof(lstm_weights_order));
-        ib >> direction;
-    }
-
-protected:
-    std::vector<input_info> get_dependencies() const override {
-        std::vector<input_info> ret;
-        if (!cell.empty())
-            ret.push_back(cell);
-        return ret;
-    }
-};
-
-}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm_cell.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm_cell.hpp
new file mode 100644
index 00000000000000..c53840d4bfd0c4
--- /dev/null
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm_cell.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "primitive.hpp"
+#include "activation.hpp"
+#include <vector>
+#include <algorithm>
+#include "intel_gpu/graph/serialization/activation_serializer.hpp"
+#include "rnn.hpp"
+
+
+namespace cldnn {
+
+struct lstm_cell : public RNNParams<lstm_cell> {
+    CLDNN_DECLARE_PRIMITIVE(lstm_cell)
+    using vec_activation = std::vector<activation_func>;
+    using vec_activation_param = std::vector<activation_additional_params>;
+    using RNNParams::RNNParams;
+    lstm_cell(const lstm_cell&) = default;
+    lstm_cell() : RNNParams() {}
+};
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp
new file mode 100644
index 00000000000000..ff167267c1aa24
--- /dev/null
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp
@@ -0,0 +1,189 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "primitive.hpp"
+#include "activation.hpp"
+#include <vector>
+#include <algorithm>
+#include <string>
+#include "intel_gpu/graph/serialization/activation_serializer.hpp"
+
+namespace cldnn {
+
+/// @brief Weights orders
+/// @details Specifies the order in which the weights are concatenated.
+/// e.g. [i, o, f, z] : [input, output, forget, block]
+/// ONNX order: iofz
+/// Caffe order: ifoz
+/// pyTorch order: izof
+/// OV order: fizo
+enum class lstm_weights_order {
+    iofz,
+    ifoz,
+    izof,
+    fizo
+};
+
+template <typename PType>
+struct RNNParams : public primitive_base<PType> {
+    RNNParams() : primitive_base<PType>("", {}) {}
+    RNNParams(const RNNParams&) = default;
+    RNNParams(const primitive_id& id,
+              const input_info& x,
+              const input_info& initial_hidden_state,
+              const input_info& initial_cell_state,
+              const input_info& W,
+              const input_info& R,
+              const input_info& B,
+              const input_info& seq_lenghts,
+              const float clip = 0,
+              bool input_forget = false,
+              const std::vector<activation_func>& activations = {activation_func::logistic,
+                                                                activation_func::hyperbolic_tan,
+                                                                activation_func::hyperbolic_tan},
+              const std::vector<activation_additional_params>& activation_params = {},
+              const lstm_weights_order& offset_order = lstm_weights_order::iofz,
+              const ov::op::RecurrentSequenceDirection direction = ov::op::RecurrentSequenceDirection::FORWARD,
+              const padding& output_padding = padding(),
+              const int num_outputs = 1)
+        : primitive_base<PType>(id, {x}, num_outputs, {optional_data_type()}, {output_padding}),
+        x(x),
+        initial_hidden_state(initial_hidden_state),
+        initial_cell_state(initial_cell_state),
+        W(W),
+        R(R),
+        B(B),
+        seq_lenghts(seq_lenghts),
+        clip(clip),
+        input_forget(input_forget),
+        activations(activations),
+        activation_params(activation_params),
+        offset_order(offset_order),
+        direction(direction) {
+        std::vector<std::string> pids{initial_hidden_state.pid, initial_cell_state.pid, W.pid, R.pid, B.pid, seq_lenghts.pid};
+        for (auto pid : pids) {
+            if (!pid.empty()) {
+                primitive_base<PType>::input.push_back(pid);
+            }
+        }
+    }
+
+    input_info x;
+    input_info initial_hidden_state;
+    input_info initial_cell_state;
+    input_info W;
+    input_info R;
+    input_info B;
+    input_info seq_lenghts;
+    /// @brief Cell clip threshold T. It is applied to the input of activations [-T, T]. No clip is applied if it is not specified.
+    float clip;
+    bool input_forget;
+    /// @brief A list of 3 activation functions for the input, output, forget, cell, and hidden.
+    std::vector<activation_func> activations;
+    /// @brief Optional scaling values used by some activation functions. The values are consumed in the order of activation functions.
+    std::vector<activation_additional_params> activation_params;
+    /// @brief Weights, recurrent weights, and biases order. [iofz] : ONNX, [ifoz] : Caffe
+    lstm_weights_order offset_order;
+    /// @brief direction of LSTMSequence - only FORWARD or REVERSE, currently BIDIRECTIONAL not supported
+    ov::op::RecurrentSequenceDirection direction;
+
+    int num_directions() const {
+        return direction == ov::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1;
+    }
+
+    size_t hash() const override {
+        size_t seed = primitive::hash();
+        seed = hash_combine(seed, x.pid);
+        seed = hash_combine(seed, initial_hidden_state.pid);
+        seed = hash_combine(seed, initial_cell_state.pid);
+        seed = hash_combine(seed, seq_lenghts.pid);
+        seed = hash_combine(seed, W.pid);
+        seed = hash_combine(seed, R.pid);
+        seed = hash_combine(seed, B.pid);
+        seed = hash_combine(seed, clip);
+        seed = hash_range(seed, activations.begin(), activations.end());
+        for (auto& act_param : activation_params) {
+            seed = hash_combine(seed, act_param.a);
+            seed = hash_combine(seed, act_param.b);
+        }
+        seed = hash_combine(seed, offset_order);
+        seed = hash_combine(seed, direction);
+        return seed;
+    }
+
+    bool operator==(const primitive& rhs) const override {
+        if (!primitive::compare_common_params(rhs))
+            return false;
+
+        auto rhs_casted = downcast<const PType>(rhs);
+        bool act_params_eq = activation_params.size() == rhs_casted.activation_params.size();
+        for (size_t i = 0; i < activation_params.size(); ++i) {
+            act_params_eq &= activation_params[i].a == rhs_casted.activation_params[i].a &&
+                             activation_params[i].b == rhs_casted.activation_params[i].b;
+        }
+
+        #define cmp_fields(name) name == rhs_casted.name
+        return act_params_eq &&
+               cmp_fields(x) &&
+               cmp_fields(initial_hidden_state) &&
+               cmp_fields(initial_cell_state) &&
+               cmp_fields(seq_lenghts) &&
+               cmp_fields(W) &&
+               cmp_fields(R) &&
+               cmp_fields(B) &&
+               cmp_fields(clip) &&
+               cmp_fields(activations) &&
+               cmp_fields(offset_order) &&
+               cmp_fields(direction);
+        #undef cmp_fields
+    }
+
+    void save(BinaryOutputBuffer& ob) const override {
+        primitive_base<PType>::save(ob);
+        ob << x;
+        ob << initial_hidden_state;
+        ob << initial_cell_state;
+        ob << W;
+        ob << R;
+        ob << B;
+        ob << seq_lenghts;
+        ob << clip;
+        ob << activations;
+        ob << activation_params;
+        ob << make_data(&offset_order, sizeof(lstm_weights_order));
+        ob << make_data(&direction, sizeof(ov::op::RecurrentSequenceDirection));
+    }
+
+    void load(BinaryInputBuffer& ib) override{
+        primitive_base<PType>::load(ib);
+        ib >> x;
+        ib >> initial_hidden_state;
+        ib >> initial_cell_state;
+        ib >> W;
+        ib >> R;
+        ib >> B;
+        ib >> seq_lenghts;
+        ib >> clip;
+        ib >> activations;
+        ib >> activation_params;
+        ib >> make_data(&offset_order, sizeof(lstm_weights_order));
+        ib >> make_data(&direction, sizeof(ov::op::RecurrentSequenceDirection));
+    }
+};
+
+struct lstm_seq : public RNNParams<lstm_seq> {
+    CLDNN_DECLARE_PRIMITIVE(lstm_seq)
+    using vec_activation = std::vector<activation_func>;
+    using vec_activation_param = std::vector<activation_additional_params>;
+    using RNNParams::RNNParams;
+    lstm_seq() : RNNParams() {
+        weights = W.pid;
+        input = x.pid;
+    }
+    lstm_seq(const lstm_seq&) = default;
+    primitive_id input;
+    primitive_id weights;
+};
+} //namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
index febcabd57efba0..199261772dcf2e 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
@@ -56,6 +56,7 @@ static constexpr Property<size_t, PropertyMutability::RW> max_dynamic_batch{"DYN
 static constexpr Property<bool, PropertyMutability::RW> nv12_two_inputs{"GPU_NV12_TWO_INPUTS"};
 static constexpr Property<float, PropertyMutability::RW> buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"};
 static constexpr Property<size_t, PropertyMutability::RW> max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"};
+static constexpr Property<bool, PropertyMutability::RW> use_onednn{"USE_ONEDNN"};
 
 }  // namespace intel_gpu
 }  // namespace ov
diff --git a/src/plugins/intel_gpu/src/graph/concatenation.cpp b/src/plugins/intel_gpu/src/graph/concatenation.cpp
index b493bb217b1c32..87dad139c10404 100644
--- a/src/plugins/intel_gpu/src/graph/concatenation.cpp
+++ b/src/plugins/intel_gpu/src/graph/concatenation.cpp
@@ -120,6 +120,9 @@ concatenation_inst::typed_primitive_inst(network& network, concatenation_node co
             if (dim == node.get_primitive()->axis) {
                 concat_count += input_mem_size[dim];
             } else {
+                if (i.first->get_outputs_count() > 1 && i.first->get_user_index(node) > 0) {
+                    continue;
+                }
                 CLDNN_ERROR_NOT_EQUAL(node.id(),
                                       "Input size dim: " + std::to_string(dim),
                                       input_size[dim],
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
index 9805b45ad005ed..407ed3b87fd4e8 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
@@ -9,7 +9,10 @@
 #include "convolution_inst.h"
 #include "deconvolution_inst.h"
 #include "fully_connected_inst.h"
+#include "lstm_seq_inst.h"
 #include "intel_gpu/runtime/format.hpp"
+#include "permute_inst.h"
+#include "crop_inst.h"
 #ifdef ENABLE_ONEDNN_FOR_GPU
 #include "graph/impls/onednn/utils.hpp"
 #endif // ENABLE_ONEDNN_FOR_GPU
@@ -18,10 +21,17 @@ namespace cldnn {
 post_optimize_weights::post_optimize_weights(reorder_factory& rf_ref)
     : base_pass("post_optimize_weights"), _rf(rf_ref) {}
 
-template<typename T> post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) {
+template <typename T>
+post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) {
     return weights_bias_offset(node.get_primitive()->input.size(), program_helpers::wrap_if_single(node.get_primitive()->weights).size());
 }
 
+template <>
+post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const lstm_seq_node& node) {
+    const int W_idx = 3;
+    return weights_bias_offset(W_idx, 3);
+}
+
 // function which prepares given primitive for weights optimization
 template<typename T>
 void post_optimize_weights::optimize_weights(T& node, program& p) {
@@ -109,15 +119,26 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
                     set_implementation(weights_reorder_node);
                 }
             } else {
-                auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params);
-                // insert new weights reorder node to topology
-                p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second);
-                // set weights reorder's node output layout and implementation
-                auto& weights_reorder_node = node.get_dependency(i);
-                weights_reorder_node.get_output_layout(false);
+                if (node.type() == lstm_seq::type_id()) {
+                    program_node& prev_node = node.get_dependency(i);
+                    if (i == 5) {
+                        add_lstm_bias_reorder(prev_node.id(), weights_reorder_params, p, prev_node, node);
+                    } else {
+                        add_lstm_weights_reorder(prev_node.id(), weights_reorder_params, p, prev_node, node, i);
+                    }
+                    auto& weights_reorder_node = node.get_dependency(i);
+                    weights_reorder_node.get_output_layout(false);
+                } else {
+                    auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params);
+                    // insert new weights reorder node to topology
+                    p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second);
+                    // set weights reorder's node output layout and implementation
+                    auto& weights_reorder_node = node.get_dependency(i);
+                    weights_reorder_node.get_output_layout(false);
 
-                if (!weights_reorder.second) {
-                    set_implementation(weights_reorder_node);
+                    if (!weights_reorder.second) {
+                        set_implementation(weights_reorder_node);
+                    }
                 }
             }
         }
@@ -126,6 +147,110 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
     node.set_output_layout(output_layout, false);
 }
 
+void post_optimize_weights::select_implementation(program& p, program_node& node) {
+    node.set_selected_impl(node.type()->create_impl(node));
+    if (auto impl = node.get_selected_impl()) {
+        auto params = node.get_kernel_impl_params();
+        p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
+    }
+}
+
+void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std::shared_ptr<WeightsReorderParams> reorder_params, program& p, \
+                                                     cldnn::program_node& prev, cldnn::program_node& node, size_t i) {
+    OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized.");
+    std::string reorder_id = input_id + "_reo_" + std::to_string(i);
+    const auto dir_num = static_cast<int>(reorder_params->get_input_layout().get_shape()[0]);
+    auto hiddenSize = reorder_params->get_input_layout().get_shape()[1] / 4;
+    auto inputSize = static_cast<int>(reorder_params->get_input_layout().get_shape()[2]);
+    int size_third;
+    const int W_idx = 3;
+    if (i == W_idx) {
+        size_third = inputSize;
+    } else {
+        size_third = static_cast<int>(hiddenSize);
+    }
+    auto cropSizeR = cldnn::tensor{dir_num, static_cast<int>(hiddenSize), 1, size_third};
+    std::string crop_id_b = input_id + "_c";
+    auto get_crop_node = [&](int cropNum) -> cldnn::program_node& {
+        auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum));
+        auto crop_prim = std::make_shared<cldnn::crop>(crop_id, reorder_id, cropSizeR, cldnn::tensor{0, static_cast<int>(cropNum*hiddenSize), 0, 0});
+        return p.get_or_create(crop_prim);
+    };
+
+    auto& crop0_node = get_crop_node(0);
+    auto& crop1_node = get_crop_node(1);
+    auto crop2_id = primitive_id(crop_id_b + std::to_string(2));
+    auto crop2_prim = std::make_shared<cldnn::crop>(crop2_id, reorder_id,  cldnn::tensor{dir_num, static_cast<int>(2*hiddenSize), 1, size_third},
+        cldnn::tensor{0, static_cast<int>(2*hiddenSize), 0, 0});
+    auto& crop2_node = p.get_or_create(crop2_prim);
+    std::vector<input_info> con_input{input_info(crop_id_b + "1"), input_info(crop_id_b + "0"), input_info(crop_id_b + "2")};
+    cldnn::primitive_id concat_id{input_id + "cont"};
+    auto con = std::make_shared<cldnn::concatenation>(concat_id, con_input, 1);
+    auto& con_node = p.get_or_create(con);
+    p.add_intermediate(con_node, node, prev, true);
+    p.add_intermediate(crop1_node, con_node, prev, true);
+    p.add_connection(prev, crop0_node, 0);
+    p.add_connection(prev, crop2_node, 0);
+    p.add_connection(crop0_node, con_node, 0);
+    p.add_connection(crop2_node, con_node, 0);
+    std::string permute_id = input_id + "_perx";
+    std::vector<uint16_t> ord{0, 2, 1};
+    auto permute = std::make_shared<cldnn::permute>(permute_id, input_info{concat_id}, ord);
+    auto& permute_node = p.get_or_create(permute);
+    p.add_intermediate(permute_node, node, con_node,  true);
+    auto set_implementation_and_output = [this, &p](program_node& node) {
+        node.get_output_layout(false);
+        select_implementation(p, node);
+        p.mark_if_constant(node);
+        node.recalc_output_layout(false);
+    };
+    set_implementation_and_output(crop1_node);
+    set_implementation_and_output(crop0_node);
+    set_implementation_and_output(crop2_node);
+    set_implementation_and_output(con_node);
+    set_implementation_and_output(permute_node);
+}
+
+void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::shared_ptr<WeightsReorderParams> reorder_params, program& p, \
+                                                  cldnn::program_node& prev, cldnn::program_node& node) {
+    OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized.");
+    const auto dir_num = static_cast<int>(reorder_params->get_input_layout().get_shape()[0]);
+    auto hiddenSize = reorder_params->get_output_layout().get_shape()[1] / 4;
+    auto cropSize = cldnn::tensor{dir_num, static_cast<int>(hiddenSize), 1, 1};
+    std::string crop_id_b = input_id + "_c";
+    auto get_crop_node = [&](int cropNum) -> cldnn::program_node& {
+        auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum));
+        auto crop_prim = std::make_shared<cldnn::crop>(crop_id,  input_id, cropSize, cldnn::tensor{0, static_cast<int>(cropNum*hiddenSize), 0, 0});
+        return p.get_or_create(crop_prim);
+    };
+    auto& crop0_node = get_crop_node(0);
+    auto& crop1_node = get_crop_node(1);
+    auto crop2_id = primitive_id(crop_id_b + std::to_string(2));
+    auto crop2_prim = std::make_shared<cldnn::crop>(crop2_id, input_id,  cldnn::tensor{dir_num, static_cast<int>(2*hiddenSize), 1, 1},
+        cldnn::tensor{0, static_cast<int>(2*hiddenSize), 0, 0});
+    auto& crop2_node = p.get_or_create(crop2_prim);
+    std::vector<input_info> con_input{input_info(crop1_node.id()), input_info(crop0_node.id()), input_info(crop2_node.id())};
+    cldnn::primitive_id concat_id{input_id + "concat"};
+    auto con = std::make_shared<cldnn::concatenation>(concat_id, con_input, 1);
+    auto& con_node = p.get_or_create(con);
+    p.add_intermediate(con_node, node, prev, true);
+    p.add_intermediate(crop1_node, con_node, prev, true);
+    p.add_connection(prev, crop0_node, 0);
+    p.add_connection(prev, crop2_node, 0);
+    p.add_connection(crop0_node, con_node, 0);
+    p.add_connection(crop2_node, con_node, 0);
+    auto set_implementation_and_output = [this, &p](program_node& node) {
+        node.get_output_layout(false);
+        select_implementation(p, node);
+        p.mark_if_constant(node);
+        node.recalc_output_layout(false);
+    };
+    set_implementation_and_output(crop0_node);
+    set_implementation_and_output(crop1_node);
+    set_implementation_and_output(crop2_node);
+    set_implementation_and_output(con_node);
+}
+
 void post_optimize_weights::run(program& p) {
     for (auto& node : p.get_processing_order()) {
         if (node->is_type<convolution>()) {
@@ -134,8 +259,11 @@ void post_optimize_weights::run(program& p) {
             optimize_weights(node->as<deconvolution>(), p);
         } else if (node->is_type<fully_connected>()) {
             optimize_weights(node->as<fully_connected>(), p);
+        } else if (node->is_type<lstm_seq>()) {
+            optimize_weights(node->as<lstm_seq>(), p);
         }
     }
+    p.get_processing_order().calc_processing_order(p);
 }
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index e94714c84fdebf..de7f51b071ae53 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -17,11 +17,12 @@
 #include "depth_to_space_inst.h"
 #include "resample_inst.h"
 #include "loop_inst.h"
-#include "lstm_elt_inst.h"
+#include "lstm_cell_inst.h"
 #include "strided_slice_inst.h"
 #include "shape_of_inst.h"
 #include "non_max_suppression_inst.h"
 #include "experimental_detectron_roi_feature_extractor_inst.hpp"
+#include "lstm_seq_inst.h"
 #include "border_inst.h"
 
 #include "pass_manager.h"
@@ -504,6 +505,8 @@ bool crop_in_place_optimization::match(const program_node& node,
         }
         if (user->is_type<experimental_detectron_roi_feature_extractor>() && user->get_dependency_index(node) == 0)
             return false;
+        if (user->is_type<lstm_seq>() || user->is_type<lstm_cell>())
+            return false;
     }
 
     // do not optimize crop, that must be calculated in propagate_constants
@@ -519,10 +522,6 @@ bool crop_in_place_optimization::match(const program_node& node,
         return false;
 
     if (node.get_users().size() > 0) {
-        if (node.get_program().is_body_program() && node.get_dependency(0).is_type<lstm_elt>()) {
-            return false;
-        }
-
         GPU_DEBUG_GET_INSTANCE(debug_config);
         GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing && node.is_dynamic()) {
             return false;
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
index c323109850c489..60d1e8aa7e10b7 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -439,7 +439,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
         };
 
         auto conv_supports_fusings = [&](convolution_node& node) -> bool {
-            if (lo.get_optimization_attributes().use_onednn_impls == 1 &&
+            if (lo.has_all_enabled_onednn_impls_optimization_attribute() &&
                 lo.get_preferred_impl_type(node, format::byxf /*dummy value to disable format checking*/) == impl_types::onednn) {
                 return true;
             }
@@ -491,7 +491,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
         };
 
         auto fc_supports_fusings = [&](fully_connected_node& node) -> bool {
-            if (lo.get_optimization_attributes().use_onednn_impls &&
+            if (lo.has_all_enabled_onednn_impls_optimization_attribute() &&
                 lo.get_preferred_impl_type(node, format::any /*dummy*/) == impl_types::onednn) {
                 return true;
             } else {
@@ -589,7 +589,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
             // Do not fuse if the estimated format is fs_b_yx_fsv32 because the optimized kernel does not support fusion
             if (out_layout.data_type == data_types::f16 && out_layout.is_static() && out_layout.batch() > 1 &&
                 ((lo.get_optimization_attributes().fs_b_yx_fsv32_network &&
-                  !lo.get_optimization_attributes().use_onednn_impls && !has_reorder_behind_mvn()) ||
+                  !lo.has_all_enabled_onednn_impls_optimization_attribute() && !has_reorder_behind_mvn()) ||
                  out_layout.format == format::fs_b_yx_fsv32)) {
                 return false;
             }
@@ -665,7 +665,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
             if (input.in_shape_of_subgraph || node->in_shape_of_subgraph)
                 return;
 
-            if (lo.get_optimization_attributes().use_onednn_impls) {
+            if (lo.has_all_enabled_onednn_impls_optimization_attribute()) {
                 if (input.is_type<reshape>() || input.is_type<concatenation>())
                     return;
                 auto additional_params_input = activation_node.get_primitive()->additional_params_input;
@@ -768,7 +768,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                 return;
 
             // Onednn reorder does not support eltwise nor binary post operation
-            if (lo.get_optimization_attributes().use_onednn_impls && input.is_type<reorder>()) {
+            if (lo.has_all_enabled_onednn_impls_optimization_attribute() && input.is_type<reorder>()) {
                 return;
             }
 
@@ -809,7 +809,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                             (lo.should_select_b_fs_yx_fsv16_layout(input_data.as<convolution>(), input_data.get_input_layout(1)) &&
                              !is_grouped_conv(input_data.as<convolution>())) ||
                            // Avoid fusing to b_fs_yx_fsv16 (and similar) kernels
-                           lo.get_optimization_attributes().use_onednn_impls ||
+                           (lo.has_all_enabled_onednn_impls_optimization_attribute()) ||
                            (in_dt_is_i8_u8 && out_dt_is_i8_u8));
 
             should_fuse |= input_data.is_type<pooling>() && quantize_node.get_scale_shift_opt();
@@ -1067,7 +1067,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                 }
             }
 
-            if (lo.get_optimization_attributes().use_onednn_impls && lo.is_primitive_implemented_for_onednn(*fused_node)) {
+            if (lo.has_all_enabled_onednn_impls_optimization_attribute() && lo.is_primitive_implemented_for_onednn(*fused_node)) {
                 auto eltw_in_size = peer_node->get_output_layout();
                 if (eltw_in_size.is_dynamic()
                     // this whitelist condition is temporarily and to be relaxed soon.
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
index 28ee84c4a4ec02..1e5f943600fc05 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -489,7 +489,7 @@ void remove_redundant_reorders::run(program& p) {
                             (dep.get_output_layout().format == format::b_fs_yx_fsv16 ||
                              dep.get_output_layout().format == format::bfyx ||
                              (dep.get_output_layout().format == format::fs_b_yx_fsv32 &&
-                             !lo.get_optimization_attributes().use_onednn_impls));
+                             !lo.has_all_enabled_onednn_impls_optimization_attribute()));
 
         auto convert_color_opt = usr->is_type<convert_color>() && prim_desc->has_surface_input();
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
index 213da8cb0ab606..218b6268f9d340 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -56,9 +56,9 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
             onednn_impls_counter++;
     }
 
-    if (lo.get_optimization_attributes().use_onednn_impls && onednn_impls_counter < 1) {
+    if (!lo.is_empty_onednn_impls_optimization_attribute() && onednn_impls_counter < 1) {
         should_update_fmt_map = true;
-        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0);
+        lo.clear_onednn_impls_optimization_attribute();
         GPU_DEBUG_LOG << "Disable oneDNN implementations globally" << std::endl;
     }
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp
index 8a1197dfb843a6..fcd6dab33754fd 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp
@@ -69,7 +69,7 @@ void select_preferred_formats::run(program& p) {
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     auto& engine = p.get_engine();
-    if (p.get_layout_optimizer().get_optimization_attributes().use_onednn_impls) {
+    if (!p.get_layout_optimizer().is_empty_onednn_impls_optimization_attribute()) {
         engine.create_onednn_engine(p.get_config());
     }
 #endif  // ENABLE_ONEDNN_FOR_GPU
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
index 409fd824063da6..0a999a5a124d3b 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
@@ -309,6 +309,8 @@ kernel_selector::data_layout to_data_layout(format f) {
             return kernel_selector::data_layout::bfzyx;
         case format::bzyxf:
             return kernel_selector::data_layout::bzyxf;
+        case format::ybfx:
+            return kernel_selector::data_layout::ybfx;
         case format::fs_b_yx_fsv32:
             return kernel_selector::data_layout::fs_b_yx_fsv32;
         case format::bfwzyx:
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp
new file mode 100644
index 00000000000000..a41cd1065122de
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp
@@ -0,0 +1,93 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "primitive_base.hpp"
+
+#include "lstm_cell_inst.h"
+#include "lstm/lstm_cell_and_seq_kernel_selector.h"
+#include "lstm/lstm_kernel_base.h"
+#include "openvino/op/lstm_cell.hpp"
+#include "lstm_cell.hpp"
+
+namespace cldnn {
+namespace ocl {
+
+struct lstm_cell_impl : typed_primitive_impl_ocl<lstm_cell> {
+    using parent = typed_primitive_impl_ocl<lstm_cell>;
+    using parent::parent;
+    using kernel_selector_t = kernel_selector::lstm_cell_and_seq_kernel_selector;
+    using kernel_params_t = kernel_selector::lstm_params;
+
+    DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::lstm_cell_impl)
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<lstm_cell_impl>(*this);
+    }
+
+protected:
+    kernel_arguments_data get_arguments(const typed_primitive_inst<lstm_cell>& instance) const override {
+        kernel_arguments_data args;
+        for (size_t i = 0; i < instance.inputs_memory_count(); i++) {
+            args.inputs.push_back(instance.input_memory_ptr(i));
+        }
+
+        for (size_t i = 0; i < instance.outputs_memory_count(); i++) {
+            args.outputs.push_back(instance.output_memory_ptr(i));
+        }
+        return args;
+    }
+
+public:
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
+        const auto& primitive = impl_param.typed_desc<lstm_cell>();
+        auto params = get_default_params<kernel_selector::lstm_params>(impl_param);
+        for (size_t i = 1; i < 6; ++i) {
+            params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(i)));
+        }
+
+        if (!primitive->activations.empty()) {
+            auto a_sz = primitive->activations.size();
+            auto param_sz = primitive->activation_params.size();
+            OPENVINO_ASSERT(param_sz == 0 || a_sz == param_sz, "[GPU] Unexpected activation params count in lstm_cell impl: ", param_sz);
+            for (size_t i = 0; i < a_sz; i++) {
+                params.activations.emplace_back(get_kernel_selector_activation_param(primitive->activations[i]),
+                                                         param_sz ? primitive->activation_params[i].a : 0.0f,
+                                                         param_sz ? primitive->activation_params[i].b : 0.0f);
+            }
+        }
+
+        if (primitive->clip > 0.0f) {
+            params.activations.emplace_back(get_kernel_selector_activation_param(activation_func::clamp), -primitive->clip, primitive->clip);
+        }
+
+        params.SetOffsetOrder(static_cast<int32_t>(primitive->offset_order));
+        params.clip = primitive->clip;
+        params.direction = primitive->direction;
+
+        return params;
+    }
+
+    static kernel_impl_params static_canonicalize_shapes(const kernel_impl_params& impl_params) {
+        if (impl_params.get_input_layout().get_partial_shape().size() != 3) {
+            return primitive_impl::static_canonicalize_shapes(impl_params);
+        }
+        auto updated_impl_params = canonicalize_fused_shapes(impl_params);
+        return updated_impl_params;
+    }
+
+    kernel_impl_params canonicalize_shapes(const kernel_impl_params& impl_params) const override {
+        return static_canonicalize_shapes(impl_params);
+    }
+};
+
+std::unique_ptr<primitive_impl> LSTMCellImplementationManager::create_impl(const program_node& node, const kernel_impl_params& params) const {
+    OPENVINO_ASSERT(node.is_type<lstm_cell>());
+    return typed_primitive_impl_ocl<lstm_cell>::create<lstm_cell_impl>(static_cast<const lstm_cell_node&>(node), params);
+}
+
+}  // namespace ocl
+}  // namespace cldnn
+
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::lstm_cell_impl)
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::lstm_cell)
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.hpp
new file mode 100644
index 00000000000000..731bacf2e17e4f
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lstm_cell_inst.h"
+#include "impls/registry/implementation_manager.hpp"
+#include "intel_gpu/runtime/layout.hpp"
+
+#include <memory>
+namespace cldnn {
+namespace ocl {
+
+struct LSTMCellImplementationManager: public ImplementationManager {
+    OV_GPU_PRIMITIVE_IMPL("ocl::lstm_cell")
+    LSTMCellImplementationManager(shape_types shape_type, ValidateFunc vf = nullptr) : ImplementationManager(impl_types::ocl, shape_type, vf) {}
+
+    std::unique_ptr<primitive_impl> create_impl(const program_node& node, const kernel_impl_params& params) const override;
+
+    bool validate_impl(const program_node& node) const override {
+        assert(node.is_type<lstm_cell>());
+
+        const auto& input_layout = node.get_input_layout(0);
+        const auto& output_layout = node.get_output_layout(0);
+
+        auto input_fmt = input_layout.format;
+        auto output_fmt = output_layout.format;
+        auto in_dt = input_layout.data_type;
+        auto out_dt = output_layout.data_type;
+        static const std::vector<format::type> supported_formats = {
+            format::bfyx,
+            format::fyxb,
+        };
+        static const std::vector<ov::element::Type_t> supported_data_types = {
+            data_types::f32,
+            data_types::f16,
+        };
+
+        if (!one_of(in_dt, supported_data_types) || !one_of(out_dt, supported_data_types)) {
+            return false;
+        }
+
+        return one_of(input_fmt.value, supported_formats) && one_of(output_fmt.value, supported_formats);
+    }
+};
+
+}  // namespace ocl
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp
deleted file mode 100644
index 5de12d83fdbab3..00000000000000
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_elt.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "primitive_base.hpp"
-
-#include "lstm_elt_inst.h"
-#include "lstm/lstm_elt_kernel_selector.h"
-#include "lstm/lstm_elt_kernel_base.h"
-
-namespace cldnn {
-namespace ocl {
-
-struct lstm_elt_impl : typed_primitive_impl_ocl<lstm_elt> {
-    using parent = typed_primitive_impl_ocl<lstm_elt>;
-    using parent::parent;
-    using kernel_selector_t = kernel_selector::lstm_elt_kernel_selector;
-    using kernel_params_t = kernel_selector::lstm_elt_params;
-
-    DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::lstm_elt_impl)
-
-    std::unique_ptr<primitive_impl> clone() const override {
-        return make_deep_copy<lstm_elt_impl, kernel_params_t>(*this);
-    }
-
-protected:
-    kernel_arguments_data get_arguments(const typed_primitive_inst<lstm_elt>& instance) const override {
-        kernel_arguments_data args = parent::get_arguments(instance);
-
-        args.cell = instance.cell_term() ? instance.cell_memory() : nullptr;
-        args.outputs = { instance.output_memory_ptr() };
-
-        return args;
-    }
-
-public:
-    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
-        const auto& primitive = impl_param.typed_desc<lstm_elt>();
-        auto params = get_default_params<kernel_selector::lstm_elt_params>(impl_param);
-
-        if (!primitive->cell.empty()) {
-            const auto& cell_idx = 1;
-            const auto& cell_layout = impl_param.input_layouts[cell_idx];
-            params.SetCell(convert_data_tensor(cell_layout));
-            // TODO: make a generic function to get the direction
-            if (cell_layout.spatial(1) > 1) {
-                params.cell_direction = primitive->direction;
-            }
-        }
-
-        if (!primitive->activations.empty()) {
-            auto a_sz = primitive->activations.size();
-            auto param_sz = primitive->activation_params.size();
-            OPENVINO_ASSERT(param_sz == 0|| a_sz == param_sz, "[GPU] Unexpected activation params count in lstm_elt impl: ", param_sz);
-            for (size_t i = 0; i < a_sz; i++) {
-                params.activations.emplace_back(get_kernel_selector_activation_param(primitive->activations[i]),
-                                                         param_sz ? primitive->activation_params[i].a : 0.0f,
-                                                         param_sz ? primitive->activation_params[i].b : 0.0f);
-            }
-        }
-
-        if (primitive->clip > 0.0f) {
-            params.activations.emplace_back(get_kernel_selector_activation_param(activation_func::clamp), -primitive->clip, primitive->clip);
-        }
-
-        params.SetOffsetOrder(static_cast<int32_t>(primitive->offset_order));
-        params.clip = primitive->clip;
-        params.input_forget = primitive->input_forget;
-        params.direction = primitive->direction;
-
-        return params;
-    }
-
-    static kernel_impl_params static_canonicalize_shapes(const kernel_impl_params& impl_params) {
-        if (impl_params.get_input_layout().get_partial_shape().size() != 2) {
-            return primitive_impl::static_canonicalize_shapes(impl_params);
-        }
-        auto updated_impl_params = canonicalize_fused_shapes(impl_params);
-
-        auto& input_layout = updated_impl_params.input_layouts[0];
-        auto& weights_layout = updated_impl_params.input_layouts[1];
-        auto& output_layout = updated_impl_params.output_layouts[0];
-
-        auto input_pshape = input_layout.get_partial_shape();
-        auto weights_pshape = weights_layout.get_partial_shape();
-        auto output_pshape = output_layout.get_partial_shape();
-
-        auto lstm_input_size = static_cast<cldnn::tensor::value_type>(input_pshape[1].get_length());
-        auto lstm_batch_size = static_cast<cldnn::tensor::value_type>(input_pshape[0].get_length());
-        auto lstm_hidden_size = static_cast<cldnn::tensor::value_type>(lstm_input_size / 4);
-
-        GPU_DEBUG_LOG << "lstm_input_size   : " << lstm_input_size << std::endl;
-        GPU_DEBUG_LOG << "lstm_batch_size   : " << lstm_batch_size << std::endl;
-        GPU_DEBUG_LOG << "lstm_hidden_size  : " << lstm_hidden_size << std::endl;
-
-        GPU_DEBUG_LOG << "origin input_pshape   : " << input_layout.to_short_string() << std::endl;
-        GPU_DEBUG_LOG << "origin weights_layout : " << weights_layout.to_short_string() << std::endl;
-
-        input_pshape = {lstm_batch_size, 1, 1, lstm_input_size};
-        input_layout.set_partial_shape(input_pshape);
-
-        weights_pshape = {lstm_batch_size, 1, 1, lstm_hidden_size}; // {batch, direction, 1, hidden_size}
-        weights_layout.format = format::adjust_to_rank(weights_layout.format, weights_pshape.size());
-        weights_layout.set_partial_shape(weights_pshape);
-
-        updated_impl_params.weights_layout = weights_layout;
-
-        GPU_DEBUG_LOG << "input_layout   : " << input_layout.to_short_string() << std::endl;
-        GPU_DEBUG_LOG << "weights_layout : " << weights_layout.to_short_string() << std::endl;
-        GPU_DEBUG_LOG << "output_layout  : " << output_layout.to_short_string() << std::endl;
-
-        OPENVINO_ASSERT(input_pshape.size() == 4 && weights_pshape.size() == 4, "input and weights shape should be rank 4");
-        return updated_impl_params;
-    }
-
-    kernel_impl_params canonicalize_shapes(const kernel_impl_params& impl_params) const override {
-        return static_canonicalize_shapes(impl_params);
-    }
-};
-
-namespace detail {
-
-attach_lstm_elt_impl::attach_lstm_elt_impl() {
-    implementation_map<lstm_elt>::add(impl_types::ocl, typed_primitive_impl_ocl<lstm_elt>::create<lstm_elt_impl>, {
-        std::make_tuple(data_types::f32, format::bfyx),
-        std::make_tuple(data_types::f16, format::bfyx),
-        std::make_tuple(data_types::f32, format::fyxb),
-        std::make_tuple(data_types::f16, format::fyxb),
-    });
-}
-
-}  // namespace detail
-}  // namespace ocl
-}  // namespace cldnn
-
-BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::lstm_elt_impl)
-BIND_BINARY_BUFFER_WITH_TYPE(cldnn::lstm_elt)
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
index 2597e419e66a41..9b31e70d4ab69d 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
@@ -39,7 +39,6 @@ void register_implementations() {
     REGISTER_OCL(kv_cache);
     REGISTER_OCL(paged_attention);
     REGISTER_OCL(lrn);
-    REGISTER_OCL(lstm_elt);
     REGISTER_OCL(multiclass_nms);
     REGISTER_OCL(multinomial);
     REGISTER_OCL(mutable_data);
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
index d4b08b5154ef4b..906210f08252a4 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
@@ -113,7 +113,6 @@ REGISTER_OCL(group_normalization);
 REGISTER_OCL(kv_cache);
 REGISTER_OCL(paged_attention);
 REGISTER_OCL(lrn);
-REGISTER_OCL(lstm_elt);
 REGISTER_OCL(multiclass_nms);
 REGISTER_OCL(multinomial);
 REGISTER_OCL(mutable_data);
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp
new file mode 100644
index 00000000000000..3fb8ae13d3baa4
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp
@@ -0,0 +1,94 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "primitive_base.hpp"
+
+#include "lstm_seq_inst.h"
+#include "rnn_seq.hpp"
+#include "lstm/lstm_cell_and_seq_kernel_selector.h"
+#include "lstm/lstm_kernel_base.h"
+#include "openvino/op/lstm_sequence.hpp"
+#include "impls/registry/implementation_manager.hpp"
+
+namespace cldnn {
+namespace ocl {
+
+struct rnn_seq_impl : typed_primitive_impl_ocl<lstm_seq> {
+    using parent = typed_primitive_impl_ocl<lstm_seq>;
+    using parent::parent;
+    using kernel_selector_t = kernel_selector::lstm_cell_and_seq_kernel_selector;
+    using kernel_params_t = kernel_selector::lstm_params;
+
+    DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::rnn_seq_impl)
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<rnn_seq_impl>(*this);
+    }
+
+protected:
+    kernel_arguments_data get_arguments(const typed_primitive_inst<lstm_seq>& instance) const override {
+        kernel_arguments_data args;
+        for (size_t i = 0; i < instance.inputs_memory_count(); i++) {
+            args.inputs.push_back(instance.input_memory_ptr(i));
+        }
+
+        for (size_t i = 0; i < instance.outputs_memory_count(); i++) {
+            args.outputs.push_back(instance.output_memory_ptr(i));
+        }
+        return args;
+    }
+
+public:
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
+        const auto& primitive = impl_param.typed_desc<lstm_seq>();
+        auto params = get_default_params<kernel_selector::lstm_params>(impl_param);
+        params.sequential = true;
+        for (size_t i = 1; i < impl_param.input_layouts.size(); ++i) {
+            params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(i)));
+        }
+
+        if (!primitive->activations.empty()) {
+            auto a_sz = primitive->activations.size();
+            auto param_sz = primitive->activation_params.size();
+            OPENVINO_ASSERT(param_sz == 0|| a_sz == param_sz, "[GPU] Unexpected activation params count in lstm_seq impl: ", param_sz);
+            for (size_t i = 0; i < a_sz; i++) {
+                params.activations.emplace_back(get_kernel_selector_activation_param(primitive->activations[i]),
+                                                         param_sz ? primitive->activation_params[i].a : 0.0f,
+                                                         param_sz ? primitive->activation_params[i].b : 0.0f);
+            }
+        }
+
+        if (primitive->clip > 0.0f) {
+            params.activations.emplace_back(get_kernel_selector_activation_param(activation_func::clamp), -primitive->clip, primitive->clip);
+        }
+
+        params.SetOffsetOrder(static_cast<int32_t>(primitive->offset_order));
+        params.clip = primitive->clip;
+        params.direction = primitive->direction;
+        return params;
+    }
+
+    static kernel_impl_params static_canonicalize_shapes(const kernel_impl_params& impl_params) {
+        if (impl_params.get_input_layout().get_partial_shape().size() != 3) {
+            return primitive_impl::static_canonicalize_shapes(impl_params);
+        }
+        auto updated_impl_params = canonicalize_fused_shapes(impl_params);
+        return updated_impl_params;
+    }
+
+    kernel_impl_params canonicalize_shapes(const kernel_impl_params& impl_params) const override {
+        return static_canonicalize_shapes(impl_params);
+    }
+};
+
+std::unique_ptr<primitive_impl> RNNSeqImplementationManager::create_impl(const program_node& node, const kernel_impl_params& params) const {
+    OPENVINO_ASSERT(node.is_type<lstm_seq>());
+    return typed_primitive_impl_ocl<lstm_seq>::create<rnn_seq_impl>(static_cast<const lstm_seq_node&>(node), params);
+}
+
+}  // namespace ocl
+}  // namespace cldnn
+
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::rnn_seq_impl)
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::lstm_seq)
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.hpp
new file mode 100644
index 00000000000000..3e71ad2be51192
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lstm_seq_inst.h"
+#include "impls/registry/implementation_manager.hpp"
+#include "intel_gpu/runtime/layout.hpp"
+
+#include <memory>
+namespace cldnn {
+namespace ocl {
+
+struct RNNSeqImplementationManager: public ImplementationManager {
+    OV_GPU_PRIMITIVE_IMPL("ocl::lstm_seq")
+    RNNSeqImplementationManager(shape_types shape_type, ValidateFunc vf = nullptr) : ImplementationManager(impl_types::ocl, shape_type, vf) {}
+
+    std::unique_ptr<primitive_impl> create_impl(const program_node& node, const kernel_impl_params& params) const override;
+
+    bool validate_impl(const program_node& node) const override {
+        assert(node.is_type<lstm_seq>());
+
+        const auto& input_layout = node.get_input_layout(0);
+        const auto& output_layout = node.get_output_layout(0);
+
+        auto input_fmt = input_layout.format;
+        auto output_fmt = output_layout.format;
+        auto in_dt = input_layout.data_type;
+        auto out_dt = output_layout.data_type;
+        static const std::vector<format::type> supported_formats = {
+            format::bfyx
+        };
+        static const std::vector<ov::element::Type_t> supported_data_types = {
+            data_types::f32,
+            data_types::f16,
+        };
+
+        if (!one_of(in_dt, supported_data_types) || !one_of(out_dt, supported_data_types)) {
+            return false;
+        }
+
+        return one_of(input_fmt.value, supported_formats) && one_of(output_fmt.value, supported_formats);
+    }
+};
+
+}  // namespace ocl
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp
new file mode 100644
index 00000000000000..167be7be6e7481
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp
@@ -0,0 +1,218 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "impls/onednn/utils.hpp"
+#include "lstm_seq_inst.h"
+#include "primitive_onednn_base.h"
+#include "lstm_seq_onednn.hpp"
+#include "impls/registry/implementation_map.hpp"
+
+#include "kernel_selector_common.h"
+
+#include <oneapi/dnnl/dnnl.hpp>
+
+#include <algorithm>
+#include <memory>
+namespace cldnn {
+namespace onednn {
+
+struct lstm_seq_onednn : typed_primitive_onednn_impl<lstm_seq> {
+    using parent = typed_primitive_onednn_impl<lstm_seq>;
+    using parent::parent;
+
+    DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::onednn::lstm_seq_onednn)
+
+protected:
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<lstm_seq_onednn>(*this);
+    }
+
+    std::unordered_map<int, dnnl::memory> get_arguments(lstm_seq_inst& instance) const override {
+        std::unordered_map<int, dnnl::memory> args;
+        std::vector<std::vector<unsigned int>> dnnl_arg{{DNNL_ARG_SRC_LAYER, DNNL_ARG_SRC_ITER, DNNL_ARG_SRC_ITER_C}, {DNNL_ARG_WEIGHTS_LAYER,
+            DNNL_ARG_WEIGHTS_ITER, DNNL_ARG_BIAS}, {DNNL_ARG_DST_LAYER, DNNL_ARG_DST_ITER, DNNL_ARG_DST_ITER_C}};
+
+        for (int i = 0; i < 3; i++) {
+            for (int j = 0 ; j < 3; j++) {
+                dnnl::memory mem;
+                switch (i) {
+                    case 0:
+                        {
+                            auto& input = instance.input_memory(j);
+                            auto offset = onednn::get_offset(instance.get_input_layout(j), _pd.dnnl::primitive_desc_base::src_desc(j));
+                            mem = input.get_onednn_memory(_pd.dnnl::primitive_desc_base::src_desc(j), offset);
+                            break;
+                        }
+                    case 1:
+                        {
+                            auto& input = instance.input_memory(3+j);
+                            auto offset = onednn::get_offset(instance.get_input_layout(3+j), _pd.dnnl::primitive_desc_base::weights_desc(j));
+                            mem = input.get_onednn_memory(_pd.dnnl::primitive_desc_base::weights_desc(j), offset);
+                            break;
+                        }
+                    case 2:
+                        {
+                            auto& output = instance.output_memory(j);
+                            auto offset = onednn::get_offset(instance.get_output_layout(j), _pd.dnnl::primitive_desc_base::dst_desc(j));
+                            mem = output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(j), offset);
+                            break;
+                        }
+                    default:
+                        break;
+                }
+                args.insert({dnnl_arg[i][j], mem});
+            }
+        }
+        return args;
+    }
+
+    static cldnn::layout get_reorder_layout(const kernel_impl_params& impl_params, size_t layout_nr) {
+        auto weights_shape = impl_params.get_input_layout(layout_nr).get_shape();
+        auto target_weights_layout = impl_params.get_input_layout(layout_nr);
+        target_weights_layout.format = cldnn::format::bfzyx;
+        auto layout = target_weights_layout.clone_with_other_shape(ov::Shape{weights_shape[0], weights_shape[1], weights_shape[2], 1, 1});
+        return layout;
+    }
+
+    static std::shared_ptr<WeightsReorderParams> get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) {
+        const auto weights_layout_idx = 3;
+        auto source_weights_layout = impl_params.get_input_layout(weights_layout_idx);
+        auto target_weights_layout = get_reorder_layout(impl_params, weights_layout_idx);
+        auto W_desc = onednn::layout_to_memory_desc(source_weights_layout);
+        auto grouped_weights = format::is_grouped(source_weights_layout.format);
+
+        return std::make_shared<WeightsReorderParamsOneDNN>(source_weights_layout,
+                                                            target_weights_layout,
+                                                            W_desc,
+                                                            W_desc,
+                                                            false,
+                                                            grouped_weights);
+    }
+    static std::shared_ptr<dnnl::lstm_forward::primitive_desc> get_lstm_primitive_descriptor(const kernel_impl_params& impl_params, cldnn::engine& engine,
+                                                                                             const dnnl::primitive_attr& attr,
+                                                                                             ov::op::RecurrentSequenceDirection direction) {
+        auto prim = impl_params.typed_desc<lstm_seq>();
+        auto num_dir = static_cast<size_t>(prim->num_directions());
+        const auto& src_shape = impl_params.get_input_layout(0).get_shape();
+        auto mod_src_shape = src_shape;
+        std::swap(mod_src_shape[0], mod_src_shape[1]);
+        auto input_md = onednn::layout_to_memory_desc(impl_params.get_input_layout(0).clone_with_other_shape(mod_src_shape), dnnl::memory::format_tag::abc);
+        auto initial_hidden_shape_mod = impl_params.get_input_layout(1).get_shape();
+        initial_hidden_shape_mod = { 1, num_dir, initial_hidden_shape_mod[0], initial_hidden_shape_mod[2] };
+        auto initial_hidden =  onednn::layout_to_memory_desc(impl_params.get_input_layout(1).clone_with_other_shape(initial_hidden_shape_mod));
+        auto initial_cell =  onednn::layout_to_memory_desc(impl_params.get_input_layout(2).clone_with_other_shape(initial_hidden_shape_mod));
+        auto W_shape_mod = impl_params.get_input_layout(3).get_shape();
+        W_shape_mod = {1, num_dir, W_shape_mod[2], 4, W_shape_mod[1]/4};
+        auto w_layout = impl_params.get_input_layout(3).clone_with_other_shape(W_shape_mod);
+        w_layout.format = cldnn::format::bfzyx;
+        auto W_md = onednn::layout_to_memory_desc(w_layout);
+        auto R_shape_mod = impl_params.get_input_layout(4).get_shape();
+        R_shape_mod = {1, num_dir, R_shape_mod[2], 4, R_shape_mod[1]/4};
+        auto r_layout = impl_params.get_input_layout(4).clone_with_other_shape(R_shape_mod);
+        r_layout.format = cldnn::format::bfzyx;
+        auto R_md = onednn::layout_to_memory_desc(r_layout);
+        auto B_shape_mod = impl_params.get_input_layout(5).get_shape();
+        B_shape_mod = {1, num_dir, 4, B_shape_mod[1]/4};
+        auto b_layout = impl_params.get_input_layout(5).clone_with_other_shape(B_shape_mod);
+        b_layout.format = cldnn::format::bfyx;
+        auto B_md = onednn::layout_to_memory_desc(b_layout);
+        auto out_shape = impl_params.get_output_layout().get_shape();
+        out_shape = {out_shape[2], out_shape[0], out_shape[3]*num_dir};
+        auto output_md = onednn::layout_to_memory_desc(impl_params.get_output_layout().clone_with_other_shape(out_shape), dnnl::memory::format_tag::abc);
+        auto output1_md = onednn::layout_to_memory_desc(impl_params.get_output_layout(1).clone_with_other_shape(initial_hidden_shape_mod));
+        auto output2_md = onednn::layout_to_memory_desc(impl_params.get_output_layout(2).clone_with_other_shape(initial_hidden_shape_mod));
+        OPENVINO_ASSERT(input_md.get_format_kind() != dnnl::memory::format_kind::any,
+                        "[GPU] The format kind of the input memory descriptor of onednn lstm_seq cannot be 'any'.");
+        OPENVINO_ASSERT(output_md.get_format_kind() != dnnl::memory::format_kind::any,
+                        "[GPU] The format kind of the output memory descriptor of onednn lstm_seq cannot be 'any'.");
+
+        auto eng = engine.get_onednn_engine();
+        dnnl::rnn_direction lstm_desc_dir;
+        if (direction == ov::op::RecurrentSequenceDirection::FORWARD) {
+            lstm_desc_dir = dnnl::rnn_direction::unidirectional_left2right;
+        } else if (direction == ov::op::RecurrentSequenceDirection::REVERSE) {
+            lstm_desc_dir = dnnl::rnn_direction::unidirectional_right2left;
+        } else {
+            lstm_desc_dir = dnnl::rnn_direction::bidirectional_concat;
+        }
+        return std::make_shared<dnnl::lstm_forward::primitive_desc>(
+            eng,
+            dnnl::prop_kind::forward_inference,
+            lstm_desc_dir,
+            input_md,
+            initial_hidden,
+            initial_cell,
+            W_md,
+            R_md,
+            B_md,
+            output_md,
+            output1_md,
+            output2_md);
+    }
+
+public:
+    void save(BinaryOutputBuffer& ob) const override {
+#ifdef ONEDNN_PRIMITIVE_SERIALIZATION
+        parent::save(ob);
+
+        std::vector<uint8_t> prim_cache;
+        prim_cache = _prim.get_cache_blob();
+        ob << prim_cache;
+#endif
+    }
+
+    void load(BinaryInputBuffer& ib) override {
+#ifdef ONEDNN_PRIMITIVE_SERIALIZATION
+        parent::load(ib);
+
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
+
+        auto input_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(0));
+        auto initial_hidden_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(1));
+        auto initial_cell_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(2));
+        auto W_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(3));
+        auto R_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(4));
+        auto B_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(5));
+        auto output_md = onednn::layout_to_memory_desc(impl_params->get_output_layout());
+        auto output2_md = onednn::layout_to_memory_desc(impl_params->get_output_layout());
+        auto prim_desc = std::make_shared<dnnl::lstm_forward::primitive_desc>(
+            ib.get_engine().get_onednn_engine(),
+            dnnl::prop_kind::forward_inference,
+            dnnl::rnn_direction::undef,
+            input_md,
+            initial_hidden_md,
+            initial_cell_md,
+            W_md,
+            R_md,
+            B_md,
+            output_md,
+            output_md,
+            output2_md);
+        _pd = *prim_desc;
+
+        std::vector<uint8_t> prim_cache;
+        ib >> prim_cache;
+        _prim = dnnl::primitive(_pd, prim_cache);
+#endif
+    }
+
+    static std::unique_ptr<primitive_impl> create(const lstm_seq_node& arg, const kernel_impl_params& impl_params) {
+        auto& engine = impl_params.prog->get_engine();
+        auto& config = impl_params.prog->get_config();
+        auto attr = impl_params.attrs_onednn;
+        auto direction = arg.direction();
+        auto prim_desc = get_lstm_primitive_descriptor(impl_params, engine, *attr, direction);
+        return cldnn::make_unique<lstm_seq_onednn>(engine, config, attr, *prim_desc, get_weights_reorder(impl_params, *prim_desc));
+    }
+};
+
+std::unique_ptr<primitive_impl> LSTMSeqImplementationManager::create_impl(const program_node& node, const kernel_impl_params& params) const  {
+    assert(node.is_type<lstm_seq>());
+    return onednn::lstm_seq_onednn::create(static_cast<const lstm_seq_node&>(node), params);
+}
+
+}  // namespace onednn
+}  // namespace cldnn
+
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::onednn::lstm_seq_onednn)
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.hpp
new file mode 100644
index 00000000000000..545ae780a7548b
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.hpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lstm_seq_inst.h"
+#include "reshape_inst.h"
+#include "intel_gpu/runtime/utils.hpp"
+#include "impls/registry/implementation_manager.hpp"
+#include "transformations/utils/utils.hpp"
+
+#include <memory>
+
+
+namespace cldnn {
+namespace onednn {
+
+struct LSTMSeqImplementationManager : public ImplementationManager {
+    OV_GPU_PRIMITIVE_IMPL("onednn::lstm_seq")
+    LSTMSeqImplementationManager(shape_types shape_type) : ImplementationManager(impl_types::onednn, shape_type) {}
+    std::unique_ptr<primitive_impl> create_impl(const program_node& node, const kernel_impl_params& params) const override;
+
+    bool validate_impl(const program_node& node) const override {
+        assert(node.is_type<lstm_seq>());
+        const auto& info = node.get_program().get_engine().get_device_info();
+        if (info.arch == gpu_arch::unknown)
+            return false;
+
+        auto in0_dt = node.get_input_layout(0).data_type;
+        auto in1_dt = node.get_input_layout(1).data_type;
+        auto in2_dt = node.get_input_layout(2).data_type;
+        auto in3_dt = node.get_input_layout(3).data_type;
+        auto in4_dt = node.get_input_layout(4).data_type;
+        auto in5_dt = node.get_input_layout(5).data_type;
+        auto out0_dt = node.get_output_layout(0).data_type;
+        auto out1_dt = node.get_output_layout(1).data_type;
+        auto out2_dt = node.get_output_layout(2).data_type;
+        bool cell_state_check = one_of(in2_dt, {data_types::f16, data_types::bf16, data_types::f32}) &&
+            one_of(out2_dt, {data_types::f16, data_types::bf16, data_types::f32});
+        bool f16_case = everyone_is(data_types::f16, in0_dt, in1_dt, in3_dt, in4_dt, out0_dt, out1_dt);
+        bool bf16_case = everyone_is(data_types::bf16, in0_dt, in1_dt, in3_dt, in4_dt, out0_dt, out1_dt);
+        bool f32_case = everyone_is(data_types::f32, in0_dt, in1_dt, in3_dt, in4_dt, in5_dt, out0_dt, out1_dt);
+        bool u8u8u8_case = one_of(out0_dt, {data_types::u8, data_types::f32}) && everyone_is(data_types::i8, in3_dt, in4_dt) &&
+            everyone_is(data_types::u8, in0_dt, in1_dt, out1_dt) && everyone_is(data_types::f32, in2_dt, in5_dt, out2_dt);
+        bool f32u8f32_case = everyone_is(data_types::u8, in0_dt) && everyone_is(data_types::i8, in3_dt, in4_dt) &&
+            one_of(out0_dt, {data_types::u8, data_types::f32}) && everyone_is(data_types::f32, in1_dt, in5_dt, out1_dt);
+        bool s8s8s8_case = everyone_is(data_types::i8, in0_dt, in1_dt, out0_dt, out1_dt) && one_of(out0_dt, {data_types::i8, data_types::f32}) &&
+            everyone_is(data_types::f32, in2_dt, in5_dt, out2_dt);
+        bool f32s8f32_case = everyone_is(data_types::i8, in0_dt, in3_dt, in4_dt) && one_of(out0_dt, {data_types::i8, data_types::f32}) &&
+            everyone_is(data_types::f32, in1_dt, in5_dt, out1_dt);
+
+        if (!cell_state_check)
+            return false;
+        if (!f16_case && !f32_case && !bf16_case && !u8u8u8_case && !f32u8f32_case && !s8s8s8_case && !f32s8f32_case)
+            return false;
+
+        return node.get_input_layout(0).format == cldnn::format::bfyx || node.get_input_layout(0).format == cldnn::format::fbyx
+            || node.get_input_layout(0).format == cldnn::format::ybfx;
+    }
+
+    in_out_fmts_t query_formats(const program_node& node) const override {
+        assert(node.is_type<lstm_seq>());
+        std::vector<format::type> in_fmts(node.get_dependencies().size(), format::any);
+        std::vector<format::type> out_fmts(node.get_outputs_count(), format::any);
+
+        size_t out_rank = node.get_output_layout().get_rank();
+        for (size_t idx = 0; idx < node.get_dependencies().size(); idx++) {
+            if (node.get_dependency(idx).is_constant())
+                continue;
+
+            auto target_format = format::get_default_format(out_rank);
+            if (idx == 0)
+                in_fmts[idx] = format::fbyx;
+            in_fmts[idx] = target_format;
+        }
+        out_fmts[0] = format::ybfx;
+
+        return {in_fmts, out_fmts};
+    }
+};
+
+}  // namespace onednn
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
index a8aa43671ed048..75e087a25fb48f 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
@@ -270,6 +270,10 @@ dnnl::memory::desc layout_to_memory_desc(cldnn::layout l, dnnl::memory::format_t
     } else if (target_fmt == dnnl::memory::format_tag::ab) {
         dims.push_back(l.batch());
         dims.push_back(l.get_tensor().count() / l.batch());
+    } else if (target_fmt == dnnl::memory::format_tag::abc) {
+        dims.push_back(l.batch());
+        dims.push_back(l.feature());
+        dims.push_back(l.spatial(1));
     } else if (target_fmt == dnnl::memory::format_tag::ba) {
         dims.push_back(l.feature());
         dims.push_back(l.get_tensor().count() / l.feature());
diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/lstm_cell_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/lstm_cell_impls.cpp
new file mode 100644
index 00000000000000..09ba1f670b29d3
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/registry/lstm_cell_impls.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "primitive_inst.h"
+#include "registry.hpp"
+#include "intel_gpu/primitives/rnn.hpp"
+
+#if OV_GPU_WITH_OCL
+    #include "impls/ocl/lstm_cell.hpp"
+#endif
+
+namespace ov {
+namespace intel_gpu {
+
+using namespace cldnn;
+
+const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<lstm_cell>::get_implementations() {
+    static const std::vector<std::shared_ptr<ImplementationManager>> impls = {
+        OV_GPU_CREATE_INSTANCE_OCL(ocl::LSTMCellImplementationManager, shape_types::static_shape)
+    };
+
+    return impls;
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/lstm_seq_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/lstm_seq_impls.cpp
new file mode 100644
index 00000000000000..4b718bd1c74c72
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/registry/lstm_seq_impls.cpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "primitive_inst.h"
+#include "registry.hpp"
+#include "intel_gpu/primitives/rnn.hpp"
+
+#if OV_GPU_WITH_OCL
+    #include "impls/ocl/rnn_seq.hpp"
+#endif
+
+#if OV_GPU_WITH_ONEDNN
+    #include "impls/onednn/lstm_seq_onednn.hpp"
+#endif
+
+namespace ov {
+namespace intel_gpu {
+
+using namespace cldnn;
+
+const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<lstm_seq>::get_implementations() {
+    static const std::vector<std::shared_ptr<ImplementationManager>> impls = {
+        OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::LSTMSeqImplementationManager, shape_types::static_shape)
+        OV_GPU_CREATE_INSTANCE_OCL(ocl::RNNSeqImplementationManager, shape_types::static_shape)
+    };
+
+    return impls;
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp b/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp
index a6bb8ad6eebcc2..a837d614d0fb1d 100644
--- a/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/registry/registry.hpp
@@ -130,6 +130,8 @@ REGISTER_IMPLS(fully_connected);
 REGISTER_IMPLS(gather);
 REGISTER_IMPLS(gather_nd);
 REGISTER_IMPLS(gemm);
+REGISTER_IMPLS(lstm_cell);
+REGISTER_IMPLS(lstm_seq);
 REGISTER_IMPLS(pooling);
 REGISTER_IMPLS(reduce);
 REGISTER_IMPLS(reorder);
@@ -171,7 +173,6 @@ REGISTER_DEFAULT_IMPLS(grid_sample, OCL_S);
 REGISTER_DEFAULT_IMPLS(group_normalization, OCL_S, OCL_D);
 REGISTER_DEFAULT_IMPLS(kv_cache, OCL_S, OCL_D);
 REGISTER_DEFAULT_IMPLS(lrn, OCL_S);
-REGISTER_DEFAULT_IMPLS(lstm_elt, OCL_S);
 REGISTER_DEFAULT_IMPLS(multiclass_nms, OCL_S);
 REGISTER_DEFAULT_IMPLS(multinomial, OCL_S);
 REGISTER_DEFAULT_IMPLS(mutable_data, OCL_S);
diff --git a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
index 52abc5f0cf8cb4..e7d5bdc8bdabdf 100644
--- a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
+++ b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
@@ -95,8 +95,7 @@ class layout_optimizer {
         b_fs_zyx_fsv32_network,
         b_fs_yx_fsv16_network,
         b_fs_zyx_fsv16_network,
-        bs_fs_yx_bsv16_fsv16_network,
-        use_onednn_impls
+        bs_fs_yx_bsv16_fsv16_network
     };
 
     struct optimization_attributes {
@@ -107,7 +106,7 @@ class layout_optimizer {
         int32_t b_fs_yx_fsv16_network = 0;
         int32_t b_fs_zyx_fsv16_network = 0;
         int32_t bs_fs_yx_bsv16_fsv16_network = 0;
-        int32_t use_onednn_impls = 0;
+        std::map<primitive_type_id, bool> onednn_impls = {};
     };
 
 private:
@@ -190,6 +189,33 @@ class layout_optimizer {
     void set_optimization_attribute(optimization_attributes_type attribute, int32_t val);
     optimization_attributes get_optimization_attributes() { return _optimization_attributes; }
 
+    template <typename PT>
+    void enable_onednn_for() {
+        _optimization_attributes.onednn_impls[PT::type_id()] = true;
+    }
+
+    template <typename PT>
+    void disable_onednn_for() {
+        _optimization_attributes.onednn_impls[PT::type_id()] = false;
+    }
+    void add_all_onednn_impls_optimization_attribute();
+    bool has_all_enabled_onednn_impls_optimization_attribute();
+    template <typename PT>
+    bool is_enabled_onednn_for() {
+        auto type_id = PT::type_id();
+        auto it = _optimization_attributes.onednn_impls.find(type_id);
+        if (it == _optimization_attributes.onednn_impls.end()) {
+            return false;
+        }
+
+        return it->second;
+    }
+    void set_value_onednn(primitive_type_id p_type, bool val);
+    bool contains_onednn_impls_optimization_attribute(const program_node*);
+    bool is_empty_onednn_impls_optimization_attribute();
+    void clear_onednn_impls_optimization_attribute();
+    std::map<primitive_type_id, bool> get_all_onednn_impls_optimization_attribute();
+
     void set_implementation_forcing(const ov::intel_gpu::ImplForcingMap& map);
     const std::map<primitive_id, std::pair<format::type, impl_types>>& get_implementation_forcing() const;
 
diff --git a/src/plugins/intel_gpu/src/graph/include/lstm_cell_inst.h b/src/plugins/intel_gpu/src/graph/include/lstm_cell_inst.h
new file mode 100644
index 00000000000000..38c4232a500eb9
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/include/lstm_cell_inst.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "intel_gpu/primitives/lstm_cell.hpp"
+#include "primitive_inst.h"
+
+#include <string>
+
+namespace cldnn {
+template <>
+struct typed_program_node<lstm_cell> : public typed_program_node_base<lstm_cell> {
+    using parent = typed_program_node_base<lstm_cell>;
+
+public:
+    using parent::parent;
+};
+
+using lstm_cell_node = typed_program_node<lstm_cell>;
+
+template <>
+class typed_primitive_inst<lstm_cell> : public typed_primitive_inst_base<lstm_cell> {
+    using parent = typed_primitive_inst_base<lstm_cell>;
+    using parent::parent;
+
+public:
+    template<typename ShapeType>
+    static std::vector<layout> calc_output_layouts(lstm_cell_node const& node, kernel_impl_params const& impl_param);
+    static layout calc_output_layout(lstm_cell_node const& node, kernel_impl_params const& impl_param);
+    static std::string to_string(lstm_cell_node const& node);
+
+public:
+    typed_primitive_inst(network& network, lstm_cell_node const& node);
+};
+
+using lstm_cell_inst = typed_primitive_inst<lstm_cell>;
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/include/lstm_elt_inst.h b/src/plugins/intel_gpu/src/graph/include/lstm_elt_inst.h
deleted file mode 100644
index 1524598c6f3987..00000000000000
--- a/src/plugins/intel_gpu/src/graph/include/lstm_elt_inst.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-#include "intel_gpu/primitives/lstm.hpp"
-#include "primitive_inst.h"
-
-#include <string>
-
-namespace cldnn {
-template <>
-struct typed_program_node<lstm_elt> : public typed_program_node_base<lstm_elt> {
-    using parent = typed_program_node_base<lstm_elt>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-    program_node& cell() const { return get_dependency(1); }
-    bool cell_term() const { return !get_primitive()->cell.empty(); }
-    lstm_weights_order offset_order() const { return get_primitive()->offset_order; }
-    float clip() const {
-        float clip_val = get_primitive()->clip;
-        if (clip_val < 0)
-            throw std::range_error("Clip value < 0");
-        return clip_val;
-    }
-    bool input_forget() const { return get_primitive()->input_forget; }
-    int32_t direction() const { return get_primitive()->direction; }
-};
-
-using lstm_elt_node = typed_program_node<lstm_elt>;
-
-template <>
-class typed_primitive_inst<lstm_elt> : public typed_primitive_inst_base<lstm_elt> {
-    using parent = typed_primitive_inst_base<lstm_elt>;
-    using parent::parent;
-
-public:
-    template<typename ShapeType>
-    static std::vector<layout> calc_output_layouts(lstm_elt_node const& node, kernel_impl_params const& impl_param);
-    static layout calc_output_layout(lstm_elt_node const& node, kernel_impl_params const& impl_param);
-    static std::string to_string(lstm_elt_node const& node);
-
-public:
-    typed_primitive_inst(network& network, lstm_elt_node const& node);
-
-    memory::ptr cell_memory() const { return dep_memory_ptr(1); }
-    bool cell_term() const { return !get_typed_desc<lstm_elt>()->cell.empty(); }
-    lstm_weights_order offset_order() const { return get_typed_desc<lstm_elt>()->offset_order; }
-    float clip() const {
-        float clip_val = get_typed_desc<lstm_elt>()->clip;
-        if (clip_val < 0)
-            throw std::range_error("Clip value < 0");
-        return clip_val;
-    }
-    bool input_forget() const { return get_typed_desc<lstm_elt>()->input_forget; }
-    uint32_t direction() const { return get_typed_desc<lstm_elt>()->direction; }
-};
-
-using lstm_elt_inst = typed_primitive_inst<lstm_elt>;
-
-}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/include/lstm_seq_inst.h b/src/plugins/intel_gpu/src/graph/include/lstm_seq_inst.h
new file mode 100644
index 00000000000000..33ad7bebac2fbc
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/include/lstm_seq_inst.h
@@ -0,0 +1,39 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "intel_gpu/primitives/rnn.hpp"
+#include "primitive_inst.h"
+
+#include <string>
+
+namespace cldnn {
+template <>
+struct typed_program_node<lstm_seq> : public typed_program_node_base<lstm_seq> {
+    using parent = typed_program_node_base<lstm_seq>;
+
+public:
+    using parent::parent;
+    ov::op::RecurrentSequenceDirection direction() const { return get_primitive()->direction; }
+};
+
+using lstm_seq_node = typed_program_node<lstm_seq>;
+
+template <>
+class typed_primitive_inst<lstm_seq> : public typed_primitive_inst_base<lstm_seq> {
+    using parent = typed_primitive_inst_base<lstm_seq>;
+    using parent::parent;
+
+public:
+    template<typename ShapeType>
+    static std::vector<layout> calc_output_layouts(lstm_seq_node const& node, kernel_impl_params const& impl_param);
+    static layout calc_output_layout(lstm_seq_node const& node, kernel_impl_params const& impl_param);
+    static std::string to_string(lstm_seq_node const& node);
+
+public:
+    typed_primitive_inst(network& network, lstm_seq_node const& node);
+};
+
+using lstm_seq_inst = typed_primitive_inst<lstm_seq>;
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
index 281f95a892662c..9850c25a64ec5d 100644
--- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h
+++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@@ -196,6 +196,11 @@ class post_optimize_weights : public base_pass {
     weights_bias_offset get_weights_bias_offset(const T& node);
     template<typename T>
     void optimize_weights(T& node, program& p);
+    void select_implementation(program& p, program_node& node);
+    void add_lstm_weights_reorder(primitive_id input_id, std::shared_ptr<WeightsReorderParams> reorder_params, program& p, cldnn::program_node&, \
+                                  cldnn::program_node&, size_t);
+    void add_lstm_bias_reorder(primitive_id input_id, std::shared_ptr<WeightsReorderParams> reorder_params, program& p, cldnn::program_node&, \
+                               cldnn::program_node&);
     reorder_factory& _rf;
 };
 
diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h b/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h
index eb3f313af00de6..81f614ba9fd43a 100644
--- a/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h
@@ -51,7 +51,7 @@ struct primitive_type_base : primitive_type {
             if ((node.get_forced_impl_type() & impl_type) != impl_type)
                 continue;
 
-            if (impl_type == impl_types::onednn && !node.get_program().get_layout_optimizer().get_optimization_attributes().use_onednn_impls)
+            if (impl_type == impl_types::onednn && !node.get_program().get_layout_optimizer().contains_onednn_impls_optimization_attribute(&node))
                 continue;
 
             shape_types supported_shape_type = impl->get_shape_type();
@@ -168,7 +168,7 @@ struct primitive_type_base : primitive_type {
                     return true;
                 continue;
             } else {
-                if (impl_type == impl_types::onednn && !node.get_program().get_layout_optimizer().get_optimization_attributes().use_onednn_impls)
+                if (impl_type == impl_types::onednn && !node.get_program().get_layout_optimizer().contains_onednn_impls_optimization_attribute(&node))
                     continue;
 
                 if (!impl->validate(node))
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index bb4d739b3a07c1..5262e8c4621e72 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -27,7 +27,6 @@
 #include "pooling_inst.h"
 #include "reduce_inst.h"
 #include "one_hot_inst.h"
-#include "permute_inst.h"
 #include "quantize_inst.h"
 #include "mvn_inst.h"
 #include "depth_to_space_inst.h"
@@ -37,7 +36,10 @@
 #include "gather_inst.h"
 #include "broadcast_inst.h"
 #include "loop_inst.h"
+#include "concatenation_inst.h"
+#include "permute_inst.h"
 #include "dft_inst.h"
+#include "lstm_seq_inst.h"
 #include "to_string_utils.h"
 #include <vector>
 #include <memory>
@@ -114,7 +116,6 @@ bool layout_optimizer::is_format_supported(program_node& node, format::type fmt)
         node.get_input_layout(0).data_type != data_types::i8 &&
         node.get_input_layout(0).data_type != data_types::u8)
         return false;
-
     if (node.is_type<input_layout>())
         return node.get_output_layout().format == fmt;
 
@@ -132,7 +133,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
     auto next_output_layout = next.get_output_layout();
     auto prev_dt = prev.get_output_layout().data_type;
     auto next_dt = next.get_output_layout().data_type;
-    auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
+    auto use_onednn_impls = has_all_enabled_onednn_impls_optimization_attribute();
 
     if (prev.is_dynamic() || next.is_dynamic())
         return false;
@@ -365,7 +366,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, reorder_node
     auto next = node.get_users().front();
     auto dt_prev = prev.get_output_layout().data_type;
     auto dt_next = next->get_output_layout().data_type;
-    auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
+    auto use_onednn_impls = contains_onednn_impls_optimization_attribute(&node) && contains_onednn_impls_optimization_attribute(&prev);
 
     if (prev.is_type<reorder>())
         return true;
@@ -927,7 +928,7 @@ format layout_optimizer::get_expected_format(convolution_node const& node) {
     }
 
     bool onednn_valid_post_ops = get_post_ops_count(node) <= 32;
-    bool use_onednn_impls = _optimization_attributes.use_onednn_impls && input_layout.data_type != data_types::f32;
+    bool use_onednn_impls = contains_onednn_impls_optimization_attribute(&node) && input_layout.data_type != data_types::f32;
 
     // Use planar bfyx format for dynamic convolutions with explicit padding in clDNN
     if (node.is_dynamic() && output_layout.get_partial_shape().size() == 4 && node.use_explicit_padding() && !i8_u8_input &&
@@ -1038,7 +1039,7 @@ format layout_optimizer::get_expected_format(deconvolution_node const& node) {
     }
 
     auto expected_shape = output_layout.get_shape();
-    bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
+    bool use_onednn_impls = contains_onednn_impls_optimization_attribute(&node);
 
     auto available = node.get_primitive()->type->get_available_impl_types(node);
 
@@ -1086,7 +1087,7 @@ format layout_optimizer::get_expected_format(quantize_node const& node) {
         return all_users_gemm;
     };
 
-    auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
+    auto use_onednn_impls = has_all_enabled_onednn_impls_optimization_attribute();
 
     if (use_onednn_impls) {
         expected = format::any;
@@ -1126,7 +1127,7 @@ format layout_optimizer::get_expected_format(quantize_node const& node) {
 bool layout_optimizer::is_primitive_implemented_for_onednn(program_node& node) {
     if (node.is_type<fully_connected>() || node.is_type<gemm>() || node.is_type<pooling>() ||
         node.is_type<convolution>() || node.is_type<deconvolution>() ||
-        node.is_type<reduce>() || node.is_type<reorder>() || node.is_type<concatenation>()) {
+        node.is_type<reduce>() || node.is_type<reorder>() || node.is_type<concatenation>() || node.is_type<lstm_seq>()) {
         return true;
     }
 
@@ -1219,7 +1220,7 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
 format layout_optimizer::get_preferred_format(program_node& node) {
     format expected = format::any;
     auto output_layout = node.get_output_layout();
-    bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
+    bool use_onednn_impls = contains_onednn_impls_optimization_attribute(&node);
 
     bool allow_new_shape_infer = node.get_program().is_new_shape_infer();
 
@@ -1417,14 +1418,55 @@ void layout_optimizer::set_optimization_attribute(optimization_attributes_type a
         case optimization_attributes_type::bs_fs_yx_bsv16_fsv16_network:
             _optimization_attributes.bs_fs_yx_bsv16_fsv16_network = val;
             break;
-        case optimization_attributes_type::use_onednn_impls:
-            _optimization_attributes.use_onednn_impls = val;
-            break;
         default:
             throw std::out_of_range("unsupported layout optimization attribute");
     }
 }
 
+void layout_optimizer::add_all_onednn_impls_optimization_attribute() {
+    enable_onednn_for<concatenation>();
+    enable_onednn_for<convolution>();
+    enable_onednn_for<deconvolution>();
+    enable_onednn_for<fully_connected>();
+    enable_onednn_for<gemm>();
+    enable_onednn_for<lstm_seq>();
+    enable_onednn_for<pooling>();
+    enable_onednn_for<reduce>();
+    enable_onednn_for<reorder>();
+}
+
+bool layout_optimizer::has_all_enabled_onednn_impls_optimization_attribute() {
+    return is_enabled_onednn_for<concatenation>() && is_enabled_onednn_for<convolution>() && is_enabled_onednn_for<deconvolution>() &&
+        is_enabled_onednn_for<fully_connected>() && is_enabled_onednn_for<gemm>() && is_enabled_onednn_for<lstm_seq>() &&
+        is_enabled_onednn_for<pooling>() && is_enabled_onednn_for<reduce>() && is_enabled_onednn_for<reorder>();
+}
+
+void layout_optimizer::set_value_onednn(primitive_type_id p_type, bool val) {
+    _optimization_attributes.onednn_impls[p_type] = val;
+}
+
+bool layout_optimizer::contains_onednn_impls_optimization_attribute(const program_node* node) {
+    auto type_id = node->type();
+    auto it = _optimization_attributes.onednn_impls.find(type_id);
+    if (it == _optimization_attributes.onednn_impls.end()) {
+        return false;
+    }
+
+    return it->second;
+}
+
+bool layout_optimizer::is_empty_onednn_impls_optimization_attribute() {
+    return _optimization_attributes.onednn_impls.empty();
+}
+
+void layout_optimizer::clear_onednn_impls_optimization_attribute() {
+    _optimization_attributes.onednn_impls.clear();
+}
+
+std::map<primitive_type_id, bool> layout_optimizer::get_all_onednn_impls_optimization_attribute() {
+    return _optimization_attributes.onednn_impls;
+}
+
 bool layout_optimizer::is_format_optimized(const convolution_node& node, const format& format, bool use_weak_restrictions) {
     auto input_layout = node.get_input_layout();
     auto weights_layout = node.weights().get_output_layout();
diff --git a/src/plugins/intel_gpu/src/graph/lstm_cell.cpp b/src/plugins/intel_gpu/src/graph/lstm_cell.cpp
new file mode 100644
index 00000000000000..0b300199fb05a3
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/lstm_cell.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "lstm_cell_inst.h"
+#include "primitive_type_base.h"
+#include "json_object.h"
+#include <string>
+
+namespace cldnn {
+GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_cell)
+
+layout lstm_cell_inst::calc_output_layout(lstm_cell_node const& node, kernel_impl_params const& impl_param) {
+    const auto& input_layout = impl_param.get_input_layout(0);
+    const auto& input_pshape = input_layout.get_partial_shape();
+    const auto& input_layout_hidden = impl_param.get_input_layout(1);
+    const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape();
+    const auto& lstm_batch_size = input_pshape[0];
+    const auto& lstm_hidden_size = input_pshape_hidden[1];
+
+    return cldnn::layout{ov::PartialShape{lstm_batch_size, lstm_hidden_size}, input_layout.data_type, input_layout.format};
+}
+
+template<typename ShapeType>
+std::vector<layout> lstm_cell_inst::calc_output_layouts(lstm_cell_node const& node, kernel_impl_params const& impl_param) {
+    const auto& input_layout = impl_param.get_input_layout(0);
+    const auto& input_pshape = input_layout.get_partial_shape();
+    const auto& input_layout_hidden = impl_param.get_input_layout(1);
+    const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape();
+    const auto& lstm_batch_size = input_pshape[0];
+    const auto& lstm_hidden_size = input_pshape_hidden[1];
+
+    auto out_layout = cldnn::layout{ShapeType{lstm_batch_size, lstm_hidden_size}, input_layout.data_type, input_layout.format};
+    return {out_layout, out_layout};
+}
+
+template std::vector<layout> lstm_cell_inst::calc_output_layouts<ov::PartialShape>(lstm_cell_node const& node, const kernel_impl_params& impl_param);
+
+std::string lstm_cell_inst::to_string(lstm_cell_node const& node) {
+    auto node_info = node.desc_to_json();
+
+    std::stringstream primitive_description;
+
+    json_composite lstm_cell_info;
+    node_info->add("lstm cell info", lstm_cell_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+lstm_cell_inst::typed_primitive_inst(network& network, lstm_cell_node const& node) : parent(network, node) {}
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/lstm_elt.cpp b/src/plugins/intel_gpu/src/graph/lstm_elt.cpp
deleted file mode 100644
index 098e89aa45003e..00000000000000
--- a/src/plugins/intel_gpu/src/graph/lstm_elt.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-#include "lstm_elt_inst.h"
-#include "primitive_type_base.h"
-#include "intel_gpu/runtime/error_handler.hpp"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_elt)
-
-layout lstm_elt_inst::calc_output_layout(lstm_elt_node const& node, kernel_impl_params const& impl_param) {
-    assert(static_cast<bool>(impl_param.desc->output_data_types[0]) == false &&
-           "Output data type forcing is not supported for lstm_elt_node!");
-    auto input_layout = impl_param.get_input_layout();
-
-    // tempGEMM{bfyx} = [b: batch, f: direction, x: 1,         y: 4 * hidden_size ] input
-    // cell{bfyx}     = [b: batch, f: direction, x: 1,         y: hidden_size ] optional
-    // output{bfyx}   = [b: batch, f: 2,         x: direction, y: hidden_size ] output
-    // The output of the lstm_elt node is the concatenation of the intermediate [hidden, cell] tensors.
-    // A crop/split node is needed to extract each individual tensors
-    auto result =
-        layout(input_layout.data_type,
-               input_layout.format,
-               tensor(input_layout.batch(), 2, input_layout.spatial(0) / 4, input_layout.feature()));
-    return result;
-}
-
-template<typename ShapeType>
-std::vector<layout> lstm_elt_inst::calc_output_layouts(lstm_elt_node const& node, kernel_impl_params const& impl_param) {
-    std::vector<layout> output_layouts;
-
-    // input partial shape [batch, input_size (= hidden_size * 4)]
-    auto input_layout = impl_param.get_input_layout();
-    auto input_pshape = input_layout.get_partial_shape();
-    OPENVINO_ASSERT(static_cast<bool>(impl_param.desc->output_data_types[0]) == false, "Output data type forcing is not supported for lstm_elt_node!");
-    OPENVINO_ASSERT(input_pshape.rank().get_length() == 2, "input_layout rank should be 2 on dynamic shape.");
-
-    int lstm_input_size, lstm_batch_size, lstm_hidden_size;
-    if (input_pshape[input_pshape.size() - 1].is_static()) {
-        lstm_input_size = input_pshape[input_pshape.size() - 1].get_length();
-        lstm_hidden_size = lstm_input_size / 4;
-    } else {
-        lstm_input_size = -1;
-        lstm_hidden_size = -1;
-    }
-
-    if (input_pshape[input_pshape.size() - 2].is_static()) {
-        lstm_batch_size = input_pshape[input_pshape.size() - 2].get_length();
-    } else {
-        lstm_batch_size = -1;
-    }
-
-    return {cldnn::layout{ov::PartialShape{lstm_batch_size, 2, 1, lstm_hidden_size}, input_layout.data_type, input_layout.format}};
-}
-
-template std::vector<layout> lstm_elt_inst::calc_output_layouts<ov::PartialShape>(lstm_elt_node const& node, const kernel_impl_params& impl_param);
-
-std::string lstm_elt_inst::to_string(lstm_elt_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    auto cell_id = desc->cell;
-
-    std::stringstream primitive_description;
-
-    json_composite lstm_elt_info;
-    lstm_elt_info.add("cell id", cell_id);
-    node_info->add("lstm elt info", lstm_elt_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-lstm_elt_inst::typed_primitive_inst(network& network, lstm_elt_node const& node) : parent(network, node) {
-    auto input_size = node.get_input_layout();
-    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                  "input format",
-                                  input_size.format.value,
-                                  "expected format",
-                                  format::bfyx,
-                                  format::fyxb);
-}
-}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/lstm_seq.cpp b/src/plugins/intel_gpu/src/graph/lstm_seq.cpp
new file mode 100644
index 00000000000000..f06f7a644ad12a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/lstm_seq.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "lstm_seq_inst.h"
+#include "primitive_type_base.h"
+#include "json_object.h"
+#include <string>
+
+namespace cldnn {
+GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_seq)
+
+layout lstm_seq_inst::calc_output_layout(lstm_seq_node const& node, kernel_impl_params const& impl_param) {
+    const auto& desc = impl_param.typed_desc<lstm_seq>();
+    const auto& input_layout = impl_param.get_input_layout(0);
+    const auto& input_pshape = input_layout.get_partial_shape();
+    const auto& input_layout_hidden = impl_param.get_input_layout(1);
+    const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape();
+    const auto& lstm_batch_size = input_pshape[0];
+    const auto& lstm_seq_length = input_pshape[1];
+    const auto& lstm_hidden_size = input_pshape_hidden[2];
+
+    auto first_out_fmt = cldnn::format::bfyx;
+    if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) {
+        first_out_fmt = node.get_preferred_output_fmt();
+    }
+
+    return cldnn::layout{ov::PartialShape{lstm_batch_size, desc->num_directions(), lstm_seq_length, lstm_hidden_size}, input_layout.data_type, first_out_fmt};
+}
+
+template<typename ShapeType>
+std::vector<layout> lstm_seq_inst::calc_output_layouts(lstm_seq_node const& node, kernel_impl_params const& impl_param) {
+    const auto& desc = impl_param.typed_desc<lstm_seq>();
+    const auto& input_layout = impl_param.get_input_layout(0);
+    const auto& input_pshape = input_layout.get_partial_shape();
+    const auto& input_layout_hidden = impl_param.get_input_layout(1);
+    const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape();
+    const auto& lstm_batch_size = input_pshape[0];
+    const auto& lstm_seq_length = input_pshape[1];
+    const auto& lstm_hidden_size = input_pshape_hidden[2];
+
+    auto first_out_fmt = cldnn::format::bfyx;
+    auto second_out_fmt = input_layout.format;
+    auto third_out_fmt = input_layout.format;
+    if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) {
+        first_out_fmt = node.get_preferred_output_fmt();
+    }
+    auto num_directions = desc->num_directions();
+
+    return {cldnn::layout{ShapeType{lstm_batch_size, num_directions, lstm_seq_length, lstm_hidden_size}, input_layout.data_type, first_out_fmt}, \
+            cldnn::layout{ShapeType{lstm_batch_size, num_directions, lstm_hidden_size}, input_layout.data_type, second_out_fmt}, \
+            cldnn::layout{ShapeType{lstm_batch_size, num_directions, lstm_hidden_size}, input_layout.data_type, third_out_fmt}};
+}
+
+template std::vector<layout> lstm_seq_inst::calc_output_layouts<ov::PartialShape>(lstm_seq_node const& node, const kernel_impl_params& impl_param);
+
+std::string lstm_seq_inst::to_string(lstm_seq_node const& node) {
+    auto node_info = node.desc_to_json();
+
+    std::stringstream primitive_description;
+
+    json_composite lstm_seq_info;
+    node_info->add("lstm seq info", lstm_seq_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+lstm_seq_inst::typed_primitive_inst(network& network, lstm_seq_node const& node) : parent(network, node) {}
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 2bfaac84134387..bdffb9c4980722 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -70,6 +70,9 @@
 #include "unique_inst.hpp"
 #include "condition_inst.h"
 #include "to_string_utils.h"
+#include "intel_gpu/graph/serialization/map_serializer.hpp"
+
+#include "intel_gpu/primitives/rnn.hpp"
 
 // TODO: Remove once we have interface for kernels cache
 #include "impls/ocl/kernels_cache.hpp"
@@ -151,15 +154,14 @@ program::program(engine& engine_ref,
       is_internal(is_internal),
       _is_body_program(is_body_program),
       _compilation_context(compilation_context) {
-    _config.apply_user_properties(_engine.get_device_info());
     init_primitives();
     GPU_DEBUG_INFO << "Program config\n" << _config.to_string();
     init_program();
     prepare_nodes(topology);
     program_node::reset_unique_id();
-
     if (no_optimizations) {
         init_graph();
+        _config.apply_user_properties(_engine.get_device_info());
     } else {
         build_program(is_internal);
         if (_is_body_program) {
@@ -494,6 +496,7 @@ void program::set_options() {
 
 void program::build_program(bool is_internal) {
     init_graph();
+    _config.apply_user_properties(_engine.get_device_info());
     { pre_optimize_graph(is_internal); }
     run_graph_compilation();
     { post_optimize_graph(is_internal); }
@@ -523,6 +526,9 @@ void program::init_graph() {
     for (auto& node : processing_order) {
         if (!node->is_type<data>())
             node->get_output_layouts();
+        if (node->is_type<lstm_seq>()) {
+            _config.set_property(ov::intel_gpu::use_onednn(true));
+        }
     }
     // Perform initial shape_of subgraphs markup
     apply_opt_pass<mark_shape_of_subgraphs>();
@@ -1631,11 +1637,17 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
 #ifdef ENABLE_ONEDNN_FOR_GPU
     bool enable_onednn_for_tests = get_config().get_property(ov::intel_gpu::optimize_data) || is_internal_program();
     auto& engine = get_engine();
-    if (engine.get_device_info().supports_immad &&
-        engine.get_device_info().vendor_id == INTEL_VENDOR_ID &&
+    if (engine.get_device_info().vendor_id == INTEL_VENDOR_ID &&
         get_config().get_property(ov::intel_gpu::queue_type) == QueueTypes::in_order &&
-        enable_onednn_for_tests)
-        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 1);
+        enable_onednn_for_tests) {
+            if (engine.get_device_info().supports_immad) {
+                lo.add_all_onednn_impls_optimization_attribute();
+            } else {
+                if (get_config().get_property(ov::intel_gpu::use_onednn)) {
+                    lo.enable_onednn_for<lstm_seq>();
+                }
+            }
+        }
 #endif
 }
 
@@ -1779,7 +1791,13 @@ void program::save(cldnn::BinaryOutputBuffer& ob) const {
 
     ob << _is_body_program;
     ob << _can_be_optimized;
-    ob << get_layout_optimizer().get_optimization_attributes().use_onednn_impls;
+    auto onednn_impls_size = get_layout_optimizer().get_all_onednn_impls_optimization_attribute().size();
+    ob << onednn_impls_size;
+    for (const auto& onednn_impl : get_layout_optimizer().get_all_onednn_impls_optimization_attribute()) {
+        ob << prim_map_storage::instance().get_type_string(onednn_impl.first);
+        ob << onednn_impl.second;
+    }
+
     processing_order.save(ob);
 
     {
@@ -1903,9 +1921,18 @@ void program::load(cldnn::BinaryInputBuffer& ib) {
 
     ib >> _is_body_program;
     ib >> _can_be_optimized;
-    int32_t use_onednn_attr = 0;
-    ib >> use_onednn_attr;
-    get_layout_optimizer().set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, use_onednn_attr);
+
+    size_t num_of_onednn_impls;
+    ib >> num_of_onednn_impls;
+    for (size_t num = 0; num < num_of_onednn_impls; num++) {
+        primitive_id p_id{};
+        bool enabled;
+        ib >> p_id;
+        ib >> enabled;
+        auto ptype_id = prim_map_storage::instance().get_type_id(p_id);
+        get_layout_optimizer().set_value_onednn(ptype_id, enabled);
+    }
+
     _loaded_from_cache = true;
 
     processing_order.load(ib, *this);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl
new file mode 100644
index 00000000000000..f2cf2ca985e855
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl
@@ -0,0 +1,215 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+#include "include/batch_headers/common.cl"
+
+#define INPUT0_TYPE_VEC  MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE)
+#define INPUT1_TYPE_VEC  MAKE_VECTOR_TYPE(INPUT1_TYPE, VEC_SIZE)
+#define INPUT3_TYPE_VEC  MAKE_VECTOR_TYPE(INPUT3_TYPE, VEC_SIZE)
+#define INPUT4_TYPE_VEC  MAKE_VECTOR_TYPE(INPUT4_TYPE, VEC_SIZE)
+#define OUTPUT_TYPE_VEC  MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
+#define READ_VEC(offset, ptr) CAT(vload, VEC_SIZE)(offset, ptr)
+
+#ifdef SEQUENCE
+#define GET_IN0_IDX(b, f, y) INPUT0_GET_INDEX(b, f, y, 0)
+    #if DIRECTION == 2
+        #define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, f, y, 0)
+        #define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, f, y, 0)
+        #define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(b, f, y, 0)
+        #define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(b, f, y, 0)
+        #define GET_IN5_IDX(b, f)    INPUT5_GET_INDEX(b, f, 0, 0)
+    #else
+        #define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, 0, y, 0)
+        #define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, 0, y, 0)
+        #define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(0, f, y, 0)
+        #define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(0, f, y, 0)
+        #define GET_IN5_IDX(b, f)    INPUT5_GET_INDEX(0, f, 0, 0)
+    #endif
+#else
+#define GET_IN0_IDX(b, f, y) INPUT0_GET_INDEX(b, y, 0, 0)
+#define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, y, 0, 0)
+#define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, y, 0, 0)
+#define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(f, y, 0, 0)
+#define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(f, y, 0, 0)
+#define GET_IN5_IDX(b, f)    INPUT5_GET_INDEX(f, 0, 0, 0)
+#endif
+
+KERNEL(lstm_cell_and_seq_bfyx)(
+    const __global INPUT0_TYPE* x,
+    const __global INPUT1_TYPE* initial_hidden_state,
+    const __global INPUT2_TYPE* initial_cell_state,
+    const __global INPUT3_TYPE* W,
+    const __global INPUT4_TYPE* R,
+    const __global INPUT5_TYPE* B,
+#ifdef SEQUENCE
+    const __global INPUT6_TYPE* sequence_lengths,
+    __global OUTPUT_TYPE* hidden_history,
+    __global OUTPUT1_TYPE* hidden_state,
+    __global OUTPUT2_TYPE* cell_state
+#else
+    __global OUTPUT_TYPE* hidden_state,
+    __global OUTPUT1_TYPE* cell_state
+#endif
+)
+{
+    const uint b = get_global_id(1);
+    const uint local_idx = get_local_id(0);
+    const uint weight_offsets[4] = {GEMM_OFFSET_F, GEMM_OFFSET_I, GEMM_OFFSET_Z, GEMM_OFFSET_O};
+    #ifdef SEQUENCE
+        const uint real_seq_length = sequence_lengths[INPUT6_GET_INDEX(b, 0, 0, 0)];
+    #else
+        const uint real_seq_length = 1;
+    #endif
+    #if DIRECTION == 2
+    unroll_for(uint dir=0;dir<DIRECTION;dir++) {
+    #else
+    uint dir = DIRECTION;
+    #endif
+    unroll_for(uint i=0;i<real_seq_length;++i){
+        #ifdef SEQUENCE
+            uint prev_idx = i-1;
+            if(dir == 1) {
+               prev_idx = real_seq_length - i;
+            }
+            if(i>0){
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+        #endif
+        unroll_for(uint l=0;l<NUM_HIDDEN_TO_DO;++l) { //kernel responsible for HIDDEN_SIZE
+            const uint hidden_idx = local_idx*NUM_HIDDEN_TO_DO + l;
+            if (hidden_idx >= HIDDEN_SIZE) {
+                continue;
+            }
+            ACCUMULATOR_TYPE gate_output[GATE_NUM];
+            unroll_for(uint k=0;k<GATE_NUM;++k){
+                ACCUMULATOR_TYPE hidden_result = 0;
+                ACCUMULATOR_TYPE input_result = 0;
+                const uint weight_idx = hidden_idx+weight_offsets[k];
+                uint hblock_num = HIDDEN_SIZE/VEC_SIZE;
+                uint idx_r = GET_IN4_IDX(dir, weight_idx, 0);
+                #ifdef SEQUENCE
+                    #if DIRECTION == 2
+                        uint idx_hidden_history = OUTPUT_GET_INDEX(b, dir, prev_idx, 0);
+                    #else
+                        uint idx_hidden_history = OUTPUT_GET_INDEX(b, 0, prev_idx, 0);
+                    #endif
+                #endif
+                unroll_for(uint j=0;j<hblock_num;++j) {
+                    INPUT4_TYPE_VEC r_block = READ_VEC(0, &R[idx_r]);
+                    idx_r += VEC_SIZE;
+                    if(i==0){
+                        INPUT1_TYPE_VEC initial_block = READ_VEC(0, &initial_hidden_state[GET_IN1_IDX(b, dir, j*VEC_SIZE)]);
+                        hidden_result += dot(initial_block, r_block);
+                    }else{
+                        #ifdef SEQUENCE
+                            #if DIRECTION == 2
+                                OUTPUT_TYPE_VEC h_block = READ_VEC(0, &hidden_history[idx_hidden_history]);
+                            #else
+                                OUTPUT_TYPE_VEC h_block = READ_VEC(0, &hidden_history[idx_hidden_history]);
+                            #endif
+                            idx_hidden_history += VEC_SIZE;
+                            hidden_result += dot(h_block, r_block);
+                        #endif
+                    }
+                }
+                unroll_for(uint j=hblock_num*VEC_SIZE;j<HIDDEN_SIZE;++j) {
+                    if(i==0){
+                        #if DIRECTION == 2
+                            hidden_result += initial_hidden_state[GET_IN1_IDX(b, dir, j)]*R[GET_IN4_IDX(dir, weight_idx, j)];
+                        #else
+                            hidden_result += initial_hidden_state[GET_IN1_IDX(b, 0, j)]*R[GET_IN4_IDX(dir, weight_idx, j)];
+                        #endif
+                    }else{
+                        #ifdef SEQUENCE
+                            #if DIRECTION == 2
+                                hidden_result += hidden_history[OUTPUT_GET_INDEX(b, dir, prev_idx, j)]*R[GET_IN4_IDX(dir, weight_idx, j)];
+                            #else
+                                hidden_result += hidden_history[OUTPUT_GET_INDEX(b, 0, prev_idx, j)]*R[GET_IN4_IDX(dir, weight_idx, j)];
+                            #endif
+                        #endif
+                    }
+                }
+
+                uint block_num = INPUT_SIZE/VEC_SIZE;
+                uint idx_x_block;
+                uint idx_w_block = GET_IN3_IDX(dir, weight_idx, 0);
+                if (dir == 1) {
+                    idx_x_block = GET_IN0_IDX(b, real_seq_length-1-i, 0);
+                } else {
+                    idx_x_block = GET_IN0_IDX(b, i, 0);
+                }
+                unroll_for(uint j=0;j<block_num;++j) {
+                    INPUT0_TYPE_VEC x_block = READ_VEC(0, &x[idx_x_block]);
+                    INPUT3_TYPE_VEC w_block = READ_VEC(0, &W[idx_w_block]);
+                    idx_x_block += VEC_SIZE;
+                    idx_w_block += VEC_SIZE;
+                    input_result += dot(x_block, w_block);
+                }
+
+                unroll_for(uint j=block_num*VEC_SIZE;j<INPUT_SIZE;++j) { //leftovers
+                    if (dir == 1) {
+                        input_result += x[GET_IN0_IDX(b, real_seq_length-1-i, j)]*W[GET_IN3_IDX(dir, weight_idx, j)];
+                    } else {
+                        input_result += x[GET_IN0_IDX(b, i, j)]*W[GET_IN3_IDX(dir, weight_idx, j)];
+                    }
+                }
+                gate_output[k] = hidden_result + input_result + TO_ACCUMULATOR_TYPE(B[GET_IN5_IDX(dir, weight_idx)]);
+                switch(k){
+                    case 0:
+                    case 1:
+                    case 3:
+                        gate_output[k] = ACTIVATION_F(ACTIVATION_CLIP(TO_OUTPUT_TYPE(gate_output[k]), ACTIVATION_PARAMS_CLIP), ACTIVATION_PARAMS_F);
+                        break;
+                    case 2:
+                        gate_output[k] = ACTIVATION_G(ACTIVATION_CLIP(TO_OUTPUT_TYPE(gate_output[k]), ACTIVATION_PARAMS_CLIP), ACTIVATION_PARAMS_G);
+                        break;
+                    default:
+                        break;
+                }
+            }
+            ACCUMULATOR_TYPE temp_cell_state;
+            if (i==0){
+                temp_cell_state = gate_output[0]*initial_cell_state[GET_IN2_IDX(b, dir, hidden_idx)] + gate_output[1]*gate_output[2];
+            }else{
+                temp_cell_state *= gate_output[0];
+                temp_cell_state += gate_output[1]*gate_output[2];
+            }
+            uint cur_history_idx = i;
+            if (dir == 1) {  //reverse
+                cur_history_idx = real_seq_length - 1 - i ;
+            }
+            #ifdef SEQUENCE
+                #if DIRECTION == 2
+                    hidden_state[OUTPUT1_GET_INDEX(b, dir, hidden_idx, 0)] = gate_output[3]*ACTIVATION_H(temp_cell_state, ACTIVATION_PARAMS_H);
+                #else
+                    hidden_state[OUTPUT1_GET_INDEX(b, 0, hidden_idx, 0)] = gate_output[3]*ACTIVATION_H(temp_cell_state, ACTIVATION_PARAMS_H);
+                #endif
+            #else
+                hidden_state[OUTPUT_GET_INDEX(b, hidden_idx, 0, 0)] = gate_output[3]*ACTIVATION_H(temp_cell_state, ACTIVATION_PARAMS_H);
+            #endif
+            #ifdef SEQUENCE
+                #if DIRECTION == 2
+                    hidden_history[OUTPUT_GET_INDEX(b, dir, cur_history_idx, hidden_idx)] = hidden_state[OUTPUT1_GET_INDEX(b, dir, hidden_idx, 0)];
+                #else // DIRECTION == 2
+                    hidden_history[OUTPUT_GET_INDEX(b, 0, cur_history_idx, hidden_idx)] = hidden_state[OUTPUT1_GET_INDEX(b, 0, hidden_idx, 0)];
+                #endif
+            #endif
+            if(i==real_seq_length-1){
+                #ifdef SEQUENCE
+                    #if DIRECTION == 2
+                        cell_state[OUTPUT2_GET_INDEX(b, dir, hidden_idx, 0)] = temp_cell_state;
+                    #else
+                        cell_state[OUTPUT2_GET_INDEX(b, 0, hidden_idx, 0)] = temp_cell_state;
+                    #endif
+                #else
+                    cell_state[OUTPUT1_GET_INDEX(b, hidden_idx, 0, 0)] = temp_cell_state;
+                #endif
+            }
+        }
+    }
+    #if DIRECTION == 2
+    }
+    #endif   
+}
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_ref.cl
new file mode 100644
index 00000000000000..578660f7685813
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_ref.cl
@@ -0,0 +1,170 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+
+#ifdef SEQUENCE
+#define GET_IN0_IDX(b, f, y) INPUT0_GET_INDEX(b, f, y, 0)
+    #if DIRECTION == 2
+        #define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, f, y, 0)
+        #define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, f, y, 0)
+        #define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(b, f, y, 0)
+        #define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(b, f, y, 0)
+        #define GET_IN5_IDX(b, f)    INPUT5_GET_INDEX(b, f, 0, 0)
+    #else
+        #define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, 0, y, 0)
+        #define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, 0, y, 0)
+        #define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(0, f, y, 0)
+        #define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(0, f, y, 0)
+        #define GET_IN5_IDX(b, f)    INPUT5_GET_INDEX(0, f, 0, 0)
+    #endif
+#else
+#define GET_IN0_IDX(b, f, y) INPUT0_GET_INDEX(b, y, 0, 0) 
+#define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, y, 0, 0)
+#define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, y, 0, 0)
+#define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(f, y, 0, 0)
+#define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(f, y, 0, 0)
+#define GET_IN5_IDX(b, f)    INPUT5_GET_INDEX(f, 0, 0, 0)
+#endif
+
+KERNEL(lstm_cell_and_seq_ref)(
+    const __global INPUT0_TYPE* x,
+    const __global INPUT1_TYPE* initial_hidden_state,
+    const __global INPUT2_TYPE* initial_cell_state,
+    const __global INPUT3_TYPE* W,
+    const __global INPUT4_TYPE* R,
+    const __global INPUT5_TYPE* B,
+#ifdef SEQUENCE
+    const __global INPUT6_TYPE* sequence_lengths,
+    __global OUTPUT_TYPE* hidden_history,
+    __global OUTPUT1_TYPE* hidden_state,
+    __global OUTPUT2_TYPE* cell_state
+#else
+    __global OUTPUT_TYPE* hidden_state,
+    __global OUTPUT1_TYPE* cell_state
+#endif
+)
+{
+    const uint b = get_global_id(1);
+    const uint local_idx = get_local_id(0);
+    const uint weight_offsets[4] = {GEMM_OFFSET_F, GEMM_OFFSET_I, GEMM_OFFSET_Z, GEMM_OFFSET_O};
+    #ifdef SEQUENCE
+        const uint real_seq_length = sequence_lengths[INPUT6_GET_INDEX_SAFE(b, 0, 0, 0)];
+    #else
+        const uint real_seq_length = 1;
+    #endif
+    #if DIRECTION == 2
+    unroll_for(uint dir=0;dir<DIRECTION;dir++) {
+    #else
+    uint dir = DIRECTION;
+    #endif
+    unroll_for(uint i=0;i<real_seq_length;++i){
+        #ifdef SEQUENCE
+            uint prev_idx = i-1;
+            if(dir == 1) {
+               prev_idx = real_seq_length - i;
+            }
+            if(i>0){
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+        #endif
+        unroll_for(uint l=0;l<NUM_HIDDEN_TO_DO;++l) { //kernel responsible for HIDDEN_SIZE
+            const uint hidden_idx = local_idx*NUM_HIDDEN_TO_DO + l;
+            if (hidden_idx >= HIDDEN_SIZE) {
+                continue;
+            }
+            ACCUMULATOR_TYPE gate_output[GATE_NUM];
+            unroll_for(uint k=0;k<GATE_NUM;++k){
+                ACCUMULATOR_TYPE hidden_result = 0;
+                ACCUMULATOR_TYPE input_result = 0;
+                const uint weight_idx = hidden_idx+weight_offsets[k];
+                unroll_for(uint j=0;j<HIDDEN_SIZE;++j) {
+                    if(i==0){
+                        hidden_result += initial_hidden_state[GET_IN1_IDX(b, dir, j)]*R[GET_IN4_IDX(dir, weight_idx, j)];
+                    }else{
+                        #ifdef SEQUENCE
+                            #if DIRECTION == 2
+                                hidden_result += hidden_history[OUTPUT_GET_INDEX_SAFE(b, dir, prev_idx, j)]*R[GET_IN4_IDX(dir, weight_idx, j)];
+                            #else
+                                hidden_result += hidden_history[OUTPUT_GET_INDEX_SAFE(b, 0, prev_idx, j)]*R[GET_IN4_IDX(0, weight_idx, j)];
+                            #endif
+                        #endif
+                    }
+                }
+                
+                unroll_for(uint j=0;j<INPUT_SIZE;++j) {
+                    #if DIRECTION == 2
+                        if (dir == 1) { //reverse
+                            input_result += x[GET_IN0_IDX(b, real_seq_length-1-i, j)]*W[GET_IN3_IDX(dir, weight_idx, j)];
+                        } else {
+                            input_result += x[GET_IN0_IDX(b, i, j)]*W[GET_IN3_IDX(dir, weight_idx, j)];
+                        }
+                    #else
+                        #if DIRECTION == 1 //reverse
+                            input_result += x[GET_IN0_IDX(b, real_seq_length-1-i, j)]*W[GET_IN3_IDX(dir, weight_idx, j)];
+                        #else
+                            input_result += x[GET_IN0_IDX(b, i, j)]*W[GET_IN3_IDX(dir, weight_idx, j)];
+                        #endif
+                    #endif
+                }
+                gate_output[k] = hidden_result + input_result + TO_ACCUMULATOR_TYPE(B[GET_IN5_IDX(dir, weight_idx)]);
+
+                switch(k){
+                    case 0:
+                    case 1:
+                    case 3:
+                        gate_output[k] = ACTIVATION_F(ACTIVATION_CLIP(TO_OUTPUT_TYPE(gate_output[k]), ACTIVATION_PARAMS_CLIP), ACTIVATION_PARAMS_F);
+                        break;
+                    case 2:
+                        gate_output[k] = ACTIVATION_G(ACTIVATION_CLIP(TO_OUTPUT_TYPE(gate_output[k]), ACTIVATION_PARAMS_CLIP), ACTIVATION_PARAMS_G);
+                        break;
+                    default:
+                        break;
+                }
+            }
+            ACCUMULATOR_TYPE temp_cell_state;
+            if (i==0){
+                temp_cell_state = gate_output[0]*initial_cell_state[GET_IN2_IDX(b, dir, hidden_idx)] + gate_output[1]*gate_output[2];
+            }else{
+                temp_cell_state *= gate_output[0];
+                temp_cell_state += gate_output[1]*gate_output[2];
+            }
+            
+            uint cur_history_idx = i;
+            if (dir == 1) {  //reverse
+                cur_history_idx = real_seq_length - 1 - i ;
+            }
+            #ifdef SEQUENCE
+                #if DIRECTION == 2
+                    hidden_state[OUTPUT1_GET_INDEX_SAFE(b, dir, hidden_idx, 0)] = gate_output[3]*ACTIVATION_H(temp_cell_state, ACTIVATION_PARAMS_H);
+                #else
+                    hidden_state[OUTPUT1_GET_INDEX_SAFE(b, 0, hidden_idx, 0)] = gate_output[3]*ACTIVATION_H(temp_cell_state, ACTIVATION_PARAMS_H);
+                #endif
+            #else
+                hidden_state[OUTPUT_GET_INDEX_SAFE(b, hidden_idx, 0, 0)] = gate_output[3]*ACTIVATION_H(temp_cell_state, ACTIVATION_PARAMS_H);
+            #endif
+            #ifdef SEQUENCE
+                #if DIRECTION == 2
+                    hidden_history[OUTPUT_GET_INDEX_SAFE(b, dir, cur_history_idx, hidden_idx)] = hidden_state[OUTPUT1_GET_INDEX_SAFE(b, dir, hidden_idx, 0)];
+                #else // DIRECTION == 2
+                    hidden_history[OUTPUT_GET_INDEX_SAFE(b, 0, cur_history_idx, hidden_idx)] = hidden_state[OUTPUT1_GET_INDEX_SAFE(b, 0, hidden_idx, 0)];
+                #endif
+            #endif
+            if(i==real_seq_length-1){
+                #ifdef SEQUENCE
+                    #if DIRECTION == 2
+                        cell_state[OUTPUT2_GET_INDEX_SAFE(b, dir, hidden_idx, 0)] = temp_cell_state;
+                    #else
+                        cell_state[OUTPUT2_GET_INDEX_SAFE(b, 0, hidden_idx, 0)] = temp_cell_state;
+                    #endif
+                #else
+                    cell_state[OUTPUT1_GET_INDEX_SAFE(b, hidden_idx, 0, 0)] = temp_cell_state;
+                #endif
+            }
+        }
+    }   
+    #if DIRECTION == 2
+    }
+    #endif   
+}
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_elt_gpu_bfyx_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_elt_gpu_bfyx_ref.cl
deleted file mode 100644
index b40458dfb49d78..00000000000000
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_elt_gpu_bfyx_ref.cl
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "include/batch_headers/fetch_data.cl"
-
-// tempGEMM = [ batch, 1, direction, 4 * hidden_size ]
-// cell     = [ batch, 1, direction, hidden_size ] optional
-// output   = [ batch, 1, direction, hidden_size ] output
-KERNEL(lstm_elt)(
-    const __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output
-#if CELL_TERM
-    ,const __global OUTPUT_TYPE* cell
-#endif
-    )
-{
-    const uint x = get_global_id(0);
-    const uint b = get_global_id(1);
-
-    ACCUMULATOR_TYPE it = input[INPUT0_GET_INDEX(b, 0, 0, x + GEMM_OFFSET_I)];
-    ACCUMULATOR_TYPE ot = input[INPUT0_GET_INDEX(b, 0, 0, x + GEMM_OFFSET_O)]; // pass constant offsets here
-    ACCUMULATOR_TYPE zt = input[INPUT0_GET_INDEX(b, 0, 0, x + GEMM_OFFSET_Z)];
-
-    ACCUMULATOR_TYPE val = ACTIVATION_F(ACTIVATION_CLIP(it, ACTIVATION_PARAMS_CLIP), ACTIVATION_PARAMS_F) *
-                           ACTIVATION_G(ACTIVATION_CLIP(zt, ACTIVATION_PARAMS_CLIP), ACTIVATION_PARAMS_G);
-
-#if CELL_TERM || INPUT_FORGET
-    ACCUMULATOR_TYPE ft = input[INPUT0_GET_INDEX(b, 0, 0, x + GEMM_OFFSET_F)];
-#endif
-
-#if INPUT_FORGET
-    val *= ((ACCUMULATOR_TYPE)1 - ft);
-#endif
-
-#if CELL_TERM
-    val += cell[CELL_GET_INDEX(b, 0, CELL_DIRECTION, x)] * ACTIVATION_F(ACTIVATION_CLIP(ft, ACTIVATION_PARAMS_CLIP), ACTIVATION_PARAMS_F);
-#endif
-    // hidden
-    output[OUTPUT_GET_INDEX(b, 0, 0, x)] = (OUTPUT_TYPE)(ACTIVATION_H(val, ACTIVATION_PARAMS_H) *
-                                                         ACTIVATION_F(ACTIVATION_CLIP(ot, ACTIVATION_PARAMS_CLIP), ACTIVATION_PARAMS_F));
-    // cell
-    output[OUTPUT_GET_INDEX(b, 1, 0, x)] = (OUTPUT_TYPE)val;
-}
diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
index bc9cc9f5b8da07..fd863f696552c3 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
@@ -42,7 +42,7 @@ enum class KernelType {
     REGION_YOLO,
     REORG_YOLO,
     MVN,
-    LSTM_ELT,
+    LSTM_SEQ_CELL,
     BORDER,
     TILE,
     SELECT,
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h
index 3a0c2224dd7358..d5462e07ea183b 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h
@@ -229,9 +229,6 @@ class ParamsKey {
                         uint32_t stride : 1;
                         uint32_t broadcast : 1;
                     } eltwise;
-                    struct lstm_elt_t {
-                        uint32_t cell : 1;
-                    } lstm_elt;
                     struct quantize_t {
                         uint32_t scale_shift_opt : 1;
                     } quantize;
@@ -335,7 +332,6 @@ class ParamsKey {
     void EnableEltwiseStride();
     void EnableEltwiseBroadcast() { key.restrict.val.dedicated.eltwise.broadcast = 1; }
 
-    void EnableLSTMEltCell() { key.restrict.val.dedicated.lstm_elt.cell = 1; }
     void EnableConcatKernelPerInput() { key.restrict.val.dedicated.concat.kernelPerInput = 1; }
     void EnableConcatOneKernel() { key.restrict.val.dedicated.concat.oneKernel = 1; }
     void EnableArgMaxMinAxis(ArgMaxMinAxis a);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_bfyx.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_bfyx.cpp
new file mode 100644
index 00000000000000..7211ed17db493f
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_bfyx.cpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lstm_cell_and_seq_kernel_bfyx.h"
+#include "kernel_selector_utils.h"
+
+namespace kernel_selector {
+
+ParamsKey LSTMCellAndSeqKernel_bfyx::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableDifferentTypes();
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableAllOutputLayout();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    return k;
+}
+
+KernelsData LSTMCellAndSeqKernel_bfyx::GetKernelsData(const Params& params) const {
+    return GetCommonKernelsData(params);
+}
+
+KernelsPriority LSTMCellAndSeqKernel_bfyx::GetKernelsPriority(const Params& /*params*/) const {
+    return FORCE_PRIORITY_3;
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_bfyx.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_bfyx.h
new file mode 100644
index 00000000000000..2626ede25b6c64
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_bfyx.h
@@ -0,0 +1,19 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "lstm_kernel_base.h"
+
+namespace kernel_selector {
+class LSTMCellAndSeqKernel_bfyx : public LSTMKernelBase {
+public:
+    LSTMCellAndSeqKernel_bfyx() : LSTMKernelBase("lstm_cell_and_seq_bfyx") {}
+    virtual ~LSTMCellAndSeqKernel_bfyx() {}
+
+    KernelsData GetKernelsData(const Params& params) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    ParamsKey GetSupportedKey() const override;
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_ref.cpp
similarity index 62%
rename from src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_ref.cpp
rename to src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_ref.cpp
index 88e71c50ce2447..eb9c5e15b3d2a0 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_ref.cpp
@@ -2,13 +2,14 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "lstm_elt_kernel_ref.h"
+#include "lstm_cell_and_seq_kernel_ref.h"
 #include "kernel_selector_utils.h"
 
 namespace kernel_selector {
 
-ParamsKey LSTMEltKernelRef::GetSupportedKey() const {
+ParamsKey LSTMCellAndSeqKernelRef::GetSupportedKey() const {
     ParamsKey k;
+    k.EnableInputDataType(Datatype::INT32);
     k.EnableInputDataType(Datatype::F16);
     k.EnableInputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F16);
@@ -19,15 +20,14 @@ ParamsKey LSTMEltKernelRef::GetSupportedKey() const {
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();
-    k.EnableLSTMEltCell();
     return k;
 }
 
-KernelsData LSTMEltKernelRef::GetKernelsData(const Params& params) const {
+KernelsData LSTMCellAndSeqKernelRef::GetKernelsData(const Params& params) const {
     return GetCommonKernelsData(params);
 }
 
-KernelsPriority LSTMEltKernelRef::GetKernelsPriority(const Params& /*params*/) const {
-    return FORCE_PRIORITY_1;
+KernelsPriority LSTMCellAndSeqKernelRef::GetKernelsPriority(const Params& /*params*/) const {
+    return DONT_USE_IF_HAVE_SOMETHING_ELSE;
 }
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_ref.h
similarity index 64%
rename from src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_ref.h
rename to src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_ref.h
index a3e388b61a999a..6329306fe66297 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_ref.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_ref.h
@@ -4,13 +4,13 @@
 
 #pragma once
 
-#include "lstm_elt_kernel_base.h"
+#include "lstm_kernel_base.h"
 
 namespace kernel_selector {
-class LSTMEltKernelRef : public LSTMEltKernelBase {
+class LSTMCellAndSeqKernelRef : public LSTMKernelBase {
 public:
-    LSTMEltKernelRef() : LSTMEltKernelBase("lstm_elt_gpu_bfyx_ref") {}
-    virtual ~LSTMEltKernelRef() {}
+    LSTMCellAndSeqKernelRef() : LSTMKernelBase("lstm_cell_and_seq_ref") {}
+    virtual ~LSTMCellAndSeqKernelRef() {}
 
     KernelsData GetKernelsData(const Params& params) const override;
     KernelsPriority GetKernelsPriority(const Params& params) const override;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.cpp
new file mode 100644
index 00000000000000..db6b26bc1db1ba
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.cpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lstm_cell_and_seq_kernel_selector.h"
+#include "lstm_cell_and_seq_kernel_ref.h"
+#include "lstm_cell_and_seq_kernel_bfyx.h"
+
+namespace kernel_selector {
+lstm_cell_and_seq_kernel_selector::lstm_cell_and_seq_kernel_selector() {
+    Attach<LSTMCellAndSeqKernelRef>();
+    Attach<LSTMCellAndSeqKernel_bfyx>();
+}
+
+KernelsData lstm_cell_and_seq_kernel_selector::GetBestKernels(const Params& params) const {
+    return GetNaiveBestKernel(params, KernelType::LSTM_SEQ_CELL);
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.h
similarity index 51%
rename from src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_selector.h
rename to src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.h
index 7735b18abf6824..09eba2ff7c9b2f 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_selector.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.h
@@ -7,16 +7,16 @@
 #include "kernel_selector.h"
 
 namespace kernel_selector {
-class lstm_elt_kernel_selector : public kernel_selector_base {
+class lstm_cell_and_seq_kernel_selector : public kernel_selector_base {
 public:
-    static lstm_elt_kernel_selector& Instance() {
-        static lstm_elt_kernel_selector instance_;
+    static lstm_cell_and_seq_kernel_selector& Instance() {
+        static lstm_cell_and_seq_kernel_selector instance_;
         return instance_;
     }
 
-    lstm_elt_kernel_selector();
+    lstm_cell_and_seq_kernel_selector();
 
-    virtual ~lstm_elt_kernel_selector() {}
+    virtual ~lstm_cell_and_seq_kernel_selector() {}
 
     KernelsData GetBestKernels(const Params& params) const override;
 };
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.cpp
deleted file mode 100644
index 1f4f613205163a..00000000000000
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "lstm_elt_kernel_base.h"
-#include "kernel_selector_utils.h"
-#include "common_tools.h"
-#include <string>
-
-namespace kernel_selector {
-
-JitConstants LSTMEltKernelBase::GetJitConstants(const lstm_elt_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    if (params.has_cell) {
-        const auto& cell = params.cell;
-        jit.AddConstants({MakeJitConstant("CELL_TERM", true),
-                          MakeJitConstant("CELL", cell),
-                          MakeJitConstant("CELL_DIRECTION", params.cell_direction)});
-    }
-    if (params.input_forget) {
-        jit.AddConstants({MakeJitConstant("INPUT_FORGET", true)});
-    }
-    jit.AddConstants({MakeJitConstant("DIRECTION", params.direction)});
-
-    const auto& GEMMInput = params.inputs[0];
-    size_t size = GEMMInput.X().v / 4;
-    jit.AddConstants({
-        MakeJitConstant("GEMM_OFFSET_I", params.GetOffsetIndexI() * size),
-        MakeJitConstant("GEMM_OFFSET_O", params.GetOffsetIndexO() * size),
-        MakeJitConstant("GEMM_OFFSET_F", params.GetOffsetIndexF() * size),
-        MakeJitConstant("GEMM_OFFSET_Z", params.GetOffsetIndexZ() * size),
-    });
-
-    auto ftype = GetUnitType(params);
-    // if ReLU activation present, we have to reset accumulator type for the kernel to FP32
-    // to avoid possible overflows on FP16, since ReLU doesn't limit upper border of its result
-    for (size_t i = 0; i < params.activations.size(); i++) {
-        if (params.activations[i].function == ActivationFunction::RELU) {
-            ftype = Datatype::F32;
-            break;
-        }
-    }
-    jit.Merge(MakeTypeJitConstants(ftype, "ACCUMULATOR"));
-
-    static const std::vector<std::string> asuffixes = {"_F", "_G", "_H", "_CLIP"};
-    for (size_t i = 0; i < params.activations.size(); i++) {
-        std::vector<base_activation_params> aparams = { params.activations[i] };
-        jit.Merge(MakeActivationJitConstants(aparams, ftype, asuffixes[i]));
-    }
-
-    if (params.clip <= 0) {
-        jit.AddConstants({
-                MakeJitConstant("ACTIVATION_PARAMS_CLIP", ""),
-                MakeJitConstant("ACTIVATION_CLIP(x, p)", "(x)"),
-            });
-    }
-
-    return jit;
-}
-
-KernelsData LSTMEltKernelBase::GetCommonKernelsData(const Params& params) const {
-    if (!Validate(params)) {
-        return {};
-    }
-
-    const lstm_elt_params& orgParams = static_cast<const lstm_elt_params&>(params);
-
-    KernelData kd = KernelData::Default<lstm_elt_params>(params, orgParams.inputs.size());
-
-    const auto& input = orgParams.inputs[0];
-
-    auto newParams = orgParams;
-    newParams.inputs.resize(1);
-    newParams.inputs[0] = input;
-    auto out = newParams.outputs[0];
-
-    auto& kernel = kd.kernels[0];
-    auto cldnnJit = GetJitConstants(newParams);
-    auto entryPoint = GetEntryPoint(kernelName, newParams.layerID, params);
-    auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
-
-    kernel.params.workGroups.global = {out.X().v, out.Batch().v, 1};
-    kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global, params.engineInfo);
-    kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo);
-    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
-    kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
-    if (orgParams.has_cell) {
-        kernel.params.arguments.push_back({ArgumentDescriptor::Types::CELL, 0});
-    }
-
-    return {kd};
-}
-}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_selector.cpp
deleted file mode 100644
index f3c8b875f38921..00000000000000
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_selector.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "lstm_elt_kernel_selector.h"
-#include "lstm_elt_kernel_ref.h"
-
-namespace kernel_selector {
-lstm_elt_kernel_selector::lstm_elt_kernel_selector() { Attach<LSTMEltKernelRef>(); }
-
-KernelsData lstm_elt_kernel_selector::GetBestKernels(const Params& params) const {
-    return GetNaiveBestKernel(params, KernelType::LSTM_ELT);
-}
-}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.cpp
new file mode 100644
index 00000000000000..65f884f034e24f
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.cpp
@@ -0,0 +1,122 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lstm_kernel_base.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <string>
+#include <algorithm>
+
+namespace kernel_selector {
+
+JitConstants LSTMKernelBase::GetJitConstants(const lstm_params& params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+    bool sequential = params.sequential;
+    auto out =  params.outputs[0];
+    if (params.input_forget) {
+        jit.AddConstants({MakeJitConstant("INPUT_FORGET", true)});
+    }
+    jit.AddConstants({MakeJitConstant("VEC_SIZE", 4)});
+    jit.AddConstants({MakeJitConstant("DIRECTION", static_cast<int>(params.direction))});
+    const unsigned int gate_num = 4;
+    jit.AddConstants({MakeJitConstant("GATE_NUM", gate_num)});
+    if (sequential) {
+        jit.AddConstants({MakeJitConstant("SEQUENCE", 1)});
+    }
+    int num_hidden_kernels;
+    int hidden_size;
+    if (sequential) {
+        jit.AddConstants({MakeJitConstant("INPUT_SIZE", params.inputs[0].Y().v)});
+        hidden_size = static_cast<int>(params.inputs[1].Y().v);
+        num_hidden_kernels = std::min({static_cast<int>(params.engineInfo.maxWorkGroupSize), static_cast<int>(out.X().v)});
+    } else {
+        jit.AddConstants({MakeJitConstant("INPUT_SIZE", params.inputs[0].Feature().v)});
+        hidden_size = static_cast<int>(params.inputs[1].Feature().v);
+        num_hidden_kernels = std::min({static_cast<int>(params.engineInfo.maxWorkGroupSize), static_cast<int>(out.Feature().v)});
+    }
+    size_t size;
+    if (sequential) {
+        size = params.inputs[1].Y().v;
+    } else {
+        size = params.inputs[1].Feature().v;
+    }
+    jit.AddConstants({
+        MakeJitConstant("GEMM_OFFSET_I", params.GetOffsetIndexI() * size),
+        MakeJitConstant("GEMM_OFFSET_O", params.GetOffsetIndexO() * size),
+        MakeJitConstant("GEMM_OFFSET_F", params.GetOffsetIndexF() * size),
+        MakeJitConstant("GEMM_OFFSET_Z", params.GetOffsetIndexZ() * size),
+    });
+    jit.AddConstants({MakeJitConstant("BATCH_SIZE", params.inputs[1].Batch().v)});
+    jit.AddConstants({MakeJitConstant("HIDDEN_SIZE", hidden_size)});
+    int num_hidden_to_do = hidden_size/num_hidden_kernels + (hidden_size % num_hidden_kernels  ? 1 : 0);
+    jit.AddConstant({MakeJitConstant("NUM_HIDDEN_TO_DO", num_hidden_to_do)});
+    auto ftype = params.inputs[0].GetDType();
+    // if ReLU activation present, we have to reset accumulator type for the kernel to FP32
+    // to avoid possible overflows on FP16, since ReLU doesn't limit upper border of its result
+    for (size_t i = 0; i < params.activations.size(); i++) {
+        if (params.activations[i].function == ActivationFunction::RELU) {
+            ftype = Datatype::F32;
+            break;
+        }
+    }
+    jit.Merge(MakeTypeJitConstants(ftype, "ACCUMULATOR"));
+
+    static const std::vector<std::string> asuffixes = {"_F", "_G", "_H", "_CLIP"};
+    for (size_t i = 0; i < params.activations.size(); i++) {
+        std::vector<base_activation_params> aparams = { params.activations[i] };
+        jit.Merge(MakeActivationJitConstants(aparams, params.inputs[0].GetDType(), asuffixes[i]));
+    }
+
+    if (params.clip <= 0) {
+        jit.AddConstants({
+                MakeJitConstant("ACTIVATION_PARAMS_CLIP", ""),
+                MakeJitConstant("ACTIVATION_CLIP(x, p)", "(x)"),
+            });
+    }
+
+    return jit;
+}
+
+KernelsData LSTMKernelBase::GetCommonKernelsData(const Params& params) const {
+    if (!Validate(params)) {
+        return {};
+    }
+
+    const lstm_params& orgParams = static_cast<const lstm_params&>(params);
+    bool sequential = orgParams.sequential;
+    KernelData kd = KernelData::Default<lstm_params>(params, 1);
+
+    auto out =  orgParams.outputs[0];
+
+    auto& kernel = kd.kernels[0];
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2});
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 3});
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 4});
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 5});
+    if (sequential) {
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 6});
+    }
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 1});
+    if (sequential) {
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 2});
+    }
+    auto cldnnJit = GetJitConstants(orgParams);
+    auto entryPoint = GetEntryPoint(kernelName, orgParams.layerID, params);
+    auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
+    size_t num_hidden_kernels;
+    if (sequential) {
+        num_hidden_kernels = static_cast<size_t>(std::min({params.engineInfo.maxWorkGroupSize, out.X().v}));
+    } else {
+        num_hidden_kernels = static_cast<size_t>(std::min({params.engineInfo.maxWorkGroupSize, out.Feature().v}));
+    }
+    kernel.params.workGroups.global = {num_hidden_kernels, out.Batch().v, 1};
+    kernel.params.workGroups.local = {num_hidden_kernels, 1, 1};
+    kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo);
+
+    return {kd};
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.h
similarity index 78%
rename from src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.h
rename to src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.h
index 1b95c34961201a..bb1153960a0107 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.h
@@ -11,9 +11,9 @@
 
 namespace kernel_selector {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// lstm_elt_params
+// lstm_cell_params
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct lstm_elt_params : public base_params {
+struct lstm_params : public base_params {
     enum order_type : int32_t {
         offset_iofz,  // ONNX default
         offset_ifoz,  // caffe
@@ -21,15 +21,12 @@ struct lstm_elt_params : public base_params {
         offset_fizo   // OV default
     };
 
-    lstm_elt_params() : base_params(KernelType::LSTM_ELT) {}
-
-    DataTensor cell;
-    bool has_cell = false;
+    lstm_params() : base_params(KernelType::LSTM_SEQ_CELL) {}
     order_type gate_order = offset_iofz;
     float clip = 0;
+    bool sequential = false;
     bool input_forget = false;
-    uint32_t direction = 0;
-    uint32_t cell_direction = 0;
+    ov::op::RecurrentSequenceDirection direction = ov::op::RecurrentSequenceDirection::FORWARD;
 
     size_t GetOffsetIndex(order_type type, size_t idx) const {
         static const std::map<order_type, std::vector<size_t>> offset_map{{offset_iofz, {0, 1, 2, 3}},
@@ -46,36 +43,28 @@ struct lstm_elt_params : public base_params {
 
     void SetOffsetOrder(int32_t t) { gate_order = static_cast<order_type>(t); }
 
-    void SetCell(const DataTensor& v) {
-        cell = v;
-        has_cell = true;
-    }
-
     ParamsKey GetParamsKey() const override {
         ParamsKey k = base_params::GetParamsKey();
-        if (has_cell) {
-            k.EnableLSTMEltCell();
-        }
         return k;
     }
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// LSTMEltKernelBase
+// LSTMSeqKernelBase
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class LSTMEltKernelBase : public KernelBaseOpenCL {
+class LSTMKernelBase : public KernelBaseOpenCL {
 public:
     using KernelBaseOpenCL::KernelBaseOpenCL;
-    virtual ~LSTMEltKernelBase() {}
+    virtual ~LSTMKernelBase() {}
 
     struct DispatchData : public CommonDispatchData {};
 
 protected:
-    virtual JitConstants GetJitConstants(const lstm_elt_params& params) const;
+    virtual JitConstants GetJitConstants(const lstm_params& params) const;
     KernelsData GetCommonKernelsData(const Params& params) const;
 
     bool Validate(const Params& p) const override {
-        if (p.GetType() != KernelType::LSTM_ELT) {
+        if (p.GetType() != KernelType::LSTM_SEQ_CELL) {
             return false;
         }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel_base.cpp
index 52613518ff1e79..db8dd13df0ee05 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel_base.cpp
@@ -189,6 +189,11 @@ ReorderKernelBase::DispatchData ReorderKernelBase::SetDefault(const reorder_para
         dispatchData.lws[0] = 1;
         dispatchData.lws[1] = 16;
         dispatchData.lws[2] = 1;
+    } else if (input_l == DataLayout::ybfx) {
+        dispatchData.gws[2] = input.Batch().v;
+        dispatchData.gws[1] = input.Feature().v;
+        dispatchData.gws[0] = input.Y().v*input.X().v;
+        dispatchData.lws = {1, 1, 1};
     } else if (input_l == DataLayout::byfx) {
         auto first_primary_axis_size = dispatchData.gws[0];  // X axis
         auto second_primiary_axis_size =  dispatchData.gws[1];  // YF axes
diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp
index 41d4f193d9e386..c8afd95abda580 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp
@@ -25,6 +25,7 @@ DataTensor::DataChannelArray DataTensor::dataChannelArray {{
     { DataLayout::bxfy,                  {  2,  0, -1, -1, -1, -1,  1,  3 } },
     { DataLayout::fbyx,                  {  0,  1, -1, -1, -1, -1,  3,  2 } },
     { DataLayout::fyxb,                  {  1,  2, -1, -1, -1, -1,  3,  0 } },
+    { DataLayout::ybfx,                  {  0,  3, -1, -1, -1, -1,  1,  2 } },
     { DataLayout::b_fs_yx_fsv2,          {  0,  1, -1, -1, -1, -1,  2,  3 } },
     { DataLayout::b_fs_yx_fsv4,          {  0,  1, -1, -1, -1, -1,  2,  3 } },
     { DataLayout::b_fs_yx_fsv8,          {  0,  1, -1, -1, -1, -1,  2,  3 } },
diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h
index 3596da68e6decf..fbc6e1ad813a80 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h
@@ -36,6 +36,7 @@ enum DataLayout {
     bfxy,                   // 3D+batch
     byfx,
     bxfy,
+    ybfx,
     b_fs_yx_fsv2,
     b_fs_zyx_fsv2,
     b_fs_yx_fsv4,           // reordering format for swizzled input for convolution using IMAD
@@ -255,6 +256,7 @@ inline bool SimpleLayout(DataLayout l) {
         case DataLayout::fb:
         case DataLayout::bfyx:
         case DataLayout::yxfb:
+        case DataLayout::ybfx:
         case DataLayout::byxf:
         case DataLayout::byfx:
         case DataLayout::bxfy:
diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp
index 08e59caeab74bd..c3d74feffb5599 100644
--- a/src/plugins/intel_gpu/src/plugin/graph.cpp
+++ b/src/plugins/intel_gpu/src/plugin/graph.cpp
@@ -247,7 +247,8 @@ std::shared_ptr<ov::Model> Graph::get_runtime_model(std::vector<cldnn::primitive
                 { "gemm", "Gemm" },
                 { "input_layout", "Input" },
                 { "lrn", "LRN" },
-                { "lstm_elt", "LSTM_Eltwise" },
+                { "lstm_cell", "LSTM_Cell" },
+                { "lstm_seq", "LSTM_Seq" },
                 { "mvn", "MVN" },
                 { "normalize", "Normalize" },
                 { "permute", "Permute" },
diff --git a/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp b/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp
index 713e24fe7571bd..c20927ea5d84ee 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp
@@ -10,11 +10,14 @@
 
 #include "intel_gpu/primitives/reshape.hpp"
 #include "intel_gpu/primitives/reorder.hpp"
+#include "intel_gpu/primitives/mutable_data.hpp"
 #include "intel_gpu/primitives/fully_connected.hpp"
-#include "intel_gpu/primitives/lstm.hpp"
+#include "intel_gpu/primitives/lstm_cell.hpp"
 #include "intel_gpu/primitives/crop.hpp"
 #include "intel_gpu/primitives/concatenation.hpp"
 #include "intel_gpu/primitives/data.hpp"
+#include "intel_gpu/primitives/permute.hpp"
+#include "intel_gpu/primitives/slice.hpp"
 
 namespace ov {
 namespace intel_gpu {
@@ -64,287 +67,36 @@ void GetLSTMActivationParams(const std::shared_ptr<T>& op,
 
 static void CreateLSTMCellOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v4::LSTMCell>& op) {
     validate_inputs_count(op, {6});
-    int lstm_batch_size, lstm_input_size, lstm_hidden_size;
-    auto inputs = p.GetInputInfo(op);
-
     std::string layerName = layer_type_name_ID(op);
-    cldnn::input_info weight = inputs[3];
-    cldnn::input_info recurrent = inputs[4];
-    cldnn::input_info bias = inputs[5];
-
-    /* check incoming CNN layer and setup required variables */
-    {
-        const auto in0_pshape = op->get_input_partial_shape(0);
-        const auto out0_pshape = op->get_output_partial_shape(0);
-
-        if (in0_pshape[in0_pshape.size() - 1].is_static())
-            lstm_input_size = in0_pshape[in0_pshape.size() - 1].get_length();
-        else
-            lstm_input_size = -1;
-
-        if (in0_pshape[in0_pshape.size() - 2].is_static())
-            lstm_batch_size = in0_pshape[in0_pshape.size() - 2].get_length();
-        else
-            lstm_batch_size = -1;
-
-        if (out0_pshape[out0_pshape.size() - 1].is_static())
-            lstm_hidden_size = out0_pshape[out0_pshape.size() - 1].get_length();
-        else
-            lstm_hidden_size = -1;
-    }
-
+    auto inputs = p.GetInputInfo(op);
     std::vector<cldnn::activation_func> activations;
     std::vector<cldnn::activation_additional_params> activation_params;
     GetLSTMActivationParams(op, activations, activation_params);
     float clip = op->get_clip();
-
-    if (p.use_new_shape_infer()) {
-        cldnn::primitive_id input_concatID = layerName + "_inputConcat";
-        p.add_primitive(*op, cldnn::concatenation(input_concatID, { inputs[0], inputs[1] }, 1));
-
-        cldnn::primitive_id lstm_fc_id = layerName + "_fully_connected";
-        cldnn::primitive_id lstm_elt_id = layerName + "_lstm_elt";
-        cldnn::primitive_id wr_concat_id = layerName + "_WRconcat";
-        p.add_primitive(*op, cldnn::concatenation(wr_concat_id, { inputs[3], inputs[4] }, 1));
-        p.add_primitive(*op, cldnn::fully_connected(lstm_fc_id, cldnn::input_info(input_concatID), wr_concat_id, bias.pid));
-        p.add_primitive(*op, cldnn::lstm_elt(lstm_elt_id, cldnn::input_info(lstm_fc_id), inputs[2].pid, clip, 0, activations,
-                                            activation_params, cldnn::lstm_weights_order::fizo, 0));
-
-        auto outSz = op->get_output_partial_shape(0);
-        std::vector<int64_t> outSzPt;
-        for (auto pshape : outSz) {
-            if (pshape.is_static())
-                outSzPt.push_back(pshape.get_length());
-            else
-                outSzPt.push_back(-1);
-        }
-
-        cldnn::crop_ngraph_op_mode op_mode = cldnn::crop_ngraph_op_mode::split;
-        size_t num_splits = 2;
-        cldnn::tensor hiddenSz = cldnn::tensor{ lstm_batch_size, 1, lstm_hidden_size, 1 };
-
-        cldnn::primitive_id outputHiddenCropID = layerName + "_hc";
-        cldnn::primitive_id outputHiddenID = layerName + ".out0";
-        cldnn::primitive_id outputDataID = layerName + "_data";
-
-        cldnn::layout constLayout = cldnn::layout({}, cldnn::data_types::i64, cldnn::format::bfyx);
-        cldnn::memory::ptr data_mem = p.get_engine().allocate_memory(constLayout, false);
-        auto& stream = p.get_engine().get_service_stream();
-        cldnn::mem_lock<char> lock{data_mem, stream};
-        auto buf = lock.data();
-        const int64_t axis = 1;
-        std::memcpy(&buf[0], &axis, constLayout.bytes_count());
-        p.add_primitive(*op,  cldnn::data(outputDataID, data_mem));
-
-        p.add_primitive(*op,
-                        cldnn::crop(outputHiddenCropID,
-                        {cldnn::input_info(lstm_elt_id), cldnn::input_info(outputDataID)},
-                        hiddenSz,
-                        cldnn::tensor{0, 0, 0, 0},
-                        op_mode, 0, axis, num_splits));
-        p.add_primitive(*op, cldnn::reshape(outputHiddenID, cldnn::input_info(outputHiddenCropID),
-                        false, outSzPt, op->get_output_partial_shape(0)), {layerName});
-
-        cldnn::primitive_id outputCellCropID = layerName + "_cc";
-        cldnn::primitive_id outputCellID = layerName + ".out1";
-        p.add_primitive(*op,
-                        cldnn::crop(outputCellCropID,
-                        {cldnn::input_info(lstm_elt_id), cldnn::input_info(outputDataID)},
-                        hiddenSz,
-                        cldnn::tensor{0, 1, 0, 0},
-                        op_mode, 1, axis, num_splits));
-        p.add_primitive(*op, cldnn::reshape(outputCellID, cldnn::input_info(outputCellCropID),
-                        false, outSzPt, op->get_output_partial_shape(1)));
-    } else {
-        //  LSTM primitive works with single precision for all in/out/weights tensors
-        auto lstm_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0));
-
-        cldnn::primitive_id inReshapeID = layerName + "_inReshape";
-        cldnn::primitive_id permuteID = layerName + "_inputReorder";
-        cldnn::primitive_id inHiddenReshapeID = layerName + "_inHiddenReshape";
-        cldnn::primitive_id inHiddenReorderID = layerName + "_inHiddenReorder";
-        cldnn::primitive_id gemmReshapeID = layerName + "_gemmReshape";
-        cldnn::primitive_id gemmReorderID = layerName + "_gemmReorder";
-        cldnn::primitive_id input_concatID = layerName + "_inputConcat";
-
-        cldnn::tensor inputShape = { lstm_batch_size, 1, lstm_input_size, 1 };
-        cldnn::tensor inStateShape = { lstm_batch_size, 1, lstm_hidden_size, 1 };
-        cldnn::layout inputLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, inputShape);
-        cldnn::layout hiddenLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, inStateShape);
-        p.add_primitive(*op, cldnn::reshape(inReshapeID, inputs[0], inputShape));
-        p.add_primitive(*op, cldnn::reorder(permuteID, inReshapeID, inputLayout));
-
-
-        std::string hiddenInResh = inHiddenReshapeID + "_1";
-        std::string hiddenInStr = inHiddenReorderID + "_1";
-        std::string cellInResh = inHiddenReshapeID + "_2";
-        std::string cellInStr = inHiddenReorderID + "_2";
-        p.add_primitive(*op, cldnn::reshape(hiddenInResh, inputs[1], inStateShape));
-        p.add_primitive(*op, cldnn::reorder(hiddenInStr, cldnn::input_info(hiddenInResh), hiddenLayout));
-        p.add_primitive(*op, cldnn::reshape(cellInResh, inputs[2], inStateShape));
-        p.add_primitive(*op, cldnn::reorder(cellInStr, cldnn::input_info(cellInResh), hiddenLayout));
-        p.add_primitive(*op, cldnn::concatenation(input_concatID,
-                                                { permuteID, hiddenInStr },
-                                                3));
-
-        cldnn::tensor gemmSz = cldnn::tensor{ lstm_batch_size, 1, 4 * lstm_hidden_size, 1 };
-        cldnn::layout gemmLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, gemmSz);
-        cldnn::tensor hiddenSz = cldnn::tensor{ lstm_batch_size, 1, lstm_hidden_size, 1 };
-        cldnn::tensor cellCropSz = cldnn::tensor{0, 1, 0, 0};
-
-        std::string lstm_fc_id = layerName + "_fully_connected";
-        std::string lstm_elt_id = layerName + "_lstm_elt";
-
-        cldnn::primitive_id WRconcatID = layerName + "_WRconcat";
-        p.add_primitive(*op, cldnn::concatenation(WRconcatID, { weight, recurrent }, 1));
-
-        cldnn::primitive_id FCInputReshapeID = "Reshape_bf_" + lstm_fc_id + "_for_input";
-        cldnn::tensor FCInputReshapeSz = { lstm_batch_size, inputShape.spatial[0] + inStateShape.spatial[0], 1, 1 };
-        p.add_primitive(*op, cldnn::reshape(FCInputReshapeID, cldnn::input_info(input_concatID), FCInputReshapeSz));
-
-        p.add_primitive(*op, cldnn::fully_connected(lstm_fc_id, cldnn::input_info(FCInputReshapeID), WRconcatID, bias.pid));
-        p.add_primitive(*op, cldnn::reshape(gemmReshapeID, cldnn::input_info(lstm_fc_id), gemmSz));
-        p.add_primitive(*op, cldnn::reorder(gemmReorderID, cldnn::input_info(gemmReshapeID), gemmLayout));
-        p.add_primitive(*op, cldnn::lstm_elt(lstm_elt_id, cldnn::input_info(gemmReorderID), cellInStr, clip, 0, activations,
-                                            activation_params, cldnn::lstm_weights_order::fizo, 0));
-
-
-        cldnn::tensor outSz = cldnn::tensor{ lstm_batch_size, lstm_hidden_size, 1, 1 };
-        cldnn::primitive_id outputHiddenCropID = layerName + "_hc";
-        cldnn::primitive_id outputHiddenID = layerName + ".out0";
-        p.add_primitive(*op, cldnn::crop(outputHiddenCropID, cldnn::input_info(lstm_elt_id), hiddenSz, cldnn::tensor{0, 0, 0, 0}));
-        p.add_primitive(*op, cldnn::reshape(outputHiddenID, cldnn::input_info(outputHiddenCropID), outSz), {layerName});
-
-        cldnn::primitive_id outputCellCropID = layerName + "_cc";
-        cldnn::primitive_id outputCellID = layerName + ".out1";
-        p.add_primitive(*op, cldnn::crop(outputCellCropID, cldnn::input_info(lstm_elt_id), hiddenSz, cellCropSz));
-        p.add_primitive(*op, cldnn::reshape(outputCellID, cldnn::input_info(outputCellCropID), outSz));
-    }
+    OPENVINO_ASSERT(!inputs[5].pid.empty());
+    OPENVINO_ASSERT(p.use_new_shape_infer());
+    p.add_primitive(*op, cldnn::lstm_cell(layerName+".out0", inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5], cldnn::input_info(),
+        clip, false, activations, activation_params, cldnn::lstm_weights_order::fizo, ov::op::RecurrentSequenceDirection::FORWARD, cldnn::padding(),
+        static_cast<int>(op->get_output_size())));
 }
 
 static void CreateLSTMSequenceOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v5::LSTMSequence>& op) {
     validate_inputs_count(op, {7});
-
     std::string layerName = layer_type_name_ID(op);
-    int lstm_batch_size, lstm_input_size, lstm_hidden_size, lstm_sequence_len;
-
     auto inputs = p.GetInputInfo(op);
-    cldnn::input_info weight = inputs[4];
-    cldnn::input_info recurrent = inputs[5];
-    cldnn::input_info bias = inputs[6];
-
-    {
-        const auto in_dims0 = op->get_input_shape(0);
-        const auto out_dims0 = op->get_output_shape(0);
-
-        if (in_dims0.size() != 3 ||
-            op->get_input_shape(1).size() != 3 ||
-            op->get_input_shape(2).size() != 3)
-            OPENVINO_THROW("Wrong input shapes for LSTMSequence op ", op->get_friendly_name());
-
-        lstm_input_size = static_cast<int>(in_dims0.back());
-        lstm_sequence_len = static_cast<int>(in_dims0.at(in_dims0.size() - 2));
-        lstm_batch_size = static_cast<int>(in_dims0.at(in_dims0.size() - 3));
-        lstm_hidden_size = static_cast<int>(out_dims0.back());
-    }
-
     std::vector<cldnn::activation_func> activations;
     std::vector<cldnn::activation_additional_params> activation_params;
     GetLSTMActivationParams(op, activations, activation_params);
-    float clip = op->get_clip();
-    bool isForward = op->get_direction() == ov::op::RecurrentSequenceDirection::FORWARD;
-
-    //  LSTM primitive works with single precision for all in/out/weights tensors
-    auto lstm_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0));
-
-    cldnn::primitive_id inReshapeID = layerName + "_inReshape";
-    cldnn::primitive_id permuteID = layerName + "_inputReorder";
-    cldnn::primitive_id inHiddenReshapeID = layerName + "_inHiddenReshape";
-    cldnn::primitive_id inHiddenReorderID = layerName + "_inHiddenReorder";
-    cldnn::primitive_id inHiddenStateID = inHiddenReshapeID + "_1";
-    cldnn::primitive_id inCellStateID = inHiddenReshapeID + "_2";
-
-    std::vector<cldnn::input_info> output_ids_offsets;
-
-    cldnn::tensor inputShape = { lstm_batch_size, lstm_sequence_len, lstm_input_size, 1 };
-    cldnn::tensor inStateShape = { lstm_batch_size, 1, lstm_hidden_size, 1 };
-    cldnn::layout inputLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, inputShape);
-    p.add_primitive(*op, cldnn::reshape(inReshapeID, inputs[0], inputShape));
-    p.add_primitive(*op, cldnn::reorder(permuteID, cldnn::input_info(inReshapeID), inputLayout));
-
-    p.add_primitive(*op, cldnn::reshape(inHiddenStateID, inputs[1], inStateShape));
-    p.add_primitive(*op, cldnn::reshape(inCellStateID, inputs[2], inStateShape));
-
-    cldnn::tensor gemmSz = cldnn::tensor{ lstm_batch_size, 1, 4 * lstm_hidden_size, 1 };
-    cldnn::layout gemmLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, gemmSz);
-    cldnn::tensor hiddenSz = cldnn::tensor{ lstm_batch_size, 1, lstm_hidden_size, 1 };
-    cldnn::tensor cellCropSz = cldnn::tensor{0, 1, 0, 0};
-    cldnn::primitive_id hiddenStr = inHiddenReshapeID + "_1";
-    cldnn::primitive_id cellStr = inHiddenReshapeID + "_2";
-    cldnn::primitive_id inputCropID = layerName + "_inputCrop";
-
-    cldnn::primitive_id wr_concat_id = layerName + "_WRconcat";
-    p.add_primitive(*op, cldnn::concatenation(wr_concat_id, { weight, recurrent }, 2));
-
-    std::vector<size_t> WRreshapeSize = { 4 * size_t(lstm_hidden_size), size_t(lstm_input_size + lstm_hidden_size) };
-    cldnn::primitive_id WRreshapeID = wr_concat_id + "_reshape";
-    auto reshapeInPrim = cldnn::reshape(WRreshapeID, cldnn::input_info(wr_concat_id), tensor_from_dims(WRreshapeSize));
-    p.add_primitive(*op, reshapeInPrim);
-
-    for (int i = 0; i < lstm_sequence_len; ++i) {
-        const std::string id_str = std::to_string(i);
-        cldnn::primitive_id concatID = layerName + "_inputConcat" + id_str;
-        cldnn::primitive_id lstm_fc_id = layerName + "_fully_connected" + id_str;
-        cldnn::primitive_id fc_input_resh_id = "Reshape_bf_" + lstm_fc_id + "_for_input" + id_str;
-        cldnn::primitive_id lstm_fc_resh_id = layerName + "_gemmReshape" + id_str;
-        cldnn::primitive_id lstm_fc_reor_id = layerName + "_gemmReorder" + id_str;
-        cldnn::primitive_id lstm_elt_id = layerName + "_lstm_elt" + id_str;
-        cldnn::primitive_id crop_id = layerName + "_crop" + id_str;
-
-        int seqIdx = isForward ? i : lstm_sequence_len - 1 - i;
-        const std::string seqIdx_str = std::to_string(seqIdx);
-
-        cldnn::tensor crop_tensor{ inputShape.batch[0], 1, inputShape.spatial[0], inputShape.spatial[1] };
-        cldnn::tensor offset_tensor{ 0, static_cast<cldnn::tensor::value_type>(seqIdx), 0, 0 };
-        cldnn::primitive_id inputCrop_id = inputCropID + ":" + seqIdx_str;
-        p.add_primitive(*op, cldnn::crop(inputCrop_id, cldnn::input_info(permuteID), crop_tensor, offset_tensor));
-
-        p.add_primitive(*op, cldnn::concatenation(concatID, { cldnn::input_info(inputCrop_id), cldnn::input_info(hiddenStr) }, 3));
-
-        cldnn::tensor fc_input_resh_tensor = { crop_tensor.batch[0], crop_tensor.spatial[0] + inStateShape.spatial[0],
-                                               crop_tensor.feature[0], crop_tensor.spatial[1]};
-        p.add_primitive(*op, cldnn::reshape(fc_input_resh_id, cldnn::input_info(concatID), fc_input_resh_tensor));
-
-        p.add_primitive(*op, cldnn::fully_connected(lstm_fc_id, fc_input_resh_id, WRreshapeID, bias.pid));
-
-        p.add_primitive(*op, cldnn::reshape(lstm_fc_resh_id, cldnn::input_info(lstm_fc_id), gemmSz));
-        p.add_primitive(*op, cldnn::reorder(lstm_fc_reor_id, cldnn::input_info(lstm_fc_resh_id), gemmLayout));
-        p.add_primitive(*op, cldnn::lstm_elt(lstm_elt_id, cldnn::input_info(lstm_fc_reor_id), cellStr, clip, 0, activations,
-                                             activation_params, cldnn::lstm_weights_order::fizo, 0));
-
-        hiddenStr = crop_id + ":hidden";
-        cellStr = crop_id + ":cell";
-        p.add_primitive(*op, cldnn::crop(hiddenStr, cldnn::input_info(lstm_elt_id), hiddenSz, cldnn::tensor{ 0, 0, 0, 0 }));
-        output_ids_offsets.push_back(cldnn::input_info(hiddenStr));
-
-        if (i < lstm_sequence_len - 1) {
-            p.add_primitive(*op, cldnn::crop(cellStr, cldnn::input_info(lstm_elt_id), hiddenSz, cellCropSz));
-        } else {
-            // last hidden state crop (output 2)
-
-            // last cell state crop (output 3)
-            p.add_primitive(*op, cldnn::crop(cellStr, cldnn::input_info(lstm_elt_id), hiddenSz, cellCropSz));
-        }
-    }
-
-    if (!isForward) std::reverse(output_ids_offsets.begin(), output_ids_offsets.end());
-    // concatenated hidden state (output 1)
-    cldnn::primitive_id concatStr = layerName + ":hiddenConcat";
-    p.add_primitive(*op, cldnn::concatenation(concatStr, output_ids_offsets, 1));
-
-    p.add_primitive(*op, cldnn::reshape(layerName + ".out0", concatStr, tensor_from_dims(op->get_output_shape(0))), {layerName});
-    p.add_primitive(*op, cldnn::reshape(layerName + ".out1", hiddenStr, tensor_from_dims(op->get_output_shape(1))));
-    p.add_primitive(*op, cldnn::reshape(layerName + ".out2", cellStr, tensor_from_dims(op->get_output_shape(2))));
+    const float clip = op->get_clip();
+    OPENVINO_ASSERT(op->get_input_shape(2).size() == 3 && op->get_input_shape(3).size() == 1 && op->get_input_shape(4).size() == 3 &&
+        op->get_input_shape(5).size() == 3 && op->get_input_shape(6).size() == 2, "Wrong input shapes for LSTMSequence op ", op->get_friendly_name());
+    auto direction = op->get_direction();
+
+    OPENVINO_ASSERT(p.use_new_shape_infer());
+    cldnn::lstm_seq prim(layerName, inputs[0], inputs[1], inputs[2], inputs[4], inputs[5], inputs[6], inputs[3], clip, false, activations,
+        activation_params, cldnn::lstm_weights_order::fizo, direction, cldnn::padding(), static_cast<int>(op->get_output_size()));
+    prim.output_data_types = get_output_data_types(op);
+    p.add_primitive(*op, prim);
 }
 
 REGISTER_FACTORY_IMPL(v4, LSTMCell);
diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp
index 899110872ba633..48f0b3dbf23f66 100644
--- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp
+++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp
@@ -7,6 +7,7 @@
 #include "openvino/op/split.hpp"
 #include "openvino/op/variadic_split.hpp"
 #include "openvino/op/lstm_cell.hpp"
+#include "openvino/op/lstm_sequence.hpp"
 #include "openvino/op/loop.hpp"
 
 #include "intel_gpu/plugin/common_utils.hpp"
@@ -354,6 +355,10 @@ bool ProgramBuilder::requires_new_shape_infer(const std::shared_ptr<ov::Node>& o
         if (body_function->is_dynamic())
             return true;
     }
+
+    if (ov::is_type<ov::op::v5::LSTMSequence>(op) || ov::is_type<ov::op::v4::LSTMCell>(op)) {
+        return true;
+    }
     // When input node has dynamic shape with 4 dimension, this function return false
     // because op.is_dynamic() which only checks input shapes return false.
     // So, in the case of input data, we need to check output shape.
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index f4ec7afb5c3d1e..fcb88560944854 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -561,7 +561,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             } else if (std::dynamic_pointer_cast<const ov::op::v3::GRUCell>(node)) {
                 return false;
             } else if (const auto &lstm_cell = std::dynamic_pointer_cast<const ov::op::v4::LSTMCell>(node)) {
-                return lstm_cell->get_clip() == 0.0f && lstm_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
+                return false;
             } else if (const auto &lstm_cell_v1 = std::dynamic_pointer_cast<const ov::op::v0::LSTMCell>(node)) {
                 return lstm_cell_v1->get_clip() == 0.0f && lstm_cell_v1->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
             }
@@ -576,9 +576,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         auto isSequencePrimitiveSupported = [](const_node_ptr &node) -> bool {
             const auto& data = node->input(0);
             const auto& data_pshape = data.get_partial_shape();
+            auto max_seq_len = data_pshape[1];
             if (data_pshape.rank().is_static() && data_pshape.rank().get_length() > 1 && !data_pshape[1].is_static())
                 return false;
-            auto max_seq_len = data.get_shape().at(1);
             if (std::dynamic_pointer_cast<const ov::op::v5::RNNSequence>(node)) {
                 return false;
             } else if (std::dynamic_pointer_cast<const ov::op::v5::GRUSequence>(node)) {
@@ -586,7 +586,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             } else if (const auto &lstm_seq = std::dynamic_pointer_cast<const ov::op::v5::LSTMSequence>(node)) {
                 return lstm_seq->get_clip() == 0.0f &&
                        lstm_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"} &&
-                       max_seq_len < 16 &&
+                       max_seq_len != 1 &&
                        !ov::op::util::is_seq_len_provided(lstm_seq->get_input_node_shared_ptr(0),
                                                           lstm_seq->get_input_node_shared_ptr(3));
             }
@@ -604,7 +604,6 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             [isCellPrimitiveSupported](const_node_ptr &node) -> bool {
                 return !isCellPrimitiveSupported(node);
             });
-
         if (unroll_loop) {
             pass_config->set_callback<ov::pass::ConvertRNNSequenceToTensorIterator,
                     ov::pass::ConvertGRUSequenceToTensorIterator,
diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
index 44758f73289edb..4eaccf5540bd2a 100644
--- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp
+++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
@@ -81,7 +81,8 @@ void ExecutionConfig::set_default() {
         std::make_tuple(ov::intel_gpu::allow_new_shape_infer, false),
         std::make_tuple(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape, false),
         std::make_tuple(ov::intel_gpu::buffers_preallocation_ratio, 1.1f),
-        std::make_tuple(ov::intel_gpu::max_kernels_per_batch, 8));
+        std::make_tuple(ov::intel_gpu::max_kernels_per_batch, 8),
+        std::make_tuple(ov::intel_gpu::use_onednn, false));
 }
 
 void ExecutionConfig::register_property_impl(const std::pair<std::string, ov::Any>& property, PropertyVisibility visibility, BaseValidator::Ptr validator) {
@@ -241,8 +242,10 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
     if (!is_set_by_user(ov::intel_gpu::enable_lp_transformations)) {
         set_property(ov::intel_gpu::enable_lp_transformations(info.supports_imad || info.supports_immad));
     }
-
     if (info.supports_immad) {
+        set_property(ov::intel_gpu::use_onednn(true));
+    }
+    if (get_property(ov::intel_gpu::use_onednn)) {
         set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
     }
 
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp
index 6fd9c69ee76376..d83412829a4a9a 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp
@@ -48,7 +48,7 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(smoke_KernelCachingSupportCase_GPU,
                          CompiledKernelsCacheTest,
                          ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_GPU),
-                                            ::testing::Values(std::make_pair(ov::AnyMap{}, "blob"))),
+                                            ::testing::Values(std::make_pair(ov::AnyMap{}, "blob,cl_cache"))),
                          CompiledKernelsCacheTest::getTestCaseName);
 
 const std::vector<ov::AnyMap> GPULoadFromFileConfigs = {
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/lstm_sequence.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/lstm_sequence.cpp
index f56609475a2080..c594888095d092 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/lstm_sequence.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/lstm_sequence.cpp
@@ -18,7 +18,7 @@ std::vector<ov::test::utils::SequenceTestsMode> mode{ov::test::utils::SequenceTe
 std::vector<size_t> seq_lengths_zero_clip{2};
 std::vector<size_t> seq_lengths_clip_non_zero{20};
 std::vector<size_t> batch{10};
-std::vector<size_t> hidden_size{1, 10};
+std::vector<size_t> hidden_size{1, 4, 10};
 std::vector<size_t> hidden_size_smoke{1};
 std::vector<size_t> input_size{10};
 std::vector<std::vector<std::string>> activations = {{"relu", "sigmoid", "tanh"}, {"sigmoid", "tanh", "tanh"},
diff --git a/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp
index 19c4ecd522a242..c271d2341fd5e8 100644
--- a/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp
@@ -30,6 +30,27 @@ struct reduce_test_params {
     std::vector<int64_t> reduce_axes;
     bool keep_dims;
     std::string kernel_name;
+    friend std::ostream& operator<<(std::ostream& os, const reduce_test_params test_params) {
+        os << "in_shape";
+        for(auto r : test_params.in_shape) {
+            os << r << "_";
+        }
+        os << "out_shape";
+        for(auto r : test_params.out_shape) {
+            os << r << "_";
+        }
+        os << "data_type" <<  test_params.data_type << "_input_format" << test_params.input_format;
+        os << "_default_type" << test_params.default_type << "_default_format" << test_params.default_format;
+        os << "_expected_fused_primitives" << test_params.expected_fused_primitives;
+        os << "_expected_not_fused_primitives" << test_params.expected_not_fused_primitives;
+        os << "_reduce_mode" << int(test_params.reduce_mode);
+        os << "_reduce_axes";
+        for(auto r : test_params.reduce_axes) {
+            os << r << "_";
+        }
+        os << "_keep_dims" << test_params.keep_dims << "_kernel_name" << test_params.kernel_name;
+        return os;
+    }
 };
 
 class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp
index 5167e4fb222acd..a16cd20846a1c7 100644
--- a/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/impls_registry_test.cpp
@@ -44,7 +44,6 @@
 #include "intel_gpu/primitives/group_normalization.hpp"
 #include "intel_gpu/primitives/kv_cache.hpp"
 #include "intel_gpu/primitives/loop.hpp"
-#include "intel_gpu/primitives/lstm.hpp"
 #include "intel_gpu/primitives/matrix_nms.hpp"
 #include "intel_gpu/primitives/multiclass_nms.hpp"
 #include "intel_gpu/primitives/multinomial.hpp"
@@ -176,7 +175,6 @@ TEST(registry_test, no_null_impls) {
                 cldnn::group_normalization,
                 cldnn::kv_cache,
                 cldnn::lrn,
-                cldnn::lstm_elt,
                 cldnn::multiclass_nms,
                 cldnn::multinomial,
                 cldnn::mutable_data,
diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/impls_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/impls_test.cpp
index c920025c74e6ac..ff7ee5b100b34f 100644
--- a/src/plugins/intel_gpu/tests/unit/module_tests/impls_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/impls_test.cpp
@@ -327,8 +327,11 @@ TEST_P(PrimitiveTypeTest, has_impl_for_test) {
     node.recalc_output_layout();
 
 #if OV_GPU_WITH_ONEDNN
-    p.get_layout_optimizer().set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 1);
-#endif
+    p.get_layout_optimizer().add_all_onednn_impls_optimization_attribute();
+    if (param_value == some_primitive::SomeParameter::SUPPORTED_VALUE_ONEDNN_1 || param_value == some_primitive::SomeParameter::SUPPORTED_VALUE_ONEDNN_2) {
+        p.get_layout_optimizer().enable_onednn_for<some_primitive>();
+    }
+#endif 
 
     ASSERT_EQ(some_primitive::type_id()->has_impl_for(node, impl_type, shape_type), expected_has_impl) << (int)param_value;
     if (param_value != some_primitive::SomeParameter::UNSUPPORTED_VALUE_ALL)
diff --git a/src/plugins/intel_gpu/tests/unit/passes/add_onednn_optimization_attributes_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/add_onednn_optimization_attributes_test.cpp
index 70e6f99aad8097..ab33a81828c3bf 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/add_onednn_optimization_attributes_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/add_onednn_optimization_attributes_test.cpp
@@ -48,7 +48,7 @@ TEST(add_onednn_optimization_attributes, init_attribute_for_fused_onednn_primiti
     config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
     auto prog = program::build_program(engine, topology, config, false, false);
 
-    prog->get_layout_optimizer().set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, true);
+    prog->get_layout_optimizer().add_all_onednn_impls_optimization_attribute();
 
     program_wrapper::apply_opt_pass<prepare_primitive_fusing>(*prog);
     program_wrapper::apply_opt_pass<add_onednn_optimization_attributes>(*prog);
diff --git a/src/plugins/intel_gpu/tests/unit/passes/handle_reshape.cpp b/src/plugins/intel_gpu/tests/unit/passes/handle_reshape.cpp
index 151c6def8788b6..07110aec701ce5 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/handle_reshape.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/handle_reshape.cpp
@@ -117,7 +117,7 @@ TEST(handle_reshape, skip_reorder_node_to_split_when_onndnn_not_support) {
     config.set_property(ov::intel_gpu::optimize_data(true));
     auto prog = program::build_program(engine, topology, config, false, true);
 
-    prog->get_layout_optimizer().set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, true);
+    prog->get_layout_optimizer().add_all_onednn_impls_optimization_attribute();
     reorder_factory rf;
 
     program_wrapper::apply_opt_pass<reorder_inputs>(*prog, rf);
diff --git a/src/plugins/intel_gpu/tests/unit/passes/select_preferred_formats_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/select_preferred_formats_test.cpp
index a3a802e33a8fca..a277b67633194a 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/select_preferred_formats_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/select_preferred_formats_test.cpp
@@ -41,7 +41,7 @@ TEST(test_select_preferred_formats, setting_target_conv_format) {
 
     auto prog = program::build_program(engine, topology, config, false, true);
     if (engine.get_device_info().supports_immad) {
-        prog->get_layout_optimizer().set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 1);
+        prog->get_layout_optimizer().add_all_onednn_impls_optimization_attribute();
     }
 
     // It initializes output_layout.
@@ -90,7 +90,7 @@ TEST(test_select_preferred_formats, fsv2_fallback_to_byxf) {
 
     auto prog = program::build_program(engine, topology, config, false, true);
     if (engine.get_device_info().supports_immad) {
-        prog->get_layout_optimizer().set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 1);
+        prog->get_layout_optimizer().add_all_onednn_impls_optimization_attribute();
     }
 
     // It initializes output_layout.
diff --git a/src/plugins/intel_gpu/tests/unit/passes/test_module_fusing_reorder.cpp b/src/plugins/intel_gpu/tests/unit/passes/test_module_fusing_reorder.cpp
index d543910ad38834..53f2222ca81ca7 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/test_module_fusing_reorder.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/test_module_fusing_reorder.cpp
@@ -67,7 +67,7 @@ TEST(test_can_fuse_reorder, reorder_for_mixed_type_convolution_fsv32_onednn)
     ExecutionConfig cfg = get_test_default_config(engine);
     program::ptr prog = program::build_program(engine, topology, cfg, false, true);
     layout_optimizer lo = layout_optimizer();
-    lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, true);
+    lo.add_all_onednn_impls_optimization_attribute();
 
     auto itr = prog->get_processing_order().begin();
     while (itr != prog->get_processing_order().end()) {
@@ -104,7 +104,7 @@ TEST(test_can_fuse_reorder, reorder_for_mixed_type_convolution_fsv32_cldnn)
     ExecutionConfig cfg = get_test_default_config(engine);
     program::ptr prog = program::build_program(engine, topology, cfg, false, true);
     layout_optimizer lo = layout_optimizer();
-    lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, false);
+    lo.clear_onednn_impls_optimization_attribute();
 
     auto itr = prog->get_processing_order().begin();
     while (itr != prog->get_processing_order().end()) {
@@ -176,7 +176,7 @@ TEST_P(test_fused_reorder_deep_depth, no_removal_for_deep_depth_conv)
     ExecutionConfig cfg = get_test_default_config(engine);
     program::ptr prog = program::build_program(engine, topology, cfg, false, true);
     layout_optimizer lo = layout_optimizer();
-    lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, true);
+    lo.add_all_onednn_impls_optimization_attribute();
     setting_node(prog, "conv", conv_layout);
 
     auto itr = prog->get_processing_order().begin();
@@ -233,7 +233,7 @@ TEST_P(test_can_fuse_reorder_cldnn, reorder_for_firstconv_cldnn)
 
     program::ptr prog = program::build_program(engine, topology, cfg, false, true);
     layout_optimizer lo = layout_optimizer();
-    lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, false);
+    lo.clear_onednn_impls_optimization_attribute();
 
     auto itr = prog->get_processing_order().begin();
     while (itr != prog->get_processing_order().end()) {
@@ -279,7 +279,7 @@ TEST_P(test_can_fuse_reorder_onednn, reorder_for_firstconv_onednn)
     ExecutionConfig cfg = get_test_default_config(engine);
     program::ptr prog = program::build_program(engine, topology, cfg, false, true);
     layout_optimizer lo = layout_optimizer();
-    lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, true);
+    lo.add_all_onednn_impls_optimization_attribute();
     setting_node(prog, "conv", conv_layout);
 
     auto itr = prog->get_processing_order().begin();
@@ -456,7 +456,7 @@ TEST_P(test_can_fuse_reorder_onednn_errata, errata_case_for_conv) {
     ExecutionConfig cfg = get_test_default_config(engine);
     program::ptr prog = program::build_program(engine, topology, cfg, false, true);
     auto& lo = prog->get_layout_optimizer();
-    lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, true);
+    lo.add_all_onednn_impls_optimization_attribute();
     setting_onednn_conv(prog, lo, "conv", p.conv_layout);
 
     auto itr = prog->get_processing_order().begin();
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
index df493544624b64..70c355c0a6e565 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
@@ -2883,7 +2883,7 @@ class gemm_onednn: public ::testing::Test {
         network->get_program()->get_compilation_context().wait_all();
 
         auto& lo = network->get_program()->get_layout_optimizer();
-        ASSERT_TRUE(lo.get_optimization_attributes().use_onednn_impls);
+        ASSERT_TRUE(lo.has_all_enabled_onednn_impls_optimization_attribute());
 
         // Check if OneDNN's impl is used for the next execute() call
         network->execute();
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
index d43273e2a1508d..8ade3b6c8e0f31 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
@@ -103,8 +103,6 @@ static void compare_bfyx2blocked_with_ref(const std::string& kernel_name,
     network_ref->set_input_data("input", input);
 
     auto outputs_ref = network_ref->execute();
-    cldnn::event::ptr e1 = outputs_ref.at("reorder").get_event();
-    e1->wait();
 
     // run on optimized kernel
     ov::intel_gpu::ExecutionConfig config = get_test_default_config(engine);
@@ -116,8 +114,6 @@ static void compare_bfyx2blocked_with_ref(const std::string& kernel_name,
     network->set_input_data("input", input);
 
     auto outputs = network->execute();
-    cldnn::event::ptr e2 = outputs.at("reorder").get_event();
-    e2->wait();
 
     // compare output_ref and output_opt.
     if (output_data_type == data_types::i8)
@@ -1917,10 +1913,8 @@ TEST(reorder_gpu_opt, non_trivial_remove_redundant)
     auto outputs = net.execute();
     auto executed_primitives = net.get_executed_primitives();
 
-    if (engine.get_device_info().supports_immad) {
-        // Currently, oneDNN only supports in_order_queue
-        return;
-    }
+    if (config.get_property(ov::intel_gpu::queue_type) != QueueTypes::out_of_order)
+        GTEST_SKIP();
 
     ASSERT_TRUE(executed_primitives.count("in") == 1);
     ASSERT_TRUE(executed_primitives.at("in") != outputs.at("r1").get_event());
diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index 3bf6853346d61a..50d87ba18e4fa3 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -12,8 +12,8 @@ if(ENABLE_ONEDNN_FOR_GPU)
         set(ONEDNN_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_build")
         set(ONEDNN_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_install" CACHE PATH "Installation path for oneDNN GPU library")
         set(ONEDNN_PREFIX_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_root")
-        set(ONEDNN_ENABLED_PRIMITIVES "CONCAT;CONVOLUTION;DECONVOLUTION;INNER_PRODUCT;MATMUL;REORDER;POOLING;REDUCTION;SDPA")
-        set(ONEDNN_ENABLED_ISA "XEHPG;XEHPC;XE2;XE3")
+        set(ONEDNN_ENABLED_PRIMITIVES "CONCAT;CONVOLUTION;DECONVOLUTION;INNER_PRODUCT;MATMUL;REORDER;POOLING;REDUCTION;SDPA;RNN")
+        set(ONEDNN_ENABLED_ISA "XELP;XEHP;XEHPG;XEHPC;XE2;XE3")
         set(DNNL_GPU_LIBRARY_NAME "openvino_onednn_gpu" CACHE STRING "Name of oneDNN library for Intel GPU Plugin")
 
         if(X86_64)
diff --git a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp
index 1de6b4f610bbb8..f77a66063d6c98 100644
--- a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp
+++ b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp
@@ -281,12 +281,12 @@ TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkWithGetTensor) {
     OV_ASSERT_NO_THROW(tensor = req.get_tensor(function->inputs().back().get_any_name()));
     OV_ASSERT_NO_THROW(tensor.set_shape({1, 4, 20, 20}));
     ASSERT_EQ(tensor.get_shape(), refShape);
+    std::memset(tensor.data<float>(), 10.0f, tensor.get_byte_size());
+    req.set_input_tensor(tensor);
     OV_ASSERT_NO_THROW(otensor = req.get_tensor(outputname));
     ASSERT_EQ(0, otensor.get_size()); // output tensor is not allocated
     ASSERT_EQ(function->output(0).get_element_type(), otensor.get_element_type()); // by it has type
     OV_ASSERT_NO_THROW(req.infer());
-    OV_ASSERT_NO_THROW(req.start_async());
-    OV_ASSERT_NO_THROW(req.wait());
     EXPECT_NE(0, otensor.get_size()); // output tensor is allocated after infer
     OV_ASSERT_NO_THROW(otensor = req.get_tensor(outputname));
     ASSERT_EQ(otensor.get_shape(), refOutShape);
@@ -432,9 +432,9 @@ TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkWithGetTensor2times) {
     OV_ASSERT_NO_THROW(tensor = req.get_tensor(function->inputs().back().get_any_name()));
     OV_ASSERT_NO_THROW(tensor.set_shape(refShape));
     ASSERT_EQ(tensor.get_shape(), refShape);
+    std::memset(tensor.data<float>(), 10.0f, tensor.get_byte_size());
+    req.set_input_tensor(tensor);
     OV_ASSERT_NO_THROW(req.infer());
-    OV_ASSERT_NO_THROW(req.start_async());
-    req.wait();
     const std::string outputName = function->outputs().back().get_any_name();
     OV_ASSERT_NO_THROW(tensor = req.get_tensor(outputName));
     ASSERT_EQ(tensor.get_shape(), refOutShape);
@@ -443,9 +443,9 @@ TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkWithGetTensor2times) {
     OV_ASSERT_NO_THROW(tensor = req.get_tensor(function->inputs().back().get_any_name()));
     OV_ASSERT_NO_THROW(tensor.set_shape(refShape2));
     ASSERT_EQ(tensor.get_shape(), refShape2);
+    std::memset(tensor.data<float>(), 10.0f, tensor.get_byte_size());
+    req.set_input_tensor(tensor);
     OV_ASSERT_NO_THROW(req.infer());
-    OV_ASSERT_NO_THROW(req.start_async());
-    req.wait();
     OV_ASSERT_NO_THROW(tensor = req.get_tensor(outputName));
     ASSERT_EQ(tensor.get_shape(), refOutShape2);
     ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor"), req.get_tensor(outputName)));
@@ -483,6 +483,7 @@ TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkWithSetTensor) {
     // Create InferRequest
     ov::InferRequest req;
     ov::Tensor tensor(ov::element::f32, refShape);
+    std::memset(tensor.data<float>(), 10.0f, tensor.get_byte_size());
     OV_ASSERT_NO_THROW(req = execNet.create_infer_request());
     OV_ASSERT_NO_THROW(req.set_tensor(function->inputs().back().get_any_name(), tensor));
     ASSERT_EQ(tensor.get_shape(), refShape);
@@ -507,6 +508,7 @@ TEST_P(OVInferRequestDynamicTests, InferFullyDynamicNetworkWithSetTensor) {
     // Create InferRequest
     ov::InferRequest req;
     ov::Tensor tensor(ov::element::f32, refShape), otensor;
+    std::memset(tensor.data<float>(), 10.0f, tensor.get_byte_size());
     const std::string outputName = function->outputs().back().get_any_name();
     OV_ASSERT_NO_THROW(req = execNet.create_infer_request());
     OV_ASSERT_NO_THROW(req.set_tensor(function->inputs().back().get_any_name(), tensor));
@@ -540,23 +542,20 @@ TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkWithSetTensor2times) {
     // Create InferRequest
     ov::InferRequest req;
     ov::Tensor tensor(ov::element::f32, refShape);
-
+    std::memset(tensor.data<float>(), 10.0f, tensor.get_byte_size());
     OV_ASSERT_NO_THROW(req = execNet.create_infer_request());
     OV_ASSERT_NO_THROW(req.set_tensor(function->inputs().back().get_any_name(), tensor));
     ASSERT_EQ(tensor.get_shape(), refShape);
     OV_ASSERT_NO_THROW(req.infer());
-    OV_ASSERT_NO_THROW(req.start_async());
-    OV_ASSERT_NO_THROW(req.wait());
     OV_ASSERT_NO_THROW(tensor = req.get_tensor(outputName));
     ASSERT_EQ(tensor.get_shape(), refOutShape);
     ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor"), req.get_tensor(outputName)));
 
     tensor = ov::Tensor(ov::element::f32, refShape2);
+    std::memset(tensor.data<float>(), 10.0f, tensor.get_byte_size());
     OV_ASSERT_NO_THROW(req.set_tensor(function->inputs().back().get_any_name(), tensor));
     ASSERT_EQ(tensor.get_shape(), refShape2);
     OV_ASSERT_NO_THROW(req.infer());
-    OV_ASSERT_NO_THROW(req.start_async());
-    OV_ASSERT_NO_THROW(req.wait());
     OV_ASSERT_NO_THROW(tensor = req.get_tensor(outputName));
     ASSERT_EQ(tensor.get_shape(), refOutShape2);
     ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor"), req.get_tensor(outputName)));
diff --git a/src/tests/functional/plugin/shared/src/behavior/ov_plugin/caching_tests.cpp b/src/tests/functional/plugin/shared/src/behavior/ov_plugin/caching_tests.cpp
index 73cefa6cc4d8d1..39ed6d9b6bacbd 100644
--- a/src/tests/functional/plugin/shared/src/behavior/ov_plugin/caching_tests.cpp
+++ b/src/tests/functional/plugin/shared/src/behavior/ov_plugin/caching_tests.cpp
@@ -380,7 +380,10 @@ TEST_P(CompileModelLoadFromFileTestBase, CanCreateCacheDirAndDumpBinariesUnicode
         // Check that directory with cached model exists after loading network
         ASSERT_TRUE(ov::util::directory_exists(cache_path_w)) << "Directory with cached kernels doesn't exist";
         // Check that folder contains cache files and remove them
-        ASSERT_GT(ov::test::utils::removeFilesWithExt(cache_path_w, ov::util::string_to_wstring("blob")), 0);
+        int removed_files_num = 0;
+        removed_files_num += ov::test::utils::removeFilesWithExt(cache_path_w, ov::util::string_to_wstring("blob"));
+        removed_files_num += ov::test::utils::removeFilesWithExt(cache_path_w, ov::util::string_to_wstring("cl_cache"));
+        ASSERT_GT(removed_files_num, 0);
         ov::test::utils::removeFile(model_xml_path_w);
         ov::test::utils::removeFile(model_bin_path_w);
         // Remove directory and check that it doesn't exist anymore
@@ -786,10 +789,12 @@ TEST_P(CompiledKernelsCacheTest, CanCreateCacheDirAndDumpBinaries) {
         // Check that directory with cached kernels exists after loading network
         ASSERT_TRUE(ov::util::directory_exists(cache_path)) << "Directory with cached kernels doesn't exist";
         // Check that folder contains cache files and remove them
+        int number_of_deleted_files = 0;
         for (auto& ext : m_extList) {
             // Check that folder contains cache files and remove them
-            ASSERT_GT(ov::test::utils::removeFilesWithExt(cache_path, ext), 0);
+            number_of_deleted_files += ov::test::utils::removeFilesWithExt(cache_path, ext);
         }
+        ASSERT_GT(number_of_deleted_files, 0);
         // Remove directory and check that it doesn't exist anymore
         ASSERT_EQ(ov::test::utils::removeDir(cache_path), 0);
         ASSERT_FALSE(ov::util::directory_exists(cache_path));
@@ -824,12 +829,14 @@ TEST_P(CompiledKernelsCacheTest, TwoNetworksWithSameModelCreatesSameCache) {
         auto execNet2 = core->compile_model(function, targetDevice, configuration);
         execNet2 = {};
         size_t n_cache_files_compare = 0;
+        int number_of_deleted_files = 0;
         // Check that two loaded networks with same function creates same caches
         for (auto& ext : m_extList) {
             // Check that folder contains cache files and remove them
             n_cache_files_compare += ov::test::utils::listFilesWithExt(cache_path, ext).size();
-            ASSERT_TRUE(ov::test::utils::removeFilesWithExt(cache_path, ext));
+            number_of_deleted_files += ov::test::utils::removeFilesWithExt(cache_path, ext);
         }
+        ASSERT_GT(number_of_deleted_files, 0);
         ASSERT_EQ(n_cache_files_compare, n_cache_files);
 
         // Remove directory and check that it doesn't exist anymore
@@ -865,10 +872,12 @@ TEST_P(CompiledKernelsCacheTest, CanCreateCacheDirAndDumpBinariesUnicodePath) {
             // Check that directory with cached kernels exists after loading network
             ASSERT_TRUE(ov::util::directory_exists(cache_path_w)) << "Directory with cached kernels doesn't exist";
             // Check that folder contains cache files and remove them
+            int count_of_removed_files = 0;
             for (auto& ext : m_extList) {
                 // Check that folder contains cache files and remove them
-                ASSERT_GT(ov::test::utils::removeFilesWithExt(cache_path_w, ov::test::utils::stringToWString(ext)), 0);
+                count_of_removed_files += ov::test::utils::removeFilesWithExt(cache_path_w, ov::test::utils::stringToWString(ext));
             }
+            ASSERT_GT(count_of_removed_files, 0);
             // Remove directory and check that it doesn't exist anymore
             ASSERT_EQ(ov::test::utils::removeDir(cache_path_w), 0);
             ASSERT_FALSE(ov::util::directory_exists(cache_path_w));