diff --git a/src/inference/dev_api/openvino/runtime/memory_solver.hpp b/src/inference/dev_api/openvino/runtime/memory_solver.hpp index b2e11d203ef50f..5a93004c5e20f2 100644 --- a/src/inference/dev_api/openvino/runtime/memory_solver.hpp +++ b/src/inference/dev_api/openvino/runtime/memory_solver.hpp @@ -52,6 +52,7 @@ class MemorySolver { struct Box { /** Execution order index of first use. The data will be produced here. */ int start; + // intel_cpu::GlobalExecutionIndex start; /** * The execution order index of last use. After that data will be released. @@ -59,6 +60,7 @@ class MemorySolver { * end of execution. */ int finish; + // intel_cpu::GlobalExecutionIndex finish; /** Size of data. In abstract unit of measure (byte, simd, cache line, ...) */ int64_t size; diff --git a/src/plugins/intel_cpu/src/allocation_context.hpp b/src/plugins/intel_cpu/src/allocation_context.hpp new file mode 100644 index 00000000000000..8affe814807004 --- /dev/null +++ b/src/plugins/intel_cpu/src/allocation_context.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +namespace ov { +namespace intel_cpu { + +class Node; +class Edge; + +using GlobalExecutionIndex = std::unordered_map, std::pair>; + +struct AllocationContext { + std::vector> edges; + GlobalExecutionIndex execIndex; + std::vector syncPoints; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index bbee5d937be5d5..0f9adeeb490a1b 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -4,6 +4,7 @@ #include "compiled_model.h" #include "async_infer_request.h" +#include "graph.h" #include "infer_request.h" #include "itt.h" #include "low_precision/low_precision.hpp" @@ -19,6 +20,7 @@ #include "openvino/runtime/threading/cpu_streams_info.hpp" #include "openvino/runtime/threading/cpu_message.hpp" #include "utils/serialize.hpp" +#include "memory_control.hpp" #include "cpu/x64/cpu_isa_traits.hpp" #include @@ -52,7 +54,8 @@ CompiledModel::CompiledModel(const std::shared_ptr& model, m_cfg{cfg}, m_name{model->get_name()}, m_loaded_from_cache(loaded_from_cache), - m_sub_memory_manager(sub_memory_manager) { + m_sub_memory_manager(sub_memory_manager), + m_networkMemoryControl(std::make_shared()) { m_mutex = std::make_shared(); const auto& core = m_plugin->get_core(); if (!core) @@ -155,17 +158,24 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const { GraphContext::Ptr ctx; { std::lock_guard lock{*m_mutex.get()}; + MemoryControl* memoryControl = m_networkMemoryControl->createMemoryControlUnit(); auto isQuantizedFlag = (m_cfg.lpTransformsMode == Config::On) && ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model); - ctx = std::make_shared(m_cfg, m_socketWeights[socketId], isQuantizedFlag, + memoryControl, + m_networkMemoryControl, streamsExecutor, - m_sub_memory_manager); + m_sub_memory_manager, + true); } + const std::shared_ptr model = m_model; - graphLock._graph.CreateGraph(model, ctx); + // @todo propagate input / output memory descriptors + graphLock._graph.Init(model, ctx); + // @todo pass input / output memory + graphLock._graph.Activate(); } catch (...) { exception = std::current_exception(); } @@ -346,7 +356,7 @@ void CompiledModel::release_memory() { for (auto&& graph : m_graphs) { GraphGuard::Lock graph_lock{graph}; auto ctx = graph_lock._graph.getGraphContext(); - ctx->getNetworkMemoryControl()->releaseMemory(); + m_networkMemoryControl->releaseMemory(); } } diff --git a/src/plugins/intel_cpu/src/compiled_model.h b/src/plugins/intel_cpu/src/compiled_model.h index faedf1ae5a744c..cab50971f31a78 100644 --- a/src/plugins/intel_cpu/src/compiled_model.h +++ b/src/plugins/intel_cpu/src/compiled_model.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -19,6 +20,8 @@ namespace ov { namespace intel_cpu { +class NetworkMemoryControl; + class CompiledModel : public ov::ICompiledModel { public: typedef std::shared_ptr Ptr; @@ -51,6 +54,10 @@ class CompiledModel : public ov::ICompiledModel { void release_memory() override; + std::shared_ptr get_network_memory_control() const { + return m_networkMemoryControl; + } + private: std::shared_ptr create_sync_infer_request() const override; friend class SyncInferRequest; @@ -91,6 +98,7 @@ class CompiledModel : public ov::ICompiledModel { std::vector> m_sub_compiled_models; std::shared_ptr m_sub_memory_manager = nullptr; + std::shared_ptr m_networkMemoryControl = nullptr; bool m_has_sub_compiled_models = false; }; diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp index c314718bb82416..acdae424ca5271 100644 --- a/src/plugins/intel_cpu/src/edge.cpp +++ b/src/plugins/intel_cpu/src/edge.cpp @@ -235,7 +235,7 @@ Edge::ReorderStatus Edge::needReorder() { } void Edge::reuse(MemoryPtr ptr) { - OPENVINO_ASSERT(ptr != nullptr, "Attempt to reuse initialized memory in " + name()); + OPENVINO_ASSERT(ptr != nullptr, "Attempt to reuse uninitialized memory in " + name()); memoryPtr = ptr; changeStatus(Status::Allocated); @@ -293,6 +293,11 @@ std::string Edge::name() const { std::stringstream result; result << parentPtr->getName() << " port " << parent_port << " <-> " << childPtr->getName() << " port " << child_port; + // result << parentPtr->getName()<< " port " << parent_port + // << " <-> " + // << childPtr->getName() << " port " << child_port + // << " status: " + // << static_cast(getStatus()); return result.str(); } @@ -411,6 +416,9 @@ const MemoryDesc& Edge::getOutputDesc() const { } const MemoryDesc& Edge::getDesc() const { + OPENVINO_ASSERT(!one_of(status, Status::Validated, Status::Allocated), + "Desc of an Allocated edge ", name(), " must be accessed through the memory object"); + if (!getInputDesc().isCompatible(getOutputDesc())) OPENVINO_THROW("Cannot get descriptor for edge: ", getParent()->getName(), "->", getChild()->getName()); @@ -441,7 +449,7 @@ void Edge::validate() { getChild(); if (status != Status::Allocated || !memoryPtr) { - OPENVINO_THROW("Error memory is not allocated!"); + OPENVINO_THROW("Error memory is not allocated for edge: ", name()); } status = Status::Validated; } diff --git a/src/plugins/intel_cpu/src/edge.h b/src/plugins/intel_cpu/src/edge.h index 29cb8113943cd3..b2890f3f477de1 100644 --- a/src/plugins/intel_cpu/src/edge.h +++ b/src/plugins/intel_cpu/src/edge.h @@ -28,11 +28,11 @@ class Edge { int pr_port = 0, int ch_port = 0); enum class Status { - Uninitialized, - NeedAllocation, - NotAllocated, - Allocated, - Validated + Uninitialized, // base edge is unknown yet + NeedAllocation, // edge is the base edge + NotAllocated, // edge is a referencing edge + Allocated, // edge memory is allocated + Validated // edge is validated }; enum class ReorderStatus { @@ -82,6 +82,7 @@ class Edge { } std::string name() const; + const MemoryDesc& getDesc() const; private: std::weak_ptr parent; @@ -99,7 +100,6 @@ class Edge { PortDescBaseCPtr getInputPortDesc() const; PortDescBaseCPtr getOutputPortDesc() const; - const MemoryDesc& getDesc() const; bool enforceReorder(); void collectConsumers(std::vector>& result) const; diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index f3f3a379fc2af7..7b519bd75d3fcf 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -6,6 +6,9 @@ #include #include +#include +#include +#include #include #include #include @@ -16,11 +19,14 @@ #include #include +#include "allocation_context.hpp" #include "edge.h" +#include "graph_context.h" #include "graph_dumper.h" #include "graph_optimizer.h" #include "infer_request.h" #include "itt.h" +#include "memory_control.hpp" #include "memory_desc/cpu_memory_desc_utils.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "node.h" @@ -64,7 +70,7 @@ template void Graph::CreateGraph(NET &model, const GraphContext::CPtr context) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "CreateGraph"); - Init(model, context); + Init(model, std::make_shared(context->disableMemoryReuse())); Activate(); } @@ -76,7 +82,7 @@ void Graph::CreateGraph(const std::vector& graphNodes, if (IsReady()) ForgetGraphData(); - m_context = context; + m_context = std::make_shared(context->disableMemoryReuse()); m_stream = dnnl::stream(getEngine()); this->_name = std::move(name); @@ -277,7 +283,9 @@ static std::tuple, std::vector> ExtractExecutableNo std::vector executableGraphNodes; for (size_t i = 0; i < graphNodes.size(); i++) { const auto& graphNode = graphNodes[i]; - if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(graphNode->isExecutable())) || // non-constant executable or + // if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or + // if ((!graphNode->isConstant()) || // non-constant executable or + if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or (graphNode->isDynamicNode() && !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs /* @todo * Revise implementation. @@ -350,16 +358,22 @@ static void UseExternalOutputMemory(const std::map& output } void Graph::Activate(const std::vector& externalInputMemory, - const std::vector& externalOutputMemory) { - OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); + const std::vector& externalOutputMemory) { + // OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); - const bool hasDynNodes = ProcessDynNodes(); - const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; + // const bool hasDynNodes = ProcessDynNodes(); + // const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; UseExternalInputMemory(inputNodesMap, externalInputMemory); UseExternalOutputMemory(outputNodesMap, externalOutputMemory); - Allocate(syncNodesInds); + // std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + // status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) + // : Status::ReadyStatic; + + // CPU_DEBUG_CAP_ENABLE(serialize(*this)); + Allocate(); CreatePrimitivesAndExecConstants(); @@ -369,22 +383,6 @@ void Graph::Activate(const std::vector& externalInputMemory, } #endif - std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); - - if (hasDynNodes) { - status = Status::ReadyDynamic; - // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec - // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context - // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also - // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in - // parallel. - const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); - if (exec2sync < 10 || parallel_get_max_threads() < 2) { - status = Status::ReadyDynamicSeq; - } - } else { - status = Status::ReadyStatic; - } CPU_DEBUG_CAP_ENABLE(serialize(*this)); } @@ -713,88 +711,344 @@ void Graph::ResolveComplexInplaceConflicts() { } } -static inline bool isConstOutput(EdgePtr edge) { - return edge->getParent()->isConstant() && !edge->getChild()->isConstant(); +/** + * Partition the \clusters of Edges, by moving and allocating at the same time + * the clusters which cannot be handled as part of generic memory solver algorithm. + * Such clusters meet one of the following criteria: + * - base edge of a cluster is already Allocated + * - base edge of a cluster is a "ov::element::string" type of edge + * - base edge of a cluster is a Constant edge + * + * @return a remaining number of clusters to process (left partition) + */ +static size_t AllocateStringsAndConstants(EdgeClusters& clusters, + const GraphContext::CPtr context) { + auto allocateStringMemory = [context](const EdgePtr& edge) { + if (edge->getParent()->isConstant()) { + if (edge->getParent()->getType() == Type::Input) { + auto constNode = static_cast(edge->getParent().get()); + edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); + } else { + edge->externalAllocate(context->getWeightsCache()); + } + auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); + OPENVINO_ASSERT(stringMemory, "[CPU] Edge between nodes '", + edge->getParent()->getName(), "' and '", edge->getChild()->getName(), "' must have StringMemory."); + return stringMemory->getStringMemoryBlockPtr(); + } + + auto memory = std::make_shared(context->getEngine(), edge->getDesc()); + edge->reuse(memory); + return memory->getStringMemoryBlockPtr(); + }; + + auto allocateConstantEdge = [context](const EdgePtr& edge) { + // std::cout << "Allocating constant edge: " << edge->name() << " wc: " << context->getWeightsCache() << "\n"; + if (edge->getParent()->getType() == Type::Input) { + auto constNode = std::static_pointer_cast(edge->getParent()); + edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); + } else { + edge->externalAllocate(context->getWeightsCache()); + } + }; + + auto endOfNotAllocatedPartition = + std::partition(clusters.begin(), clusters.end(), + [&allocateStringMemory, &allocateConstantEdge, &context](const EdgeCluster& cluster) { + if (cluster.empty()) return false; + + auto baseEdgeIt = std::find_if(cluster.begin(), cluster.end(), [](const EdgePtr& edge) { + return one_of(edge->getStatus(), Edge::Status::Allocated, Edge::Status::NeedAllocation); + }); + + OPENVINO_ASSERT(baseEdgeIt != cluster.end(), "Unexpected cluster state"); + + // const auto& baseEdge = cluster.front(); + const auto& baseEdge = *baseEdgeIt; + // Skip already allocated cluster + if (baseEdge->getStatus() == Edge::Status::Allocated) { + return false; + } + + // std::cout << "Processing string/const for base edge: " << baseEdge->name() << "\n"; + + // Skip if the baseEdge does not require allocation + if (baseEdge->getStatus() != Edge::Status::NeedAllocation) { + return true; + } + + // Allocate a string cluster + if (baseEdge->getDesc().getPrecision() == element::string) { + OPENVINO_ASSERT(std::all_of(cluster.begin(), cluster.end(), + [](const EdgePtr& edge) { + return edge->getDesc().getPrecision() == element::string; + }), "All edges in the cluster must be string."); + auto memBlock = allocateStringMemory(baseEdge); + for (auto &edge : cluster) { + if (edge->getStatus() == Edge::Status::NotAllocated) { + edge->reuse(std::make_shared(context->getEngine(), edge->getDesc(), memBlock)); + } + } + return false; + } + + // Allocate a constant cluster + if (baseEdge->getParent()->isConstant()) { + // @todo can we add some meaningful assert here? + for (auto &edge : cluster) { + if (edge->getParent()->isConstant() && edge->getStatus() == Edge::Status::NeedAllocation) { + allocateConstantEdge(edge); + } + } + return false; + } + + return true; + }); + + return std::distance(clusters.begin(), endOfNotAllocatedPartition); } -void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { - edgeClusters edge_clusters = MemoryControl::findEdgeClusters(graphEdges); +static void AllocateBaseEdges(const EdgeClusters& edgeClusters, + const MemoryControl::MemorySolution& memorySolution) { + // attach all the not yet allocated edges to the memory control + for (auto&& item : memorySolution) { + int count = 0; + // std::cout << "Processing cluster: " << item.first << "\n"; + for (auto&& edge : edgeClusters[item.first]) { + // std::cout << "Processing edge: " << edge->name() << "\n"; + if (edge->getStatus() == Edge::Status::NeedAllocation) { + // std::cout << "Allocating edge: " << edge->name() << "\n"; - size_t remaining_edge_clusters_count = edge_clusters.size(); + edge->allocate(item.second); - // Resolve special cases: - for (size_t i = 0; i < remaining_edge_clusters_count;) { - auto &cluster = edge_clusters[i]; - bool erase = false; - for (auto &edge : cluster) { - // Remove already allocated edges from the mem reuse algo - if (edge->getStatus() == Edge::Status::Allocated) { - erase = true; - break; + // TODO: WA for some test (like strided_slice_test) which use tensors with + // shapes {0}. And it is implicitly converted into {1} tensor. + // Zeroing of input data allow pass tests. + if (edge->getParent()->getType() == Type::Input && edge->getMemory().getDesc().hasDefinedMaxSize()) + edge->getMemoryPtr()->nullify(); + + count++; } + } + OPENVINO_ASSERT(count == 1, "Expected exactly one allocation. Actual number of allocations: ", count); + } +} - // Special allocation for string tensors - if (edge->getDesc().getPrecision() == element::string && edge->getStatus() == Edge::Status::NeedAllocation) { - StringMemory::StringMemoryBlockPtr memBlcok; - if (edge->getParent()->isConstant()) { - if (edge->getParent()->getType() == Type::Input) { - auto constNode = static_cast(edge->getParent().get()); - edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); - } else { - edge->externalAllocate(m_context->getWeightsCache()); - } - auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); - OPENVINO_ASSERT(stringMemory, "[CPU] Edge between nodes '", - edge->getParent()->getName(), "' and '", edge->getChild()->getName(), "' must have StringMemory."); - memBlcok = stringMemory->getStringMemoryBlockPtr(); - } else { - auto memory = std::make_shared(getEngine(), edge->getDesc()); - edge->reuse(memory); - memBlcok = memory->getStringMemoryBlockPtr(); - } - for (auto& edge_c : cluster) { - if (edge_c == edge) { - continue; - } - OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string, "All edges in the cluster must be string."); - if (edge_c->getStatus() == Edge::Status::NotAllocated) { - auto memory = std::make_shared(getEngine(), edge_c->getDesc(), memBlcok); - edge_c->reuse(memory); +static void AllocatedReferencingEdges(const EdgeClusters& clusters) { + for (auto& cluster : clusters) { + for (auto& edge : cluster) { + if (edge->getStatus() != Edge::Status::NotAllocated) { + continue; + } + + std::vector edges_to_process; + edges_to_process.push_back(edge); + for (auto next_edge = edge->getSharedEdge(std::nothrow); + next_edge; + next_edge = next_edge->getSharedEdge(std::nothrow)) { + edges_to_process.push_back(next_edge); + } + + std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) { + // std::cout << "Processing edge: " << edge->name() << "\n"; + if (edge->getStatus() == Edge::Status::NotAllocated) { + if (edge->inPlace(Edge::LOOK_DOWN)) { + edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); + } else if (edge->inPlace(Edge::LOOK_UP)) { + edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); } else { - OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '", edge_c->getParent()->getName(), "' and '", - edge_c->getChild()->getName(), "' has an unexpected status: ", static_cast(edge_c->getStatus())); + auto sharedEdge = edge->getSharedEdge(); + auto sharedEdgeParent = sharedEdge->getParent(); + // std::cout << "Allocating edge: " << edge->name() << " Using shared edge: " << sharedEdge->name() << "\n"; + edge->allocate(sharedEdge->getMemoryPtr()->getMemoryBlock()); + DEBUG_LOG(*edge, " sharedEdge with ", *sharedEdge); } } - erase = true; - continue; - } + }); + } + } +} - // Special allocation for constants - if (edge->getStatus() != Edge::Status::NeedAllocation || !edge->getParent()->isConstant()) { - continue; +std::vector Graph::CreateExecutionGraph() { + const bool hasDynNodes = ProcessDynNodes(); + auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; + + std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = + ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) + : Status::ReadyStatic; + + if (hasDynNodes) { + status = Status::ReadyDynamic; + // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec + // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context + // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also + // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in + // parallel. + const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); + if (exec2sync < 10 || parallel_get_max_threads() < 2) { + status = Status::ReadyDynamicSeq; + } + } else { + status = Status::ReadyStatic; + } + + return syncNodesInds; +} + +static void ResolveInOutInPlaceEdgesLegacy(const std::vector& edges) { + for (const auto& edge : edges) { + // std::cout << edge->name() << "\n"; + if (edge->getStatus() == Edge::Status::Uninitialized) { + if (edge->getParent()->getParentEdges().empty() && + one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) && + edge->inPlace(Edge::LOOK_UP)) { + edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); + } else if (edge->getChild()->getChildEdges().empty() && + one_of(edge->getChild()->getType(), Type::Output, Type::MemoryOutput) && + edge->inPlace(Edge::LOOK_DOWN)) { + edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); } - if (edge->getParent()->getType() == Type::Input) { - auto constNode = std::static_pointer_cast(edge->getParent()); - edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); - } else { - edge->externalAllocate(m_context->getWeightsCache()); + } + } +} + +static void ResolveInOutInPlaceEdges(const std::vector& edges) { + for (const auto& edge : edges) { + if (edge->getStatus() == Edge::Status::Uninitialized) { + if (edge->getParent()->getParentEdges().empty() && + one_of(edge->getParent()->getType(), Type::MemoryInput) && + edge->inPlace(Edge::LOOK_UP)) { + edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); + } else if (edge->getChild()->getChildEdges().empty() && + one_of(edge->getChild()->getType(), Type::MemoryOutput) && + edge->inPlace(Edge::LOOK_DOWN)) { + edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); } - erase = true; } + } +} - if (erase) { - std::swap(edge_clusters[i], edge_clusters[remaining_edge_clusters_count - 1]); - --remaining_edge_clusters_count; - } else { - ++i; +int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) { + auto syncNodesInds = CreateExecutionGraph(); + + ResolveInOutInPlaceEdges(graphEdges); + + // nodes are expected to be topologically sorted + for (size_t execIndex = 0, j = 0; execIndex < graphNodes.size(); execIndex++) { + const auto& node = graphNodes[execIndex]; + const auto inputExecIndex = execIndex + offset; + // an offset is the number of nodes in the internal graph minus the current node (-1) + offset = node->registerToAllocationContext(inputExecIndex, context) - 1; + const auto outputExecIndex = execIndex + offset; + context.execIndex[node] = {inputExecIndex, outputExecIndex}; + + if (j < syncNodesInds.size() && syncNodesInds[j] == execIndex) { + context.syncPoints.push_back(inputExecIndex); + j++; + } + } + + context.edges.insert(context.edges.end(), graphEdges.begin(), graphEdges.end()); + + return offset; +} + +AllocationContext Graph::CreateAllocationContext(bool global) { + AllocationContext allocationContext; + + if (global) { + RegisterToAllocationContext(0, allocationContext); + } else { // local allocation context. Used for the nodes with inner graph which are not updated yet + ResolveInOutInPlaceEdgesLegacy(graphEdges); + + auto syncNodesInds = CreateExecutionGraph(); + + for (size_t i = 0; i < graphNodes.size(); i++) { + const auto& node = graphNodes[i]; + allocationContext.execIndex[node] = {i, i}; + } + + allocationContext.edges = graphEdges; + allocationContext.syncPoints = syncNodesInds; + } + + return allocationContext; +} + +static void InitEdgeStatus(const std::vector& edges) { + for (auto& edge : edges) edge->init(); +} + +static void ValidateEdgeStatus(const std::vector& edges) { + for (auto& edge : edges) edge->validate(); +} + +/** + * Forms clusters of edges. + * An edge cluster is a collection of edges, so: + * - base edge is an edge with a Memory which other edges point to by means of inplace logic + * - first edge of a cluster is a base edge with a status either NeedAllocation or Allocated + * - rest of the edges in a cluster are NotAllocated ones, since they point to their base edge + */ +static EdgeClusters FormEdgeClusters(const std::vector& graphEdges) { + typedef std::unordered_map EdgeClusterIdxMap; + EdgeClusters edgeClusters; + EdgeClusterIdxMap edgeClusterIndices; + + for (auto& edge : graphEdges) { + if (edgeClusterIndices.count(edge)) + continue; // edge is visited + + size_t clusterIdx = edgeClusters.size(); + EdgePtr lastSharedEdge = nullptr; + + // find cluster index + for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge; + shared_edge = shared_edge->getSharedEdge(std::nothrow)) { + auto shared_edge_it = edgeClusterIndices.find(shared_edge); + if (shared_edge_it != edgeClusterIndices.end()) { + clusterIdx = shared_edge_it->second; + lastSharedEdge = shared_edge; + break; + } } + + if (clusterIdx == edgeClusters.size()) + edgeClusters.emplace_back(EdgeCluster{edge}); + + // use recursive approach to ensure that the base edge is placed as a first entry of a cluster + std::function addToCluster; + addToCluster = [&addToCluster, &edgeClusterIndices, &clusterIdx, &edgeClusters, &lastSharedEdge](EdgePtr edge) { + if (edge == lastSharedEdge) + return; + + addToCluster(edge->getSharedEdge(std::nothrow)); + + edgeClusterIndices.emplace(edge, clusterIdx); + edgeClusters[clusterIdx].push_back(edge); + }; + + addToCluster(edge); } + return edgeClusters; +} + +static MemoryRegions FormMemoryRegions(const EdgeClusters& clusters, + size_t remaining, + const GlobalExecutionIndex& globalExecIndex) { + auto isConstOutput = [](EdgePtr edge) { + return edge->getParent()->isConstant() && !edge->getChild()->isConstant(); + }; + // Markup the memory regions - std::vector memoryRegions; - memoryRegions.reserve(remaining_edge_clusters_count); + MemoryRegions memoryRegions; + memoryRegions.reserve(remaining); - for (size_t i = 0; i < remaining_edge_clusters_count; ++i) { + for (size_t i = 0; i < remaining; ++i) { MemoryRegion reg = {std::numeric_limits::max(), 0, 0, @@ -804,9 +1058,18 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { int64_t boxSize = 0; bool isConst = false, isOutput = false, isInput = false; - for (auto &edge : edge_clusters[i]) { - int e_start = edge->getParent()->getExecIndex(); - int e_finish = edge->getChild()->getExecIndex(); + // std::cout << "Form memory region for cluster: " << i << "\n"; + for (auto &edge : clusters[i]) { + const auto& parent = edge->getParent(); + const auto& child = edge->getChild(); + + // std::cout << "[" << globalExecIndex.at(parent).second << " - " << globalExecIndex.at(child).first << "]" + // << edge->name() + // << "\n"; + + int e_start = globalExecIndex.at(parent).second; + int e_finish = globalExecIndex.at(child).first; + // int e_finish = edge->getChild()->getExecIndex(); auto&& desc = edge->getDesc(); @@ -829,8 +1092,8 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { reg.alloc_type = allocType; isConst |= isConstOutput(edge); - isOutput |= edge->getChild()->getType() == Type::Output; - isInput |= edge->getParent()->getType() == Type::Input; + isOutput |= child->getType() == Type::Output; + isInput |= parent->getType() == Type::Input; } reg.size = boxSize; @@ -850,25 +1113,33 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { memoryRegions.push_back(reg); } - // special processing of the dynamic output edges - auto it = std::remove_if(memoryRegions.begin(), memoryRegions.end(), [&](const MemoryRegion& region) { + return memoryRegions; +} + +static OutputMemoryBlocks FilterOutDynamicOutputEdges(MemoryRegions& memoryRegions, + const EdgeClusters& clusters, + const std::map& outputNodes) { + OutputMemoryBlocks outputMemBlocks; + memoryRegions.erase(std::remove_if(memoryRegions.begin(), memoryRegions.end(), [&](const MemoryRegion& region) { if (region.size >= 0 || !one_of(region.type, MemoryRegion::RegionType::OUTPUT, MemoryRegion::RegionType::IO)) { return false; } bool result = false; - for (auto& edge : edge_clusters[region.id]) { + for (auto& edge : clusters[region.id]) { auto child = edge->getChild(); if (child->getType() == Type::Output && edge->getStatus() == Edge::Status::NeedAllocation) { auto proxyMemBlock = std::make_shared(); - DEBUG_LOG("ProxyMemoryBlock ", proxyMemBlock, " ", this); + DEBUG_LOG("ProxyMemoryBlock ", proxyMemBlock); + // std::cout << "Allocating output edge: " << edge->name() << "\n"; edge->allocate(proxyMemBlock); // Store the output memory blocks. // So that, the infer requests can be able to access them. + // @todo Can we just get them from outputNodesMap instead? int count = 0; - for (auto& output : outputNodesMap) { + for (auto& output : outputNodes) { if (output.second == child) { - outputNodesMemBlocksMap[output.first] = proxyMemBlock; + outputMemBlocks[output.first] = proxyMemBlock; count++; } } @@ -878,97 +1149,85 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } } return result; - }); + }), memoryRegions.end()); - memoryRegions.erase(it, memoryRegions.end()); + return outputMemBlocks; +} - //Set up the memory control subsystem. - this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(syncNodesInds)); - auto memoryBlocks = m_pMemoryControl->insert(memoryRegions); +/** + * Solve memory reuse + * Ideally only MemorySolution should be returned + * For now we have to additionally return: + * 1) EdgeClusters - to propagate the solution through the graph + * 2) OutputMemoryBlocks - to allow memory sharing between graph and infer request + */ +static std::tuple +SolveMemoryReuse(MemoryControl* memoryControl, + const AllocationContext& allocationContext, + const GraphContext::CPtr graphContext, + const std::map& outputNodesMap) { + const auto& edges = allocationContext.edges; - // attach all the not yet allocated edges to the memory contol - for (auto&& item : memoryBlocks) { - int count = 0; - for (auto&& edge : edge_clusters[item.first]) { - if (edge->getStatus() == Edge::Status::NeedAllocation) { - edge->allocate(item.second); + auto edgeClusters = FormEdgeClusters(edges); - // TODO: WA for some test (like strided_slice_test) which use tensors with - // shapes {0}. And it is implicitly converted into {1} tensor. - // Zeroing of input data allow pass tests. - if (edge->getParent()->type == Type::Input && edge->hasDefinedMaxSize()) - edge->getMemoryPtr()->nullify(); + const size_t remainingEdgeClustersCount = AllocateStringsAndConstants(edgeClusters, graphContext); - count++; - } - } - OPENVINO_ASSERT(count == 1); - } + auto memoryRegions = FormMemoryRegions(edgeClusters, + remainingEdgeClustersCount, + allocationContext.execIndex); - m_pMemoryControl->allocateMemory(); + auto outputNodesMemBlocks = FilterOutDynamicOutputEdges(memoryRegions, + edgeClusters, + outputNodesMap); - // Resolve all other edges with status NotAllocated and in-place - for (auto& cluster : edge_clusters) { - for (auto& edge : cluster) { - if (edge->getStatus() != Edge::Status::NotAllocated) { - continue; - } - std::vector edges_to_process; - edges_to_process.push_back(edge); - for (auto next_edge = edge->getSharedEdge(std::nothrow); - next_edge; - next_edge = next_edge->getSharedEdge(std::nothrow)) { - edges_to_process.push_back(next_edge); - } - std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) { - if (edge->getStatus() == Edge::Status::NotAllocated) { - if (edge->inPlace(Edge::LOOK_DOWN)) { - edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); - } else if (edge->inPlace(Edge::LOOK_UP)) { - edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); - } else { - auto sharedEdge = edge->getSharedEdge(); - auto sharedEdgeParent = sharedEdge->getParent(); - edge->allocate(sharedEdge->getMemoryPtr()->getMemoryBlock()); - DEBUG_LOG(*edge, " sharedEdge with ", *sharedEdge); - } - } - }); - } - } + memoryControl->insert(memoryRegions, allocationContext.syncPoints); + auto memoryBlocks = memoryControl->solve(); + + return std::make_tuple(memoryBlocks, edgeClusters, outputNodesMemBlocks); } -void Graph::Allocate(const std::vector& syncNodesInds) { - OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::Allocate"); +void Graph::Allocate() { + const auto globalAllocation = m_context->memoryReuseGlobal(); + // Set up the memory control subsystem. + auto memoryControl = globalAllocation ? m_context->getMemoryControl() : m_context->getNetworkMemoryControl()->createMemoryControlUnit(); - //resolve inplace dead end nodes - for (const auto& edge : graphEdges) { - if (edge->getStatus() == Edge::Status::Uninitialized) { - if (edge->getParent()->getParentEdges().empty() && - one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) && - edge->inPlace(Edge::LOOK_UP)) { - edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); - } else if (edge->getChild()->getChildEdges().empty() && - one_of(edge->getChild()->getType(), Type::Output, Type::MemoryOutput) && - edge->inPlace(Edge::LOOK_DOWN)) { - edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); - } - } + // memory is already allocated globally + if (memoryControl->allocated()) { + return; + } + + auto allocationContext = CreateAllocationContext(globalAllocation); + + for (const auto& entry : allocationContext.execIndex) { + OPENVINO_ASSERT(entry.second.first >= 0); + OPENVINO_ASSERT(entry.second.second >= 0); } - // resolve edges. Define which will be a view on others - // NeedAllocation - real blob - // NotAllocated - view on other blob, peer or in-place - for (auto& edge : graphEdges) edge->init(); + const auto& edges = allocationContext.edges; + InitEdgeStatus(edges); - // Allocate memory space for all edges marked with NeedAllocation - AllocateWithReuse(syncNodesInds); + MemoryControl::MemorySolution solution; + EdgeClusters edgeClusters; + std::tie(solution, edgeClusters, m_outputNodesMemBlocks) = SolveMemoryReuse(memoryControl, allocationContext, m_context, outputNodesMap); - // Check all getters. Should work. - for (auto& edge : graphEdges) edge->validate(); + // std::cout << "### Global edges:" << "\n"; + // for (const auto& edge : edges) { + // const auto& parent = edge->getParent(); + // const auto& child = edge->getChild(); + // std::cout << "[" << allocationContext.execIndex[parent].second << " - " << allocationContext.execIndex[child].first << "]" + // << edge->name() + // << "\n"; + // } + + AllocateBaseEdges(edgeClusters, solution); + + memoryControl->allocateMemory(); + + AllocatedReferencingEdges(edgeClusters); + ValidateEdgeStatus(edges); } -bool Graph::ProcessDynNodes() { +bool Graph::ProcessDynNodes() const { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::ProcessDynNodes"); const bool containsDynamicNodes = std::any_of(graphNodes.begin(), graphNodes.end(), [](const NodePtr& node) { @@ -1395,14 +1654,6 @@ void Graph::Infer(SyncInferRequest* request) { DEBUG_LOG("Infer graph: ", GetName(), ". Status: ", static_cast(status)); const int numaId = GetNumaNodeId(m_context); - if (!m_pMemoryControl) { - OPENVINO_THROW("Memory control unit is not initilized in graph: ", GetName()); - } - - if (!m_pMemoryControl->allocated()) { - m_pMemoryControl->allocateMemory(); - } - switch (status) { case Status::ReadyDynamic: InferDynamic(request, numaId, UpdateNodes(m_executableGraphNodes)); diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index d50ccc152c9186..cb4fa8d1ab86fd 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -4,17 +4,17 @@ #pragma once +#include "allocation_context.hpp" #include "config.h" #include "cpu_memory.h" #include "nodes/input.h" -#include "openvino/core/node_vector.hpp" #include "openvino/runtime/profiling_info.hpp" #include "node.h" #include "edge.h" #include "graph_context.h" -#include "memory_control.hpp" #include "openvino/runtime/profiling_info.hpp" +#include #include #include #include @@ -31,6 +31,8 @@ namespace node { class MemoryStateNode; } // namespace node +using OutputMemoryBlocks = std::unordered_map; + class Graph { public: typedef std::shared_ptr Ptr; @@ -65,9 +67,23 @@ class Graph { return m_context->getConfig(); } + /** + * Obsolete way of creating graph + * To enable layout propagation and global memory reuse + * two stage creation should be used instead: + * - Init() + * - Allocate() + */ template void CreateGraph(NET &model, const GraphContext::CPtr context); + /** + * Obsolete way of creating graph + * To enable layout propagation and global memory reuse + * two stage creation should be used instead: + * - Init() + * - Allocate() + */ void CreateGraph(const std::vector &graphNodes, const std::vector &graphEdges, const GraphContext::CPtr context, @@ -75,6 +91,9 @@ class Graph { void PushInputData(const std::size_t& index, const ov::SoPtr& input); void PullOutputData(std::unordered_map>& output); + // @todo pass as part of one of the graph configuration stages + // void SetGlobalExecutionIndex() { + // } // Returns Output nodes memory descriptors VecMemoryDescs getOutputMemoryDescriptors() const; @@ -213,12 +232,39 @@ class Graph { /** * Activate execution graph using \p externalInputMemory and \p externalOutputMemory + * 'globalAllocation' is a temporary flag indicating that the current graph is participaing in + * global memory reuse (together with all inner / outer graphs). + * The flag should be dropped after all the nodes with inner graphs participate in + * global memory reuse by default */ void Activate(const std::vector& externalInputMemory = {}, - const std::vector& externalOutputMemory = {}); + const std::vector& externalOutputMemory = {}); + + void Allocate(); + + AllocationContext CreateAllocationContext(bool global); + + /** + * Register the graph in the global allocation context by transforming + * local execution data into the global one: + * 1) Local execution indices are transformed into global ones, represented by input and output execution index + * where output execution index is an index of the last node of the inner graph + * 2) Local sync node indices are transformed into global ones using global input execution index + * 3) Local edges are added to the global list of edges + * + * Example graph with subgraphs: + * 0 -> 1 -> 2 -> 3 [0 -> 1 -> 2] -> 4 [0 -> 1] -> 5 + * + * Virtually flatten: + * 0(0) -> 1(1) -> 2(2) -> 3(5) [3 -> 4 -> 5] -> 6(7) [6 -> 7] -> 8 + * + * This is basically an equivalent to the actually flatten graph: + * 0 -> 1 -> 2 -> [3 -> 4 -> 5] -> [6 -> 7] -> 8 + */ + int RegisterToAllocationContext(int offset, AllocationContext& context); const std::unordered_map& getOutputNodesMemBlocksMap() const { - return outputNodesMemBlocksMap; + return m_outputNodesMemBlocks; } protected: @@ -256,10 +302,10 @@ class Graph { void InitOptimalPrimitiveDescriptors(); void ResolveEdgeConflicts(); void ResolveComplexInplaceConflicts(); - bool ProcessDynNodes(); - void Allocate(const std::vector& syncNodesInds); - void AllocateWithReuse(const std::vector& syncNodesInds); + bool ProcessDynNodes() const; + void AllocateWithReuse(const std::vector& syncNodesInds, GlobalExecutionIndex globalExecIndex); void CreatePrimitivesAndExecConstants() const; + std::vector CreateExecutionGraph(); /** * Execute a given \p node within \p request using \p numaId @@ -300,7 +346,7 @@ class Graph { std::map inputNodesMap; std::map outputNodesMap; - std::unordered_map outputNodesMemBlocksMap; + OutputMemoryBlocks m_outputNodesMemBlocks; // these node pointers (from graphNodes) are to avoid regular checking for // constantness of nodes in Infer methods and calls of @@ -310,8 +356,6 @@ class Graph { GraphContext::CPtr m_context; dnnl::stream m_stream; - - MemoryControl* m_pMemoryControl = nullptr; }; using GraphPtr = std::shared_ptr; diff --git a/src/plugins/intel_cpu/src/graph_context.cpp b/src/plugins/intel_cpu/src/graph_context.cpp index e200766fa4791c..28b8ab6bf12ca3 100644 --- a/src/plugins/intel_cpu/src/graph_context.cpp +++ b/src/plugins/intel_cpu/src/graph_context.cpp @@ -1,7 +1,6 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "dnnl_types.h" #include "graph_context.h" #include "nodes/memory.hpp" #include "memory_control.hpp" @@ -12,15 +11,20 @@ namespace intel_cpu { GraphContext::GraphContext(const Config& config, WeightsSharing::Ptr w_cache, bool isGraphQuantized, + MemoryControl* memoryControl, + std::shared_ptr networkMemoryControl, ov::threading::IStreamsExecutor::Ptr streamExecutor, - std::shared_ptr sub_memory_manager) + std::shared_ptr sub_memory_manager, + bool globalMemoryReuse) : config(config), weightsCache(std::move(w_cache)), isGraphQuantizedFlag(isGraphQuantized), streamExecutor(streamExecutor), subMemoryManager(sub_memory_manager), memoryStatesRegister(std::make_shared()), - networkMemoryControl(std::make_shared()) { + memoryControl(memoryControl), + networkMemoryControl(networkMemoryControl), + m_globalMemoryReuse(globalMemoryReuse) { rtParamsCache = std::make_shared(config.rtCacheCapacity); // primitive/executors can be shared across sub-stream // but scratch pad cannot be shared. diff --git a/src/plugins/intel_cpu/src/graph_context.h b/src/plugins/intel_cpu/src/graph_context.h index db2b126213978c..02553410a70da4 100644 --- a/src/plugins/intel_cpu/src/graph_context.h +++ b/src/plugins/intel_cpu/src/graph_context.h @@ -18,6 +18,7 @@ namespace node { class MemoryStatesRegister; } // namespace node +class MemoryControl; class NetworkMemoryControl; class GraphContext { @@ -28,8 +29,11 @@ class GraphContext { GraphContext(const Config& config, WeightsSharing::Ptr w_cache, bool isGraphQuantized, + MemoryControl* memoryControl, + std::shared_ptr networkMemoryControl, // obsolete in favor of local memoryControl ov::threading::IStreamsExecutor::Ptr streamExecutor = nullptr, - std::shared_ptr sub_memory_manager = nullptr); + std::shared_ptr sub_memory_manager = nullptr, + bool globalMemoryReuse = true); const Config& getConfig() const { return config; @@ -78,10 +82,24 @@ class GraphContext { return memoryStatesRegister; } + MemoryControl* getMemoryControl() const { + return memoryControl; + } + const std::shared_ptr& getNetworkMemoryControl() const { return networkMemoryControl; } + const bool memoryReuseGlobal() const { + return m_globalMemoryReuse; + } + + GraphContext disableMemoryReuse() const { + GraphContext copy = *this; + copy.m_globalMemoryReuse = false; + return copy; + } + private: Config config; // network-level config @@ -103,7 +121,13 @@ class GraphContext { int numNumaNodes = 1; std::shared_ptr memoryStatesRegister; + MemoryControl* memoryControl; + // to be removed in favor of local memoryControl + // currently required for the nodes with inner graphs which + // do not participate in global memory reuse std::shared_ptr networkMemoryControl; + + bool m_globalMemoryReuse = false; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/graph_dumper.cpp b/src/plugins/intel_cpu/src/graph_dumper.cpp index fab6e99dcf2550..6bcc46153cbf79 100644 --- a/src/plugins/intel_cpu/src/graph_dumper.cpp +++ b/src/plugins/intel_cpu/src/graph_dumper.cpp @@ -243,7 +243,6 @@ void serializeToXML(const Graph &graph, const std::string& path) { void serializeToCout(const Graph &graph) { for (const auto& node : graph.GetNodes()) { - std::cout << "name: " << node->getName() << " [ "; auto nodeDesc = node->getSelectedPrimitiveDescriptor(); if (nodeDesc) { auto& inConfs = nodeDesc->getConfig().inConfs; diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index f0b817dcda859c..2110d837ab7bc9 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -19,6 +19,7 @@ #include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" #include "openvino/runtime/threading/cpu_message.hpp" +#include "memory_control.hpp" using OvString = ov::element_type_traits::value_type; @@ -135,6 +136,15 @@ void SyncInferRequest::infer() { push_input_data(); + MemoryControl* network_memory_control = m_graph->getGraphContext()->getMemoryControl(); + if (!network_memory_control) { + OPENVINO_THROW("Memory control unit is not initilized for graph: ", m_graph->GetName()); + } + + if (!network_memory_control->allocated()) { + network_memory_control->allocateMemory(); + } + m_graph->Infer(this); throw_if_canceled(); diff --git a/src/plugins/intel_cpu/src/memory_control.cpp b/src/plugins/intel_cpu/src/memory_control.cpp index 0f202c296891c1..8f1885daeae8fc 100644 --- a/src/plugins/intel_cpu/src/memory_control.cpp +++ b/src/plugins/intel_cpu/src/memory_control.cpp @@ -4,10 +4,15 @@ #include "memory_control.hpp" +#include +#include #include +#include "edge.h" #include "node.h" #include "openvino/runtime/memory_solver.hpp" +#include "proxy_mem_blk.h" +#include "utils/general_utils.h" namespace ov { namespace intel_cpu { @@ -84,8 +89,8 @@ class MemoryBlockWithRelease : public IMemoryBlockObserver { class IMemoryManager { public: virtual ~IMemoryManager() = default; - virtual void insert(const MemoryRegion& reg) = 0; - virtual const MemoryControl::MemoryBlockMap& lastSolution() = 0; + virtual void insert(const MemoryRegion& reg, const std::vector& syncInds) = 0; + virtual const MemoryControl::MemorySolution& lastSolution() = 0; virtual void allocate() = 0; virtual void release() = 0; }; @@ -99,11 +104,12 @@ std::shared_ptr makeDnnlMemoryBlock(Args&&... args) { class MemoryManagerIO : public IMemoryManager { public: - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { + (void) syncInds; m_blocks.insert({reg.id, makeDnnlMemoryBlock()}); } - const MemoryControl::MemoryBlockMap& lastSolution() override { + const MemoryControl::MemorySolution& lastSolution() override { return m_blocks; } @@ -115,16 +121,17 @@ class MemoryManagerIO : public IMemoryManager { } private: - MemoryControl::MemoryBlockMap m_blocks; + MemoryControl::MemorySolution m_blocks; }; class MemoryManagerStatic : public IMemoryManager { public: - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { + (void) syncInds; m_boxes.emplace_back(MemorySolver::Box{reg.start, reg.finish, reg.size, reg.id}); } - const MemoryControl::MemoryBlockMap& lastSolution() override { + const MemoryControl::MemorySolution& lastSolution() override { if (!m_boxes.empty() && m_blocks.empty()) { solve(); } @@ -159,7 +166,7 @@ class MemoryManagerStatic : public IMemoryManager { } private: - MemoryControl::MemoryBlockMap m_blocks; + MemoryControl::MemorySolution m_blocks; std::vector m_boxes; std::shared_ptr m_workspace; size_t m_totalSize = 0; @@ -167,19 +174,18 @@ class MemoryManagerStatic : public IMemoryManager { class MemoryManageNonOverlapingSets : public IMemoryManager { public: - MemoryManageNonOverlapingSets(std::vector syncInds) : m_syncInds(std::move(syncInds)) {} - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { MemorySolver::Box box = {reg.start, reg.finish, reg.size, reg.id}; if (-1 != reg.finish) { //We have to extend the lifespan of tensors that are crossing a sync point border in order to save //the intermediate computation results from possible loss due to the tensor resize auto itr_upper = - std::upper_bound(m_syncInds.begin(), m_syncInds.end(), box.finish, [](int y, int x) { + std::upper_bound(syncInds.begin(), syncInds.end(), box.finish, [](int y, int x) { return y <= x; }); - auto itr_lower = std::lower_bound(m_syncInds.begin(), m_syncInds.end(), box.start); + auto itr_lower = std::lower_bound(syncInds.begin(), syncInds.end(), box.start); if (itr_lower != itr_upper) { // across sections - if (itr_upper == m_syncInds.end()) { + if (itr_upper == syncInds.end()) { box.finish = -1; } else { box.finish = *itr_upper; @@ -189,10 +195,10 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { m_boxes.emplace_back(std::move(box)); } - const MemoryControl::MemoryBlockMap& lastSolution() override { + const MemoryControl::MemorySolution& lastSolution() override { if (!m_boxes.empty() && m_blocks.empty()) { solve(); - m_blocks = MemoryControl::MemoryBlockMap{m_internalBlocks.begin(), m_internalBlocks.end()}; + m_blocks = MemoryControl::MemorySolution{m_internalBlocks.begin(), m_internalBlocks.end()}; } return m_blocks; } @@ -238,11 +244,10 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { } private: - MemoryControl::MemoryBlockMap m_blocks; - std::unordered_map> + MemoryControl::MemorySolution m_blocks; + std::unordered_map> m_internalBlocks; std::vector m_boxes; - std::vector m_syncInds; }; } // namespace @@ -256,16 +261,16 @@ class MemoryControl::RegionHandler { : m_cond(std::move(cond)), m_memManager(std::move(memManager)) {} - bool insert(const MemoryRegion& reg) { + bool insert(const MemoryRegion& reg, const std::vector& syncInds) { if (!m_cond(reg)) { return false; } - m_memManager->insert(reg); + m_memManager->insert(reg, syncInds); return true; } - const MemoryControl::MemoryBlockMap& lastSolution() const { + const MemoryControl::MemorySolution& lastSolution() const { return m_memManager->lastSolution(); } @@ -292,9 +297,8 @@ MemoryControl::RegionHandlerPtr buildHandler(F&& f, Args&&... args) { } // namespace -MemoryControl::MemoryControl(std::vector syncInds) { +MemoryControl::MemoryControl() { // init handlers - // handler for dynamic tensors m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { if (reg.size < 0 || MemoryRegion::RegionType::VARIABLE != reg.type || @@ -311,7 +315,7 @@ MemoryControl::MemoryControl(std::vector syncInds) { return false; } return true; - }, std::move(syncInds))); + })); //handler for I/O tensors, so far simply individual blocks m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { @@ -322,22 +326,24 @@ MemoryControl::MemoryControl(std::vector syncInds) { })); } -void MemoryControl::insert(const MemoryRegion& region) { +void MemoryControl::insert(const MemoryRegion& region, const std::vector& syncInds) { for (auto&& handler : m_handlers) { - if (handler->insert(region)) { + if (handler->insert(region, syncInds)) { return; } } OPENVINO_THROW("No suitable hanlder was found for the given memory region"); } -MemoryControl::MemoryBlockMap MemoryControl::insert(const std::vector& regions) { +void MemoryControl::insert(const std::vector& regions, + const std::vector& syncInds) { for (auto&& region : regions) { - insert(region); + insert(region, syncInds); } +} - MemoryControl::MemoryBlockMap blocksMap; - blocksMap.reserve(regions.size()); +MemoryControl::MemorySolution MemoryControl::solve() { + MemoryControl::MemorySolution blocksMap; for (auto&& handler : m_handlers) { auto&& solution = handler->lastSolution(); @@ -364,52 +370,9 @@ void MemoryControl::releaseMemory() { m_allocated = false; } -edgeClusters MemoryControl::findEdgeClusters(const std::vector& graphEdges) { - typedef std::unordered_map edge_cluster_idx_map_t; - - edgeClusters edge_clusters; - edge_cluster_idx_map_t edge_cluster_indices; - - for (auto& edge : graphEdges) { - auto edge_it = edge_cluster_indices.find(edge); - if (edge_it != edge_cluster_indices.end()) - continue; // edge is visited - - size_t cluster_idx = edge_clusters.size(); - EdgePtr last_shared_edge = nullptr; - - // find cluster index - for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge; - shared_edge = shared_edge->getSharedEdge(std::nothrow)) { - auto shared_edge_it = edge_cluster_indices.find(shared_edge); - if (shared_edge_it != edge_cluster_indices.end()) { - cluster_idx = shared_edge_it->second; - last_shared_edge = shared_edge; - break; - } - } - - // add shared edges to cluster - edge_cluster_indices.emplace(edge, cluster_idx); - - if (cluster_idx == edge_clusters.size()) - edge_clusters.emplace_back(edgeCluster{edge}); - else - edge_clusters[cluster_idx].emplace(edge); - - for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge != last_shared_edge; - shared_edge = shared_edge->getSharedEdge(std::nothrow)) { - edge_cluster_indices.emplace(shared_edge, cluster_idx); - edge_clusters[cluster_idx].emplace(shared_edge); - } - } - - return edge_clusters; -} - -MemoryControl& NetworkMemoryControl::createMemoryControlUnit(std::vector syncInds) { - m_controlUnits.emplace_back(std::unique_ptr(new MemoryControl(syncInds))); - return *(m_controlUnits.back()); +MemoryControl* NetworkMemoryControl::createMemoryControlUnit() { + m_controlUnits.emplace_back(std::unique_ptr(new MemoryControl())); + return m_controlUnits.back().get(); } void NetworkMemoryControl::allocateMemory() { @@ -425,4 +388,4 @@ void NetworkMemoryControl::releaseMemory() { } } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_control.hpp b/src/plugins/intel_cpu/src/memory_control.hpp index ce4dc90890f3fa..175f8b51cb4258 100644 --- a/src/plugins/intel_cpu/src/memory_control.hpp +++ b/src/plugins/intel_cpu/src/memory_control.hpp @@ -5,12 +5,15 @@ #pragma once #include "edge.h" +#include "graph.h" +#include "node.h" +#include "proxy_mem_blk.h" namespace ov { namespace intel_cpu { -using edgeCluster = std::unordered_set; -using edgeClusters = std::vector; +using EdgeCluster = std::vector; +using EdgeClusters = std::vector; struct MemoryRegion { int start; // Execution order index of first use. @@ -22,17 +25,20 @@ struct MemoryRegion { enum class AllocType : uint8_t { POD, STRING, UNKNOWN } alloc_type; }; +using MemoryRegions = std::vector; + class MemoryControl { public: class RegionHandler; using RegionHandlerPtr = std::shared_ptr; - using MemoryBlockMap = std::unordered_map; + using MemorySolution = std::unordered_map; public: - static edgeClusters findEdgeClusters(const std::vector& graphEdges); + void insert(const MemoryRegions& regions, + const std::vector& syncInds); - MemoryBlockMap insert(const std::vector& regions); + MemorySolution solve(); bool allocated() const { return m_allocated; @@ -42,13 +48,12 @@ class MemoryControl { void releaseMemory(); private: - explicit MemoryControl(std::vector syncInds); - void insert(const MemoryRegion& region); + explicit MemoryControl(); + void insert(const MemoryRegion& region, const std::vector& syncInds); friend class NetworkMemoryControl; private: - std::vector m_syncInds; std::vector m_handlers; bool m_allocated = false; }; @@ -56,7 +61,8 @@ class MemoryControl { class NetworkMemoryControl { public: NetworkMemoryControl() = default; - MemoryControl& createMemoryControlUnit(std::vector syncInds); + // @todo return std::reference_wrapper instead? + MemoryControl* createMemoryControlUnit(); void allocateMemory(); void releaseMemory(); @@ -69,4 +75,4 @@ class NetworkMemoryControl { }; } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 7c23d55fc4147a..2c592a26bac7c9 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -1152,6 +1152,10 @@ bool Node::isConstant() { return getConstantType() == ConstantType::Const; } +bool Node::isConstantInput() { + return isConstant() && getType() == Type::Input; +} + void Node::updateConstantType() { if (constant == ConstantType::StrictNoConst) return; diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 948bd6999ce27a..c9d85f2123d6b7 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -6,6 +6,7 @@ #include #include +#include "allocation_context.hpp" #include "cpu_memory.h" #include "cpu_shape.h" #include "cpu_types.h" @@ -43,6 +44,7 @@ using NodePtr = std::shared_ptr; using NodeConstPtr = std::shared_ptr; using NodeWeakPtr = std::weak_ptr; + class PortConfigurator { public: PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, ov::element::Type prc, const Shape& shape, @@ -111,6 +113,34 @@ class NodeDesc { executorFactory = factory; } + bool hasZeroInputDims() const { + const auto& inputConfigs = getConfig().inConfs; + + return std::any_of(inputConfigs.begin(), inputConfigs.end(), [](const PortConfig& portConfig) { + return portConfig.hasZeroDims(); + }); + } + + bool hasZeroInputDimsAtPort(size_t portIdx) const { + const auto& inputConfigs = getConfig().inConfs; + OPENVINO_ASSERT("Attempt to get NodeDesc input configuration for port " , portIdx, ". Number of inputs is ", inputConfigs.size()); + return inputConfigs[portIdx].hasZeroDims(); + } + + bool hasZeroOutputDims() const { + const auto& outputConfigs = getConfig().outConfs; + + return std::any_of(outputConfigs.begin(), outputConfigs.end(), [](const PortConfig& portConfig) { + return portConfig.hasZeroDims(); + }); + } + + bool hasZeroOutputDimsAtPort(size_t portIdx) const { + const auto& outputConfigs = getConfig().outConfs; + OPENVINO_ASSERT("Attempt to get NodeDesc output configuration for port " , portIdx, ". Number of outputs is ", outputConfigs.size()); + return outputConfigs[portIdx].hasZeroDims(); + } + private: NodeConfig config; impl_desc_type implementationType; @@ -265,6 +295,9 @@ class Node { bool isInPlace() const; + virtual bool canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDims(); + } // must be called only after Graph::ResolveEdgeConflicts() virtual bool isExecutable() const { return !hasEmptyInputTensors(); @@ -278,6 +311,7 @@ class Node { ConstantType getConstantType() const; void updateConstantType(); bool isConstant(); + bool isConstantInput(); // return type int supports return -1 in overloading when channel axis doesn't exist virtual int getFusingAxis() const { @@ -483,6 +517,11 @@ class Node { return execIndex; } + virtual int registerToAllocationContext(int offset, AllocationContext& context) { + (void) context; + return offset + 1; + } + const std::string & getTypeStr() const { return typeStr; } diff --git a/src/plugins/intel_cpu/src/nodes/batch_to_space.h b/src/plugins/intel_cpu/src/nodes/batch_to_space.h index 1b583f74bd7905..db4d7985b6e322 100644 --- a/src/plugins/intel_cpu/src/nodes/batch_to_space.h +++ b/src/plugins/intel_cpu/src/nodes/batch_to_space.h @@ -17,6 +17,11 @@ class BatchToSpace : public Node { void getSupportedDescriptors() override {}; void initSupportedPrimitiveDescriptors() override; + bool canBeSkipped() const override { + const auto& spd = getSelectedPrimitiveDescriptor(); + return spd->hasZeroInputDims() || spd->hasZeroOutputDims(); + } + // output shape can potentially be empty bool isExecutable() const override { return !hasEmptyInputTensors() && !hasEmptyOutputTensors(); diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.cpp b/src/plugins/intel_cpu/src/nodes/broadcast.cpp index ac8dd814ae9961..6d2045e473f9a9 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.cpp +++ b/src/plugins/intel_cpu/src/nodes/broadcast.cpp @@ -180,6 +180,10 @@ bool Broadcast::needShapeInfer() const { return false; } +bool Broadcast::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Broadcast::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.h b/src/plugins/intel_cpu/src/nodes/broadcast.h index 1435314ee08776..15dd9c70297b6b 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.h +++ b/src/plugins/intel_cpu/src/nodes/broadcast.h @@ -24,6 +24,7 @@ class Broadcast : public Node, public TileBroadcastCommon { void executeDynamicImpl(dnnl::stream strm) override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.cpp b/src/plugins/intel_cpu/src/nodes/bucketize.cpp index a71255c0d531e4..205d3afc8491af 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.cpp +++ b/src/plugins/intel_cpu/src/nodes/bucketize.cpp @@ -216,6 +216,10 @@ void Bucketize::prepareParams() { std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), size_t(1), std::multiplies()); } +bool Bucketize::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Bucketize::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.h b/src/plugins/intel_cpu/src/nodes/bucketize.h index c834921a38ce54..cd62a43a78eeed 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.h +++ b/src/plugins/intel_cpu/src/nodes/bucketize.h @@ -24,6 +24,7 @@ class Bucketize : public Node { void prepareParams() override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index a1ceabd6942db1..44b79d8e6982ab 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -4,6 +4,7 @@ #include "composite.h" +#include "compiled_model.h" #include "nodes/input.h" #include "cpu_memory.h" #include "transformations/cpu_opset/common/op/submodel.hpp" @@ -43,16 +44,16 @@ void Composite::selectOptimalPrimitiveDescriptor() { std::vector inConfs; std::vector graphInputConfig; + // @todo should be always inplace after global memory reuse is fully supported by all the nodes + bool isInPlace = context->memoryReuseGlobal(); + for (size_t i = 0; i < getParentEdges().size(); i++) { auto desc = getParentOutputMemDesc(getParentEdgeAt(i)); inConfs.emplace_back(desc); - graphInputConfig.emplace_back(node::Input::InputConfig{desc, true}); + graphInputConfig.emplace_back(node::Input::InputConfig{desc, isInPlace}); } - std::vector graphOutputConfig; - for (size_t i = 0; i < outputShapes.size(); i++) { - graphOutputConfig.emplace_back(node::Input::OutputConfig{true, true}); - } + std::vector graphOutputConfig(outputShapes.size(), node::Input::OutputConfig{true, isInPlace}); // configure the inner graph to get the information about output memory descriptors m_graph.Init(m_body, context, graphInputConfig, graphOutputConfig); @@ -75,23 +76,29 @@ void Composite::selectOptimalPrimitiveDescriptor() { // @todo add ascii diagramm for memory mapping / reuse void Composite::createPrimitive() { - OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), - "Number of node inputs must be equal the number of inner graph's inputs"); + m_graph.Activate(); +} - std::vector inputMemory; - for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - inputMemory.emplace_back(getSrcMemoryAtPort(i)); +int Composite::registerToAllocationContext(int offset, AllocationContext& context) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto parentEdge = getParentEdgeAt(i); + auto inputEdges = m_graph.GetInputNodesMap().at(i)->getChildEdgesAtPort(0); + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", inputEdge->name()); + inputEdge->sharedMemFrom(parentEdge); + } } - OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), - "Number of node outputs must be equal the number of inner graph's outputs"); - - std::vector outputMemory; - for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { - outputMemory.emplace_back(getDstMemoryAtPort(i)); + for (size_t i = 0; i < getChildEdges().size(); i++) { + auto childEdge = getChildEdgeAt(i); + auto outputEdge = m_graph.GetOutputNodesMap().at(i)->getParentEdgeAt(0); + OPENVINO_ASSERT(outputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", outputEdge->name()); + outputEdge->sharedMemFrom(childEdge); } - m_graph.Activate(inputMemory, outputMemory); + return m_graph.RegisterToAllocationContext(offset, context); } void Composite::execute(dnnl::stream) { diff --git a/src/plugins/intel_cpu/src/nodes/composite.h b/src/plugins/intel_cpu/src/nodes/composite.h index 9f18a2ba68b769..816aa97b2aa5cc 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.h +++ b/src/plugins/intel_cpu/src/nodes/composite.h @@ -4,7 +4,9 @@ #pragma once +#include #include +#include #include "graph.h" #include "node.h" @@ -31,6 +33,10 @@ class Composite : public Node { return false; } + bool canBeSkipped() const override { + return false; + } + bool isExecutable() const override { return true; } @@ -41,6 +47,8 @@ class Composite : public Node { void execute(dnnl::stream) override; void executeDynamicImpl(dnnl::stream strm) override; + int registerToAllocationContext(int offset, AllocationContext& context) override; + const Graph& graph() const { return m_graph; } diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 576361de7e692b..3385d958937e89 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -29,6 +29,10 @@ namespace { constexpr size_t channelAxis = 1lu; } +bool Concat::canBeSkipped() const { + return isInPlace() || getSelectedPrimitiveDescriptor()->hasZeroOutputDims(); +} + bool Concat::isExecutable() const { return !isInPlace() && !hasEmptyOutputTensors(); } diff --git a/src/plugins/intel_cpu/src/nodes/concat.h b/src/plugins/intel_cpu/src/nodes/concat.h index 9ed331bee4f16d..5dfc4f11fadbb1 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.h +++ b/src/plugins/intel_cpu/src/nodes/concat.h @@ -27,6 +27,7 @@ class Concat : public Node { ov::element::Type getRuntimePrecision() const override; + bool canBeSkipped() const override; bool isExecutable() const override; bool needPrepareParams() const override; void prepareParams() override; diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp index dc69892dabb2e4..e82146ad3a22b0 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp @@ -140,6 +140,10 @@ void EmbeddingBagOffset::executeDynamicImpl(dnnl::stream strm) { execute(strm); } +bool EmbeddingBagOffset::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool EmbeddingBagOffset::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h index a31b518e7891a9..000cc86cce25c3 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h @@ -20,6 +20,7 @@ class EmbeddingBagOffset : public Node, public EmbeddingBag { void execute(dnnl::stream strm) override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp index 0b490a28a81487..a9465f909fd37b 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp @@ -107,6 +107,10 @@ void EmbeddingBagPacked::executeDynamicImpl(dnnl::stream strm) { execute(strm); } +bool EmbeddingBagPacked::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool EmbeddingBagPacked::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h index 6a9d33fe3afccb..98f8e117639f82 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h @@ -21,6 +21,7 @@ class EmbeddingBagPacked : public Node, public EmbeddingBag { bool created() const override; bool isExecutable() const override; + bool canBeSkipped() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; protected: diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp index 1b2e28e6039543..55159f71b6a6eb 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp @@ -138,6 +138,10 @@ void EmbeddingSegmentsSum::executeDynamicImpl(dnnl::stream strm) { execute(strm); } +bool EmbeddingSegmentsSum::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool EmbeddingSegmentsSum::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h index bb312b4dd47246..60fa504e326767 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h @@ -20,6 +20,7 @@ class EmbeddingSegmentsSum : public Node, public EmbeddingBag { void execute(dnnl::stream strm) override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index d2629fe8fe6811..cebf57421982e7 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -923,6 +923,10 @@ bool Gather::created() const { return getType() == Type::Gather; } +bool Gather::canBeSkipped() const { + return isInPlace() || Node::canBeSkipped(); +} + bool Gather::isExecutable() const { return !isInPlace() && Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/gather.h b/src/plugins/intel_cpu/src/nodes/gather.h index 6ee097e9a1fbab..99a22df010caf5 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.h +++ b/src/plugins/intel_cpu/src/nodes/gather.h @@ -24,6 +24,7 @@ class Gather : public Node { void createPrimitive() override; void execute(dnnl::stream strm) override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; void resolveInPlaceEdges(Edge::LOOK look) override; diff --git a/src/plugins/intel_cpu/src/nodes/if.h b/src/plugins/intel_cpu/src/nodes/if.h index f858c92b0b2651..3c279f028754b8 100644 --- a/src/plugins/intel_cpu/src/nodes/if.h +++ b/src/plugins/intel_cpu/src/nodes/if.h @@ -25,6 +25,7 @@ class If : public Node { void createPrimitive() override; bool created() const override; void execute(dnnl::stream strm) override; + bool canBeSkipped() const override { return false; } bool isExecutable() const override { return true; } protected: diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 1f650bd8c5de17..2549194e6a3b02 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -479,7 +479,8 @@ void Input::selectOptimalPrimitiveDescriptor() { supportedPrimitiveDescriptors.clear(); // and just use parent memory descriptor for Output node to avoid reorders insertion - NodeConfig config({PortConfig(getParentOutputMemDesc(getParentEdgeAt(0)), BlockedMemoryDesc::FULL_MASK, 0)}, {}); + int inPlacePort = m_isInPlace ? 0 : -1; + NodeConfig config({PortConfig(getParentOutputMemDesc(getParentEdgeAt(0)), BlockedMemoryDesc::FULL_MASK, inPlacePort)}, {}); supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); selectPrimitiveDescriptorByIndex(0); @@ -543,6 +544,37 @@ void Input::initSupportedPdFromMemDesc() { supportedPrimitiveDescriptors.emplace_back(std::move(config), impl_desc_type::unknown); } +void Input::resolveInPlaceEdges(Edge::LOOK look) { + if (!m_isInPlace) + return Node::resolveInPlaceEdges(look); + + if (look & Edge::LOOK_UP) { + auto edges = getChildEdgesAtPort(0); + for (const auto& edge : edges) { + EdgePtr sharedEdge = edge; + + while (sharedEdge->getSharedEdge(std::nothrow)) { + sharedEdge = sharedEdge->getSharedEdge(std::nothrow); + } + + edge->reuse(sharedEdge->getMemoryPtr()); + } + } + + if (look & Edge::LOOK_DOWN) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto edge = getParentEdgeAt(i); + EdgePtr sharedEdge = edge; + + while (sharedEdge->getSharedEdge(std::nothrow)) { + sharedEdge = sharedEdge->getSharedEdge(std::nothrow); + } + + edge->reuse(sharedEdge->getMemoryPtr()); + } + } +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 4d7febb17ad4b7..f190a99d2a7530 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -56,15 +56,16 @@ class Input : public Node { void selectOptimalPrimitiveDescriptor() override; void createPrimitive() override; bool created() const override; + void resolveInPlaceEdges(Edge::LOOK look) override; void withMeanImage(); MemoryCPtr getMemoryPtr() const; void execute(dnnl::stream strm) override {} void executeDynamicImpl(dnnl::stream strm) override {} - bool isExecutable() const override { - return false; - } + + bool canBeSkipped() const override { return true; } + bool isExecutable() const override { return false; } bool needShapeInfer() const override { return false; } bool needPrepareParams() const override { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/interaction.cpp b/src/plugins/intel_cpu/src/nodes/interaction.cpp index 6f604f4a9e278a..5a49675a77e2e1 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.cpp +++ b/src/plugins/intel_cpu/src/nodes/interaction.cpp @@ -356,6 +356,10 @@ void Interaction::executeDynamicImpl(dnnl::stream strm) { execute(strm); } +bool Interaction::canBeSkipped() const { + return false; +} + bool Interaction::isExecutable() const { return true; } diff --git a/src/plugins/intel_cpu/src/nodes/interaction.h b/src/plugins/intel_cpu/src/nodes/interaction.h index 448484a2512dd1..978f9785f8ee81 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.h +++ b/src/plugins/intel_cpu/src/nodes/interaction.h @@ -50,6 +50,7 @@ class Interaction : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool canBeSkipped() const override; bool isExecutable() const override; void executeDynamicImpl(dnnl::stream strm) override; void prepareParams() override; diff --git a/src/plugins/intel_cpu/src/nodes/lora.cpp b/src/plugins/intel_cpu/src/nodes/lora.cpp index 52509cdfc44a13..13e3c533fb2215 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.cpp +++ b/src/plugins/intel_cpu/src/nodes/lora.cpp @@ -51,17 +51,19 @@ void LoRA::selectOptimalPrimitiveDescriptor() { auto mainInputPrc = mainInputDesc->getPrecision(); // we have to align precision across all the inputs inConfs.emplace_back(mainInputDesc); - graphInputConfig.emplace_back(node::Input::InputConfig{mainInputDesc, true}); + // @todo should be always inplace after global memory reuse is fully supported by all the nodes + bool isInPlace = context->memoryReuseGlobal(); + graphInputConfig.emplace_back(node::Input::InputConfig{mainInputDesc, isInPlace}); for (size_t i = 1; i < getParentEdges().size(); i++) { auto desc = getParentOutputMemDesc(getParentEdgeAt(i))->cloneWithNewPrecision(mainInputPrc); inConfs.emplace_back(desc); - graphInputConfig.emplace_back(node::Input::InputConfig{desc, true}); + graphInputConfig.emplace_back(node::Input::InputConfig{desc, isInPlace}); } std::vector graphOutputConfig; // enforce the same memory descriptor on the output as on the input to allow inPlace memory - graphOutputConfig.emplace_back(node::Input::OutputConfig{inConfs.front().getMemDesc(), true}); + graphOutputConfig.emplace_back(node::Input::OutputConfig{inConfs.front().getMemDesc(), isInPlace}); // configure the inner graph to get the information about output memory descriptors m_graph.Init(m_body, context, graphInputConfig, graphOutputConfig); @@ -86,24 +88,40 @@ void LoRA::selectOptimalPrimitiveDescriptor() { selectPrimitiveDescriptorByIndex(0); } +int LoRA::registerToAllocationContext(int offset, AllocationContext& context) { + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + auto parentEdge = getParentEdgeAt(i); + auto inputEdges = m_graph.GetInputNodesMap().at(i)->getChildEdgesAtPort(0); + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized Edge instead of: ", static_cast(inputEdge->getStatus())); + inputEdge->sharedMemFrom(parentEdge); + } + } + + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + auto childEdge = getChildEdgeAt(i); + auto outputEdge = m_graph.GetOutputNodesMap().at(i)->getParentEdgeAt(0); + outputEdge->sharedMemFrom(childEdge); + } + + return m_graph.RegisterToAllocationContext(offset, context); +} + // @todo add ascii diagram for memory mapping / reuse void LoRA::createPrimitive() { CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), "Number of node inputs must be equal the number of inner graph's inputs"); - - std::vector inputMemory; + // Workaround to avoid making LoRa node always executable (isExecutable()) true + // This way we update subgraph's input memory without performing an actual Infer() call for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - auto srcEdgeMem = getSrcMemoryAtPort(i); - auto mem = std::make_shared(getEngine(), srcEdgeMem->getDescPtr(), srcEdgeMem->getMemoryBlock()); + const auto& subgraphInputNode = m_graph.GetInputNodesMap().at(i); + const auto& subgraphInputMemory = subgraphInputNode->getDstMemoryAtPort(0); + auto mem = std::make_shared(getEngine(), subgraphInputMemory->getDescPtr(), subgraphInputMemory->getMemoryBlock()); subgraphMemoryPtrs.push_back(mem); - inputMemory.emplace_back(std::move(mem)); } - CPU_NODE_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), - "Number of node outputs must be equal the number of inner graph's outputs"); - - std::vector outputMemory{getDstMemoryAtPort(0)}; - m_graph.Activate(inputMemory, outputMemory); + m_graph.Activate(); } void LoRA::execute(dnnl::stream) { diff --git a/src/plugins/intel_cpu/src/nodes/lora.h b/src/plugins/intel_cpu/src/nodes/lora.h index 27701daf9034f2..3fd7892040fc14 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.h +++ b/src/plugins/intel_cpu/src/nodes/lora.h @@ -23,6 +23,7 @@ class LoRA : public Node { void getSupportedDescriptors() override{}; void selectOptimalPrimitiveDescriptor() override; + int registerToAllocationContext(int offset, AllocationContext& context) override; void createPrimitive() override; void prepareParams() override; void execute(dnnl::stream) override; diff --git a/src/plugins/intel_cpu/src/nodes/matmul.cpp b/src/plugins/intel_cpu/src/nodes/matmul.cpp index 92d8f356728ed9..088844b0b8575c 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.cpp +++ b/src/plugins/intel_cpu/src/nodes/matmul.cpp @@ -708,6 +708,10 @@ const std::vector& MatMul::getDefaultImplPriority() { return priorities; } +bool MatMul::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroOutputDims(); +} + bool MatMul::isExecutable() const { return !hasEmptyOutputTensors(); } diff --git a/src/plugins/intel_cpu/src/nodes/matmul.h b/src/plugins/intel_cpu/src/nodes/matmul.h index 2e487148d0ec0c..eccfc435ee55bc 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.h +++ b/src/plugins/intel_cpu/src/nodes/matmul.h @@ -43,6 +43,7 @@ class MatMul : public Node { const std::vector& getDefaultImplPriority() override; bool canBeExecutedInInt8() const override; + bool canBeSkipped() const override; bool isExecutable() const override; protected: diff --git a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp index 354ac92f9272cb..8bc68ad2528e1d 100644 --- a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp +++ b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp @@ -287,6 +287,10 @@ void MatrixNms::prepareParams() { } } +bool MatrixNms::canBeSkipped() const { + return !isDynamicNode() && Node::canBeSkipped(); +} + bool MatrixNms::isExecutable() const { return isDynamicNode() || Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/matrix_nms.h b/src/plugins/intel_cpu/src/nodes/matrix_nms.h index 6afa9c09c751c1..853bcfeaf07be8 100644 --- a/src/plugins/intel_cpu/src/nodes/matrix_nms.h +++ b/src/plugins/intel_cpu/src/nodes/matrix_nms.h @@ -29,6 +29,7 @@ class MatrixNms : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool canBeSkipped() const override; bool isExecutable() const override; void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/memory.cpp b/src/plugins/intel_cpu/src/nodes/memory.cpp index 565597bdcc2a9e..7d355181207edc 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.cpp +++ b/src/plugins/intel_cpu/src/nodes/memory.cpp @@ -220,6 +220,10 @@ void MemoryOutputBase::assignState(MemStatePtr newState) { assignExtMemory(state->output_mem(), state->internal_desc()); } +bool MemoryOutputBase::canBeSkipped() const { + return false; +} + bool MemoryOutputBase::isExecutable() const { return true; } @@ -471,6 +475,10 @@ void MemoryInputBase::deregisterSibling(MemoryOutputBase* node) { if (node == outputNode) { outputNode = nullptr; } } +bool MemoryInputBase::canBeSkipped() const { + return false; +} + bool MemoryInputBase::isExecutable() const { return true; } diff --git a/src/plugins/intel_cpu/src/nodes/memory.hpp b/src/plugins/intel_cpu/src/nodes/memory.hpp index f503a8d58386a5..cedc2aaa394dde 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.hpp +++ b/src/plugins/intel_cpu/src/nodes/memory.hpp @@ -64,6 +64,8 @@ class MemoryOutputBase : public Node, public MemoryNode { void execute(dnnl::stream strm) override final; // NOLINT void executeDynamicImpl(dnnl::stream strm) override final; // NOLINT + + bool canBeSkipped() const override final; // NOLINT bool isExecutable() const override final; // NOLINT void registerInputNode(MemoryInputBase* node); @@ -142,6 +144,7 @@ class MemoryInputBase : public Input, public MemoryStateNode { void executeDynamicImpl(dnnl::stream strm) override final; // NOLINT bool needShapeInfer() const override { return false; } bool needPrepareParams() const override { return false; } + bool canBeSkipped() const override final; // NOLINT bool isExecutable() const override final; // NOLINT void registerOutputNode(MemoryOutputBase* node); diff --git a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp index e87fd69fd9c004..0470d9b32c60ca 100644 --- a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp +++ b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp @@ -211,6 +211,10 @@ void MultiClassNms::prepareParams() { m_numBoxOffset.resize(m_numBatches); } +bool MultiClassNms::canBeSkipped() const { + return !isDynamicNode() && Node::canBeSkipped(); +} + bool MultiClassNms::isExecutable() const { return isDynamicNode() || Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp b/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp index ea5d166351efb6..a6ade302908124 100644 --- a/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp +++ b/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp @@ -27,6 +27,7 @@ class MultiClassNms : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool canBeSkipped() const override; bool isExecutable() const override; void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/multinomial.cpp b/src/plugins/intel_cpu/src/nodes/multinomial.cpp index 24958b4e2b980d..e3da27b83f34e1 100644 --- a/src/plugins/intel_cpu/src/nodes/multinomial.cpp +++ b/src/plugins/intel_cpu/src/nodes/multinomial.cpp @@ -117,6 +117,11 @@ void Multinomial::prepareParams() { m_batches_samples_probs_count = m_output_elements_count * m_probs_count; } +bool Multinomial::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(PROBS_PORT) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(NUM_SAMPLES_PORT); +} + bool Multinomial::isExecutable() const { return !isInputTensorAtPortEmpty(PROBS_PORT) && !isInputTensorAtPortEmpty(NUM_SAMPLES_PORT); } diff --git a/src/plugins/intel_cpu/src/nodes/multinomial.hpp b/src/plugins/intel_cpu/src/nodes/multinomial.hpp index 611b70503f5dba..633671930f07ff 100644 --- a/src/plugins/intel_cpu/src/nodes/multinomial.hpp +++ b/src/plugins/intel_cpu/src/nodes/multinomial.hpp @@ -30,6 +30,7 @@ class Multinomial : public Node { void createPrimitive() override; + bool canBeSkipped() const override; bool isExecutable() const override; void execute(dnnl::stream strm) override; void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/node_config.h b/src/plugins/intel_cpu/src/nodes/node_config.h index d814f0ee65df37..5e540ee685f51f 100644 --- a/src/plugins/intel_cpu/src/nodes/node_config.h +++ b/src/plugins/intel_cpu/src/nodes/node_config.h @@ -138,6 +138,10 @@ class PortConfig { _desc = createPortDesc(desc, cmpMask); } + bool hasZeroDims() const { + return getMemDesc()->getShape().hasZeroDims(); + } + private: PortDescBasePtr createPortDesc(MemoryDescPtr desc, BlockedMemoryDesc::CmpMask cmpMask) { if (desc->getType() & Blocked) diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp index 944318a3c24ed6..61ab802ccbe4b4 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp @@ -900,6 +900,10 @@ void NonMaxSuppression::checkOutput(const Shape& shape, const std::string& name, THROW_CPU_NODE_ERR("has unsupported '", name, "' output 2nd dimension size: ", dim2str(shape.getDims()[1])); } +bool NonMaxSuppression::canBeSkipped() const { + return !isDynamicNode() && Node::canBeSkipped(); +} + bool NonMaxSuppression::isExecutable() const { return isDynamicNode() || Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h index 025c46f5799a3e..71b187b826aef7 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h @@ -50,6 +50,7 @@ class NonMaxSuppression : public Node { int suppress_begin_index; }; + bool canBeSkipped() const override; bool isExecutable() const override; bool needShapeInfer() const override { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/non_zero.h b/src/plugins/intel_cpu/src/nodes/non_zero.h index 515ff965055bea..fd553610b712df 100644 --- a/src/plugins/intel_cpu/src/nodes/non_zero.h +++ b/src/plugins/intel_cpu/src/nodes/non_zero.h @@ -29,6 +29,7 @@ class NonZero : public Node { void executeDynamicImpl(dnnl::stream strm) override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool canBeSkipped() const override { return false; } bool isExecutable() const override { return true; } private: diff --git a/src/plugins/intel_cpu/src/nodes/normalize.cpp b/src/plugins/intel_cpu/src/nodes/normalize.cpp index ca52e572b73ea8..0337f462a99f70 100644 --- a/src/plugins/intel_cpu/src/nodes/normalize.cpp +++ b/src/plugins/intel_cpu/src/nodes/normalize.cpp @@ -912,6 +912,10 @@ void NormalizeL2::createPrimitive() { } } +bool NormalizeL2::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool NormalizeL2::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/normalize.h b/src/plugins/intel_cpu/src/nodes/normalize.h index a05925a15deb71..ce2bf6607b287a 100644 --- a/src/plugins/intel_cpu/src/nodes/normalize.h +++ b/src/plugins/intel_cpu/src/nodes/normalize.h @@ -94,6 +94,7 @@ class NormalizeL2 : public Node { void prepareParams() override; void executeDynamicImpl(dnnl::stream strm) override; + bool canBeSkipped() const override; bool isExecutable() const override; enum class NormEpsMode { diff --git a/src/plugins/intel_cpu/src/nodes/pad.cpp b/src/plugins/intel_cpu/src/nodes/pad.cpp index 10cdb2a19b771f..b96fd7f36d160b 100644 --- a/src/plugins/intel_cpu/src/nodes/pad.cpp +++ b/src/plugins/intel_cpu/src/nodes/pad.cpp @@ -201,6 +201,10 @@ void Pad::createPrimitive() { } } +bool Pad::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroOutputDimsAtPort(0); +} + bool Pad::isExecutable() const { return !isOutputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/pad.h b/src/plugins/intel_cpu/src/nodes/pad.h index f2fcd9cc3c20a9..8493558bf52650 100644 --- a/src/plugins/intel_cpu/src/nodes/pad.h +++ b/src/plugins/intel_cpu/src/nodes/pad.h @@ -23,6 +23,7 @@ class Pad : public Node { void prepareParams() override; bool needShapeInfer() const override; + bool canBeSkipped() const override; bool isExecutable() const override; bool needPrepareParams() const override; diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.h b/src/plugins/intel_cpu/src/nodes/paged_attn.h index adc0f1b634c1b2..1cc698c5e8d63a 100644 --- a/src/plugins/intel_cpu/src/nodes/paged_attn.h +++ b/src/plugins/intel_cpu/src/nodes/paged_attn.h @@ -22,10 +22,19 @@ class PagedAttention : public Node { bool created() const override { return getType() == Type::PagedAttention; } + + // pastkv may have zero dimension + bool canBeSkipped() const override { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(1) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(2); + } + // pastkv may have zero dimension bool isExecutable() const override { return !isInputTensorAtPortEmpty(0) && !isInputTensorAtPortEmpty(1) && !isInputTensorAtPortEmpty(2); } + bool needPrepareParams() const override { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/random_uniform.cpp b/src/plugins/intel_cpu/src/nodes/random_uniform.cpp index 808ad10c440854..e1725d1e60020c 100644 --- a/src/plugins/intel_cpu/src/nodes/random_uniform.cpp +++ b/src/plugins/intel_cpu/src/nodes/random_uniform.cpp @@ -520,6 +520,10 @@ bool RandomUniform::needShapeInfer() const { return !m_const_inputs[SHAPE]; } +bool RandomUniform::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(SHAPE); +} + bool RandomUniform::isExecutable() const { return !isInputTensorAtPortEmpty(SHAPE); } diff --git a/src/plugins/intel_cpu/src/nodes/random_uniform.hpp b/src/plugins/intel_cpu/src/nodes/random_uniform.hpp index 237480cd06a667..cf92cc4810dab6 100644 --- a/src/plugins/intel_cpu/src/nodes/random_uniform.hpp +++ b/src/plugins/intel_cpu/src/nodes/random_uniform.hpp @@ -39,6 +39,7 @@ class RandomUniform : public Node { void executeDynamicImpl(dnnl::stream strm) override; + bool canBeSkipped() const override; bool isExecutable() const override; void createPrimitive() override; diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index 6cfc94a02b9f3b..2ef2b4e3f0ab0c 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -2088,6 +2088,10 @@ void Reduce::initSupportedPrimitiveDescriptors() { } } +bool Reduce::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(REDUCE_DATA); +} + bool Reduce::isExecutable() const { return !isInputTensorAtPortEmpty(REDUCE_DATA); } diff --git a/src/plugins/intel_cpu/src/nodes/reduce.h b/src/plugins/intel_cpu/src/nodes/reduce.h index 2464686edb1ee4..b9e274a1cb5cd9 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.h +++ b/src/plugins/intel_cpu/src/nodes/reduce.h @@ -102,6 +102,7 @@ class Reduce : public Node { return false; } + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -195,4 +196,4 @@ class Reduce : public Node { } // namespace node } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/reference.h b/src/plugins/intel_cpu/src/nodes/reference.h index 25a285a4e72709..e025cae9e9da93 100644 --- a/src/plugins/intel_cpu/src/nodes/reference.h +++ b/src/plugins/intel_cpu/src/nodes/reference.h @@ -22,6 +22,7 @@ class Reference : public Node { bool needShapeInfer() const override; bool needPrepareParams() const override { return false; } + bool canBeSkipped() const override { return false; } bool isExecutable() const override { return true; } void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index 9b521cdb3b57c7..44b1e4547c8eda 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -32,8 +32,12 @@ namespace ov { namespace intel_cpu { namespace node { +bool Reorder::canBeSkipped() const { + return isOptimized || Node::canBeSkipped(); +} + bool Reorder::isExecutable() const { - return Node::isExecutable() && !isOptimized; + return !isOptimized && Node::isExecutable(); } Reorder::Reorder(const std::shared_ptr& op, const GraphContext::CPtr context) : diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h index ab94b60b6a4a18..380668ac0ee5de 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.h +++ b/src/plugins/intel_cpu/src/nodes/reorder.h @@ -23,6 +23,7 @@ class Reorder : public Node { bool created() const override; const std::vector& getDefaultImplPriority() override; + bool canBeSkipped() const override; bool isExecutable() const override; void createPrimitive() override; diff --git a/src/plugins/intel_cpu/src/nodes/reshape.cpp b/src/plugins/intel_cpu/src/nodes/reshape.cpp index 6e3dea09db2a2f..e10e377e75f3dd 100644 --- a/src/plugins/intel_cpu/src/nodes/reshape.cpp +++ b/src/plugins/intel_cpu/src/nodes/reshape.cpp @@ -138,7 +138,7 @@ void Reshape::execute(dnnl::stream strm) { } } -bool Reshape::isExecutable() const { +bool Reshape::canBeSkipped() const { bool inPlaceEnabled = false; if (auto prim_desc = getSelectedPrimitiveDescriptor()) { auto& config = prim_desc->getConfig(); @@ -147,7 +147,11 @@ bool Reshape::isExecutable() const { inPlaceEnabled = true; } } - return !inPlaceEnabled; + return inPlaceEnabled; +} + +bool Reshape::isExecutable() const { + return !canBeSkipped(); } bool Reshape::created() const { diff --git a/src/plugins/intel_cpu/src/nodes/reshape.h b/src/plugins/intel_cpu/src/nodes/reshape.h index 887fc6f739bd80..3b8b9100048840 100644 --- a/src/plugins/intel_cpu/src/nodes/reshape.h +++ b/src/plugins/intel_cpu/src/nodes/reshape.h @@ -18,6 +18,7 @@ class Reshape : public Node { void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; bool needShapeInfer() const override; diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.h b/src/plugins/intel_cpu/src/nodes/scaled_attn.h index bbf12727478e43..065fc77dbe8481 100644 --- a/src/plugins/intel_cpu/src/nodes/scaled_attn.h +++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.h @@ -21,6 +21,12 @@ class ScaledDotProductAttention : public Node { bool created() const override { return getType() == Type::ScaledDotProductAttention; } + + bool canBeSkipped() const override { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(1) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(2); + } // pastkv may have zero dimension bool isExecutable() const override { return !isInputTensorAtPortEmpty(0) && !isInputTensorAtPortEmpty(1) && !isInputTensorAtPortEmpty(2); diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp index 76c9acd218d9d1..3fa90e92f7b066 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp @@ -53,6 +53,10 @@ bool ScatterUpdate::isSupportedOperation(const std::shared_ptr& return true; } +bool ScatterUpdate::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(DATA_ID); +} + bool ScatterUpdate::isExecutable() const { return !isInputTensorAtPortEmpty(DATA_ID); } diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.h b/src/plugins/intel_cpu/src/nodes/scatter_update.h index 87604efe745332..897485c76a9426 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.h +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.h @@ -92,6 +92,7 @@ class ScatterUpdate : public Node { bool needPrepareParams() const override; void executeDynamicImpl(dnnl::stream strm) override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/shapeof.cpp b/src/plugins/intel_cpu/src/nodes/shapeof.cpp index af5df8e2878b18..472f2e5ceaa34c 100644 --- a/src/plugins/intel_cpu/src/nodes/shapeof.cpp +++ b/src/plugins/intel_cpu/src/nodes/shapeof.cpp @@ -79,10 +79,6 @@ void ShapeOf::initOptimalPrimitiveDescriptor() { selected_pd->setConfig(config); } -bool ShapeOf::isExecutable() const { - return true; -} - void ShapeOf::execute(dnnl::stream strm) { auto inPtr = getSrcMemoryAtPort(0); auto outPtr = getDstMemoryAtPort(0); diff --git a/src/plugins/intel_cpu/src/nodes/shapeof.h b/src/plugins/intel_cpu/src/nodes/shapeof.h index fbdb689ed08cec..d6c0da93dfaa5c 100644 --- a/src/plugins/intel_cpu/src/nodes/shapeof.h +++ b/src/plugins/intel_cpu/src/nodes/shapeof.h @@ -23,10 +23,11 @@ class ShapeOf : public Node { void initOptimalPrimitiveDescriptor() override; void execute(dnnl::stream strm) override; bool created() const override; - bool needPrepareParams() const override {return false;}; + bool needPrepareParams() const override { return false; } void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } - bool isExecutable() const override; + bool canBeSkipped() const override { return false; }; + bool isExecutable() const override { return true; } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/split.cpp b/src/plugins/intel_cpu/src/nodes/split.cpp index 72af54e619dbf3..19668241b76e79 100644 --- a/src/plugins/intel_cpu/src/nodes/split.cpp +++ b/src/plugins/intel_cpu/src/nodes/split.cpp @@ -276,6 +276,10 @@ void Split::prepareParams() { } } +bool Split::canBeSkipped() const { + return isInPlace() || getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Split::isExecutable() const { return !isInPlace() && !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/split.h b/src/plugins/intel_cpu/src/nodes/split.h index 0782594bcf9989..1a95f9817ab3e8 100644 --- a/src/plugins/intel_cpu/src/nodes/split.h +++ b/src/plugins/intel_cpu/src/nodes/split.h @@ -23,6 +23,7 @@ class Split : public Node { void initOptimalPrimitiveDescriptor() override; + bool canBeSkipped() const override; bool isExecutable() const override; bool needPrepareParams() const override; diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp index 13671c22d102ae..fe050ef7ea3586 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp @@ -287,6 +287,11 @@ void StridedSlice::initSupportedPrimitiveDescriptors() { } } +bool StridedSlice::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0) || + getSelectedPrimitiveDescriptor()->hasZeroOutputDimsAtPort(0); +} + bool StridedSlice::isExecutable() const { return !isInputTensorAtPortEmpty(0) && !isOutputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.h b/src/plugins/intel_cpu/src/nodes/strided_slice.h index bf698643271d7a..9f1cff78ab2b93 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.h +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.h @@ -26,6 +26,7 @@ class StridedSlice : public Node { return false; } + bool canBeSkipped() const override; bool isExecutable() const override; bool needShapeInfer() const override; diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.h b/src/plugins/intel_cpu/src/nodes/tensoriterator.h index f8a8110c3fae48..41c086288f0cdb 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.h +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.h @@ -111,6 +111,7 @@ class TensorIterator : public Node { void createPrimitive() override; bool created() const override; void execute(dnnl::stream strm) override; + bool canBeSkipped() const override { return false; } bool isExecutable() const override { return true; } protected: diff --git a/src/plugins/intel_cpu/src/nodes/transpose.cpp b/src/plugins/intel_cpu/src/nodes/transpose.cpp index 38712e04c50719..2674aa85fa723f 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/transpose.cpp @@ -125,8 +125,12 @@ void Transpose::initSupportedPrimitiveDescriptors() { } } +bool Transpose::canBeSkipped() const { + return isOptimized || getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Transpose::isExecutable() const { - return !isInputTensorAtPortEmpty(0) && !isOptimized; + return !isOptimized && !isInputTensorAtPortEmpty(0); } bool Transpose::needPrepareParams() const { diff --git a/src/plugins/intel_cpu/src/nodes/transpose.h b/src/plugins/intel_cpu/src/nodes/transpose.h index 03b65c1333610c..7c9e1686645914 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.h +++ b/src/plugins/intel_cpu/src/nodes/transpose.h @@ -34,6 +34,7 @@ class Transpose : public Node { return order; } + bool canBeSkipped() const override; bool isExecutable() const override; bool needPrepareParams() const override; void prepareParams() override; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 5c88772eeedabc..e330cad845837c 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -521,7 +521,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr& Config::ModelType modelType = getModelType(model); conf.readProperties(config, modelType); - auto context = std::make_shared(conf, fake_w_cache, false); + auto context = std::make_shared(conf, fake_w_cache, false, nullptr, nullptr); auto supported = ov::get_supported_nodes( model, diff --git a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake index 057869a864d87b..9d7fa9f9d9a365 100644 --- a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake +++ b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake @@ -96,7 +96,8 @@ endif() endfunction() if(ENABLE_CPU_SPECIFIC_TARGET_PER_TEST) - create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src ov_cpu_func_subgraph) + # create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src ov_cpu_func_subgraph) + create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/common ov_cpu_func_subgraph) create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/single_layer_tests ov_cpu_func_slt) endif() diff --git a/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp b/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp index a41cb4c4300d42..96733ec115319a 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp @@ -6,6 +6,7 @@ #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/concat.h" #include "nodes/rnn.h" @@ -42,7 +43,11 @@ class InplaceResolveIOCPUTestBase : public ::testing::Test { std::shared_ptr create_graph(const std::vector& input_shapes, const size_t num_consumers = 1) { Config conf; conf.rtCacheCapacity = 100; - const auto context = std::make_shared(conf, nullptr, false); + const auto context = std::make_shared(conf, + nullptr, + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); std::shared_ptr graph = std::shared_ptr(new Graph()); @@ -88,6 +93,7 @@ class InplaceResolveIOCPUTestBase : public ::testing::Test { std::vector nodes; std::vector edges; std::unordered_set nodesSet; + std::shared_ptr networkMemoryControl = std::make_shared(); }; class RNNConcatCPUTest : public InplaceResolveIOCPUTestBase { diff --git a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp index 5b9468ffc35e6f..02a5940965fb6e 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp @@ -6,6 +6,7 @@ #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/memory.hpp" #include "nodes/softmax.h" #include "nodes/shapeof.h" @@ -82,7 +83,8 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) { Config conf; conf.rtCacheCapacity = 0; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); auto input_node = std::make_shared(param, context); auto memory_input = std::make_shared(read, context); @@ -281,7 +283,12 @@ TEST(MemStateGraphTest, smoke_ShapeOf_no_Inplace_Conflicts) { Config conf; conf.rtCacheCapacity = 0; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, + nullptr, + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); auto input_node = std::make_shared(param, context); auto memory_input = std::make_shared(read, context); diff --git a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp index 003aca979398fb..71bf2dc340855e 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp @@ -9,6 +9,7 @@ #include "common_test_utils/node_builders/constant.hpp" #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/reorder.h" #include "nodes/reshape.h" @@ -76,7 +77,7 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface(conf, nullptr, false); + m_context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); const auto replication_result = CreateModelAndReplicate(shape, params.firstNodeLayout, params.firstNodeInplaceDirection, @@ -173,6 +174,7 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface m_context; std::unique_ptr m_graph; + std::shared_ptr networkMemoryControl = std::make_shared(); }; // class MergeTransposeReorderCPUTest /* @@ -335,7 +337,8 @@ TEST(MergeTransposeReorder, smoke_InplaceConflict) { */ Config conf; conf.rtCacheCapacity = 100; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); std::unique_ptr graph = std::unique_ptr(new Graph()); diff --git a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp index b44194a3d5806c..8e510f31f8066c 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp @@ -5,6 +5,7 @@ #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/concat.h" #include "openvino/op/concat.hpp" @@ -43,7 +44,12 @@ TEST(ResolveEdgeConflictsCPUTest, smoke_Run_ResolveEdgeConflicts) { */ Config conf; conf.rtCacheCapacity = 100; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, + nullptr, + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); const dnnl::engine cpuEngine = context->getEngine(); std::unique_ptr graph = std::unique_ptr(new Graph()); @@ -104,7 +110,8 @@ TEST(ResolveEdgeConflictsCPUTest2, smoke_Run_ResolveEdgeConflicts2) { */ Config conf; conf.rtCacheCapacity = 100; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); std::unique_ptr graph = std::unique_ptr(new Graph()); diff --git a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp index ea2994759e7036..63a44f5bea7075 100644 --- a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp @@ -14,6 +14,7 @@ #include #include "common_test_utils/common_utils.hpp" +#include "memory_control.hpp" #include "nodes/input.h" using namespace ov::intel_cpu; @@ -108,7 +109,9 @@ class ReorderCPUTestGraph { conf.rtCacheCapacity = 100; auto context = std::make_shared(conf, std::make_shared(), - false); + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); const dnnl::engine cpuEngine = context->getEngine(); inputNode = std::make_shared(inputDesc.clone(), @@ -152,6 +155,7 @@ class ReorderCPUTestGraph { std::shared_ptr parentEdge; std::shared_ptr childEdge; ov::element::Type prec; + std::shared_ptr networkMemoryControl = std::make_shared(); }; }// namespace ReorderCPUTest