diff --git a/src/inference/dev_api/openvino/runtime/memory_solver.hpp b/src/inference/dev_api/openvino/runtime/memory_solver.hpp index b2e11d203ef50f..5a93004c5e20f2 100644 --- a/src/inference/dev_api/openvino/runtime/memory_solver.hpp +++ b/src/inference/dev_api/openvino/runtime/memory_solver.hpp @@ -52,6 +52,7 @@ class MemorySolver { struct Box { /** Execution order index of first use. The data will be produced here. */ int start; + // intel_cpu::GlobalExecutionIndex start; /** * The execution order index of last use. After that data will be released. @@ -59,6 +60,7 @@ class MemorySolver { * end of execution. */ int finish; + // intel_cpu::GlobalExecutionIndex finish; /** Size of data. In abstract unit of measure (byte, simd, cache line, ...) */ int64_t size; diff --git a/src/plugins/intel_cpu/src/allocation_context.hpp b/src/plugins/intel_cpu/src/allocation_context.hpp new file mode 100644 index 00000000000000..8affe814807004 --- /dev/null +++ b/src/plugins/intel_cpu/src/allocation_context.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +namespace ov { +namespace intel_cpu { + +class Node; +class Edge; + +using GlobalExecutionIndex = std::unordered_map, std::pair>; + +struct AllocationContext { + std::vector> edges; + GlobalExecutionIndex execIndex; + std::vector syncPoints; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index bbee5d937be5d5..bbaa77b2d7cbc6 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -4,6 +4,7 @@ #include "compiled_model.h" #include "async_infer_request.h" +#include "graph.h" #include "infer_request.h" #include "itt.h" #include "low_precision/low_precision.hpp" @@ -19,6 +20,7 @@ #include "openvino/runtime/threading/cpu_streams_info.hpp" #include "openvino/runtime/threading/cpu_message.hpp" #include "utils/serialize.hpp" +#include "memory_control.hpp" #include "cpu/x64/cpu_isa_traits.hpp" #include @@ -52,7 +54,8 @@ CompiledModel::CompiledModel(const std::shared_ptr& model, m_cfg{cfg}, m_name{model->get_name()}, m_loaded_from_cache(loaded_from_cache), - m_sub_memory_manager(sub_memory_manager) { + m_sub_memory_manager(sub_memory_manager), + m_networkMemoryControl(std::make_shared()) { m_mutex = std::make_shared(); const auto& core = m_plugin->get_core(); if (!core) @@ -155,17 +158,23 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const { GraphContext::Ptr ctx; { std::lock_guard lock{*m_mutex.get()}; + MemoryControl* memoryControl = m_networkMemoryControl->createMemoryControlUnit(); auto isQuantizedFlag = (m_cfg.lpTransformsMode == Config::On) && ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model); - ctx = std::make_shared(m_cfg, m_socketWeights[socketId], isQuantizedFlag, + memoryControl, + m_networkMemoryControl, streamsExecutor, m_sub_memory_manager); } + const std::shared_ptr model = m_model; - graphLock._graph.CreateGraph(model, ctx); + // @todo propagate input / output memory descriptors + graphLock._graph.Init(model, ctx); + // @todo pass input / output memory + graphLock._graph.Activate({}, {}, true); } catch (...) { exception = std::current_exception(); } @@ -346,7 +355,7 @@ void CompiledModel::release_memory() { for (auto&& graph : m_graphs) { GraphGuard::Lock graph_lock{graph}; auto ctx = graph_lock._graph.getGraphContext(); - ctx->getNetworkMemoryControl()->releaseMemory(); + m_networkMemoryControl->releaseMemory(); } } diff --git a/src/plugins/intel_cpu/src/compiled_model.h b/src/plugins/intel_cpu/src/compiled_model.h index faedf1ae5a744c..cab50971f31a78 100644 --- a/src/plugins/intel_cpu/src/compiled_model.h +++ b/src/plugins/intel_cpu/src/compiled_model.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -19,6 +20,8 @@ namespace ov { namespace intel_cpu { +class NetworkMemoryControl; + class CompiledModel : public ov::ICompiledModel { public: typedef std::shared_ptr Ptr; @@ -51,6 +54,10 @@ class CompiledModel : public ov::ICompiledModel { void release_memory() override; + std::shared_ptr get_network_memory_control() const { + return m_networkMemoryControl; + } + private: std::shared_ptr create_sync_infer_request() const override; friend class SyncInferRequest; @@ -91,6 +98,7 @@ class CompiledModel : public ov::ICompiledModel { std::vector> m_sub_compiled_models; std::shared_ptr m_sub_memory_manager = nullptr; + std::shared_ptr m_networkMemoryControl = nullptr; bool m_has_sub_compiled_models = false; }; diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp index c314718bb82416..6c2f3144f15f9b 100644 --- a/src/plugins/intel_cpu/src/edge.cpp +++ b/src/plugins/intel_cpu/src/edge.cpp @@ -235,7 +235,7 @@ Edge::ReorderStatus Edge::needReorder() { } void Edge::reuse(MemoryPtr ptr) { - OPENVINO_ASSERT(ptr != nullptr, "Attempt to reuse initialized memory in " + name()); + OPENVINO_ASSERT(ptr != nullptr, "Attempt to reuse uninitialized memory in " + name()); memoryPtr = ptr; changeStatus(Status::Allocated); @@ -293,6 +293,11 @@ std::string Edge::name() const { std::stringstream result; result << parentPtr->getName() << " port " << parent_port << " <-> " << childPtr->getName() << " port " << child_port; + // result << parentPtr->getName()<< " port " << parent_port + // << " <-> " + // << childPtr->getName() << " port " << child_port + // << " status: " + // << static_cast(getStatus()); return result.str(); } @@ -441,7 +446,7 @@ void Edge::validate() { getChild(); if (status != Status::Allocated || !memoryPtr) { - OPENVINO_THROW("Error memory is not allocated!"); + OPENVINO_THROW("Error memory is not allocated for edge: ", name()); } status = Status::Validated; } diff --git a/src/plugins/intel_cpu/src/edge.h b/src/plugins/intel_cpu/src/edge.h index 29cb8113943cd3..e77a5cecf89aeb 100644 --- a/src/plugins/intel_cpu/src/edge.h +++ b/src/plugins/intel_cpu/src/edge.h @@ -82,6 +82,7 @@ class Edge { } std::string name() const; + const MemoryDesc& getDesc() const; private: std::weak_ptr parent; @@ -99,7 +100,6 @@ class Edge { PortDescBaseCPtr getInputPortDesc() const; PortDescBaseCPtr getOutputPortDesc() const; - const MemoryDesc& getDesc() const; bool enforceReorder(); void collectConsumers(std::vector>& result) const; diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index f3f3a379fc2af7..b8f3a1ccbcd81b 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -6,6 +6,9 @@ #include #include +#include +#include +#include #include #include #include @@ -16,11 +19,14 @@ #include #include +#include "allocation_context.hpp" #include "edge.h" +#include "graph_context.h" #include "graph_dumper.h" #include "graph_optimizer.h" #include "infer_request.h" #include "itt.h" +#include "memory_control.hpp" #include "memory_desc/cpu_memory_desc_utils.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "node.h" @@ -277,7 +283,9 @@ static std::tuple, std::vector> ExtractExecutableNo std::vector executableGraphNodes; for (size_t i = 0; i < graphNodes.size(); i++) { const auto& graphNode = graphNodes[i]; - if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(graphNode->isExecutable())) || // non-constant executable or + // if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or + // if ((!graphNode->isConstant()) || // non-constant executable or + if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or (graphNode->isDynamicNode() && !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs /* @todo * Revise implementation. @@ -350,16 +358,23 @@ static void UseExternalOutputMemory(const std::map& output } void Graph::Activate(const std::vector& externalInputMemory, - const std::vector& externalOutputMemory) { - OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); + const std::vector& externalOutputMemory, + bool globalAllocation) { + // OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); - const bool hasDynNodes = ProcessDynNodes(); - const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; + // const bool hasDynNodes = ProcessDynNodes(); + // const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; UseExternalInputMemory(inputNodesMap, externalInputMemory); UseExternalOutputMemory(outputNodesMap, externalOutputMemory); - Allocate(syncNodesInds); + // std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + // status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) + // : Status::ReadyStatic; + + // CPU_DEBUG_CAP_ENABLE(serialize(*this)); + Allocate(globalAllocation); CreatePrimitivesAndExecConstants(); @@ -369,22 +384,6 @@ void Graph::Activate(const std::vector& externalInputMemory, } #endif - std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); - - if (hasDynNodes) { - status = Status::ReadyDynamic; - // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec - // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context - // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also - // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in - // parallel. - const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); - if (exec2sync < 10 || parallel_get_max_threads() < 2) { - status = Status::ReadyDynamicSeq; - } - } else { - status = Status::ReadyStatic; - } CPU_DEBUG_CAP_ENABLE(serialize(*this)); } @@ -713,206 +712,137 @@ void Graph::ResolveComplexInplaceConflicts() { } } -static inline bool isConstOutput(EdgePtr edge) { - return edge->getParent()->isConstant() && !edge->getChild()->isConstant(); -} - -void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { - edgeClusters edge_clusters = MemoryControl::findEdgeClusters(graphEdges); - - size_t remaining_edge_clusters_count = edge_clusters.size(); - - // Resolve special cases: - for (size_t i = 0; i < remaining_edge_clusters_count;) { - auto &cluster = edge_clusters[i]; - bool erase = false; - for (auto &edge : cluster) { - // Remove already allocated edges from the mem reuse algo - if (edge->getStatus() == Edge::Status::Allocated) { - erase = true; - break; - } - - // Special allocation for string tensors - if (edge->getDesc().getPrecision() == element::string && edge->getStatus() == Edge::Status::NeedAllocation) { - StringMemory::StringMemoryBlockPtr memBlcok; - if (edge->getParent()->isConstant()) { - if (edge->getParent()->getType() == Type::Input) { - auto constNode = static_cast(edge->getParent().get()); - edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); - } else { - edge->externalAllocate(m_context->getWeightsCache()); - } - auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); - OPENVINO_ASSERT(stringMemory, "[CPU] Edge between nodes '", - edge->getParent()->getName(), "' and '", edge->getChild()->getName(), "' must have StringMemory."); - memBlcok = stringMemory->getStringMemoryBlockPtr(); - } else { - auto memory = std::make_shared(getEngine(), edge->getDesc()); - edge->reuse(memory); - memBlcok = memory->getStringMemoryBlockPtr(); - } - for (auto& edge_c : cluster) { - if (edge_c == edge) { - continue; - } - OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string, "All edges in the cluster must be string."); - if (edge_c->getStatus() == Edge::Status::NotAllocated) { - auto memory = std::make_shared(getEngine(), edge_c->getDesc(), memBlcok); - edge_c->reuse(memory); - } else { - OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '", edge_c->getParent()->getName(), "' and '", - edge_c->getChild()->getName(), "' has an unexpected status: ", static_cast(edge_c->getStatus())); - } - } - erase = true; - continue; - } - - // Special allocation for constants - if (edge->getStatus() != Edge::Status::NeedAllocation || !edge->getParent()->isConstant()) { - continue; - } +/** + * Partition the \clusters of Edges, by moving and allocating at the same time + * the clusters which cannot be handled as part of generic memory solver algorithm. + * Such clusters meet one of the following criteria: + * - base edge of a cluster is already Allocated + * - base edge of a cluster is a "ov::element::string" type of edge + * - base edge of a cluster is a Constant edge + * + * @return a remaining number of clusters to process (left partition) + */ +static size_t AllocateStringsAndConstants(EdgeClusters& clusters, + const GraphContext::CPtr context) { + auto allocateStringMemory = [context](const EdgePtr& edge) { + if (edge->getParent()->isConstant()) { if (edge->getParent()->getType() == Type::Input) { - auto constNode = std::static_pointer_cast(edge->getParent()); + auto constNode = static_cast(edge->getParent().get()); edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); } else { - edge->externalAllocate(m_context->getWeightsCache()); - } - erase = true; - } - - if (erase) { - std::swap(edge_clusters[i], edge_clusters[remaining_edge_clusters_count - 1]); - --remaining_edge_clusters_count; - } else { - ++i; - } - } - - // Markup the memory regions - std::vector memoryRegions; - memoryRegions.reserve(remaining_edge_clusters_count); - - for (size_t i = 0; i < remaining_edge_clusters_count; ++i) { - MemoryRegion reg = {std::numeric_limits::max(), - 0, - 0, - static_cast(i), - MemoryRegion::RegionType::VARIABLE, - MemoryRegion::AllocType::UNKNOWN}; - - int64_t boxSize = 0; - bool isConst = false, isOutput = false, isInput = false; - for (auto &edge : edge_clusters[i]) { - int e_start = edge->getParent()->getExecIndex(); - int e_finish = edge->getChild()->getExecIndex(); - - auto&& desc = edge->getDesc(); - - if (boxSize != -1 && desc.isDefined()) { - int64_t e_size = desc.getCurrentMemSize(); // size in bytes (from the beginning of data to the last element) - boxSize = std::max(e_size, boxSize); - } else { - boxSize = -1; - } - - reg.start = std::min(e_start, reg.start); - reg.finish = std::max(e_finish, reg.finish); - - auto allocType = - desc.getPrecision() == element::string ? MemoryRegion::AllocType::STRING : MemoryRegion::AllocType::POD; - - if (reg.alloc_type != allocType && MemoryRegion::AllocType::UNKNOWN != reg.alloc_type) { - OPENVINO_THROW("Different allocation types in the same memory region"); - } - reg.alloc_type = allocType; - - isConst |= isConstOutput(edge); - isOutput |= edge->getChild()->getType() == Type::Output; - isInput |= edge->getParent()->getType() == Type::Input; - } - - reg.size = boxSize; - - if (isConst) { - reg.type = MemoryRegion::RegionType::CONSTANT; - } else if (isInput) { - if (isOutput) { - reg.type = MemoryRegion::RegionType::IO; - } else { - reg.type = MemoryRegion::RegionType::INPUT; + edge->externalAllocate(context->getWeightsCache()); } - } else if (isOutput) { - reg.type = MemoryRegion::RegionType::OUTPUT; + auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); + OPENVINO_ASSERT(stringMemory, "[CPU] Edge between nodes '", + edge->getParent()->getName(), "' and '", edge->getChild()->getName(), "' must have StringMemory."); + return stringMemory->getStringMemoryBlockPtr(); } - memoryRegions.push_back(reg); - } + auto memory = std::make_shared(context->getEngine(), edge->getDesc()); + edge->reuse(memory); + return memory->getStringMemoryBlockPtr(); + }; - // special processing of the dynamic output edges - auto it = std::remove_if(memoryRegions.begin(), memoryRegions.end(), [&](const MemoryRegion& region) { - if (region.size >= 0 || !one_of(region.type, MemoryRegion::RegionType::OUTPUT, MemoryRegion::RegionType::IO)) { - return false; - } - bool result = false; - for (auto& edge : edge_clusters[region.id]) { - auto child = edge->getChild(); - if (child->getType() == Type::Output && edge->getStatus() == Edge::Status::NeedAllocation) { - auto proxyMemBlock = std::make_shared(); - DEBUG_LOG("ProxyMemoryBlock ", proxyMemBlock, " ", this); - edge->allocate(proxyMemBlock); - - // Store the output memory blocks. - // So that, the infer requests can be able to access them. - int count = 0; - for (auto& output : outputNodesMap) { - if (output.second == child) { - outputNodesMemBlocksMap[output.first] = proxyMemBlock; - count++; - } - } - // sometimes there are unused output ports. - OPENVINO_ASSERT(count <= 1, "CPU plugin cannot find output node. count ", count); - result = true; - } + auto allocateConstantEdge = [context](const EdgePtr& edge) { + // std::cout << "Allocating constant edge: " << edge->name() << " wc: " << context->getWeightsCache() << "\n"; + if (edge->getParent()->getType() == Type::Input) { + auto constNode = std::static_pointer_cast(edge->getParent()); + edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); + } else { + edge->externalAllocate(context->getWeightsCache()); } - return result; - }); - - memoryRegions.erase(it, memoryRegions.end()); + }; - //Set up the memory control subsystem. - this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(syncNodesInds)); - auto memoryBlocks = m_pMemoryControl->insert(memoryRegions); + auto endOfNotAllocatedPartition = + std::partition(clusters.begin(), clusters.end(), + [&allocateStringMemory, &allocateConstantEdge, &context](const EdgeCluster& cluster) { + if (cluster.empty()) return false; + + auto baseEdgeIt = std::find_if(cluster.begin(), cluster.end(), [](const EdgePtr& edge) { + return one_of(edge->getStatus(), Edge::Status::Allocated, Edge::Status::NeedAllocation); + }); + + OPENVINO_ASSERT(baseEdgeIt != cluster.end(), "Unexpected cluster state"); + + // const auto& baseEdge = cluster.front(); + const auto& baseEdge = *baseEdgeIt; + // Skip already allocated cluster + if (baseEdge->getStatus() == Edge::Status::Allocated) { + return false; + } + + // std::cout << "Processing string/const for base edge: " << baseEdge->name() << "\n"; + + // Skip if the baseEdge does not require allocation + if (baseEdge->getStatus() != Edge::Status::NeedAllocation) { + return true; + } + + // Allocate a string cluster + if (baseEdge->getDesc().getPrecision() == element::string) { + OPENVINO_ASSERT(std::all_of(cluster.begin(), cluster.end(), + [](const EdgePtr& edge) { + return edge->getDesc().getPrecision() == element::string; + }), "All edges in the cluster must be string."); + auto memBlock = allocateStringMemory(baseEdge); + for (auto &edge : cluster) { + if (edge->getStatus() == Edge::Status::NotAllocated) { + edge->reuse(std::make_shared(context->getEngine(), edge->getDesc(), memBlock)); + } + } + return false; + } + + // Allocate a constant cluster + if (baseEdge->getParent()->isConstant()) { + // @todo can we add some meaningful assert here? + for (auto &edge : cluster) { + if (edge->getParent()->isConstant() && edge->getStatus() == Edge::Status::NeedAllocation) { + allocateConstantEdge(edge); + } + } + return false; + } + + return true; + }); + + return std::distance(clusters.begin(), endOfNotAllocatedPartition); +} - // attach all the not yet allocated edges to the memory contol +static void attachEdgeToMemoryControl(const EdgeClusters& edgeClusters, + const MemoryControl::MemoryBlockMap& memoryBlocks) { + // attach all the not yet allocated edges to the memory control for (auto&& item : memoryBlocks) { int count = 0; - for (auto&& edge : edge_clusters[item.first]) { + // std::cout << "Processing cluster: " << item.first << "\n"; + for (auto&& edge : edgeClusters[item.first]) { + // std::cout << "Processing edge: " << edge->name() << "\n"; if (edge->getStatus() == Edge::Status::NeedAllocation) { + // std::cout << "Allocating edge: " << edge->name() << "\n"; + edge->allocate(item.second); // TODO: WA for some test (like strided_slice_test) which use tensors with // shapes {0}. And it is implicitly converted into {1} tensor. // Zeroing of input data allow pass tests. - if (edge->getParent()->type == Type::Input && edge->hasDefinedMaxSize()) + if (edge->getParent()->getType() == Type::Input && edge->hasDefinedMaxSize()) edge->getMemoryPtr()->nullify(); count++; } } - OPENVINO_ASSERT(count == 1); + OPENVINO_ASSERT(count == 1, "Expected exactly one allocation. Actual number of allocations: ", count); } +} - m_pMemoryControl->allocateMemory(); - - // Resolve all other edges with status NotAllocated and in-place - for (auto& cluster : edge_clusters) { +static void resolveInPlaceEdges(const EdgeClusters& clusters) { + for (auto& cluster : clusters) { for (auto& edge : cluster) { if (edge->getStatus() != Edge::Status::NotAllocated) { continue; } + std::vector edges_to_process; edges_to_process.push_back(edge); for (auto next_edge = edge->getSharedEdge(std::nothrow); @@ -920,7 +850,9 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { next_edge = next_edge->getSharedEdge(std::nothrow)) { edges_to_process.push_back(next_edge); } + std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) { + // std::cout << "Processing edge: " << edge->name() << "\n"; if (edge->getStatus() == Edge::Status::NotAllocated) { if (edge->inPlace(Edge::LOOK_DOWN)) { edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); @@ -929,6 +861,7 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } else { auto sharedEdge = edge->getSharedEdge(); auto sharedEdgeParent = sharedEdge->getParent(); + // std::cout << "Allocating edge: " << edge->name() << " Using shared edge: " << sharedEdge->name() << "\n"; edge->allocate(sharedEdge->getMemoryPtr()->getMemoryBlock()); DEBUG_LOG(*edge, " sharedEdge with ", *sharedEdge); } @@ -938,11 +871,37 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } } -void Graph::Allocate(const std::vector& syncNodesInds) { - OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::Allocate"); +std::vector Graph::CreateExecutionGraph() { + const bool hasDynNodes = ProcessDynNodes(); + auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; + + std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = + ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) + : Status::ReadyStatic; + + if (hasDynNodes) { + status = Status::ReadyDynamic; + // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec + // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context + // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also + // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in + // parallel. + const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); + if (exec2sync < 10 || parallel_get_max_threads() < 2) { + status = Status::ReadyDynamicSeq; + } + } else { + status = Status::ReadyStatic; + } + + return syncNodesInds; +} - //resolve inplace dead end nodes - for (const auto& edge : graphEdges) { +static void ResolveInOutInPlaceEdgesLegacy(const std::vector& edges) { + for (const auto& edge : edges) { + // std::cout << edge->name() << "\n"; if (edge->getStatus() == Edge::Status::Uninitialized) { if (edge->getParent()->getParentEdges().empty() && one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) && @@ -955,20 +914,124 @@ void Graph::Allocate(const std::vector& syncNodesInds) { } } } +} + +static void ResolveInOutInPlaceEdges(const std::vector& edges) { + for (const auto& edge : edges) { + if (edge->getStatus() == Edge::Status::Uninitialized) { + if (edge->getParent()->getParentEdges().empty() && + one_of(edge->getParent()->getType(), Type::MemoryInput) && + edge->inPlace(Edge::LOOK_UP)) { + edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); + } else if (edge->getChild()->getChildEdges().empty() && + one_of(edge->getChild()->getType(), Type::MemoryOutput) && + edge->inPlace(Edge::LOOK_DOWN)) { + edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); + } + } + } +} + +int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) { + auto syncNodesInds = CreateExecutionGraph(); + + ResolveInOutInPlaceEdges(graphEdges); + + for (size_t i = 0, j = 0; i < graphNodes.size(); i++) { + const auto& node = graphNodes[i]; + const auto inputExecIndex = i + offset; + offset = node->registerToAllocationContext(offset, context) - 1; + const auto outputExecIndex = i + offset; + context.execIndex[node] = {inputExecIndex, outputExecIndex}; + + if (j < syncNodesInds.size() && syncNodesInds[j] == i) { + context.syncPoints.push_back(inputExecIndex); + j++; + } + } + + context.edges.insert(context.edges.end(), graphEdges.begin(), graphEdges.end()); + + return offset; +} + +AllocationContext Graph::CreateAllocationContext(bool global) { + AllocationContext allocationContext; + + if (global) { + RegisterToAllocationContext(0, allocationContext); + } else { // local allocation context. Used for the nodes with inner graph which are not updated yet + ResolveInOutInPlaceEdgesLegacy(graphEdges); + + auto syncNodesInds = CreateExecutionGraph(); + + for (size_t i = 0; i < graphNodes.size(); i++) { + const auto& node = graphNodes[i]; + const int inputExecIndex = i; + const int outputExecIndex = i; + allocationContext.execIndex[node] = {inputExecIndex, outputExecIndex}; + } + + allocationContext.edges = graphEdges; + allocationContext.syncPoints = syncNodesInds; + } - // resolve edges. Define which will be a view on others - // NeedAllocation - real blob - // NotAllocated - view on other blob, peer or in-place - for (auto& edge : graphEdges) edge->init(); + return allocationContext; +} + +void Graph::Allocate(bool globalAllocation) { + if (std::getenv("FORCE_LOCAL")) + globalAllocation = false; + // Set up the memory control subsystem. + auto memoryControl = globalAllocation ? m_context->getMemoryControl() : m_context->getNetworkMemoryControl()->createMemoryControlUnit(); + + if (memoryControl->allocated()) { + // std::cout << "Memory is already allocated for a subgraph: " << _name << "\n"; + return; + } + + // @todo collect syncNodesInds with respect to global context as well + auto allocationContext = CreateAllocationContext(globalAllocation); + const auto& edges = allocationContext.edges; + + // std::cout << "### Global edges:" << "\n"; + // for (const auto& edge : edges) { + // const auto& parent = edge->getParent(); + // const auto& child = edge->getChild(); + // std::cout << "[" << allocationContext.execIndex[parent].second << " - " << allocationContext.execIndex[child].first << "]" + // << edge->name() + // << "\n"; + // } + + // ResolveInOutInPlaceEdges(edges); + + for (auto& edge : edges) edge->init(); + + auto edgeClusters = MemoryControl::formEdgeClusters(edges); - // Allocate memory space for all edges marked with NeedAllocation - AllocateWithReuse(syncNodesInds); + const size_t remainingEdgeClustersCount = AllocateStringsAndConstants(edgeClusters, m_context); - // Check all getters. Should work. - for (auto& edge : graphEdges) edge->validate(); + // std::cout << "Edge clusters size: " << edgeClusters.size() << " remaining: " << remainingEdgeClustersCount << "\n"; + + auto memoryRegions = MemoryControl::formMemoryRegions(edgeClusters, + remainingEdgeClustersCount, + allocationContext.execIndex); + + m_outputNodesMemBlocks = MemoryControl::filterOutDynamicOutputEdges(memoryRegions, + edgeClusters, + outputNodesMap); + + memoryControl->insert(memoryRegions, allocationContext.syncPoints); + auto memoryBlocks = memoryControl->solve(); + + attachEdgeToMemoryControl(edgeClusters, memoryBlocks); + memoryControl->allocateMemory(); + resolveInPlaceEdges(edgeClusters); + + for (auto& edge : edges) edge->validate(); } -bool Graph::ProcessDynNodes() { +bool Graph::ProcessDynNodes() const { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::ProcessDynNodes"); const bool containsDynamicNodes = std::any_of(graphNodes.begin(), graphNodes.end(), [](const NodePtr& node) { @@ -1395,14 +1458,6 @@ void Graph::Infer(SyncInferRequest* request) { DEBUG_LOG("Infer graph: ", GetName(), ". Status: ", static_cast(status)); const int numaId = GetNumaNodeId(m_context); - if (!m_pMemoryControl) { - OPENVINO_THROW("Memory control unit is not initilized in graph: ", GetName()); - } - - if (!m_pMemoryControl->allocated()) { - m_pMemoryControl->allocateMemory(); - } - switch (status) { case Status::ReadyDynamic: InferDynamic(request, numaId, UpdateNodes(m_executableGraphNodes)); diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index d50ccc152c9186..2e31c9f9243b0c 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -4,17 +4,17 @@ #pragma once +#include "allocation_context.hpp" #include "config.h" #include "cpu_memory.h" #include "nodes/input.h" -#include "openvino/core/node_vector.hpp" #include "openvino/runtime/profiling_info.hpp" #include "node.h" #include "edge.h" #include "graph_context.h" -#include "memory_control.hpp" #include "openvino/runtime/profiling_info.hpp" +#include #include #include #include @@ -31,6 +31,19 @@ namespace node { class MemoryStateNode; } // namespace node +struct MemoryRegion { + int start; // Execution order index of first use. + int finish; // Execution order index of last use. -1 means inf + int64_t size; // size in bytes + int64_t id; // ID unique for each region + + enum class RegionType : uint8_t { VARIABLE, CONSTANT, INPUT, OUTPUT, IO } type; + enum class AllocType : uint8_t { POD, STRING, UNKNOWN } alloc_type; +}; + +using MemoryRegions = std::vector; +using OutputMemoryBlocks = std::unordered_map; + class Graph { public: typedef std::shared_ptr Ptr; @@ -75,6 +88,9 @@ class Graph { void PushInputData(const std::size_t& index, const ov::SoPtr& input); void PullOutputData(std::unordered_map>& output); + // @todo pass as part of one of the graph configuration stages + // void SetGlobalExecutionIndex() { + // } // Returns Output nodes memory descriptors VecMemoryDescs getOutputMemoryDescriptors() const; @@ -213,12 +229,26 @@ class Graph { /** * Activate execution graph using \p externalInputMemory and \p externalOutputMemory + * 'globalAllocation' is a temporary flag indicating that the current graph is participaing in + * global memory reuse (together with all inner / outer graphs). + * The flag should be dropped after all the nodes with inner graphs participate in + * global memory reuse by default */ void Activate(const std::vector& externalInputMemory = {}, - const std::vector& externalOutputMemory = {}); + const std::vector& externalOutputMemory = {}, + bool globalAllocation = false); + + MemoryRegions RegisterExternalMemory(const std::vector& externalInputMemory = {}, + const std::vector& externalOutputMemory = {}); + + void Allocate(bool globalAllocation = false); + + AllocationContext CreateAllocationContext(bool global); + + int RegisterToAllocationContext(int offset, AllocationContext& context); const std::unordered_map& getOutputNodesMemBlocksMap() const { - return outputNodesMemBlocksMap; + return m_outputNodesMemBlocks; } protected: @@ -256,10 +286,10 @@ class Graph { void InitOptimalPrimitiveDescriptors(); void ResolveEdgeConflicts(); void ResolveComplexInplaceConflicts(); - bool ProcessDynNodes(); - void Allocate(const std::vector& syncNodesInds); - void AllocateWithReuse(const std::vector& syncNodesInds); + bool ProcessDynNodes() const; + void AllocateWithReuse(const std::vector& syncNodesInds, GlobalExecutionIndex globalExecIndex); void CreatePrimitivesAndExecConstants() const; + std::vector CreateExecutionGraph(); /** * Execute a given \p node within \p request using \p numaId @@ -300,7 +330,7 @@ class Graph { std::map inputNodesMap; std::map outputNodesMap; - std::unordered_map outputNodesMemBlocksMap; + OutputMemoryBlocks m_outputNodesMemBlocks; // these node pointers (from graphNodes) are to avoid regular checking for // constantness of nodes in Infer methods and calls of @@ -310,8 +340,6 @@ class Graph { GraphContext::CPtr m_context; dnnl::stream m_stream; - - MemoryControl* m_pMemoryControl = nullptr; }; using GraphPtr = std::shared_ptr; diff --git a/src/plugins/intel_cpu/src/graph_context.cpp b/src/plugins/intel_cpu/src/graph_context.cpp index e200766fa4791c..e4eb13ed58f53f 100644 --- a/src/plugins/intel_cpu/src/graph_context.cpp +++ b/src/plugins/intel_cpu/src/graph_context.cpp @@ -1,7 +1,6 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "dnnl_types.h" #include "graph_context.h" #include "nodes/memory.hpp" #include "memory_control.hpp" @@ -12,6 +11,8 @@ namespace intel_cpu { GraphContext::GraphContext(const Config& config, WeightsSharing::Ptr w_cache, bool isGraphQuantized, + MemoryControl* memoryControl, + std::shared_ptr networkMemoryControl, ov::threading::IStreamsExecutor::Ptr streamExecutor, std::shared_ptr sub_memory_manager) : config(config), @@ -20,7 +21,8 @@ GraphContext::GraphContext(const Config& config, streamExecutor(streamExecutor), subMemoryManager(sub_memory_manager), memoryStatesRegister(std::make_shared()), - networkMemoryControl(std::make_shared()) { + memoryControl(memoryControl), + networkMemoryControl(networkMemoryControl) { rtParamsCache = std::make_shared(config.rtCacheCapacity); // primitive/executors can be shared across sub-stream // but scratch pad cannot be shared. diff --git a/src/plugins/intel_cpu/src/graph_context.h b/src/plugins/intel_cpu/src/graph_context.h index db2b126213978c..15f947d17788a9 100644 --- a/src/plugins/intel_cpu/src/graph_context.h +++ b/src/plugins/intel_cpu/src/graph_context.h @@ -18,6 +18,7 @@ namespace node { class MemoryStatesRegister; } // namespace node +class MemoryControl; class NetworkMemoryControl; class GraphContext { @@ -28,6 +29,8 @@ class GraphContext { GraphContext(const Config& config, WeightsSharing::Ptr w_cache, bool isGraphQuantized, + MemoryControl* memoryControl, + std::shared_ptr networkMemoryControl, // obsolete in favor of local memoryControl ov::threading::IStreamsExecutor::Ptr streamExecutor = nullptr, std::shared_ptr sub_memory_manager = nullptr); @@ -78,6 +81,10 @@ class GraphContext { return memoryStatesRegister; } + MemoryControl* getMemoryControl() const { + return memoryControl; + } + const std::shared_ptr& getNetworkMemoryControl() const { return networkMemoryControl; } @@ -103,6 +110,10 @@ class GraphContext { int numNumaNodes = 1; std::shared_ptr memoryStatesRegister; + MemoryControl* memoryControl; + // to be removed in favor of local memoryControl + // currently required for the nodes with inner graphs which + // do not participate in global memory reuse std::shared_ptr networkMemoryControl; }; diff --git a/src/plugins/intel_cpu/src/graph_dumper.cpp b/src/plugins/intel_cpu/src/graph_dumper.cpp index fab6e99dcf2550..6bcc46153cbf79 100644 --- a/src/plugins/intel_cpu/src/graph_dumper.cpp +++ b/src/plugins/intel_cpu/src/graph_dumper.cpp @@ -243,7 +243,6 @@ void serializeToXML(const Graph &graph, const std::string& path) { void serializeToCout(const Graph &graph) { for (const auto& node : graph.GetNodes()) { - std::cout << "name: " << node->getName() << " [ "; auto nodeDesc = node->getSelectedPrimitiveDescriptor(); if (nodeDesc) { auto& inConfs = nodeDesc->getConfig().inConfs; diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index f0b817dcda859c..2110d837ab7bc9 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -19,6 +19,7 @@ #include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" #include "openvino/runtime/threading/cpu_message.hpp" +#include "memory_control.hpp" using OvString = ov::element_type_traits::value_type; @@ -135,6 +136,15 @@ void SyncInferRequest::infer() { push_input_data(); + MemoryControl* network_memory_control = m_graph->getGraphContext()->getMemoryControl(); + if (!network_memory_control) { + OPENVINO_THROW("Memory control unit is not initilized for graph: ", m_graph->GetName()); + } + + if (!network_memory_control->allocated()) { + network_memory_control->allocateMemory(); + } + m_graph->Infer(this); throw_if_canceled(); diff --git a/src/plugins/intel_cpu/src/memory_control.cpp b/src/plugins/intel_cpu/src/memory_control.cpp index 0f202c296891c1..9496493ffb7e9f 100644 --- a/src/plugins/intel_cpu/src/memory_control.cpp +++ b/src/plugins/intel_cpu/src/memory_control.cpp @@ -4,10 +4,15 @@ #include "memory_control.hpp" +#include +#include #include +#include "edge.h" #include "node.h" #include "openvino/runtime/memory_solver.hpp" +#include "proxy_mem_blk.h" +#include "utils/general_utils.h" namespace ov { namespace intel_cpu { @@ -84,7 +89,7 @@ class MemoryBlockWithRelease : public IMemoryBlockObserver { class IMemoryManager { public: virtual ~IMemoryManager() = default; - virtual void insert(const MemoryRegion& reg) = 0; + virtual void insert(const MemoryRegion& reg, const std::vector& syncInds) = 0; virtual const MemoryControl::MemoryBlockMap& lastSolution() = 0; virtual void allocate() = 0; virtual void release() = 0; @@ -99,7 +104,8 @@ std::shared_ptr makeDnnlMemoryBlock(Args&&... args) { class MemoryManagerIO : public IMemoryManager { public: - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { + (void) syncInds; m_blocks.insert({reg.id, makeDnnlMemoryBlock()}); } @@ -120,7 +126,8 @@ class MemoryManagerIO : public IMemoryManager { class MemoryManagerStatic : public IMemoryManager { public: - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { + (void) syncInds; m_boxes.emplace_back(MemorySolver::Box{reg.start, reg.finish, reg.size, reg.id}); } @@ -167,19 +174,18 @@ class MemoryManagerStatic : public IMemoryManager { class MemoryManageNonOverlapingSets : public IMemoryManager { public: - MemoryManageNonOverlapingSets(std::vector syncInds) : m_syncInds(std::move(syncInds)) {} - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { MemorySolver::Box box = {reg.start, reg.finish, reg.size, reg.id}; if (-1 != reg.finish) { //We have to extend the lifespan of tensors that are crossing a sync point border in order to save //the intermediate computation results from possible loss due to the tensor resize auto itr_upper = - std::upper_bound(m_syncInds.begin(), m_syncInds.end(), box.finish, [](int y, int x) { + std::upper_bound(syncInds.begin(), syncInds.end(), box.finish, [](int y, int x) { return y <= x; }); - auto itr_lower = std::lower_bound(m_syncInds.begin(), m_syncInds.end(), box.start); + auto itr_lower = std::lower_bound(syncInds.begin(), syncInds.end(), box.start); if (itr_lower != itr_upper) { // across sections - if (itr_upper == m_syncInds.end()) { + if (itr_upper == syncInds.end()) { box.finish = -1; } else { box.finish = *itr_upper; @@ -242,7 +248,6 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { std::unordered_map> m_internalBlocks; std::vector m_boxes; - std::vector m_syncInds; }; } // namespace @@ -256,12 +261,12 @@ class MemoryControl::RegionHandler { : m_cond(std::move(cond)), m_memManager(std::move(memManager)) {} - bool insert(const MemoryRegion& reg) { + bool insert(const MemoryRegion& reg, const std::vector& syncInds) { if (!m_cond(reg)) { return false; } - m_memManager->insert(reg); + m_memManager->insert(reg, syncInds); return true; } @@ -292,9 +297,8 @@ MemoryControl::RegionHandlerPtr buildHandler(F&& f, Args&&... args) { } // namespace -MemoryControl::MemoryControl(std::vector syncInds) { +MemoryControl::MemoryControl() { // init handlers - // handler for dynamic tensors m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { if (reg.size < 0 || MemoryRegion::RegionType::VARIABLE != reg.type || @@ -311,7 +315,7 @@ MemoryControl::MemoryControl(std::vector syncInds) { return false; } return true; - }, std::move(syncInds))); + })); //handler for I/O tensors, so far simply individual blocks m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { @@ -322,22 +326,24 @@ MemoryControl::MemoryControl(std::vector syncInds) { })); } -void MemoryControl::insert(const MemoryRegion& region) { +void MemoryControl::insert(const MemoryRegion& region, const std::vector& syncInds) { for (auto&& handler : m_handlers) { - if (handler->insert(region)) { + if (handler->insert(region, syncInds)) { return; } } OPENVINO_THROW("No suitable hanlder was found for the given memory region"); } -MemoryControl::MemoryBlockMap MemoryControl::insert(const std::vector& regions) { +void MemoryControl::insert(const std::vector& regions, + const std::vector& syncInds) { for (auto&& region : regions) { - insert(region); + insert(region, syncInds); } +} +MemoryControl::MemoryBlockMap MemoryControl::solve() { MemoryControl::MemoryBlockMap blocksMap; - blocksMap.reserve(regions.size()); for (auto&& handler : m_handlers) { auto&& solution = handler->lastSolution(); @@ -364,52 +370,186 @@ void MemoryControl::releaseMemory() { m_allocated = false; } -edgeClusters MemoryControl::findEdgeClusters(const std::vector& graphEdges) { - typedef std::unordered_map edge_cluster_idx_map_t; - - edgeClusters edge_clusters; - edge_cluster_idx_map_t edge_cluster_indices; +// /** +// * Forms clusters of edges. +// * An edge cluster is a collection of edges, so: +// * - base edge is an edge with a Memory which other edges point to by means of inplace logic +// * - first edge of a cluster is a base edge with a status either NeedAllocation or Allocated +// * - rest of the edges in a cluster are NotAllocated ones, since they point to their base edge +// */ +// EdgeClusters MemoryControl::flattenEdgeClusters(const EdgeClusters& clusters) { +// } + +/** + * Forms clusters of edges. + * An edge cluster is a collection of edges, so: + * - base edge is an edge with a Memory which other edges point to by means of inplace logic + * - first edge of a cluster is a base edge with a status either NeedAllocation or Allocated + * - rest of the edges in a cluster are NotAllocated ones, since they point to their base edge + */ +EdgeClusters MemoryControl::formEdgeClusters(const std::vector& graphEdges) { + typedef std::unordered_map EdgeClusterIdxMap; + EdgeClusters edgeClusters; + EdgeClusterIdxMap edgeClusterIndices; for (auto& edge : graphEdges) { - auto edge_it = edge_cluster_indices.find(edge); - if (edge_it != edge_cluster_indices.end()) - continue; // edge is visited + if (edgeClusterIndices.count(edge)) + continue; // edge is visited - size_t cluster_idx = edge_clusters.size(); - EdgePtr last_shared_edge = nullptr; + size_t clusterIdx = edgeClusters.size(); + EdgePtr lastSharedEdge = nullptr; // find cluster index for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge; shared_edge = shared_edge->getSharedEdge(std::nothrow)) { - auto shared_edge_it = edge_cluster_indices.find(shared_edge); - if (shared_edge_it != edge_cluster_indices.end()) { - cluster_idx = shared_edge_it->second; - last_shared_edge = shared_edge; + auto shared_edge_it = edgeClusterIndices.find(shared_edge); + if (shared_edge_it != edgeClusterIndices.end()) { + clusterIdx = shared_edge_it->second; + lastSharedEdge = shared_edge; break; } } - // add shared edges to cluster - edge_cluster_indices.emplace(edge, cluster_idx); + if (clusterIdx == edgeClusters.size()) + edgeClusters.emplace_back(EdgeCluster{edge}); - if (cluster_idx == edge_clusters.size()) - edge_clusters.emplace_back(edgeCluster{edge}); - else - edge_clusters[cluster_idx].emplace(edge); + // use recursive approach to ensure that the base edge is placed as a first entry of a cluster + std::function addToCluster; + addToCluster = [&addToCluster, &edgeClusterIndices, &clusterIdx, &edgeClusters, &lastSharedEdge](EdgePtr edge) { + if (edge == lastSharedEdge) + return; - for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge != last_shared_edge; - shared_edge = shared_edge->getSharedEdge(std::nothrow)) { - edge_cluster_indices.emplace(shared_edge, cluster_idx); - edge_clusters[cluster_idx].emplace(shared_edge); + addToCluster(edge->getSharedEdge(std::nothrow)); + + edgeClusterIndices.emplace(edge, clusterIdx); + edgeClusters[clusterIdx].push_back(edge); + }; + + addToCluster(edge); + } + + return edgeClusters; +} + +static inline bool isConstOutput(EdgePtr edge) { + return edge->getParent()->isConstant() && !edge->getChild()->isConstant(); +} + +MemoryRegions MemoryControl::formMemoryRegions(const EdgeClusters& clusters, + size_t remaining, + const GlobalExecutionIndex& globalExecIndex) { + // Markup the memory regions + MemoryRegions memoryRegions; + memoryRegions.reserve(remaining); + + for (size_t i = 0; i < remaining; ++i) { + MemoryRegion reg = {std::numeric_limits::max(), + 0, + 0, + static_cast(i), + MemoryRegion::RegionType::VARIABLE, + MemoryRegion::AllocType::UNKNOWN}; + + int64_t boxSize = 0; + bool isConst = false, isOutput = false, isInput = false; + // std::cout << "Form memory region for cluster: " << i << "\n"; + for (auto &edge : clusters[i]) { + const auto& parent = edge->getParent(); + const auto& child = edge->getChild(); + + // std::cout << "[" << globalExecIndex.at(parent).second << " - " << globalExecIndex.at(child).first << "]" + // << edge->name() + // << "\n"; + + int e_start = globalExecIndex.at(parent).second; + int e_finish = globalExecIndex.at(child).first; + // int e_finish = edge->getChild()->getExecIndex(); + + auto&& desc = edge->getDesc(); + + if (boxSize != -1 && desc.isDefined()) { + int64_t e_size = desc.getCurrentMemSize(); // size in bytes (from the beginning of data to the last element) + boxSize = std::max(e_size, boxSize); + } else { + boxSize = -1; + } + + reg.start = std::min(e_start, reg.start); + reg.finish = std::max(e_finish, reg.finish); + + auto allocType = + desc.getPrecision() == element::string ? MemoryRegion::AllocType::STRING : MemoryRegion::AllocType::POD; + + if (reg.alloc_type != allocType && MemoryRegion::AllocType::UNKNOWN != reg.alloc_type) { + OPENVINO_THROW("Different allocation types in the same memory region"); + } + reg.alloc_type = allocType; + + isConst |= isConstOutput(edge); + isOutput |= child->getType() == Type::Output; + isInput |= parent->getType() == Type::Input; + } + + reg.size = boxSize; + + if (isConst) { + reg.type = MemoryRegion::RegionType::CONSTANT; + } else if (isInput) { + if (isOutput) { + reg.type = MemoryRegion::RegionType::IO; + } else { + reg.type = MemoryRegion::RegionType::INPUT; + } + } else if (isOutput) { + reg.type = MemoryRegion::RegionType::OUTPUT; } + + memoryRegions.push_back(reg); } - return edge_clusters; + return memoryRegions; +} + +OutputMemoryBlocks MemoryControl::filterOutDynamicOutputEdges(MemoryRegions& memoryRegions, + const EdgeClusters& clusters, + const std::map& outputNodes) { + OutputMemoryBlocks outputMemBlocks; + memoryRegions.erase(std::remove_if(memoryRegions.begin(), memoryRegions.end(), [&](const MemoryRegion& region) { + if (region.size >= 0 || !one_of(region.type, MemoryRegion::RegionType::OUTPUT, MemoryRegion::RegionType::IO)) { + return false; + } + bool result = false; + for (auto& edge : clusters[region.id]) { + auto child = edge->getChild(); + if (child->getType() == Type::Output && edge->getStatus() == Edge::Status::NeedAllocation) { + auto proxyMemBlock = std::make_shared(); + DEBUG_LOG("ProxyMemoryBlock ", proxyMemBlock); + // std::cout << "Allocating output edge: " << edge->name() << "\n"; + edge->allocate(proxyMemBlock); + + // Store the output memory blocks. + // So that, the infer requests can be able to access them. + int count = 0; + for (auto& output : outputNodes) { + if (output.second == child) { + outputMemBlocks[output.first] = proxyMemBlock; + count++; + } + } + // sometimes there are unused output ports. + OPENVINO_ASSERT(count <= 1, "CPU plugin cannot find output node. count ", count); + result = true; + } + } + return result; + }), memoryRegions.end()); + + return outputMemBlocks; } -MemoryControl& NetworkMemoryControl::createMemoryControlUnit(std::vector syncInds) { - m_controlUnits.emplace_back(std::unique_ptr(new MemoryControl(syncInds))); - return *(m_controlUnits.back()); +MemoryControl* NetworkMemoryControl::createMemoryControlUnit() { + m_controlUnits.emplace_back(std::unique_ptr(new MemoryControl())); + return m_controlUnits.back().get(); } void NetworkMemoryControl::allocateMemory() { @@ -425,4 +565,4 @@ void NetworkMemoryControl::releaseMemory() { } } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_control.hpp b/src/plugins/intel_cpu/src/memory_control.hpp index ce4dc90890f3fa..fc38cf8df2ccb0 100644 --- a/src/plugins/intel_cpu/src/memory_control.hpp +++ b/src/plugins/intel_cpu/src/memory_control.hpp @@ -5,22 +5,15 @@ #pragma once #include "edge.h" +#include "graph.h" +#include "node.h" +#include "proxy_mem_blk.h" namespace ov { namespace intel_cpu { -using edgeCluster = std::unordered_set; -using edgeClusters = std::vector; - -struct MemoryRegion { - int start; // Execution order index of first use. - int finish; // Execution order index of last use. -1 means inf - int64_t size; // size in bytes - int64_t id; // ID unique for each region - - enum class RegionType : uint8_t { VARIABLE, CONSTANT, INPUT, OUTPUT, IO } type; - enum class AllocType : uint8_t { POD, STRING, UNKNOWN } alloc_type; -}; +using EdgeCluster = std::vector; +using EdgeClusters = std::vector; class MemoryControl { public: @@ -30,9 +23,16 @@ class MemoryControl { using MemoryBlockMap = std::unordered_map; public: - static edgeClusters findEdgeClusters(const std::vector& graphEdges); + static EdgeClusters formEdgeClusters(const std::vector& graphEdges); + static MemoryRegions formMemoryRegions(const EdgeClusters& clusters, size_t remaining, const GlobalExecutionIndex& globalExecIndex); + static OutputMemoryBlocks filterOutDynamicOutputEdges(MemoryRegions& memoryRegions, + const EdgeClusters& clusters, + const std::map& outputNodes); + + void insert(const MemoryRegions& regions, + const std::vector& syncInds); - MemoryBlockMap insert(const std::vector& regions); + MemoryBlockMap solve(); bool allocated() const { return m_allocated; @@ -42,13 +42,12 @@ class MemoryControl { void releaseMemory(); private: - explicit MemoryControl(std::vector syncInds); - void insert(const MemoryRegion& region); + explicit MemoryControl(); + void insert(const MemoryRegion& region, const std::vector& syncInds); friend class NetworkMemoryControl; private: - std::vector m_syncInds; std::vector m_handlers; bool m_allocated = false; }; @@ -56,7 +55,8 @@ class MemoryControl { class NetworkMemoryControl { public: NetworkMemoryControl() = default; - MemoryControl& createMemoryControlUnit(std::vector syncInds); + // @todo return std::reference_wrapper instead? + MemoryControl* createMemoryControlUnit(); void allocateMemory(); void releaseMemory(); @@ -69,4 +69,4 @@ class NetworkMemoryControl { }; } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 7c23d55fc4147a..2c592a26bac7c9 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -1152,6 +1152,10 @@ bool Node::isConstant() { return getConstantType() == ConstantType::Const; } +bool Node::isConstantInput() { + return isConstant() && getType() == Type::Input; +} + void Node::updateConstantType() { if (constant == ConstantType::StrictNoConst) return; diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 948bd6999ce27a..60304d504fb461 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -6,6 +6,7 @@ #include #include +#include "allocation_context.hpp" #include "cpu_memory.h" #include "cpu_shape.h" #include "cpu_types.h" @@ -43,6 +44,7 @@ using NodePtr = std::shared_ptr; using NodeConstPtr = std::shared_ptr; using NodeWeakPtr = std::weak_ptr; + class PortConfigurator { public: PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, ov::element::Type prc, const Shape& shape, @@ -111,6 +113,34 @@ class NodeDesc { executorFactory = factory; } + bool hasZeroInputDims() const { + const auto& inputConfigs = getConfig().inConfs; + + return std::any_of(inputConfigs.begin(), inputConfigs.end(), [](const PortConfig& portConfig) { + return portConfig.hasZeroDims(); + }); + } + + bool hasZeroInputDimsAtPort(size_t portIdx) const { + const auto& inputConfigs = getConfig().inConfs; + OPENVINO_ASSERT("Attempt to get NodeDesc input configuration for port " , portIdx, ". Number of inputs is ", inputConfigs.size()); + return inputConfigs[portIdx].hasZeroDims(); + } + + bool hasZeroOutputDims() const { + const auto& outputConfigs = getConfig().outConfs; + + return std::any_of(outputConfigs.begin(), outputConfigs.end(), [](const PortConfig& portConfig) { + return portConfig.hasZeroDims(); + }); + } + + bool hasZeroOutputDimsAtPort(size_t portIdx) const { + const auto& outputConfigs = getConfig().inConfs; + OPENVINO_ASSERT("Attempt to get NodeDesc output configuration for port " , portIdx, ". Number of outputs is ", outputConfigs.size()); + return outputConfigs[portIdx].hasZeroDims(); + } + private: NodeConfig config; impl_desc_type implementationType; @@ -265,6 +295,9 @@ class Node { bool isInPlace() const; + virtual bool canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDims(); + } // must be called only after Graph::ResolveEdgeConflicts() virtual bool isExecutable() const { return !hasEmptyInputTensors(); @@ -278,6 +311,7 @@ class Node { ConstantType getConstantType() const; void updateConstantType(); bool isConstant(); + bool isConstantInput(); // return type int supports return -1 in overloading when channel axis doesn't exist virtual int getFusingAxis() const { @@ -483,6 +517,11 @@ class Node { return execIndex; } + virtual int registerToAllocationContext(int offset, AllocationContext& context) { + (void) context; + return offset + 1; + } + const std::string & getTypeStr() const { return typeStr; } diff --git a/src/plugins/intel_cpu/src/nodes/batch_to_space.h b/src/plugins/intel_cpu/src/nodes/batch_to_space.h index 1b583f74bd7905..db4d7985b6e322 100644 --- a/src/plugins/intel_cpu/src/nodes/batch_to_space.h +++ b/src/plugins/intel_cpu/src/nodes/batch_to_space.h @@ -17,6 +17,11 @@ class BatchToSpace : public Node { void getSupportedDescriptors() override {}; void initSupportedPrimitiveDescriptors() override; + bool canBeSkipped() const override { + const auto& spd = getSelectedPrimitiveDescriptor(); + return spd->hasZeroInputDims() || spd->hasZeroOutputDims(); + } + // output shape can potentially be empty bool isExecutable() const override { return !hasEmptyInputTensors() && !hasEmptyOutputTensors(); diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.cpp b/src/plugins/intel_cpu/src/nodes/broadcast.cpp index ac8dd814ae9961..6d2045e473f9a9 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.cpp +++ b/src/plugins/intel_cpu/src/nodes/broadcast.cpp @@ -180,6 +180,10 @@ bool Broadcast::needShapeInfer() const { return false; } +bool Broadcast::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Broadcast::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.h b/src/plugins/intel_cpu/src/nodes/broadcast.h index 1435314ee08776..15dd9c70297b6b 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.h +++ b/src/plugins/intel_cpu/src/nodes/broadcast.h @@ -24,6 +24,7 @@ class Broadcast : public Node, public TileBroadcastCommon { void executeDynamicImpl(dnnl::stream strm) override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.cpp b/src/plugins/intel_cpu/src/nodes/bucketize.cpp index a71255c0d531e4..205d3afc8491af 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.cpp +++ b/src/plugins/intel_cpu/src/nodes/bucketize.cpp @@ -216,6 +216,10 @@ void Bucketize::prepareParams() { std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), size_t(1), std::multiplies()); } +bool Bucketize::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Bucketize::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.h b/src/plugins/intel_cpu/src/nodes/bucketize.h index c834921a38ce54..cd62a43a78eeed 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.h +++ b/src/plugins/intel_cpu/src/nodes/bucketize.h @@ -24,6 +24,7 @@ class Bucketize : public Node { void prepareParams() override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index a1ceabd6942db1..d2d8ee66ad6323 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -4,6 +4,7 @@ #include "composite.h" +#include "compiled_model.h" #include "nodes/input.h" #include "cpu_memory.h" #include "transformations/cpu_opset/common/op/submodel.hpp" @@ -75,23 +76,46 @@ void Composite::selectOptimalPrimitiveDescriptor() { // @todo add ascii diagramm for memory mapping / reuse void Composite::createPrimitive() { - OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), - "Number of node inputs must be equal the number of inner graph's inputs"); + // OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + // "Number of node inputs must be equal the number of inner graph's inputs"); - std::vector inputMemory; - for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - inputMemory.emplace_back(getSrcMemoryAtPort(i)); - } + // std::vector inputMemory; + // for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + // inputMemory.emplace_back(getSrcMemoryAtPort(i)); + // } - OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), - "Number of node outputs must be equal the number of inner graph's outputs"); + // OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + // "Number of node outputs must be equal the number of inner graph's outputs"); - std::vector outputMemory; - for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { - outputMemory.emplace_back(getDstMemoryAtPort(i)); + // std::vector outputMemory; + // for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + // outputMemory.emplace_back(getDstMemoryAtPort(i)); + // } + + // m_graph.Activate(inputMemory, outputMemory); + m_graph.Activate({}, {}, true); +} + +int Composite::registerToAllocationContext(int offset, AllocationContext& context) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto parentEdge = getParentEdgeAt(i); + auto inputEdges = m_graph.GetInputNodesMap().at(i)->getChildEdgesAtPort(0); + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", inputEdge->name()); + inputEdge->sharedMemFrom(parentEdge); + } + } + + for (size_t i = 0; i < getChildEdges().size(); i++) { + auto childEdge = getChildEdgeAt(i); + auto outputEdge = m_graph.GetOutputNodesMap().at(i)->getParentEdgeAt(0); + OPENVINO_ASSERT(outputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", outputEdge->name()); + outputEdge->sharedMemFrom(childEdge); } - m_graph.Activate(inputMemory, outputMemory); + return m_graph.RegisterToAllocationContext(offset, context); } void Composite::execute(dnnl::stream) { diff --git a/src/plugins/intel_cpu/src/nodes/composite.h b/src/plugins/intel_cpu/src/nodes/composite.h index 9f18a2ba68b769..816aa97b2aa5cc 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.h +++ b/src/plugins/intel_cpu/src/nodes/composite.h @@ -4,7 +4,9 @@ #pragma once +#include #include +#include #include "graph.h" #include "node.h" @@ -31,6 +33,10 @@ class Composite : public Node { return false; } + bool canBeSkipped() const override { + return false; + } + bool isExecutable() const override { return true; } @@ -41,6 +47,8 @@ class Composite : public Node { void execute(dnnl::stream) override; void executeDynamicImpl(dnnl::stream strm) override; + int registerToAllocationContext(int offset, AllocationContext& context) override; + const Graph& graph() const { return m_graph; } diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 576361de7e692b..3385d958937e89 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -29,6 +29,10 @@ namespace { constexpr size_t channelAxis = 1lu; } +bool Concat::canBeSkipped() const { + return isInPlace() || getSelectedPrimitiveDescriptor()->hasZeroOutputDims(); +} + bool Concat::isExecutable() const { return !isInPlace() && !hasEmptyOutputTensors(); } diff --git a/src/plugins/intel_cpu/src/nodes/concat.h b/src/plugins/intel_cpu/src/nodes/concat.h index 9ed331bee4f16d..5dfc4f11fadbb1 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.h +++ b/src/plugins/intel_cpu/src/nodes/concat.h @@ -27,6 +27,7 @@ class Concat : public Node { ov::element::Type getRuntimePrecision() const override; + bool canBeSkipped() const override; bool isExecutable() const override; bool needPrepareParams() const override; void prepareParams() override; diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp index dc69892dabb2e4..e82146ad3a22b0 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp @@ -140,6 +140,10 @@ void EmbeddingBagOffset::executeDynamicImpl(dnnl::stream strm) { execute(strm); } +bool EmbeddingBagOffset::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool EmbeddingBagOffset::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h index a31b518e7891a9..000cc86cce25c3 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h @@ -20,6 +20,7 @@ class EmbeddingBagOffset : public Node, public EmbeddingBag { void execute(dnnl::stream strm) override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp index 0b490a28a81487..a9465f909fd37b 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp @@ -107,6 +107,10 @@ void EmbeddingBagPacked::executeDynamicImpl(dnnl::stream strm) { execute(strm); } +bool EmbeddingBagPacked::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool EmbeddingBagPacked::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h index 6a9d33fe3afccb..98f8e117639f82 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h @@ -21,6 +21,7 @@ class EmbeddingBagPacked : public Node, public EmbeddingBag { bool created() const override; bool isExecutable() const override; + bool canBeSkipped() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; protected: diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp index 1b2e28e6039543..55159f71b6a6eb 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp @@ -138,6 +138,10 @@ void EmbeddingSegmentsSum::executeDynamicImpl(dnnl::stream strm) { execute(strm); } +bool EmbeddingSegmentsSum::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool EmbeddingSegmentsSum::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h index bb312b4dd47246..60fa504e326767 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h @@ -20,6 +20,7 @@ class EmbeddingSegmentsSum : public Node, public EmbeddingBag { void execute(dnnl::stream strm) override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index d2629fe8fe6811..cebf57421982e7 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -923,6 +923,10 @@ bool Gather::created() const { return getType() == Type::Gather; } +bool Gather::canBeSkipped() const { + return isInPlace() || Node::canBeSkipped(); +} + bool Gather::isExecutable() const { return !isInPlace() && Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/gather.h b/src/plugins/intel_cpu/src/nodes/gather.h index 6ee097e9a1fbab..99a22df010caf5 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.h +++ b/src/plugins/intel_cpu/src/nodes/gather.h @@ -24,6 +24,7 @@ class Gather : public Node { void createPrimitive() override; void execute(dnnl::stream strm) override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; void resolveInPlaceEdges(Edge::LOOK look) override; diff --git a/src/plugins/intel_cpu/src/nodes/if.h b/src/plugins/intel_cpu/src/nodes/if.h index f858c92b0b2651..3c279f028754b8 100644 --- a/src/plugins/intel_cpu/src/nodes/if.h +++ b/src/plugins/intel_cpu/src/nodes/if.h @@ -25,6 +25,7 @@ class If : public Node { void createPrimitive() override; bool created() const override; void execute(dnnl::stream strm) override; + bool canBeSkipped() const override { return false; } bool isExecutable() const override { return true; } protected: diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 1f650bd8c5de17..b0d80700240fc0 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -543,6 +543,34 @@ void Input::initSupportedPdFromMemDesc() { supportedPrimitiveDescriptors.emplace_back(std::move(config), impl_desc_type::unknown); } +void Input::resolveInPlaceEdges(Edge::LOOK look) { + if (look & Edge::LOOK_UP) { + auto edges = getChildEdgesAtPort(0); + for (const auto& edge : edges) { + EdgePtr sharedEdge = edge; + + while (sharedEdge->getSharedEdge(std::nothrow)) { + sharedEdge = sharedEdge->getSharedEdge(std::nothrow); + } + + edge->reuse(sharedEdge->getMemoryPtr()); + } + } + + if (look & Edge::LOOK_DOWN) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto edge = getParentEdgeAt(i); + EdgePtr sharedEdge = edge; + + while (sharedEdge->getSharedEdge(std::nothrow)) { + sharedEdge = sharedEdge->getSharedEdge(std::nothrow); + } + + edge->reuse(sharedEdge->getMemoryPtr()); + } + } +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 4d7febb17ad4b7..f190a99d2a7530 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -56,15 +56,16 @@ class Input : public Node { void selectOptimalPrimitiveDescriptor() override; void createPrimitive() override; bool created() const override; + void resolveInPlaceEdges(Edge::LOOK look) override; void withMeanImage(); MemoryCPtr getMemoryPtr() const; void execute(dnnl::stream strm) override {} void executeDynamicImpl(dnnl::stream strm) override {} - bool isExecutable() const override { - return false; - } + + bool canBeSkipped() const override { return true; } + bool isExecutable() const override { return false; } bool needShapeInfer() const override { return false; } bool needPrepareParams() const override { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/interaction.cpp b/src/plugins/intel_cpu/src/nodes/interaction.cpp index 6f604f4a9e278a..5a49675a77e2e1 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.cpp +++ b/src/plugins/intel_cpu/src/nodes/interaction.cpp @@ -356,6 +356,10 @@ void Interaction::executeDynamicImpl(dnnl::stream strm) { execute(strm); } +bool Interaction::canBeSkipped() const { + return false; +} + bool Interaction::isExecutable() const { return true; } diff --git a/src/plugins/intel_cpu/src/nodes/interaction.h b/src/plugins/intel_cpu/src/nodes/interaction.h index 448484a2512dd1..978f9785f8ee81 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.h +++ b/src/plugins/intel_cpu/src/nodes/interaction.h @@ -50,6 +50,7 @@ class Interaction : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool canBeSkipped() const override; bool isExecutable() const override; void executeDynamicImpl(dnnl::stream strm) override; void prepareParams() override; diff --git a/src/plugins/intel_cpu/src/nodes/lora.cpp b/src/plugins/intel_cpu/src/nodes/lora.cpp index 52509cdfc44a13..460b909206f150 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.cpp +++ b/src/plugins/intel_cpu/src/nodes/lora.cpp @@ -86,24 +86,40 @@ void LoRA::selectOptimalPrimitiveDescriptor() { selectPrimitiveDescriptorByIndex(0); } +int LoRA::registerToAllocationContext(int offset, AllocationContext& context) { + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + auto parentEdge = getParentEdgeAt(i); + auto inputEdges = m_graph.GetInputNodesMap().at(i)->getChildEdgesAtPort(0); + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized Edge instead of: ", static_cast(inputEdge->getStatus())); + inputEdge->sharedMemFrom(parentEdge); + } + } + + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + auto childEdge = getChildEdgeAt(i); + auto outputEdge = m_graph.GetOutputNodesMap().at(i)->getParentEdgeAt(0); + outputEdge->sharedMemFrom(childEdge); + } + + return m_graph.RegisterToAllocationContext(offset, context); +} + // @todo add ascii diagram for memory mapping / reuse void LoRA::createPrimitive() { CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), "Number of node inputs must be equal the number of inner graph's inputs"); - std::vector inputMemory; - for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - auto srcEdgeMem = getSrcMemoryAtPort(i); - auto mem = std::make_shared(getEngine(), srcEdgeMem->getDescPtr(), srcEdgeMem->getMemoryBlock()); - subgraphMemoryPtrs.push_back(mem); - inputMemory.emplace_back(std::move(mem)); - } - - CPU_NODE_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), - "Number of node outputs must be equal the number of inner graph's outputs"); + // for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + // const auto& subgraphInputNode = m_graph.GetInputNodesMap().at(i); + // const auto& subgraphInputMemory = subgraphInputNode->getDstMemoryAtPort(0); + // auto mem = std::make_shared(getEngine(), subgraphInputMemory->getDescPtr(), subgraphInputMemory->getMemoryBlock()); + // subgraphMemoryPtrs.push_back(mem); + // // inputMemory.emplace_back(std::move(mem)); + // } - std::vector outputMemory{getDstMemoryAtPort(0)}; - m_graph.Activate(inputMemory, outputMemory); + m_graph.Activate({}, {}, true); } void LoRA::execute(dnnl::stream) { @@ -115,10 +131,10 @@ void LoRA::executeDynamicImpl(dnnl::stream strm) { } void LoRA::prepareParams() { - for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - // since the external and internal descriptors are compatible, we may pass the descriptor - subgraphMemoryPtrs[i]->redefineDesc(getSrcMemoryAtPort(i)->getDescPtr()); - } + // for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + // // since the external and internal descriptors are compatible, we may pass the descriptor + // subgraphMemoryPtrs[i]->redefineDesc(getSrcMemoryAtPort(i)->getDescPtr()); + // } } } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/lora.h b/src/plugins/intel_cpu/src/nodes/lora.h index 27701daf9034f2..3fd7892040fc14 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.h +++ b/src/plugins/intel_cpu/src/nodes/lora.h @@ -23,6 +23,7 @@ class LoRA : public Node { void getSupportedDescriptors() override{}; void selectOptimalPrimitiveDescriptor() override; + int registerToAllocationContext(int offset, AllocationContext& context) override; void createPrimitive() override; void prepareParams() override; void execute(dnnl::stream) override; diff --git a/src/plugins/intel_cpu/src/nodes/matmul.cpp b/src/plugins/intel_cpu/src/nodes/matmul.cpp index 92d8f356728ed9..088844b0b8575c 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.cpp +++ b/src/plugins/intel_cpu/src/nodes/matmul.cpp @@ -708,6 +708,10 @@ const std::vector& MatMul::getDefaultImplPriority() { return priorities; } +bool MatMul::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroOutputDims(); +} + bool MatMul::isExecutable() const { return !hasEmptyOutputTensors(); } diff --git a/src/plugins/intel_cpu/src/nodes/matmul.h b/src/plugins/intel_cpu/src/nodes/matmul.h index 2e487148d0ec0c..eccfc435ee55bc 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.h +++ b/src/plugins/intel_cpu/src/nodes/matmul.h @@ -43,6 +43,7 @@ class MatMul : public Node { const std::vector& getDefaultImplPriority() override; bool canBeExecutedInInt8() const override; + bool canBeSkipped() const override; bool isExecutable() const override; protected: diff --git a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp index 354ac92f9272cb..8bc68ad2528e1d 100644 --- a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp +++ b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp @@ -287,6 +287,10 @@ void MatrixNms::prepareParams() { } } +bool MatrixNms::canBeSkipped() const { + return !isDynamicNode() && Node::canBeSkipped(); +} + bool MatrixNms::isExecutable() const { return isDynamicNode() || Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/matrix_nms.h b/src/plugins/intel_cpu/src/nodes/matrix_nms.h index 6afa9c09c751c1..853bcfeaf07be8 100644 --- a/src/plugins/intel_cpu/src/nodes/matrix_nms.h +++ b/src/plugins/intel_cpu/src/nodes/matrix_nms.h @@ -29,6 +29,7 @@ class MatrixNms : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool canBeSkipped() const override; bool isExecutable() const override; void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/memory.cpp b/src/plugins/intel_cpu/src/nodes/memory.cpp index 565597bdcc2a9e..7d355181207edc 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.cpp +++ b/src/plugins/intel_cpu/src/nodes/memory.cpp @@ -220,6 +220,10 @@ void MemoryOutputBase::assignState(MemStatePtr newState) { assignExtMemory(state->output_mem(), state->internal_desc()); } +bool MemoryOutputBase::canBeSkipped() const { + return false; +} + bool MemoryOutputBase::isExecutable() const { return true; } @@ -471,6 +475,10 @@ void MemoryInputBase::deregisterSibling(MemoryOutputBase* node) { if (node == outputNode) { outputNode = nullptr; } } +bool MemoryInputBase::canBeSkipped() const { + return false; +} + bool MemoryInputBase::isExecutable() const { return true; } diff --git a/src/plugins/intel_cpu/src/nodes/memory.hpp b/src/plugins/intel_cpu/src/nodes/memory.hpp index f503a8d58386a5..cedc2aaa394dde 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.hpp +++ b/src/plugins/intel_cpu/src/nodes/memory.hpp @@ -64,6 +64,8 @@ class MemoryOutputBase : public Node, public MemoryNode { void execute(dnnl::stream strm) override final; // NOLINT void executeDynamicImpl(dnnl::stream strm) override final; // NOLINT + + bool canBeSkipped() const override final; // NOLINT bool isExecutable() const override final; // NOLINT void registerInputNode(MemoryInputBase* node); @@ -142,6 +144,7 @@ class MemoryInputBase : public Input, public MemoryStateNode { void executeDynamicImpl(dnnl::stream strm) override final; // NOLINT bool needShapeInfer() const override { return false; } bool needPrepareParams() const override { return false; } + bool canBeSkipped() const override final; // NOLINT bool isExecutable() const override final; // NOLINT void registerOutputNode(MemoryOutputBase* node); diff --git a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp index e87fd69fd9c004..0470d9b32c60ca 100644 --- a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp +++ b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp @@ -211,6 +211,10 @@ void MultiClassNms::prepareParams() { m_numBoxOffset.resize(m_numBatches); } +bool MultiClassNms::canBeSkipped() const { + return !isDynamicNode() && Node::canBeSkipped(); +} + bool MultiClassNms::isExecutable() const { return isDynamicNode() || Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp b/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp index ea5d166351efb6..a6ade302908124 100644 --- a/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp +++ b/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp @@ -27,6 +27,7 @@ class MultiClassNms : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool canBeSkipped() const override; bool isExecutable() const override; void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/multinomial.cpp b/src/plugins/intel_cpu/src/nodes/multinomial.cpp index 24958b4e2b980d..e3da27b83f34e1 100644 --- a/src/plugins/intel_cpu/src/nodes/multinomial.cpp +++ b/src/plugins/intel_cpu/src/nodes/multinomial.cpp @@ -117,6 +117,11 @@ void Multinomial::prepareParams() { m_batches_samples_probs_count = m_output_elements_count * m_probs_count; } +bool Multinomial::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(PROBS_PORT) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(NUM_SAMPLES_PORT); +} + bool Multinomial::isExecutable() const { return !isInputTensorAtPortEmpty(PROBS_PORT) && !isInputTensorAtPortEmpty(NUM_SAMPLES_PORT); } diff --git a/src/plugins/intel_cpu/src/nodes/multinomial.hpp b/src/plugins/intel_cpu/src/nodes/multinomial.hpp index 611b70503f5dba..633671930f07ff 100644 --- a/src/plugins/intel_cpu/src/nodes/multinomial.hpp +++ b/src/plugins/intel_cpu/src/nodes/multinomial.hpp @@ -30,6 +30,7 @@ class Multinomial : public Node { void createPrimitive() override; + bool canBeSkipped() const override; bool isExecutable() const override; void execute(dnnl::stream strm) override; void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/node_config.h b/src/plugins/intel_cpu/src/nodes/node_config.h index d814f0ee65df37..5e540ee685f51f 100644 --- a/src/plugins/intel_cpu/src/nodes/node_config.h +++ b/src/plugins/intel_cpu/src/nodes/node_config.h @@ -138,6 +138,10 @@ class PortConfig { _desc = createPortDesc(desc, cmpMask); } + bool hasZeroDims() const { + return getMemDesc()->getShape().hasZeroDims(); + } + private: PortDescBasePtr createPortDesc(MemoryDescPtr desc, BlockedMemoryDesc::CmpMask cmpMask) { if (desc->getType() & Blocked) diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp index 944318a3c24ed6..61ab802ccbe4b4 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp @@ -900,6 +900,10 @@ void NonMaxSuppression::checkOutput(const Shape& shape, const std::string& name, THROW_CPU_NODE_ERR("has unsupported '", name, "' output 2nd dimension size: ", dim2str(shape.getDims()[1])); } +bool NonMaxSuppression::canBeSkipped() const { + return !isDynamicNode() && Node::canBeSkipped(); +} + bool NonMaxSuppression::isExecutable() const { return isDynamicNode() || Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h index 025c46f5799a3e..71b187b826aef7 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h @@ -50,6 +50,7 @@ class NonMaxSuppression : public Node { int suppress_begin_index; }; + bool canBeSkipped() const override; bool isExecutable() const override; bool needShapeInfer() const override { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/non_zero.h b/src/plugins/intel_cpu/src/nodes/non_zero.h index 515ff965055bea..fd553610b712df 100644 --- a/src/plugins/intel_cpu/src/nodes/non_zero.h +++ b/src/plugins/intel_cpu/src/nodes/non_zero.h @@ -29,6 +29,7 @@ class NonZero : public Node { void executeDynamicImpl(dnnl::stream strm) override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool canBeSkipped() const override { return false; } bool isExecutable() const override { return true; } private: diff --git a/src/plugins/intel_cpu/src/nodes/normalize.cpp b/src/plugins/intel_cpu/src/nodes/normalize.cpp index ca52e572b73ea8..0337f462a99f70 100644 --- a/src/plugins/intel_cpu/src/nodes/normalize.cpp +++ b/src/plugins/intel_cpu/src/nodes/normalize.cpp @@ -912,6 +912,10 @@ void NormalizeL2::createPrimitive() { } } +bool NormalizeL2::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool NormalizeL2::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/normalize.h b/src/plugins/intel_cpu/src/nodes/normalize.h index a05925a15deb71..ce2bf6607b287a 100644 --- a/src/plugins/intel_cpu/src/nodes/normalize.h +++ b/src/plugins/intel_cpu/src/nodes/normalize.h @@ -94,6 +94,7 @@ class NormalizeL2 : public Node { void prepareParams() override; void executeDynamicImpl(dnnl::stream strm) override; + bool canBeSkipped() const override; bool isExecutable() const override; enum class NormEpsMode { diff --git a/src/plugins/intel_cpu/src/nodes/pad.cpp b/src/plugins/intel_cpu/src/nodes/pad.cpp index 10cdb2a19b771f..b96fd7f36d160b 100644 --- a/src/plugins/intel_cpu/src/nodes/pad.cpp +++ b/src/plugins/intel_cpu/src/nodes/pad.cpp @@ -201,6 +201,10 @@ void Pad::createPrimitive() { } } +bool Pad::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroOutputDimsAtPort(0); +} + bool Pad::isExecutable() const { return !isOutputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/pad.h b/src/plugins/intel_cpu/src/nodes/pad.h index f2fcd9cc3c20a9..8493558bf52650 100644 --- a/src/plugins/intel_cpu/src/nodes/pad.h +++ b/src/plugins/intel_cpu/src/nodes/pad.h @@ -23,6 +23,7 @@ class Pad : public Node { void prepareParams() override; bool needShapeInfer() const override; + bool canBeSkipped() const override; bool isExecutable() const override; bool needPrepareParams() const override; diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.h b/src/plugins/intel_cpu/src/nodes/paged_attn.h index adc0f1b634c1b2..1cc698c5e8d63a 100644 --- a/src/plugins/intel_cpu/src/nodes/paged_attn.h +++ b/src/plugins/intel_cpu/src/nodes/paged_attn.h @@ -22,10 +22,19 @@ class PagedAttention : public Node { bool created() const override { return getType() == Type::PagedAttention; } + + // pastkv may have zero dimension + bool canBeSkipped() const override { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(1) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(2); + } + // pastkv may have zero dimension bool isExecutable() const override { return !isInputTensorAtPortEmpty(0) && !isInputTensorAtPortEmpty(1) && !isInputTensorAtPortEmpty(2); } + bool needPrepareParams() const override { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/random_uniform.cpp b/src/plugins/intel_cpu/src/nodes/random_uniform.cpp index 808ad10c440854..e1725d1e60020c 100644 --- a/src/plugins/intel_cpu/src/nodes/random_uniform.cpp +++ b/src/plugins/intel_cpu/src/nodes/random_uniform.cpp @@ -520,6 +520,10 @@ bool RandomUniform::needShapeInfer() const { return !m_const_inputs[SHAPE]; } +bool RandomUniform::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(SHAPE); +} + bool RandomUniform::isExecutable() const { return !isInputTensorAtPortEmpty(SHAPE); } diff --git a/src/plugins/intel_cpu/src/nodes/random_uniform.hpp b/src/plugins/intel_cpu/src/nodes/random_uniform.hpp index 237480cd06a667..cf92cc4810dab6 100644 --- a/src/plugins/intel_cpu/src/nodes/random_uniform.hpp +++ b/src/plugins/intel_cpu/src/nodes/random_uniform.hpp @@ -39,6 +39,7 @@ class RandomUniform : public Node { void executeDynamicImpl(dnnl::stream strm) override; + bool canBeSkipped() const override; bool isExecutable() const override; void createPrimitive() override; diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index 6cfc94a02b9f3b..2ef2b4e3f0ab0c 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -2088,6 +2088,10 @@ void Reduce::initSupportedPrimitiveDescriptors() { } } +bool Reduce::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(REDUCE_DATA); +} + bool Reduce::isExecutable() const { return !isInputTensorAtPortEmpty(REDUCE_DATA); } diff --git a/src/plugins/intel_cpu/src/nodes/reduce.h b/src/plugins/intel_cpu/src/nodes/reduce.h index 2464686edb1ee4..b9e274a1cb5cd9 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.h +++ b/src/plugins/intel_cpu/src/nodes/reduce.h @@ -102,6 +102,7 @@ class Reduce : public Node { return false; } + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -195,4 +196,4 @@ class Reduce : public Node { } // namespace node } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/reference.h b/src/plugins/intel_cpu/src/nodes/reference.h index 25a285a4e72709..e025cae9e9da93 100644 --- a/src/plugins/intel_cpu/src/nodes/reference.h +++ b/src/plugins/intel_cpu/src/nodes/reference.h @@ -22,6 +22,7 @@ class Reference : public Node { bool needShapeInfer() const override; bool needPrepareParams() const override { return false; } + bool canBeSkipped() const override { return false; } bool isExecutable() const override { return true; } void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index 9b521cdb3b57c7..44b1e4547c8eda 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -32,8 +32,12 @@ namespace ov { namespace intel_cpu { namespace node { +bool Reorder::canBeSkipped() const { + return isOptimized || Node::canBeSkipped(); +} + bool Reorder::isExecutable() const { - return Node::isExecutable() && !isOptimized; + return !isOptimized && Node::isExecutable(); } Reorder::Reorder(const std::shared_ptr& op, const GraphContext::CPtr context) : diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h index ab94b60b6a4a18..380668ac0ee5de 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.h +++ b/src/plugins/intel_cpu/src/nodes/reorder.h @@ -23,6 +23,7 @@ class Reorder : public Node { bool created() const override; const std::vector& getDefaultImplPriority() override; + bool canBeSkipped() const override; bool isExecutable() const override; void createPrimitive() override; diff --git a/src/plugins/intel_cpu/src/nodes/reshape.cpp b/src/plugins/intel_cpu/src/nodes/reshape.cpp index 6e3dea09db2a2f..e10e377e75f3dd 100644 --- a/src/plugins/intel_cpu/src/nodes/reshape.cpp +++ b/src/plugins/intel_cpu/src/nodes/reshape.cpp @@ -138,7 +138,7 @@ void Reshape::execute(dnnl::stream strm) { } } -bool Reshape::isExecutable() const { +bool Reshape::canBeSkipped() const { bool inPlaceEnabled = false; if (auto prim_desc = getSelectedPrimitiveDescriptor()) { auto& config = prim_desc->getConfig(); @@ -147,7 +147,11 @@ bool Reshape::isExecutable() const { inPlaceEnabled = true; } } - return !inPlaceEnabled; + return inPlaceEnabled; +} + +bool Reshape::isExecutable() const { + return !canBeSkipped(); } bool Reshape::created() const { diff --git a/src/plugins/intel_cpu/src/nodes/reshape.h b/src/plugins/intel_cpu/src/nodes/reshape.h index 887fc6f739bd80..3b8b9100048840 100644 --- a/src/plugins/intel_cpu/src/nodes/reshape.h +++ b/src/plugins/intel_cpu/src/nodes/reshape.h @@ -18,6 +18,7 @@ class Reshape : public Node { void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; bool created() const override; + bool canBeSkipped() const override; bool isExecutable() const override; bool needShapeInfer() const override; diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.h b/src/plugins/intel_cpu/src/nodes/scaled_attn.h index bbf12727478e43..065fc77dbe8481 100644 --- a/src/plugins/intel_cpu/src/nodes/scaled_attn.h +++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.h @@ -21,6 +21,12 @@ class ScaledDotProductAttention : public Node { bool created() const override { return getType() == Type::ScaledDotProductAttention; } + + bool canBeSkipped() const override { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(1) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(2); + } // pastkv may have zero dimension bool isExecutable() const override { return !isInputTensorAtPortEmpty(0) && !isInputTensorAtPortEmpty(1) && !isInputTensorAtPortEmpty(2); diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp index 76c9acd218d9d1..3fa90e92f7b066 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp @@ -53,6 +53,10 @@ bool ScatterUpdate::isSupportedOperation(const std::shared_ptr& return true; } +bool ScatterUpdate::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(DATA_ID); +} + bool ScatterUpdate::isExecutable() const { return !isInputTensorAtPortEmpty(DATA_ID); } diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.h b/src/plugins/intel_cpu/src/nodes/scatter_update.h index 87604efe745332..897485c76a9426 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.h +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.h @@ -92,6 +92,7 @@ class ScatterUpdate : public Node { bool needPrepareParams() const override; void executeDynamicImpl(dnnl::stream strm) override; + bool canBeSkipped() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/shapeof.cpp b/src/plugins/intel_cpu/src/nodes/shapeof.cpp index af5df8e2878b18..472f2e5ceaa34c 100644 --- a/src/plugins/intel_cpu/src/nodes/shapeof.cpp +++ b/src/plugins/intel_cpu/src/nodes/shapeof.cpp @@ -79,10 +79,6 @@ void ShapeOf::initOptimalPrimitiveDescriptor() { selected_pd->setConfig(config); } -bool ShapeOf::isExecutable() const { - return true; -} - void ShapeOf::execute(dnnl::stream strm) { auto inPtr = getSrcMemoryAtPort(0); auto outPtr = getDstMemoryAtPort(0); diff --git a/src/plugins/intel_cpu/src/nodes/shapeof.h b/src/plugins/intel_cpu/src/nodes/shapeof.h index fbdb689ed08cec..d6c0da93dfaa5c 100644 --- a/src/plugins/intel_cpu/src/nodes/shapeof.h +++ b/src/plugins/intel_cpu/src/nodes/shapeof.h @@ -23,10 +23,11 @@ class ShapeOf : public Node { void initOptimalPrimitiveDescriptor() override; void execute(dnnl::stream strm) override; bool created() const override; - bool needPrepareParams() const override {return false;}; + bool needPrepareParams() const override { return false; } void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } - bool isExecutable() const override; + bool canBeSkipped() const override { return false; }; + bool isExecutable() const override { return true; } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/split.cpp b/src/plugins/intel_cpu/src/nodes/split.cpp index 72af54e619dbf3..19668241b76e79 100644 --- a/src/plugins/intel_cpu/src/nodes/split.cpp +++ b/src/plugins/intel_cpu/src/nodes/split.cpp @@ -276,6 +276,10 @@ void Split::prepareParams() { } } +bool Split::canBeSkipped() const { + return isInPlace() || getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Split::isExecutable() const { return !isInPlace() && !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/split.h b/src/plugins/intel_cpu/src/nodes/split.h index 0782594bcf9989..1a95f9817ab3e8 100644 --- a/src/plugins/intel_cpu/src/nodes/split.h +++ b/src/plugins/intel_cpu/src/nodes/split.h @@ -23,6 +23,7 @@ class Split : public Node { void initOptimalPrimitiveDescriptor() override; + bool canBeSkipped() const override; bool isExecutable() const override; bool needPrepareParams() const override; diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp index 13671c22d102ae..fe050ef7ea3586 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp @@ -287,6 +287,11 @@ void StridedSlice::initSupportedPrimitiveDescriptors() { } } +bool StridedSlice::canBeSkipped() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0) || + getSelectedPrimitiveDescriptor()->hasZeroOutputDimsAtPort(0); +} + bool StridedSlice::isExecutable() const { return !isInputTensorAtPortEmpty(0) && !isOutputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.h b/src/plugins/intel_cpu/src/nodes/strided_slice.h index bf698643271d7a..9f1cff78ab2b93 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.h +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.h @@ -26,6 +26,7 @@ class StridedSlice : public Node { return false; } + bool canBeSkipped() const override; bool isExecutable() const override; bool needShapeInfer() const override; diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.h b/src/plugins/intel_cpu/src/nodes/tensoriterator.h index f8a8110c3fae48..41c086288f0cdb 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.h +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.h @@ -111,6 +111,7 @@ class TensorIterator : public Node { void createPrimitive() override; bool created() const override; void execute(dnnl::stream strm) override; + bool canBeSkipped() const override { return false; } bool isExecutable() const override { return true; } protected: diff --git a/src/plugins/intel_cpu/src/nodes/transpose.cpp b/src/plugins/intel_cpu/src/nodes/transpose.cpp index 38712e04c50719..2674aa85fa723f 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/transpose.cpp @@ -125,8 +125,12 @@ void Transpose::initSupportedPrimitiveDescriptors() { } } +bool Transpose::canBeSkipped() const { + return isOptimized || getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Transpose::isExecutable() const { - return !isInputTensorAtPortEmpty(0) && !isOptimized; + return !isOptimized && !isInputTensorAtPortEmpty(0); } bool Transpose::needPrepareParams() const { diff --git a/src/plugins/intel_cpu/src/nodes/transpose.h b/src/plugins/intel_cpu/src/nodes/transpose.h index 03b65c1333610c..7c9e1686645914 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.h +++ b/src/plugins/intel_cpu/src/nodes/transpose.h @@ -34,6 +34,7 @@ class Transpose : public Node { return order; } + bool canBeSkipped() const override; bool isExecutable() const override; bool needPrepareParams() const override; void prepareParams() override; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 5c88772eeedabc..e330cad845837c 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -521,7 +521,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr& Config::ModelType modelType = getModelType(model); conf.readProperties(config, modelType); - auto context = std::make_shared(conf, fake_w_cache, false); + auto context = std::make_shared(conf, fake_w_cache, false, nullptr, nullptr); auto supported = ov::get_supported_nodes( model, diff --git a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake index 057869a864d87b..9d7fa9f9d9a365 100644 --- a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake +++ b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake @@ -96,7 +96,8 @@ endif() endfunction() if(ENABLE_CPU_SPECIFIC_TARGET_PER_TEST) - create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src ov_cpu_func_subgraph) + # create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src ov_cpu_func_subgraph) + create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/common ov_cpu_func_subgraph) create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/single_layer_tests ov_cpu_func_slt) endif() diff --git a/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp b/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp index a41cb4c4300d42..96733ec115319a 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp @@ -6,6 +6,7 @@ #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/concat.h" #include "nodes/rnn.h" @@ -42,7 +43,11 @@ class InplaceResolveIOCPUTestBase : public ::testing::Test { std::shared_ptr create_graph(const std::vector& input_shapes, const size_t num_consumers = 1) { Config conf; conf.rtCacheCapacity = 100; - const auto context = std::make_shared(conf, nullptr, false); + const auto context = std::make_shared(conf, + nullptr, + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); std::shared_ptr graph = std::shared_ptr(new Graph()); @@ -88,6 +93,7 @@ class InplaceResolveIOCPUTestBase : public ::testing::Test { std::vector nodes; std::vector edges; std::unordered_set nodesSet; + std::shared_ptr networkMemoryControl = std::make_shared(); }; class RNNConcatCPUTest : public InplaceResolveIOCPUTestBase { diff --git a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp index 5b9468ffc35e6f..02a5940965fb6e 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp @@ -6,6 +6,7 @@ #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/memory.hpp" #include "nodes/softmax.h" #include "nodes/shapeof.h" @@ -82,7 +83,8 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) { Config conf; conf.rtCacheCapacity = 0; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); auto input_node = std::make_shared(param, context); auto memory_input = std::make_shared(read, context); @@ -281,7 +283,12 @@ TEST(MemStateGraphTest, smoke_ShapeOf_no_Inplace_Conflicts) { Config conf; conf.rtCacheCapacity = 0; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, + nullptr, + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); auto input_node = std::make_shared(param, context); auto memory_input = std::make_shared(read, context); diff --git a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp index 003aca979398fb..71bf2dc340855e 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp @@ -9,6 +9,7 @@ #include "common_test_utils/node_builders/constant.hpp" #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/reorder.h" #include "nodes/reshape.h" @@ -76,7 +77,7 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface(conf, nullptr, false); + m_context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); const auto replication_result = CreateModelAndReplicate(shape, params.firstNodeLayout, params.firstNodeInplaceDirection, @@ -173,6 +174,7 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface m_context; std::unique_ptr m_graph; + std::shared_ptr networkMemoryControl = std::make_shared(); }; // class MergeTransposeReorderCPUTest /* @@ -335,7 +337,8 @@ TEST(MergeTransposeReorder, smoke_InplaceConflict) { */ Config conf; conf.rtCacheCapacity = 100; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); std::unique_ptr graph = std::unique_ptr(new Graph()); diff --git a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp index b44194a3d5806c..8e510f31f8066c 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp @@ -5,6 +5,7 @@ #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/concat.h" #include "openvino/op/concat.hpp" @@ -43,7 +44,12 @@ TEST(ResolveEdgeConflictsCPUTest, smoke_Run_ResolveEdgeConflicts) { */ Config conf; conf.rtCacheCapacity = 100; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, + nullptr, + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); const dnnl::engine cpuEngine = context->getEngine(); std::unique_ptr graph = std::unique_ptr(new Graph()); @@ -104,7 +110,8 @@ TEST(ResolveEdgeConflictsCPUTest2, smoke_Run_ResolveEdgeConflicts2) { */ Config conf; conf.rtCacheCapacity = 100; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); std::unique_ptr graph = std::unique_ptr(new Graph()); diff --git a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp index ea2994759e7036..63a44f5bea7075 100644 --- a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp @@ -14,6 +14,7 @@ #include #include "common_test_utils/common_utils.hpp" +#include "memory_control.hpp" #include "nodes/input.h" using namespace ov::intel_cpu; @@ -108,7 +109,9 @@ class ReorderCPUTestGraph { conf.rtCacheCapacity = 100; auto context = std::make_shared(conf, std::make_shared(), - false); + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); const dnnl::engine cpuEngine = context->getEngine(); inputNode = std::make_shared(inputDesc.clone(), @@ -152,6 +155,7 @@ class ReorderCPUTestGraph { std::shared_ptr parentEdge; std::shared_ptr childEdge; ov::element::Type prec; + std::shared_ptr networkMemoryControl = std::make_shared(); }; }// namespace ReorderCPUTest