diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index d3245312a16efc..f42d0f81a20269 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -235,6 +235,7 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"Multinomial", Type::Multinomial}, {"Reference", Type::Reference}, {"Subgraph", Type::Subgraph}, + {"SubModel", Type::SubModel}, {"PriorBox", Type::PriorBox}, {"PriorBoxClustered", Type::PriorBoxClustered}, {"Interaction", Type::Interaction}, @@ -368,6 +369,7 @@ std::string NameFromType(const Type type) { CASE(Multinomial); CASE(Reference); CASE(Subgraph); + CASE(SubModel); CASE(PriorBox); CASE(PriorBoxClustered) CASE(MHA); diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h index 1fb9af3982447f..133a6daab053e0 100644 --- a/src/plugins/intel_cpu/src/cpu_types.h +++ b/src/plugins/intel_cpu/src/cpu_types.h @@ -7,6 +7,7 @@ #include #include +#include "transformations/cpu_opset/common/op/submodel.hpp" #include "utils/caseless.hpp" namespace ov { @@ -116,6 +117,7 @@ enum class Type { MulticlassNms, Multinomial, Subgraph, + SubModel, PriorBox, PriorBoxClustered, Interaction, diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index e0573e310ac86c..660f9e60694604 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -60,30 +60,22 @@ Graph::~Graph() { } template -void Graph::CreateGraph(NET &net, const GraphContext::CPtr ctx) { +void Graph::CreateGraph(NET &model, const GraphContext::CPtr context) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "CreateGraph"); - if (IsReady()) - ForgetGraphData(); - - context = ctx; - m_stream = dnnl::stream(getEngine()); - - Replicate(net); - - InitGraph(); + Init(model, context); - CPU_DEBUG_CAP_ENABLE(serialize(*this)); + Activate(); } void Graph::CreateGraph(const std::vector& graphNodes, const std::vector& graphEdges, - const GraphContext::CPtr ctx, + const GraphContext::CPtr context, std::string name) { if (IsReady()) ForgetGraphData(); - context = ctx; + m_context = context; m_stream = dnnl::stream(getEngine()); this->_name = std::move(name); @@ -103,14 +95,18 @@ void Graph::CreateGraph(const std::vector& graphNodes, } } - InitGraph(); + Configure(); - CPU_DEBUG_CAP_ENABLE(serialize(*this)); + Activate(); } template void Graph::CreateGraph(const std::shared_ptr&, const GraphContext::CPtr); -void Graph::Replicate(const std::shared_ptr &model) { + +void Graph::Replicate(const std::shared_ptr &model, + const std::vector& inputConfigs, + const std::vector& outputConfigs) { OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "ov::Model"); + this->_name = model->get_friendly_name(); // Map data object onto producer node @@ -132,31 +128,45 @@ void Graph::Replicate(const std::shared_ptr &model) { return -1; }; - for (const auto& op : model->get_ordered_ops()) { - const NodePtr node {Node::factory().create(op, context)}; - - AddNode(node); + auto createNode = [&](std::shared_ptr op) -> NodePtr { + // special handling for Parameters and Results if (op->get_type_info() == op::v0::Parameter::get_type_info_static()) { auto input_index = model->get_parameter_index(std::dynamic_pointer_cast(op)); OPENVINO_ASSERT(input_index >= 0, - "CPU plugin cannot find op: ", - op->get_friendly_name(), - " in model parameter list!"); + "CPU plugin cannot find op: ", op->get_friendly_name(), " in model parameter list!"); + + const auto& config = static_cast(input_index) < inputConfigs.size() ? inputConfigs[input_index] + : node::Input::InputConfig{}; + NodePtr node = std::make_shared(op, m_context, config); inputNodesMap[input_index] = node; + if (node->isDynamicNode()) { graphHasDynamicInput = true; } + + return node; } if (op->get_type_info() == op::v0::Result::get_type_info_static()) { auto output_index = model->get_result_index(std::dynamic_pointer_cast(op)); OPENVINO_ASSERT(output_index >= 0, - "CPU plugin cannot find op: ", - op->get_friendly_name(), - " in model result list!"); + "CPU plugin cannot find op: ", op->get_friendly_name(), " in model result list!"); + + const auto& config = static_cast(output_index) < outputConfigs.size() ? outputConfigs[output_index] + : node::Input::OutputConfig{}; + NodePtr node = std::make_shared(op, m_context, config); outputNodesMap[output_index] = node; + + return node; } + return NodePtr(Node::factory().create(op, m_context)); + }; + + for (const auto& op : model->get_ordered_ops()) { + const NodePtr node = createNode(op); + + AddNode(node); op2node[op] = node; for (size_t port = 0; port < op->get_input_size(); port++) { @@ -185,7 +195,7 @@ void Graph::Replicate(const std::shared_ptr &model) { const auto nodeName = std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName(); const NodePtr outNode = std::make_shared(parentNode->outputShapes[port], parentNode->getOriginalOutputPrecisionAtPort(port), - nodeName, "Result", context); + nodeName, "Result", m_context); CreateEdge(parentNode, outNode, port, 0); AddNode(outNode); } @@ -202,7 +212,8 @@ void Graph::Replicate(const std::shared_ptr &model) { // enforce must be performed after inputs and outputs info are taken into account EnforceInferencePrecision(); - // also we need to change input/output precisions for consumers/producers to avoid inserting reorder + + // update input precisions of consumers to avoid extra reorders for (auto &input : inputNodesMap) { const auto& inputNode = input.second; const auto precToSet = inputNode->getOriginalOutputPrecisionAtPort(0); @@ -217,12 +228,16 @@ void Graph::Replicate(const std::shared_ptr &model) { } } - for (auto &output : outputNodesMap) { - const auto& outputNode = output.second; - const auto precToSet = outputNode->getOriginalInputPrecisionAtPort(0); - const auto parentEdge = outputNode->getParentEdgeAt(0); - const auto parent = parentEdge->getParent(); - parent->setOriginalOutputPrecisionAtPort(parentEdge->getInputNum(), precToSet); + // update output precisions of producers to avoid extra reorders + // do this only in case output configration is not provided explicitly + if (outputConfigs.empty()) { + for (auto &output : outputNodesMap) { + const auto& outputNode = output.second; + const auto precToSet = outputNode->getOriginalInputPrecisionAtPort(0); + const auto parentEdge = outputNode->getParentEdgeAt(0); + const auto parent = parentEdge->getParent(); + parent->setOriginalOutputPrecisionAtPort(parentEdge->getInputNum(), precToSet); + } } } @@ -292,8 +307,77 @@ static std::tuple, std::vector> ExtractExecutableNo std::move(executableSyncNodesInds)); } -void Graph::InitGraph(bool optimize) { - DEBUG_LOG("Initializing graph with name: ", GetName()); +void Graph::Init(const std::shared_ptr& model, + const GraphContext::CPtr context, + const std::vector& inputConfigs, + const std::vector& outputConfigs) { + if (IsReady()) + ForgetGraphData(); + + m_context = context; + m_stream = dnnl::stream(getEngine()); + + Replicate(model, inputConfigs, outputConfigs); + + Configure(); +} + +static void UseExternalInputMemory(const std::map& inputNodesMap, + const std::vector& memory) { + for (size_t i = 0; i < memory.size(); i++) { + const auto& node = inputNodesMap.at(i); + + auto childEdges = node->getChildEdgesAtPort(0); + for (const auto& childEdge : childEdges) { + OPENVINO_ASSERT(childEdge->getStatus() == Edge::Status::Uninitialized, "Unexpected edge status"); + + childEdge->reuse(memory[i]); + } + } +} + +static void UseExternalOutputMemory(const std::map& outputNodesMap, + const std::vector& memory) { + for (size_t i = 0; i < memory.size(); i++) { + const auto& node = outputNodesMap.at(i); + + const auto& parentEdge = node->getParentEdgeAt(0); + OPENVINO_ASSERT(parentEdge->getStatus() == Edge::Status::Uninitialized, "Unexpected edge status"); + + parentEdge->reuse(memory[i]); + } +} + +void Graph::Activate(const std::vector& externalInputMemory, + const std::vector& externalOutputMemory) { + OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); + + const bool hasDynNodes = ProcessDynNodes(); + const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; + + UseExternalInputMemory(inputNodesMap, externalInputMemory); + UseExternalOutputMemory(outputNodesMap, externalOutputMemory); + + Allocate(syncNodesInds); + + CreatePrimitivesAndExecConstants(); + +#ifndef CPU_DEBUG_CAPS + for (auto &graphNode : graphNodes) { + graphNode->cleanup(); + } +#endif + + std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) + : Status::ReadyStatic; + + CPU_DEBUG_CAP_ENABLE(serialize(*this)); +} + +void Graph::Configure(bool optimize) { + OPENVINO_ASSERT(status == Status::NotReady, "Invalid graph status"); GraphOptimizer optimizer; @@ -327,25 +411,7 @@ void Graph::InitGraph(bool optimize) { SortTopologically(); - const bool hasDynNodes = ProcessDynNodes(); - const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; - - Allocate(syncNodesInds); - - CreatePrimitivesAndExecConstants(); - -#ifndef CPU_DEBUG_CAPS - for (auto &graphNode : graphNodes) { - graphNode->cleanup(); - } -#endif - - std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); - - status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) - : Status::ReadyStatic; - - CPU_DEBUG_CAP_ENABLE(serialize(*this)); + status = Status::Initialized; } void Graph::InitNodes() { @@ -443,7 +509,7 @@ void Graph::CreatePrimitivesAndExecConstants() const { auto edgePtr = node->getChildEdgeAt(i); if (edgePtr) { if (edgePtr->isUseExternalMemory()) { - auto ptr = context->getWeightsCache()->get(edgePtr->name()); + auto ptr = m_context->getWeightsCache()->get(edgePtr->name()); outputs.emplace_back(ptr); if (!ptr->isValid()) hasExternalInvalidEdges = true; @@ -467,7 +533,7 @@ void Graph::CreatePrimitivesAndExecConstants() const { continue; } - if (context->getWeightsCache()) { + if (m_context->getWeightsCache()) { auto sharedOutputs = acquireSharedOutputs(node); if (std::get<0>(sharedOutputs) || std::get<1>(sharedOutputs)) { @@ -535,7 +601,7 @@ void Graph::insertConvert(EdgePtr& edge) { inDesc.getPrecision().get_type_name() + "_" + outDesc.getPrecision().get_type_name(); auto convertNode = std::make_shared(inDesc.getShape(), inDesc.getPrecision(), outDesc.getPrecision(), - convertName, context); + convertName, m_context); convertNode->setDescs(inDesc, outDesc); InsertNode(edge, convertNode, true); } @@ -663,7 +729,7 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { auto constNode = static_cast(edge->getParent().get()); edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); } else { - edge->externalAllocate(context->getWeightsCache()); + edge->externalAllocate(m_context->getWeightsCache()); } auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); OPENVINO_ASSERT(stringMemory, "[CPU] Edge between nodes '", @@ -699,7 +765,7 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { auto constNode = std::static_pointer_cast(edge->getParent()); edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); } else { - edge->externalAllocate(context->getWeightsCache()); + edge->externalAllocate(m_context->getWeightsCache()); } erase = true; } @@ -1010,6 +1076,20 @@ void Graph::PullOutputData(std::unordered_map>& } } +VecMemoryDescs Graph::getOutputMemoryDescriptors() const { + OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); + + VecMemoryDescs result; + result.reserve(outputNodesMap.size()); + + for (const auto& output : outputNodesMap) { + const auto& node = output.second; + result.emplace_back(node->getBaseMemDescAtInputPort(0)); + } + + return result; +} + void Graph::InferStatic(SyncInferRequest* request, int numaId) { for (const auto& node : m_executableGraphNodes) { ExecuteNodeWithCatch(node, request, numaId); @@ -1270,7 +1350,7 @@ static int GetNumaNodeId(const GraphContext::CPtr& context) { void Graph::Infer(SyncInferRequest* request) { DEBUG_LOG("Infer graph: ", GetName(), ". Status: ", static_cast(status)); - const int numaId = GetNumaNodeId(context); + const int numaId = GetNumaNodeId(m_context); if (!m_pMemoryControl) { OPENVINO_THROW("Memory control unit is not initilized in graph: ", GetName()); @@ -1513,7 +1593,7 @@ NodePtr Graph::InsertReorder(EdgePtr edge, const MemoryDesc& outDesc, bool isOptimized, const std::vector & src_perm) { - auto reorder = std::make_shared(inDesc, outDesc, layerName, context); + auto reorder = std::make_shared(inDesc, outDesc, layerName, m_context); reorder->setOptimized(isOptimized); reorder->setSrcPermutation(src_perm); @@ -1721,7 +1801,7 @@ std::shared_ptr Graph::dump() const { } const std::unordered_map& Graph::getInternalStateNodes() const { - return context->getMemoryStatesRegister()->getMemoryStates(); + return m_context->getMemoryStatesRegister()->getMemoryStates(); } } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index 3f9debefe7e06c..97c11dba6f77c1 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -6,6 +6,8 @@ #include "config.h" #include "cpu_memory.h" +#include "nodes/input.h" +#include "openvino/core/node_vector.hpp" #include "openvino/runtime/profiling_info.hpp" #include "node.h" #include "edge.h" @@ -35,9 +37,10 @@ class Graph { enum class Status { NotReady = 0, - ReadyStatic = 1, - ReadyDynamic = 2, - ReadyDynamicSeq = 3, + Initialized = 1, + ReadyStatic = 2, + ReadyDynamic = 3, + ReadyDynamicSeq = 4, }; Graph() = default; @@ -47,24 +50,27 @@ class Graph { ~Graph(); bool IsReady() { - return (status != Status::NotReady); + return one_of(status, Status::ReadyStatic, Status::ReadyDynamic, Status::ReadyDynamicSeq); } const Config & getConfig() const { - return context->getConfig(); + return m_context->getConfig(); } template - void CreateGraph(NET &network, const GraphContext::CPtr ctx); + void CreateGraph(NET &model, const GraphContext::CPtr context); void CreateGraph(const std::vector &graphNodes, const std::vector &graphEdges, - const GraphContext::CPtr ctx, + const GraphContext::CPtr context, std::string name); void PushInputData(const std::size_t& index, const ov::SoPtr& input); void PullOutputData(std::unordered_map>& output); + // Returns Output nodes memory descriptors + VecMemoryDescs getOutputMemoryDescriptors() const; + void Infer(SyncInferRequest* request = nullptr); const std::vector& GetNodes() const { @@ -98,11 +104,11 @@ class Graph { } dnnl::engine getEngine() const { - return context->getEngine(); + return m_context->getEngine(); } GraphContext::CPtr getGraphContext() const { - return context; + return m_context; } void GetPerfData(std::vector &perfMap) const; @@ -189,7 +195,20 @@ class Graph { Status getStatus() const {return status;} const std::unordered_map& getInternalStateNodes() const; - void InitGraph(bool optimize = true); + + /** + * Init graph using \p model, \p context, \p inputConfigs and \p outputConfigs + */ + void Init(const std::shared_ptr& model, + const GraphContext::CPtr context, + const std::vector& inputConfigs = {}, + const std::vector& outputConfigs = {}); + + /** + * Activate execution graph using \p externalInputMemory and \p externalOutputMemory + */ + void Activate(const std::vector& externalInputMemory = {}, + const std::vector& externalOutputMemory = {}); protected: void ForgetGraphData() { @@ -214,7 +233,12 @@ class Graph { bool graphHasDynamicInput = false; - void Replicate(const std::shared_ptr &subgraph); + void Replicate(const std::shared_ptr &subgraph, + const std::vector& inputConfigs = {}, + const std::vector& outputConfigs = {}); + + void Configure(bool optimize = true); + void InitNodes(); void InitDescriptors(); void ResolveInplaceDirections(); @@ -274,7 +298,7 @@ class Graph { std::vector m_executableGraphNodes; std::vector m_executableSyncNodesInds; - GraphContext::CPtr context; + GraphContext::CPtr m_context; dnnl::stream m_stream; MemoryControl* m_pMemoryControl = nullptr; diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 31c4a0d2a5b54d..05c31da0623d45 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -417,6 +417,19 @@ MemoryDescPtr Node::getBaseMemDescAtOutputPort(size_t portNum) const { OPENVINO_THROW("Can't get output memory desc, primitive descriptor is not selected"); } +MemoryDescPtr Node::getParentOutputMemDesc(const EdgePtr& edge) { + const auto parentPtr = edge->getParent(); + const auto parentSpd = parentPtr->getSelectedPrimitiveDescriptor(); + OPENVINO_ASSERT(parentSpd, "Parent selected primitive descriptor is missed"); + + const auto& parentOutConfs = parentSpd->getConfig().outConfs; + OPENVINO_ASSERT(!parentOutConfs.empty(), "Parent output configuration is empty"); + + const int inNum = edge->getInputNum(); + + return parentSpd->getConfig().outConfs[inNum].getMemDesc(); +} + std::string Node::getPrimitiveDescriptorType() const { auto selectedPrimitiveDesc = getSelectedPrimitiveDescriptor(); diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index ff8bf87d993a74..df44056b023b9f 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -10,6 +10,7 @@ #include "cpu_shape.h" #include "cpu_types.h" #include "edge.h" +#include "memory_desc/cpu_memory_desc.h" #include "selective_build.h" #include "memory_desc/dnnl_memory_desc.h" #include "onednn/dnnl.h" @@ -394,6 +395,13 @@ class Node { */ MemoryDescPtr getBaseMemDescAtOutputPort(size_t portNum) const; + /** + * @brief Returns parent output memory descriptor from given \p edge + * must be used after selectOptimalPrimitiveDescriptor stage + * @param edge + * @return pointer to parent output memory descriptor with type MemoryDesc + */ + static MemoryDescPtr getParentOutputMemDesc(const EdgePtr& edge); /** * @brief Returns input selected primitive descriptor on the specified port * must be used after selectOptimalPrimitiveDescriptor stage diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp new file mode 100644 index 00000000000000..b38a56649bd60a --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "composite.h" + +#include "nodes/input.h" +#include "cpu_memory.h" +#include "transformations/cpu_opset/common/op/submodel.hpp" +#include "utils/debug_capabilities.h" +#include "shape_inference/shape_inference_internal_dyn.hpp" + +namespace ov { +namespace intel_cpu { +namespace node { + +bool Composite::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + return ov::is_type(op); +} + +Composite::Composite(const std::shared_ptr& op, const GraphContext::CPtr& context) + : Node(op, context, InternalDynShapeInferFactory()) { + const auto& subModel = ov::as_type_ptr(op); + OPENVINO_ASSERT(subModel, "Attempt to create SubGraph node from an invalid op type: ", op); + + m_body = subModel->get_function(); +} + +void Composite::selectOptimalPrimitiveDescriptor() { + // for the input configution, just always use the parent configuration + std::vector inConfs; + std::vector graphInputConfig; + + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto desc = getParentOutputMemDesc(getParentEdgeAt(i)); + inConfs.emplace_back(desc); + graphInputConfig.emplace_back(node::Input::InputConfig{desc, true}); + } + + std::vector graphOutputConfig; + for (size_t i = 0; i < getParentEdges().size(); i++) { + graphOutputConfig.emplace_back(node::Input::OutputConfig{true, true}); + } + + // configure the inner graph to get the information about output memory descriptors + m_graph.Init(m_body, context, graphInputConfig, graphOutputConfig); + + // for the output decriptors, use the configuration of the graph's output nodes + auto outputDescriptors = m_graph.getOutputMemoryDescriptors(); + + std::vector outConfs; + for (const auto& desc : outputDescriptors) { + outConfs.emplace_back(desc); + } + + const NodeConfig config(inConfs, outConfs); + + supportedPrimitiveDescriptors.clear(); + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef); + + selectPrimitiveDescriptorByIndex(0); +} + +// @todo add ascii diagramm for memory mapping / reuse +void Composite::createPrimitive() { + OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + "Number of node inputs must be equal the number of inner graph's inputs"); + + std::vector inputMemory; + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + inputMemory.emplace_back(getSrcMemoryAtPort(i)); + } + + OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + "Number of node outputs must be equal the number of inner graph's outputs"); + + std::vector outputMemory; + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + outputMemory.emplace_back(getDstMemoryAtPort(i)); + } + + m_graph.Activate(inputMemory, outputMemory); +} + +void Composite::execute(dnnl::stream) { + m_graph.Infer(); +} + +void Composite::executeDynamicImpl(dnnl::stream strm) { + execute(strm); + + if (!inputShapesModified()) + return; + + // since the shape inference is not performed for the composite node + // a memory of the extra child edges, attached to the output ports + // has to be updated after an inference of the inner graph finished + auto& childEdges = getChildEdges(); + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + const auto mem = getDstMemoryAtPort(i); + for (size_t j = getOriginalOutputsNumber(); j < childEdges.size(); j++) { + auto& childEdge = childEdges[j]; + auto childEdgePtr = childEdge.lock(); + assert(childEdgePtr); + + if (childEdgePtr->getInputNum() == static_cast(i)) { + childEdgePtr->getMemoryPtr()->redefineDesc(mem->getDescPtr()); + } + } + } +} + +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/composite.h b/src/plugins/intel_cpu/src/nodes/composite.h new file mode 100644 index 00000000000000..9f18a2ba68b769 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/composite.h @@ -0,0 +1,56 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "graph.h" +#include "node.h" + +namespace ov { +namespace intel_cpu { +namespace node { + +class Composite : public Node { +public: + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + Composite(const std::shared_ptr& op, const GraphContext::CPtr& context); + + bool created() const override { + return getType() == Type::SubModel; + } + + bool needShapeInfer() const override { + return false; + } + + bool needPrepareParams() const override { + return false; + } + + bool isExecutable() const override { + return true; + } + + void getSupportedDescriptors() override{}; + void selectOptimalPrimitiveDescriptor() override; + void createPrimitive() override; + void execute(dnnl::stream) override; + void executeDynamicImpl(dnnl::stream strm) override; + + const Graph& graph() const { + return m_graph; + } + +private: + std::shared_ptr m_body; + Graph m_graph; + std::shared_ptr m_executor; +}; + +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp index 2004242d68523d..6aad18c793c8cf 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp @@ -68,7 +68,7 @@ class GraphEmitter { } GraphPtr emit() { - graph->InitGraph(false); + OPENVINO_THROW("Not implemented yet!"); return graph; } diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index ea659ec1e31b84..c311c40714cb37 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -5,6 +5,7 @@ #include "input.h" #include "cpu/x64/jit_generator.hpp" +#include "nodes/node_config.h" #include "openvino/core/parallel.hpp" #include "shape_inference/shape_inference_pass_through.hpp" @@ -419,6 +420,22 @@ Input::Input(MemoryDescPtr memDesc, const std::string& name, const std::string& extMemDesc = memDesc; } +Input::Input(const std::shared_ptr& op, + const GraphContext::CPtr context, + InputConfig config) + : Input(op, context) { + extMemDesc = config.desc; + m_isInPlace = config.inPlace; +} + +Input::Input(const std::shared_ptr& op, + const GraphContext::CPtr context, + OutputConfig config) + : Input(op, context) { + m_useParentMemoryDescForOutput = config.useParentMemoryDescForOutput; + m_isInPlace = config.inPlace; +} + MemoryCPtr Input::getMemoryPtr() const { return memoryPtr; } @@ -448,6 +465,27 @@ void Input::initSupportedPrimitiveDescriptors() { } } +void Input::initOptimalPrimitiveDescriptor() { + if (m_useParentMemoryDescForOutput || extMemDesc) + return; + + Node::initOptimalPrimitiveDescriptor(); +} + +void Input::selectOptimalPrimitiveDescriptor() { + if (!(m_useParentMemoryDescForOutput && getType() == Type::Output)) + return Node::selectOptimalPrimitiveDescriptor(); + + // ignore previous configuration + supportedPrimitiveDescriptors.clear(); + + // and just use parent memory descriptor for Output node to avoid reorders insertion + NodeConfig config({PortConfig(getParentOutputMemDesc(getParentEdgeAt(0)), BlockedMemoryDesc::FULL_MASK, 0)}, {}); + + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); + selectPrimitiveDescriptorByIndex(0); +} + void Input::createPrimitive() { for (size_t i = 0; i < getChildEdges().size(); i++) { auto dstMemPtr = getDstMemoryAtPort(i); @@ -495,15 +533,14 @@ void Input::initSupportedPdDefault() { void Input::initSupportedPdFromMemDesc() { NodeConfig config; - PortConfig portConfig; - portConfig.inPlace(-1); - portConfig.constant(false); - portConfig.setMemDesc(extMemDesc); + PortConfig portConfig(extMemDesc, BlockedMemoryDesc::FULL_MASK, m_isInPlace ? 0 : -1, false); + if (getType() == Type::Input || getType() == Type::MemoryInput) { config.outConfs.push_back(portConfig); } else if (getType() == Type::Output) { config.inConfs.push_back(portConfig); } + supportedPrimitiveDescriptors.emplace_back(std::move(config), impl_desc_type::unknown); } diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 9b304e5a75a891..a954ce56665d61 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -13,16 +13,39 @@ namespace node { class Input : public Node { public: + struct InputConfig { + MemoryDescPtr desc; + bool inPlace; + }; + + struct OutputConfig { + // @todo better to use memory desc with any layout and undefined precision + bool useParentMemoryDescForOutput; + bool inPlace; + }; + Input(const std::shared_ptr& op, const GraphContext::CPtr context); + Input(const Shape& shape, const ov::element::Type& prc, const std::string& name, const std::string& type, const GraphContext::CPtr context); + Input(MemoryDescPtr memDesc, const std::string& name, const std::string& type, const GraphContext::CPtr context); + Input(const std::shared_ptr& op, + const GraphContext::CPtr context, + InputConfig config); + + Input(const std::shared_ptr& op, + const GraphContext::CPtr context, + OutputConfig config); + void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; + void initOptimalPrimitiveDescriptor() override; + void selectOptimalPrimitiveDescriptor() override; void createPrimitive() override; bool created() const override; @@ -46,8 +69,10 @@ class Input : public Node { private: std::shared_ptr constOp; MemoryCPtr memoryPtr; - MemoryDescPtr extMemDesc = nullptr; bool isMeanImage = false; + MemoryDescPtr extMemDesc = nullptr; + bool m_useParentMemoryDescForOutput = false; + bool m_isInPlace = false; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes_factory.cpp b/src/plugins/intel_cpu/src/nodes_factory.cpp index 21eb43d5f063dd..a7b7547cde1511 100644 --- a/src/plugins/intel_cpu/src/nodes_factory.cpp +++ b/src/plugins/intel_cpu/src/nodes_factory.cpp @@ -9,6 +9,7 @@ #include "nodes/bucketize.h" #include "nodes/col2im.h" #include "nodes/color_convert.h" +#include "nodes/composite.h" #include "nodes/concat.h" #include "nodes/conv.h" #include "nodes/convert.h" @@ -214,6 +215,7 @@ Node::NodesFactory::NodesFactory() : Factory("NodesFactory") { INTEL_CPU_NODE(RDFT, Type::RDFT); INTEL_CPU_NODE(ExtractImagePatches, Type::ExtractImagePatches); INTEL_CPU_NODE(Subgraph, Type::Subgraph); + INTEL_CPU_NODE(Composite, Type::SubModel); INTEL_CPU_NODE(ScaledDotProductAttention, Type::ScaledDotProductAttention); #if defined(OPENVINO_ARCH_X86_64) INTEL_CPU_NODE(FakeQuantize, Type::FakeQuantize); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.cpp new file mode 100644 index 00000000000000..0f72baed2b1206 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "submodel.hpp" + +namespace ov { +namespace intel_cpu { + +SubModel::SubModel(const std::shared_ptr& body) + : SubGraphOp() { + SubGraphOp::set_function(body); +} + +SubModel::SubModel(const ov::OutputVector& args, + const std::shared_ptr& body) + : SubGraphOp(args) { + SubGraphOp::set_function(body); + constructor_validate_and_infer_types(); + for (size_t i = 0; i < body->get_parameters().size(); ++i) + m_input_descriptions[0].push_back(std::make_shared(i, i)); + for (size_t i = 0; i < body->get_output_size(); ++i) + m_output_descriptions[0].push_back(std::make_shared(i, i)); +} + +SubModel::SubModel(const ov::NodeVector& args, + const std::shared_ptr& body) + : SubModel(as_output_vector(args), body) {} + +std::shared_ptr SubModel::clone_with_new_inputs(const ov::OutputVector& inputs) const { + return std::make_shared(inputs, body().clone()); +} + +void SubModel::validate_and_infer_types() { + ov::ParameterVector old_parameters = body_ptr()->get_parameters(); + + for (size_t i = 0; i < get_input_size(); ++i) { + body_ptr()->replace_parameter( + i, + std::make_shared(get_input_element_type(i), get_input_partial_shape(i))); + } + + body_ptr()->validate_nodes_and_infer_types(); + + for (size_t i = 0; i < body_ptr()->get_parameters().size(); i++) { + body_ptr()->get_parameters()[i]->set_friendly_name(old_parameters[i]->get_friendly_name()); + } + + set_output_size(body_ptr()->get_output_size()); + for (size_t i = 0; i < get_output_size(); ++i) { + set_output_type(i, body_ptr()->get_output_element_type(i), body_ptr()->get_output_partial_shape(i)); + } +} + +bool SubModel::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("body", body_ptr()); + visitor.on_attribute("input_descriptions", m_input_descriptions[0]); + visitor.on_attribute("output_descriptions", m_output_descriptions[0]); + return true; +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.hpp new file mode 100644 index 00000000000000..03e5e19f3424a5 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/op/submodel.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "openvino/core/model.hpp" +#include "openvino/op/op.hpp" +#include "openvino/op/util/sub_graph_base.hpp" + +namespace ov { +namespace intel_cpu { + +/** + * @interface Subgraph + * @brief An operation that is implemented by a model + */ +class SubModel : public ov::op::util::SubGraphOp { +public: + OPENVINO_OP("SubModel", "cpu_plugin_opset"); + + SubModel() = default; + + SubModel(const std::shared_ptr& body); + + SubModel(const OutputVector& args, const std::shared_ptr& body); + + SubModel(const NodeVector& args, const std::shared_ptr& body); + + bool visit_attributes(AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + + const ov::Model& body() const { + return *m_bodies[0]; + } + const std::shared_ptr& body_ptr() const { + return m_bodies[0]; + } + +private: + ov::Model& body() { + return *m_bodies[0]; + } + std::shared_ptr& body_ptr() { + return m_bodies[0]; + } +}; + +} // namespace intel_cpu +} // namespace ov