From 8aff79e5d46679249a5a81e3a32984abbbd5fba0 Mon Sep 17 00:00:00 2001 From: Bogdan Pereanu Date: Wed, 27 Nov 2024 21:40:27 +0200 Subject: [PATCH] [NPU] In-order async execution (#27696) ### Details: - In-order to activate it, the NPU_RUN_INFERENCES_SEQUENTIALLY property should be set to true when compiling a model. This feature works only for the inferences from a compiled model. - It doesn't support scheduling inferences from different compiled models. - Inferences should be run each time in the same order they were called the first time --------- Signed-off-by: Bogdan Pereanu Co-authored-by: Dmitry Matveev --- .../al/include/intel_npu/config/runtime.hpp | 18 ++ .../intel_npu/npu_private_properties.hpp | 9 + .../intel_npu/src/al/src/config/runtime.cpp | 1 + .../backend/include/zero_infer_request.hpp | 35 --- .../src/backend/include/zero_pipeline.hpp | 18 +- .../src/backend/src/zero_infer_request.cpp | 105 ++----- .../src/backend/src/zero_pipeline.cpp | 74 +++-- .../include/intel_npu/common/igraph.hpp | 111 ++++---- .../intel_npu/src/common/src/igraph.cpp | 159 +++++++++++ .../src/driver_compiler_adapter.cpp | 16 +- .../src/compiler_adapter/src/driver_graph.cpp | 12 +- .../src/compiler_adapter/src/plugin_graph.cpp | 12 +- .../src/plugin/npuw/compiled_model.cpp | 4 + .../src/plugin/src/compiled_model.cpp | 6 + .../intel_npu/src/plugin/src/plugin.cpp | 6 + .../intel_npu/utils/zero/zero_utils.hpp | 2 +- .../intel_npu/utils/zero/zero_wrappers.hpp | 3 +- .../src/utils/src/zero/zero_wrappers.cpp | 6 +- .../functional/behavior/infer_request_run.cpp | 12 + .../functional/behavior/infer_request_run.hpp | 260 +++++++++++++++++- 20 files changed, 655 insertions(+), 214 deletions(-) create mode 100644 src/plugins/intel_npu/src/common/src/igraph.cpp diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp index 510ab7fc43b0c8..1fc3a3e20965c6 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp @@ -270,4 +270,22 @@ struct BYPASS_UMD_CACHING final : OptionBase { return OptionMode::RunTime; } }; + +// +// RUN_INFERENCES_SEQUENTIALLY +// +struct RUN_INFERENCES_SEQUENTIALLY final : OptionBase { + static std::string_view key() { + return ov::intel_npu::run_inferences_sequentially.name(); + } + + static bool defaultValue() { + return false; + } + + static OptionMode mode() { + return OptionMode::RunTime; + } +}; + } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp index ec92e10a9f89c8..8aabd132e9431a 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp @@ -327,5 +327,14 @@ static constexpr ov::Property backend_n */ static constexpr ov::Property backend_compilation_params{"NPU_BACKEND_COMPILATION_PARAMS"}; +/** + * @brief [Only for NPU Plugin] + * Type: boolean, default is false. + * This option allows to run inferences sequentially, in the order in which they were created + * @note Experimental property, for now it only works in very specific scenarios. We need driver updates before we can + * implement a robust solution for in-order execution + */ +static constexpr ov::Property run_inferences_sequentially{"NPU_RUN_INFERENCES_SEQUENTIALLY"}; + } // namespace intel_npu } // namespace ov diff --git a/src/plugins/intel_npu/src/al/src/config/runtime.cpp b/src/plugins/intel_npu/src/al/src/config/runtime.cpp index 759956b6f597df..3da16796219332 100644 --- a/src/plugins/intel_npu/src/al/src/config/runtime.cpp +++ b/src/plugins/intel_npu/src/al/src/config/runtime.cpp @@ -27,6 +27,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); } // Heuristically obtained number. Varies depending on the values of PLATFORM and PERFORMANCE_HINT diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp index 3efbdab666d1ba..1e8781b0afe820 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp @@ -38,25 +38,6 @@ class ZeroInferRequest final : public SyncInferRequest { std::vector get_profiling_info() const override; std::vector get_raw_profiling_data() const; - /** - * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by - * the model will also be deduced and returned. - * @details Batching can be handled by the plugin only if: - * - The batch axis is the first axis. - * - The batch size received by the compiler takes the default value of 1. - * - The batch size found in the IR model matches for all inputs/outputs and takes a value different than the - * default one. - * - * If any of the previous conditions is not fulfilled, the functon will return the default batch size, thus no - * custom algorithm will be applied inside the plugin in order to address batching. - * - * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will - * ultimately be used for determining the batch size. - * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside - * the plugin. - */ - std::optional get_batch_size(const NetworkMetadata& metadata); - /** * @brief Check the received tensor and set the Level Zero tensor accordingly * @param tensor Reference to a tensor. @@ -106,22 +87,6 @@ class ZeroInferRequest final : public SyncInferRequest { std::shared_ptr _npuProfiling; std::unique_ptr _pipeline; - /** - * @brief Indicates how many command lists will be used inside the pipeline. - * @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis - * between these lists. - * - * If batching is handled on compiler's side then a single command list shall be used, we don't do any - * specific operation inside the plugin in this case. - */ - size_t _numberOfCommandLists = 1; - - /** - * @brief The batch size used by the corresponding model. - * @details The attribute contains a value only if the plugin performs the batches splitting operation. - */ - std::optional _batchSize = std::nullopt; - bool _pipelineIsCreated = false; }; diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index 5b7f488d3eb96a..de5e1ac81c4728 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -28,7 +28,6 @@ struct Pipeline { const std::shared_ptr& npu_profiling, const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, - size_t numberOfCommandLists, uint32_t group_ordinal); Pipeline(const Pipeline&) = delete; @@ -43,12 +42,25 @@ struct Pipeline { void updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex); protected: + std::shared_ptr _graph; const Config _config; + const uint32_t _id; + + /** + * @brief Indicates how many command lists will be used inside the pipeline. + * @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis + * between these lists. + * + * If batching is handled on compiler's side then a single command list shall be used, we don't do any + * specific operation inside the plugin in this case. + */ + size_t _number_of_command_lists; + std::shared_ptr _command_queue; std::vector> _command_lists; std::vector> _fences; - EventPool _event_pool; - std::vector> _events; + std::shared_ptr _event_pool; + std::vector> _events; bool sync_output_with_fences_ = true; std::shared_ptr _npu_profiling; Logger _logger; diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index 88dfaf944a8b34..a0e5d2d11c1fef 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -20,8 +20,6 @@ using namespace intel_npu; namespace { constexpr std::size_t SINGLE_TENSOR = 0; -constexpr std::size_t BATCH_AXIS = 0; -constexpr std::size_t DEFAULT_BATCH_SIZE = 1; constexpr bool INPUT = true; constexpr bool OUTPUT = false; @@ -96,64 +94,6 @@ bool memory_was_allocated_in_the_same_l0_context(ze_context_handle_t hContext, c } // namespace -std::optional ZeroInferRequest::get_batch_size(const NetworkMetadata& metadata) { - if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) { - _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); - return std::nullopt; - } - - const ov::PartialShape& firstOutputShape = *metadata.outputs.at(0).shapeFromIRModel; - if (firstOutputShape.is_dynamic()) { - _logger.warning("Networks using dynamic shapes are not supported when batching is handled by the plugin"); - return std::nullopt; - } - if (firstOutputShape.rank().get_length() == 0) { - _logger.warning( - "Networks using rank 0 shapes for inputs/outputs are not supported when batching is handled by the plugin"); - return std::nullopt; - } - - const size_t candidateBatchSize = firstOutputShape[BATCH_AXIS].get_length(); - if (candidateBatchSize == 0 || candidateBatchSize == DEFAULT_BATCH_SIZE) { - _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); - return std::nullopt; - } - - auto checkDescriptorsUseCandidateBatchSize = [candidateBatchSize](const std::vector& descriptors) { - for (const IODescriptor& descriptor : descriptors) { - OPENVINO_ASSERT(descriptor.shapeFromIRModel.has_value(), - "Missing value for the \"shapeFromIRModel\" attribute, I/O descriptor"); - - const ov::PartialShape& shapeFromCompiler = descriptor.shapeFromCompiler; - const ov::PartialShape& shapeFromIRModel = *descriptor.shapeFromIRModel; - - if (shapeFromCompiler.is_dynamic() || shapeFromCompiler.rank().get_length() == 0 || - *shapeFromCompiler.begin() != DEFAULT_BATCH_SIZE) { - return false; - } - - if (!descriptor.isStateInput && !descriptor.isStateOutput && !descriptor.isShapeTensor) { - if (shapeFromIRModel.is_dynamic() || shapeFromIRModel.rank().get_length() == 0 || - *shapeFromIRModel.begin() != candidateBatchSize) { - return false; - } - } - } - - return true; - }; - - if (!checkDescriptorsUseCandidateBatchSize(metadata.inputs) || - !checkDescriptorsUseCandidateBatchSize(metadata.outputs)) { - _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); - return std::nullopt; - } - - _logger.debug("Batching is handled by the plugin"); - - return candidateBatchSize; -} - //------------------------------------------------------------------------------ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& initStructs, const std::shared_ptr& compiledModel, @@ -187,13 +127,6 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& _inputAllocator = std::make_shared(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED); - if (config.get() != ov::intel_npu::BatchMode::COMPILER) { - _batchSize = get_batch_size(_metadata); - } - if (_batchSize.has_value()) { - _numberOfCommandLists = *_batchSize; - } - _logger.debug("ZeroInferRequest::ZeroInferRequest - checking level zero attributes and allocating tensors"); size_t ioIndex = 0; @@ -205,7 +138,8 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& continue; } - get_level_zero_input(ioIndex) = allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _batchSize); + get_level_zero_input(ioIndex) = + allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _graph->get_batch_size()); get_input_tensor_data(ioIndex) = TensorData{get_level_zero_input(ioIndex)->data(), get_level_zero_input(ioIndex)->get_byte_size()}; @@ -222,7 +156,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& } _levelZeroOutputTensors.at(ioIndex) = - allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _batchSize); + allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _graph->get_batch_size()); _outputTensorsData.at(ioIndex) = std::optional(TensorData{_levelZeroOutputTensors.at(ioIndex)->data(), _levelZeroOutputTensors.at(ioIndex)->get_byte_size()}); @@ -236,7 +170,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& void ZeroInferRequest::create_pipeline() { for (size_t inputIndex = 0; inputIndex < _metadata.inputs.size(); ++inputIndex) { if (is_batched_input(inputIndex)) { - if (_batchSize.has_value()) { + if (_graph->get_batch_size().has_value()) { _logger.debug("ZeroInferRequest::create_pipeline - tensors %s were already allocated", _metadata.inputs.at(inputIndex).nodeFriendlyName.c_str()); continue; @@ -250,8 +184,11 @@ void ZeroInferRequest::create_pipeline() { } _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor"); - get_level_zero_input(inputIndex) = - allocate_tensor(_metadata.inputs.at(inputIndex), inputIndex, INPUT, *_inputAllocator, _batchSize); + get_level_zero_input(inputIndex) = allocate_tensor(_metadata.inputs.at(inputIndex), + inputIndex, + INPUT, + *_inputAllocator, + _graph->get_batch_size()); get_input_tensor_data(inputIndex) = std::optional( TensorData{get_level_zero_input(inputIndex)->data(), get_level_zero_input(inputIndex)->get_byte_size()}); } @@ -263,17 +200,20 @@ void ZeroInferRequest::create_pipeline() { continue; } _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor"); - _levelZeroOutputTensors.at(outputIndex) = - allocate_tensor(_metadata.outputs.at(outputIndex), outputIndex, OUTPUT, *_outputAllocator, _batchSize); + _levelZeroOutputTensors.at(outputIndex) = allocate_tensor(_metadata.outputs.at(outputIndex), + outputIndex, + OUTPUT, + *_outputAllocator, + _graph->get_batch_size()); _outputTensorsData.at(outputIndex) = std::optional(TensorData{_levelZeroOutputTensors.at(outputIndex)->data(), _levelZeroOutputTensors.at(outputIndex)->get_byte_size()}); } // Find the corresponding command queue group. - _logger.debug("ZeroDevice::ZeroDevice - findGroupOrdinal"); + _logger.debug("ZeroInferRequest::create_pipeline - findGroupOrdinal"); auto groupOrdinal = zeroUtils::findGroupOrdinal(_initStructs->getDevice(), _properties); - _logger.debug("ZeroDevice::ZeroDevice - init completed"); + _logger.debug("ZeroInferRequest::create_pipeline - init completed"); _logger.debug("ZeroInferRequest::create_pipeline - constructing pipeline"); @@ -286,7 +226,6 @@ void ZeroInferRequest::create_pipeline() { _npuProfiling, _inputTensorsData, _outputTensorsData, - _numberOfCommandLists, groupOrdinal); _logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed"); @@ -321,7 +260,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr& tenso index, isInput, isInput ? *_inputAllocator : *_outputAllocator, - _batchSize); + _graph->get_batch_size()); setTensorData = true; levelZeroTensorCreatedLocally = true; @@ -444,7 +383,7 @@ void ZeroInferRequest::set_tensors(const ov::Output& port, get_user_inputs(foundPort.idx) = tensors; if (_initStructs->getMutableCommandListVersion()) { - if (_batchSize.has_value()) { + if (_graph->get_batch_size().has_value()) { for (size_t i = 0; i < tensors.size(); i++) { auto remoteTensor = std::dynamic_pointer_cast(tensors[i]._ptr); @@ -525,13 +464,17 @@ ov::SoPtr ZeroInferRequest::get_tensor(const ov::Outputget_batch_size()); tensorsData = std::optional(TensorData{levelZeroTensors->data(), levelZeroTensors->get_byte_size()}); return levelZeroTensors; } void ZeroInferRequest::infer() { + if (_config.get()) { + OPENVINO_THROW("Only start async is supported when RUN_INFERENCES_SEQUENTIALLY is enabled!"); + } + infer_async(); get_result(); } @@ -567,7 +510,7 @@ void ZeroInferRequest::infer_async() { } if (is_batched_input(inputIndex)) { - if (_batchSize.has_value()) { + if (_graph->get_batch_size().has_value()) { for (size_t i = 0; i < userTensor.size(); i++) { auto levelZeroBatchRemoteTensor = std::dynamic_pointer_cast(get_level_zero_input(inputIndex, i)); diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index c782c3e0684f0d..d7f06b813810bb 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -8,6 +8,7 @@ #include #include "intel_npu/common/itt.hpp" +#include "intel_npu/config/runtime.hpp" #include "intel_npu/prefix.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_api.hpp" @@ -23,13 +24,15 @@ Pipeline::Pipeline(const Config& config, const std::shared_ptr& npu_profiling, const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, - size_t numberOfCommandLists, uint32_t group_ordinal) - : _config(config), - _command_queue(graph->get_command_queue()), - _event_pool{initStructs->getDevice(), - initStructs->getContext(), - numberOfCommandLists ? static_cast(numberOfCommandLists) : 1}, + : _graph(graph), + _config(config), + _id(_graph->get_unique_id()), + _number_of_command_lists(_graph->get_batch_size().has_value() ? *_graph->get_batch_size() : 1), + _event_pool{ + std::make_shared(initStructs->getDevice(), + initStructs->getContext(), + _number_of_command_lists ? static_cast(_number_of_command_lists) : 1)}, _npu_profiling(npu_profiling), _logger("Pipeline", _config.get()) { OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline"); @@ -39,20 +42,20 @@ Pipeline::Pipeline(const Config& config, profiling_query.create(profiling_pool._handle); } - _command_lists.reserve(numberOfCommandLists); - _events.reserve(numberOfCommandLists); - _fences.reserve(numberOfCommandLists); + _command_lists.reserve(_number_of_command_lists); + _events.reserve(_number_of_command_lists); + _fences.reserve(_number_of_command_lists); _logger.debug("Pipeline - emplace_back _event_pool and _command_queue"); - for (size_t i = 0; i < numberOfCommandLists; i++) { + for (size_t i = 0; i < _number_of_command_lists; i++) { _command_lists.emplace_back( std::make_unique(initStructs, group_ordinal, initStructs->getMutableCommandListVersion() ? true : false)); - _events.emplace_back(std::make_unique(_event_pool.handle(), static_cast(i))); - _fences.emplace_back(std::make_unique(*_command_queue)); + _events.emplace_back(std::make_shared(_event_pool, static_cast(i))); + _fences.emplace_back(std::make_unique(*_graph->get_command_queue())); } - for (size_t i = 0; i < numberOfCommandLists; i++) { + for (size_t i = 0; i < _number_of_command_lists; i++) { size_t ioIndex = 0; for (const auto& desc : graph->get_input_descriptors()) { if (inputTensorsData.at(ioIndex).size() > 1) { @@ -64,7 +67,7 @@ Pipeline::Pipeline(const Config& config, graph->set_argument_value(desc.idx, static_cast(inputTensorsData.at(ioIndex).at(0)->mem) + - (i * inputTensorsData.at(ioIndex).at(0)->size) / numberOfCommandLists); + (i * inputTensorsData.at(ioIndex).at(0)->size) / _number_of_command_lists); ++ioIndex; } @@ -73,10 +76,16 @@ Pipeline::Pipeline(const Config& config, for (const auto& desc : graph->get_output_descriptors()) { graph->set_argument_value(desc.idx, static_cast(outputTensorsData.at(ioIndex)->mem) + - (i * outputTensorsData.at(ioIndex)->size) / numberOfCommandLists); + (i * outputTensorsData.at(ioIndex)->size) / _number_of_command_lists); ++ioIndex; } + if (_config.get()) { + if (_graph->get_last_submitted_event(i)) { + _graph->get_last_submitted_event(i)->AppendWaitOnEvent(*_command_lists.at(i)); + } + } + /// append timestamp command if feature was activated if (_npu_profiling != nullptr) { _command_lists.at(i)->appendBarrier(); @@ -92,6 +101,15 @@ Pipeline::Pipeline(const Config& config, _command_lists.at(i)->appendNpuTimestamp(reinterpret_cast(_npu_profiling->npu_ts_infer_end)); } + if (_config.get()) { + if (_graph->get_last_submitted_event(i)) { + _graph->get_last_submitted_event(i)->AppendEventReset(*_command_lists.at(i)); + } + + _events.at(i)->AppendSignalEvent(*_command_lists.at(i)); + _graph->set_last_submitted_event(_events.at(i), i); + } + // appendBarrier used in L0 as well if (!sync_output_with_fences_) { _command_lists.at(i)->appendBarrier(); @@ -105,12 +123,24 @@ Pipeline::Pipeline(const Config& config, void Pipeline::push() { _logger.debug("Pipeline - push() started"); + if (_config.get()) { + if (_id) { + auto previousIndex = _graph->get_last_submitted_id(); + + if (_id != ++previousIndex) { + OPENVINO_THROW("Inferences should be called in the same order they were called the first time!"); + } + } + + _graph->set_last_submitted_id(_id); + } + for (size_t i = 0; i < _command_lists.size(); ++i) { OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push"); if (sync_output_with_fences_) { - _command_queue->executeCommandList(*_command_lists.at(i), *_fences.at(i)); + _graph->get_command_queue()->executeCommandList(*_command_lists.at(i), *_fences.at(i)); } else { - _command_queue->executeCommandList(*_command_lists.at(i)); + _graph->get_command_queue()->executeCommandList(*_command_lists.at(i)); } } @@ -154,12 +184,12 @@ void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index) OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList"); _logger.debug("Pipeline - updateCommandList"); - const size_t numberOfCommandLists = _command_lists.size(); + const size_t _number_of_command_lists = _command_lists.size(); - for (size_t i = 0; i < numberOfCommandLists; i++) { + for (size_t i = 0; i < _number_of_command_lists; i++) { _command_lists.at(i)->updateMutableCommandList( index, - static_cast(tensorsData.mem) + (i * tensorsData.size) / numberOfCommandLists); + static_cast(tensorsData.mem) + (i * tensorsData.size) / _number_of_command_lists); _command_lists.at(i)->close(); } }; @@ -168,9 +198,9 @@ void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index, OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList"); _logger.debug("Pipeline - updateCommandList"); - const size_t numberOfCommandLists = _command_lists.size(); + const size_t _number_of_command_lists = _command_lists.size(); - OPENVINO_ASSERT(commandListIndex < numberOfCommandLists, + OPENVINO_ASSERT(commandListIndex < _number_of_command_lists, "Command list index is higgher than the number of Command lists ", commandListIndex); diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp index 51c4a4cf26eafd..7e718d9172f4f7 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp @@ -9,6 +9,7 @@ #include #include "intel_npu/network_metadata.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" #include "intel_npu/utils/zero/zero_wrappers.hpp" #include "openvino/runtime/profiling_info.hpp" @@ -17,13 +18,10 @@ namespace intel_npu { class IGraph : public std::enable_shared_from_this { public: - IGraph(ze_graph_handle_t handle, NetworkMetadata metadata, std::optional> blob) - : _handle(handle), - _metadata(std::move(metadata)) { - if (blob.has_value()) { - _blob = std::move(*blob); - } - } + IGraph(ze_graph_handle_t handle, + NetworkMetadata metadata, + const Config& config, + std::optional> blob); virtual void export_blob(std::ostream& stream) const = 0; @@ -36,55 +34,48 @@ class IGraph : public std::enable_shared_from_this { virtual ~IGraph() = default; - const NetworkMetadata& get_metadata() const { - return _metadata; - } - - ze_graph_handle_t get_handle() const { - return _handle; - } - - void update_network_name(std::string_view name) { - _metadata.name = name; - } - - inline const std::vector& get_input_descriptors() const { - return _input_descriptors; - } - - inline const std::vector& get_output_descriptors() const { - return _output_descriptors; - } - - inline const std::shared_ptr& get_command_queue() const { - return _command_queue; - } - - void set_workload_type(const ov::WorkloadType workloadType) const { - if (_command_queue == nullptr) { - return; - } - - ze_command_queue_workload_type_t zeWorkloadType; - switch (workloadType) { - case ov::WorkloadType::DEFAULT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT; - break; - case ov::WorkloadType::EFFICIENT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND; - break; - default: - OPENVINO_THROW("Unknown value for WorkloadType!"); - } - - _command_queue->setWorkloadType(zeWorkloadType); - } - - std::mutex& get_mutex() { - return _mutex; - } + const NetworkMetadata& get_metadata() const; + ze_graph_handle_t get_handle() const; + + void update_network_name(std::string_view name); + + const std::vector& get_input_descriptors() const; + const std::vector& get_output_descriptors() const; + const std::shared_ptr& get_command_queue() const; + + void set_workload_type(const ov::WorkloadType workloadType) const; + + std::mutex& get_mutex(); + + void set_last_submitted_event(const std::shared_ptr& event, size_t indexOfCommandList); + const std::shared_ptr& get_last_submitted_event(size_t indexOfCommandList) const; + + uint32_t get_unique_id(); + void set_last_submitted_id(uint32_t id_index); + const uint32_t get_last_submitted_id() const; + + const std::optional get_batch_size() const; protected: + /** + * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by + * the model will also be deduced and returned. + * @details Batching can be handled by the plugin only if: + * - The batch axis is the first axis. + * - The batch size received by the compiler takes the default value of 1. + * - The batch size found in the IR model matches for all inputs/outputs and takes a value different than the + * default one. + * + * If any of the previous conditions is not fulfilled, the functon will return the default batch size, thus no + * custom algorithm will be applied inside the plugin in order to address batching. + * + * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will + * ultimately be used for determining the batch size. + * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside + * the plugin. + */ + std::optional get_batch_size(const NetworkMetadata& metadata); + ze_graph_handle_t _handle = nullptr; NetworkMetadata _metadata; @@ -92,12 +83,24 @@ class IGraph : public std::enable_shared_from_this { std::vector _output_descriptors; std::shared_ptr _command_queue; + std::vector> _last_submitted_event; // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the // first inference starts running std::mutex _mutex; std::vector _blob; + + uint32_t _unique_id = 0; + uint32_t _last_submitted_id; + + /** + * @brief The batch size used by the corresponding model. + * @details The attribute contains a value only if the plugin performs the batches splitting operation. + */ + std::optional _batch_size = std::nullopt; + + Logger _logger; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/common/src/igraph.cpp b/src/plugins/intel_npu/src/common/src/igraph.cpp new file mode 100644 index 00000000000000..fd5463af5eea3e --- /dev/null +++ b/src/plugins/intel_npu/src/common/src/igraph.cpp @@ -0,0 +1,159 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_npu/common/igraph.hpp" + +#include "intel_npu/config/compiler.hpp" +#include "intel_npu/config/runtime.hpp" + +namespace { +constexpr std::size_t BATCH_AXIS = 0; +constexpr std::size_t DEFAULT_BATCH_SIZE = 1; +} // namespace + +namespace intel_npu { + +IGraph::IGraph(ze_graph_handle_t handle, + NetworkMetadata metadata, + const Config& config, + std::optional> blob) + : _handle(handle), + _metadata(std::move(metadata)), + _logger("IGraph", config.get()) { + if (blob.has_value()) { + _blob = std::move(*blob); + } +} + +const NetworkMetadata& IGraph::get_metadata() const { + return _metadata; +} + +ze_graph_handle_t IGraph::get_handle() const { + return _handle; +} + +void IGraph::update_network_name(std::string_view name) { + _metadata.name = name; +} + +const std::vector& IGraph::get_input_descriptors() const { + return _input_descriptors; +} + +const std::vector& IGraph::get_output_descriptors() const { + return _output_descriptors; +} + +const std::shared_ptr& IGraph::get_command_queue() const { + return _command_queue; +} + +void IGraph::set_workload_type(const ov::WorkloadType workloadType) const { + if (_command_queue == nullptr) { + return; + } + + ze_command_queue_workload_type_t zeWorkloadType; + switch (workloadType) { + case ov::WorkloadType::DEFAULT: + zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT; + break; + case ov::WorkloadType::EFFICIENT: + zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND; + break; + default: + OPENVINO_THROW("Unknown value for WorkloadType!"); + } + + _command_queue->setWorkloadType(zeWorkloadType); +} + +std::mutex& IGraph::get_mutex() { + return _mutex; +} + +void IGraph::set_last_submitted_event(const std::shared_ptr& event, size_t indexOfCommandList) { + _last_submitted_event[indexOfCommandList] = event; +} + +const std::shared_ptr& IGraph::get_last_submitted_event(size_t indexOfCommandList) const { + return _last_submitted_event[indexOfCommandList]; +} + +uint32_t IGraph::get_unique_id() { + return _unique_id++; +} + +void IGraph::set_last_submitted_id(uint32_t id_index) { + _last_submitted_id = id_index; +} + +const uint32_t IGraph::get_last_submitted_id() const { + return _last_submitted_id; +} + +std::optional IGraph::get_batch_size(const NetworkMetadata& metadata) { + if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) { + _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); + return std::nullopt; + } + + const ov::PartialShape& firstOutputShape = *metadata.outputs.at(0).shapeFromIRModel; + if (firstOutputShape.is_dynamic()) { + _logger.warning("Networks using dynamic shapes are not supported when batching is handled by the plugin"); + return std::nullopt; + } + if (firstOutputShape.rank().get_length() == 0) { + _logger.warning("Networks using rank 0 shapes for inputs/outputs are not supported when batching is " + "handled by the plugin"); + return std::nullopt; + } + + const size_t candidateBatchSize = firstOutputShape[BATCH_AXIS].get_length(); + if (candidateBatchSize == 0 || candidateBatchSize == DEFAULT_BATCH_SIZE) { + _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); + return std::nullopt; + } + + auto checkDescriptorsUseCandidateBatchSize = [candidateBatchSize](const std::vector& descriptors) { + for (const IODescriptor& descriptor : descriptors) { + OPENVINO_ASSERT(descriptor.shapeFromIRModel.has_value(), + "Missing value for the \"shapeFromIRModel\" attribute, I/O descriptor"); + + const ov::PartialShape& shapeFromCompiler = descriptor.shapeFromCompiler; + const ov::PartialShape& shapeFromIRModel = *descriptor.shapeFromIRModel; + + if (shapeFromCompiler.is_dynamic() || shapeFromCompiler.rank().get_length() == 0 || + *shapeFromCompiler.begin() != DEFAULT_BATCH_SIZE) { + return false; + } + + if (!descriptor.isStateInput && !descriptor.isStateOutput && !descriptor.isShapeTensor) { + if (shapeFromIRModel.is_dynamic() || shapeFromIRModel.rank().get_length() == 0 || + *shapeFromIRModel.begin() != candidateBatchSize) { + return false; + } + } + } + + return true; + }; + + if (!checkDescriptorsUseCandidateBatchSize(metadata.inputs) || + !checkDescriptorsUseCandidateBatchSize(metadata.outputs)) { + _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); + return std::nullopt; + } + + _logger.debug("Batching is handled by the plugin"); + + return candidateBatchSize; +} + +const std::optional IGraph::get_batch_size() const { + return _batch_size; +} + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp index f819ed73711cf2..9d634656db109a 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp @@ -541,13 +541,21 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config, content = std::regex_replace(content, std::regex(batchstr.str()), ""); } - // NPU_DEFER_WEIGHTS_LOAD is not supported in versions < 6.2 - need to remove it - if ((compilerVersion.major < 6) || (compilerVersion.major == 6 && compilerVersion.minor < 2)) { + // NPU_DEFER_WEIGHTS_LOAD is needed at runtime only + { std::ostringstream batchstr; batchstr << ov::intel_npu::defer_weights_load.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER; - logger.warning( - "NPU_DEFER_WEIGHTS_LOAD property is not suppored by this compiler version. Removing from parameters"); + logger.info("NPU_DEFER_WEIGHTS_LOAD property is needed at runtime only. Removing from parameters"); + content = std::regex_replace(content, std::regex(batchstr.str()), ""); + } + + // NPU_RUN_INFERENCES_SEQUENTIALLY is needed at runtime only + { + std::ostringstream batchstr; + batchstr << ov::intel_npu::run_inferences_sequentially.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER + << "\\S+" << VALUE_DELIMITER; + logger.info("NPU_RUN_INFERENCES_SEQUENTIALLY property is needed at runtime only. Removing from parameters"); content = std::regex_replace(content, std::regex(batchstr.str()), ""); } diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp index e1f3990b835e8d..0d180f983ad3a9 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp @@ -16,7 +16,7 @@ DriverGraph::DriverGraph(const std::shared_ptr& zeGraphExt, NetworkMetadata metadata, const Config& config, std::optional> blob) - : IGraph(graphHandle, std::move(metadata), std::move(blob)), + : IGraph(graphHandle, std::move(metadata), config, std::move(blob)), _zeGraphExt(zeGraphExt), _zeroInitStruct(zeroInitStruct), _logger("DriverGraph", config.get()) { @@ -126,6 +126,16 @@ void DriverGraph::initialize(const Config& config) { // _zeGraphExt->initializeGraph(). The driver will not access the original blob from this moment on, so we are // releasing it here to avoid unnecessary memory usage. _blobIsReleased = release_blob(config); + + if (config.get() != ov::intel_npu::BatchMode::COMPILER) { + _batch_size = get_batch_size(_metadata); + } + + if (config.get()) { + auto number_of_command_lists = _batch_size.has_value() ? *_batch_size : 1; + + _last_submitted_event.resize(number_of_command_lists); + } } bool DriverGraph::release_blob(const Config& config) { diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp index c99069a0a9760f..b1658e7e0582e0 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp @@ -17,7 +17,7 @@ PluginGraph::PluginGraph(const std::shared_ptr& zeGraphExt, NetworkMetadata metadata, std::vector blob, const Config& config) - : IGraph(graphHandle, std::move(metadata), std::optional>(std::move(blob))), + : IGraph(graphHandle, std::move(metadata), config, std::optional>(std::move(blob))), _zeGraphExt(zeGraphExt), _zeroInitStruct(zeroInitStruct), _compiler(compiler), @@ -115,6 +115,16 @@ void PluginGraph::initialize(const Config& config) { _zeGraphExt->initializeGraph(_handle, config); + if (config.get() != ov::intel_npu::BatchMode::COMPILER) { + _batch_size = get_batch_size(_metadata); + } + + if (config.get()) { + auto number_of_command_lists = _batch_size.has_value() ? *_batch_size : 1; + + _last_submitted_event.resize(number_of_command_lists); + } + _logger.debug("Graph initialize finish"); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index c6be2793fe6f70..be61fa4de081a6 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -668,6 +668,10 @@ ov::SoPtr ov::npuw::CompiledModel::compile_submodel(const st // NOTE(dm): Not sure if it is required for the NPUW plugin, but likely it is auto& device_config = m_meta_devices[device]; + if (ov::npuw::util::starts_with(device, "NPU") && m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) { + device_config["NPU_RUN_INFERENCES_SEQUENTIALLY"] = "YES"; + } + const auto& cache_dir = m_cfg.get<::intel_npu::NPUW_CACHE_DIR>(); if (!cache_dir.empty()) { LOG_INFO("NPUW will try to utilize CACHE_DIR for " << submodel->get_friendly_name() << " submodel."); diff --git a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp index 4baf15d76718a8..4e86d32d2f72b1 100644 --- a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp @@ -311,6 +311,12 @@ void CompiledModel::initialize_properties() { [](const Config& config) { return config.getString(); }}}, + {ov::intel_npu::run_inferences_sequentially.name(), + {false, + ov::PropertyMutability::RO, + [](const Config& config) { + return config.get(); + }}}, }; for (auto& property : _properties) { diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 9f77d952fd813b..18a96bff02fb80 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -568,6 +568,12 @@ Plugin::Plugin() [](const Config& config) { return config.getString(); }}}, + {ov::intel_npu::run_inferences_sequentially.name(), + {false, + ov::PropertyMutability::RW, + [](const Config& config) { + return config.get(); + }}}, {ov::intel_npu::batch_mode.name(), {false, ov::PropertyMutability::RW, [](const Config& config) { return config.getString(); }}}}; diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp index 8883bb99dd178e..0df0c5d66169a4 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp @@ -188,7 +188,7 @@ static inline uint32_t findGroupOrdinal(ze_device_handle_t device_handle, const "zeDeviceGetCommandQueueGroupProperties", zeDeviceGetCommandQueueGroupProperties(device_handle, &command_queue_group_count, nullptr)); - log.debug("ZeroDevice::ZeroDevice - resize command_queue_group_count"); + log.debug("zero_utils::findGroupOrdinal - resize command_queue_group_count"); command_group_properties.resize(command_queue_group_count); for (auto& prop : command_group_properties) { diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp index 9b5b1b4540fbe7..61999376680e90 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp @@ -37,7 +37,7 @@ class EventPool { class Event { public: Event() = delete; - Event(const ze_event_pool_handle_t& event_pool, uint32_t event_index); + Event(const std::shared_ptr& event_pool, uint32_t event_index); Event(const Event&) = delete; Event(Event&&) = delete; Event& operator=(const Event&) = delete; @@ -51,6 +51,7 @@ class Event { ~Event(); private: + std::shared_ptr _event_pool; ze_event_handle_t _handle = nullptr; Logger _log; diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp index 858e65d4b5e6ee..d95b0e172a7d64 100644 --- a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp +++ b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp @@ -24,9 +24,11 @@ EventPool::~EventPool() { } } -Event::Event(const ze_event_pool_handle_t& event_pool, uint32_t event_index) : _log("Event", Logger::global().level()) { +Event::Event(const std::shared_ptr& event_pool, uint32_t event_index) + : _event_pool(event_pool), + _log("Event", Logger::global().level()) { ze_event_desc_t event_desc = {ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, event_index, 0, 0}; - THROW_ON_FAIL_FOR_LEVELZERO("zeEventCreate", zeEventCreate(event_pool, &event_desc, &_handle)); + THROW_ON_FAIL_FOR_LEVELZERO("zeEventCreate", zeEventCreate(_event_pool->handle(), &event_desc, &_handle)); } void Event::AppendSignalEvent(CommandList& command_list) const { THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendSignalEvent", diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp index 5d023fe9d0bee6..e4a49ce9b7ccdb 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp @@ -19,6 +19,12 @@ INSTANTIATE_TEST_SUITE_P(compatibility_smoke_BehaviorTest, ::testing::ValuesIn(configsInferRequestRunTests)), InferRequestRunTests::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, + RunSeqTests, + ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(configsInferRequestRunTests)), + InferRequestRunTests::getTestCaseName); + const std::vector batchingConfigs = { {ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::PLUGIN)}, {ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::COMPILER)}, @@ -29,3 +35,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::ValuesIn(batchingConfigs)), InferRequestRunTests::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, + BatchingRunSeqTests, + ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(batchingConfigs)), + InferRequestRunTests::getTestCaseName); diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp index 20be5ed25edd27..07466677b9d547 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp @@ -103,9 +103,7 @@ class InferRequestRunTests : public ov::test::behavior::OVPluginTestBase, APIBaseTest::TearDown(); } - std::shared_ptr createBatchingModel(element::Type type, - const PartialShape& shape, - const ov::Layout& layout) { + std::shared_ptr createModel(element::Type type, const PartialShape& shape, const ov::Layout& layout) { ResultVector res; ParameterVector params; @@ -352,7 +350,7 @@ TEST_P(BatchingRunTests, CheckBatchingSupportInfer) { ov::InferRequest inference_request; auto batch_shape = Shape{4, 2, 32, 32}; - std::shared_ptr ov_model_batch = createBatchingModel(element::f32, batch_shape, "N..."); + std::shared_ptr ov_model_batch = createModel(element::f32, batch_shape, "N..."); OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model_batch, target_device, configuration)); OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request()); @@ -365,7 +363,7 @@ TEST_P(BatchingRunTests, CheckBatchingSupportAsync) { ov::InferRequest inference_request; auto batch_shape = Shape{4, 2, 32, 32}; - std::shared_ptr ov_model_batch = createBatchingModel(element::f32, batch_shape, "N..."); + std::shared_ptr ov_model_batch = createModel(element::f32, batch_shape, "N..."); OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model_batch, target_device, configuration)); OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request()); @@ -396,7 +394,7 @@ TEST_P(BatchingRunTests, UseCompilerBatchingErrorPluginBatching) { TEST_P(BatchingRunTests, SetInputTensorInfer) { auto batch_shape = Shape{4, 2, 2, 2}; auto shape_size = ov::shape_size(batch_shape); - auto model = createBatchingModel(element::f32, batch_shape, "N..."); + auto model = createModel(element::f32, batch_shape, "N..."); float* buffer = new float[shape_size]; compiled_model = core->compile_model(model, target_device, configuration); @@ -422,7 +420,7 @@ TEST_P(BatchingRunTests, SetInputTensorInfer) { TEST_P(BatchingRunTests, SetInputTensorAsync) { auto batch_shape = Shape{4, 2, 2, 2}; auto shape_size = ov::shape_size(batch_shape); - auto model = createBatchingModel(element::f32, batch_shape, "N..."); + auto model = createModel(element::f32, batch_shape, "N..."); float* buffer = new float[shape_size]; compiled_model = core->compile_model(model, target_device, configuration); @@ -449,7 +447,7 @@ TEST_P(BatchingRunTests, SetInputTensorAsync) { TEST_P(BatchingRunTests, SetInputTensorInfer_Caching) { auto batch_shape = Shape{4, 2, 2, 2}; auto shape_size = ov::shape_size(batch_shape); - auto model = createBatchingModel(element::f32, batch_shape, "N..."); + auto model = createModel(element::f32, batch_shape, "N..."); float* buffer = new float[shape_size]; m_cache_dir = generateCacheDirName(GetTestName()); @@ -480,7 +478,7 @@ TEST_P(BatchingRunTests, SetInputTensorInfer_Caching) { TEST_P(BatchingRunTests, CheckTwoRunsInfer) { auto batch_shape = Shape{4, 2, 2, 2}; auto shape_size = ov::shape_size(batch_shape); - auto model = createBatchingModel(element::f32, batch_shape, "N..."); + auto model = createModel(element::f32, batch_shape, "N..."); float* buffer = new float[shape_size]; auto context = core->get_default_context(target_device); @@ -524,6 +522,250 @@ TEST_P(BatchingRunTests, CheckTwoRunsInfer) { delete[] buffer; } +using RunSeqTests = InferRequestRunTests; + +TEST_P(RunSeqTests, CheckMultipleRunsSeq0) { + auto shape = Shape{1, 64, 64, 256}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + auto context = core->get_default_context(target_device); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + + const uint32_t inferences = 32; + std::array inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + input_tensor = context.create_host_tensor(ov::element::f32, shape); + for (uint32_t i = 0; i < inferences; i++) { + inference_request[i] = compiled_model.create_infer_request(); + output_tensor[i] = context.create_host_tensor(ov::element::f32, shape); + } + + inference_request[0].set_input_tensor(input_tensor); + inference_request[0].set_output_tensor(output_tensor[0]); + + const uint32_t runs = 10; + for (uint32_t z = 0; z < runs; z++) { + auto* input_data = reinterpret_cast(input_tensor.data()); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = static_cast(z); + } + + inference_request[0].start_async(); // Adds '1' to each element + + for (uint32_t i = 1; i < inferences; i++) { + inference_request[i].set_input_tensor(output_tensor[i - 1]); + inference_request[i].set_output_tensor(output_tensor[i]); + + inference_request[i].start_async(); // Adds '1' to each element + } + + inference_request[inferences - 1].wait(); + + float expected_result = static_cast(z) + 1.f; + + for (uint32_t i = 0; i < inferences; i++) { + auto* output_tensor_data = reinterpret_cast(output_tensor[i].data()); + for (size_t j = 0; j < shape_size; ++j) { + EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5) + << "Run=" << z << "Output=" << i << " Expected=" << expected_result + << ", actual=" << output_tensor_data[j] << " for index " << j; + } + expected_result++; + } + } +} + +TEST_P(RunSeqTests, CheckMultipleRunsSeq1) { + auto shape = Shape{1, 64, 64, 256}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + auto context = core->get_default_context(target_device); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + + const int inferences = 32; + std::array inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + input_tensor = context.create_host_tensor(ov::element::f32, shape); + + for (int i = 0; i < inferences; i++) { + inference_request[i] = compiled_model.create_infer_request(); + output_tensor[i] = context.create_host_tensor(ov::element::f32, shape); + } + + inference_request[inferences - 1].set_input_tensor(input_tensor); + inference_request[inferences - 1].set_output_tensor(output_tensor[inferences - 1]); + + const int runs = 10; + for (int z = 0; z < runs; z++) { + auto* input_data = reinterpret_cast(input_tensor.data()); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = static_cast(z); + } + + inference_request[inferences - 1].start_async(); // Adds '1' to each element + + for (int i = inferences - 2; i >= 0; i--) { + inference_request[i].set_input_tensor(output_tensor[i + 1]); + inference_request[i].set_output_tensor(output_tensor[i]); + + inference_request[i].start_async(); // Adds '1' to each element + } + + inference_request[0].wait(); + + float expected_result = static_cast(z) + 1.f; + + for (int i = inferences - 1; i >= 0; i--) { + auto* output_tensor_data = reinterpret_cast(output_tensor[i].data()); + for (size_t j = 0; j < shape_size; ++j) { + EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5) + << "Run=" << z << "Output=" << i << " Expected=" << expected_result + << ", actual=" << output_tensor_data[j] << " for index " << j; + } + expected_result++; + } + } +} + +TEST_P(RunSeqTests, CheckMultipleRunsSeq2) { + auto shape = Shape{1, 64, 64, 256}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + auto context = core->get_default_context(target_device); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + + const int inferences = 32; + std::array inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + input_tensor = context.create_host_tensor(ov::element::f32, shape); + + for (int i = 0; i < inferences; i++) { + inference_request[i] = compiled_model.create_infer_request(); + output_tensor[i] = context.create_host_tensor(ov::element::f32, shape); + } + + inference_request[inferences - 1].set_input_tensor(input_tensor); + inference_request[inferences - 1].set_output_tensor(output_tensor[inferences - 1]); + + auto* input_data = reinterpret_cast(input_tensor.data()); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 1.f; + } + + inference_request[inferences - 1].start_async(); + + for (int i = inferences - 2; i >= 0; i--) { + inference_request[i].set_input_tensor(output_tensor[i + 1]); + inference_request[i].set_output_tensor(output_tensor[i]); + + inference_request[i].start_async(); + } + + inference_request[0].wait(); + + try { + inference_request[5].start_async(); + inference_request[5].wait(); + } catch (const std::exception& ex) { + ASSERT_FALSE(false) << ex.what(); + return; + } + + ASSERT_FALSE(true) << "Exception is expected but it didn't throw any exception!"; +} + +TEST_P(RunSeqTests, CheckMultipleRunsSeq3) { + auto shape = Shape{1, 64, 64, 256}; + auto model = createModel(element::f32, shape, "N..."); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + ov::InferRequest inference_request; + inference_request = compiled_model.create_infer_request(); + + OV_EXPECT_THROW(inference_request.infer(), + ov::Exception, + HasSubstr("Only start async is supported when RUN_INFERENCES_SEQUENTIALLY is enabled!")); +} + +using BatchingRunSeqTests = InferRequestRunTests; + +TEST_P(BatchingRunSeqTests, CheckMultipleBatchingRunsSeq) { + auto shape = Shape{4, 2, 64, 64}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + auto context = core->get_default_context(target_device); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + + const uint32_t inferences = 32; + std::array inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + input_tensor = context.create_host_tensor(ov::element::f32, shape); + for (uint32_t i = 0; i < inferences; i++) { + inference_request[i] = compiled_model.create_infer_request(); + output_tensor[i] = context.create_host_tensor(ov::element::f32, shape); + } + + inference_request[0].set_input_tensor(input_tensor); + inference_request[0].set_output_tensor(output_tensor[0]); + + const uint32_t runs = 10; + for (uint32_t z = 0; z < runs; z++) { + auto* input_data = reinterpret_cast(input_tensor.data()); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = static_cast(z); + } + + inference_request[0].start_async(); // Adds '1' to each element + + for (uint32_t i = 1; i < inferences; i++) { + inference_request[i].set_input_tensor(output_tensor[i - 1]); + inference_request[i].set_output_tensor(output_tensor[i]); + + inference_request[i].start_async(); // Adds '1' to each element + } + + inference_request[inferences - 1].wait(); + + float expected_result = static_cast(z) + 1.f; + + for (uint32_t i = 0; i < inferences; i++) { + auto* output_tensor_data = reinterpret_cast(output_tensor[i].data()); + for (size_t j = 0; j < shape_size; ++j) { + EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5) + << "Run=" << z << "Output=" << i << " Expected=" << expected_result + << ", actual=" << output_tensor_data[j] << " for index " << j; + } + expected_result++; + } + } +} + } // namespace behavior } // namespace test } // namespace ov