[NPU] In-order async execution (#27696)

### Details: - In-order to activate it, the NPU_RUN_INFERENCES_SEQUENTIALLY property should be set to true when compiling a model. This feature works only for the inferences from a compiled model. - It doesn't support scheduling inferences from different compiled models. - Inferences should be run each time in the same order they were called the first time --------- Signed-off-by: Bogdan Pereanu <[email protected]> Co-authored-by: Dmitry Matveev <[email protected]>
openvinotoolkit · Nov 27, 2024 · 79493c2 · 79493c2
1 parent a1920c4
commit 79493c2
Show file tree

Hide file tree

Showing 20 changed files with 655 additions and 214 deletions.
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
@@ -270,4 +270,22 @@ struct BYPASS_UMD_CACHING final : OptionBase<BYPASS_UMD_CACHING, bool> {
         return OptionMode::RunTime;
     }
 };
+
+//
+// RUN_INFERENCES_SEQUENTIALLY
+//
+struct RUN_INFERENCES_SEQUENTIALLY final : OptionBase<RUN_INFERENCES_SEQUENTIALLY, bool> {
+    static std::string_view key() {
+        return ov::intel_npu::run_inferences_sequentially.name();
+    }
+
+    static bool defaultValue() {
+        return false;
+    }
+
+    static OptionMode mode() {
+        return OptionMode::RunTime;
+    }
+};
+
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp
@@ -327,5 +327,14 @@ static constexpr ov::Property<std::string, ov::PropertyMutability::RO> backend_n
  */
 static constexpr ov::Property<std::string> backend_compilation_params{"NPU_BACKEND_COMPILATION_PARAMS"};
 
+/**
+ * @brief [Only for NPU Plugin]
+ * Type: boolean, default is false.
+ * This option allows to run inferences sequentially, in the order in which they were created
+ * @note Experimental property, for now it only works in very specific scenarios. We need driver updates before we can
+ * implement a robust solution for in-order execution
+ */
+static constexpr ov::Property<bool> run_inferences_sequentially{"NPU_RUN_INFERENCES_SEQUENTIALLY"};
+
 }  // namespace intel_npu
 }  // namespace ov
diff --git a/src/plugins/intel_npu/src/al/src/config/runtime.cpp b/src/plugins/intel_npu/src/al/src/config/runtime.cpp
@@ -27,6 +27,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) {
     desc.add<WORKLOAD_TYPE>();
     desc.add<TURBO>();
     desc.add<BYPASS_UMD_CACHING>();
+    desc.add<RUN_INFERENCES_SEQUENTIALLY>();
 }
 
 // Heuristically obtained number. Varies depending on the values of PLATFORM and PERFORMANCE_HINT

diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
@@ -38,25 +38,6 @@ class ZeroInferRequest final : public SyncInferRequest {
     std::vector<ov::ProfilingInfo> get_profiling_info() const override;
     std::vector<uint8_t> get_raw_profiling_data() const;
 
-    /**
-     * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by
-     * the model will also be deduced and returned.
-     * @details Batching can be handled by the plugin only if:
-     *  - The batch axis is the first axis.
-     *  - The batch size received by the compiler takes the default value of 1.
-     *  - The batch size found in the IR model matches for all inputs/outputs and takes a value different than the
-     * default one.
-     *
-     * If any of the previous conditions is not fulfilled, the functon will return the default batch size, thus no
-     * custom algorithm will be applied inside the plugin in order to address batching.
-     *
-     * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will
-     * ultimately be used for determining the batch size.
-     * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside
-     * the plugin.
-     */
-    std::optional<size_t> get_batch_size(const NetworkMetadata& metadata);
-
     /**
      * @brief Check the received tensor and set the Level Zero tensor accordingly
      * @param tensor Reference to a tensor.
@@ -106,22 +87,6 @@ class ZeroInferRequest final : public SyncInferRequest {
     std::shared_ptr<zeroProfiling::NpuInferProfiling> _npuProfiling;
     std::unique_ptr<Pipeline> _pipeline;
 
-    /**
-     * @brief Indicates how many command lists will be used inside the pipeline.
-     * @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis
-     * between these lists.
-     *
-     * If batching is handled on compiler's side then a single command list shall be used, we don't do any
-     * specific operation inside the plugin in this case.
-     */
-    size_t _numberOfCommandLists = 1;
-
-    /**
-     * @brief The batch size used by the corresponding model.
-     * @details The attribute contains a value only if the plugin performs the batches splitting operation.
-     */
-    std::optional<std::size_t> _batchSize = std::nullopt;
-
     bool _pipelineIsCreated = false;
 };
 

diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
@@ -28,7 +28,6 @@ struct Pipeline {
              const std::shared_ptr<zeroProfiling::NpuInferProfiling>& npu_profiling,
              const std::vector<std::vector<std::optional<TensorData>>>& inputTensorsData,
              const std::vector<std::optional<TensorData>>& outputTensorsData,
-             size_t numberOfCommandLists,
              uint32_t group_ordinal);
 
     Pipeline(const Pipeline&) = delete;
@@ -43,12 +42,25 @@ struct Pipeline {
     void updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex);
 
 protected:
+    std::shared_ptr<IGraph> _graph;
     const Config _config;
+    const uint32_t _id;
+
+    /**
+     * @brief Indicates how many command lists will be used inside the pipeline.
+     * @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis
+     * between these lists.
+     *
+     * If batching is handled on compiler's side then a single command list shall be used, we don't do any
+     * specific operation inside the plugin in this case.
+     */
+    size_t _number_of_command_lists;
+
     std::shared_ptr<CommandQueue> _command_queue;
     std::vector<std::unique_ptr<CommandList>> _command_lists;
     std::vector<std::unique_ptr<Fence>> _fences;
-    EventPool _event_pool;
-    std::vector<std::unique_ptr<Event>> _events;
+    std::shared_ptr<EventPool> _event_pool;
+    std::vector<std::shared_ptr<Event>> _events;
     bool sync_output_with_fences_ = true;
     std::shared_ptr<zeroProfiling::NpuInferProfiling> _npu_profiling;
     Logger _logger;

diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -20,8 +20,6 @@ using namespace intel_npu;
 namespace {
 
 constexpr std::size_t SINGLE_TENSOR = 0;
-constexpr std::size_t BATCH_AXIS = 0;
-constexpr std::size_t DEFAULT_BATCH_SIZE = 1;
 constexpr bool INPUT = true;
 constexpr bool OUTPUT = false;
 
@@ -96,64 +94,6 @@ bool memory_was_allocated_in_the_same_l0_context(ze_context_handle_t hContext, c
 
 }  // namespace
 
-std::optional<size_t> ZeroInferRequest::get_batch_size(const NetworkMetadata& metadata) {
-    if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) {
-        _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
-        return std::nullopt;
-    }
-
-    const ov::PartialShape& firstOutputShape = *metadata.outputs.at(0).shapeFromIRModel;
-    if (firstOutputShape.is_dynamic()) {
-        _logger.warning("Networks using dynamic shapes are not supported when batching is handled by the plugin");
-        return std::nullopt;
-    }
-    if (firstOutputShape.rank().get_length() == 0) {
-        _logger.warning(
-            "Networks using rank 0 shapes for inputs/outputs are not supported when batching is handled by the plugin");
-        return std::nullopt;
-    }
-
-    const size_t candidateBatchSize = firstOutputShape[BATCH_AXIS].get_length();
-    if (candidateBatchSize == 0 || candidateBatchSize == DEFAULT_BATCH_SIZE) {
-        _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
-        return std::nullopt;
-    }
-
-    auto checkDescriptorsUseCandidateBatchSize = [candidateBatchSize](const std::vector<IODescriptor>& descriptors) {
-        for (const IODescriptor& descriptor : descriptors) {
-            OPENVINO_ASSERT(descriptor.shapeFromIRModel.has_value(),
-                            "Missing value for the \"shapeFromIRModel\" attribute, I/O descriptor");
-
-            const ov::PartialShape& shapeFromCompiler = descriptor.shapeFromCompiler;
-            const ov::PartialShape& shapeFromIRModel = *descriptor.shapeFromIRModel;
-
-            if (shapeFromCompiler.is_dynamic() || shapeFromCompiler.rank().get_length() == 0 ||
-                *shapeFromCompiler.begin() != DEFAULT_BATCH_SIZE) {
-                return false;
-            }
-
-            if (!descriptor.isStateInput && !descriptor.isStateOutput && !descriptor.isShapeTensor) {
-                if (shapeFromIRModel.is_dynamic() || shapeFromIRModel.rank().get_length() == 0 ||
-                    *shapeFromIRModel.begin() != candidateBatchSize) {
-                    return false;
-                }
-            }
-        }
-
-        return true;
-    };
-
-    if (!checkDescriptorsUseCandidateBatchSize(metadata.inputs) ||
-        !checkDescriptorsUseCandidateBatchSize(metadata.outputs)) {
-        _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
-        return std::nullopt;
-    }
-
-    _logger.debug("Batching is handled by the plugin");
-
-    return candidateBatchSize;
-}
-
 //------------------------------------------------------------------------------
 ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>& initStructs,
                                    const std::shared_ptr<const ICompiledModel>& compiledModel,
@@ -187,13 +127,6 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
     _inputAllocator =
         std::make_shared<const zeroMemory::HostMemAllocator>(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED);
 
-    if (config.get<BATCH_MODE>() != ov::intel_npu::BatchMode::COMPILER) {
-        _batchSize = get_batch_size(_metadata);
-    }
-    if (_batchSize.has_value()) {
-        _numberOfCommandLists = *_batchSize;
-    }
-
     _logger.debug("ZeroInferRequest::ZeroInferRequest - checking level zero attributes and allocating tensors");
 
     size_t ioIndex = 0;
@@ -205,7 +138,8 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
             continue;
         }
 
-        get_level_zero_input(ioIndex) = allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _batchSize);
+        get_level_zero_input(ioIndex) =
+            allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _graph->get_batch_size());
         get_input_tensor_data(ioIndex) =
             TensorData{get_level_zero_input(ioIndex)->data(), get_level_zero_input(ioIndex)->get_byte_size()};
 
@@ -222,7 +156,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
         }
 
         _levelZeroOutputTensors.at(ioIndex) =
-            allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _batchSize);
+            allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _graph->get_batch_size());
         _outputTensorsData.at(ioIndex) =
             std::optional(TensorData{_levelZeroOutputTensors.at(ioIndex)->data(),
                                      _levelZeroOutputTensors.at(ioIndex)->get_byte_size()});
@@ -236,7 +170,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
 void ZeroInferRequest::create_pipeline() {
     for (size_t inputIndex = 0; inputIndex < _metadata.inputs.size(); ++inputIndex) {
         if (is_batched_input(inputIndex)) {
-            if (_batchSize.has_value()) {
+            if (_graph->get_batch_size().has_value()) {
                 _logger.debug("ZeroInferRequest::create_pipeline - tensors %s were already allocated",
                               _metadata.inputs.at(inputIndex).nodeFriendlyName.c_str());
                 continue;
@@ -250,8 +184,11 @@ void ZeroInferRequest::create_pipeline() {
         }
 
         _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
-        get_level_zero_input(inputIndex) =
-            allocate_tensor(_metadata.inputs.at(inputIndex), inputIndex, INPUT, *_inputAllocator, _batchSize);
+        get_level_zero_input(inputIndex) = allocate_tensor(_metadata.inputs.at(inputIndex),
+                                                           inputIndex,
+                                                           INPUT,
+                                                           *_inputAllocator,
+                                                           _graph->get_batch_size());
         get_input_tensor_data(inputIndex) = std::optional(
             TensorData{get_level_zero_input(inputIndex)->data(), get_level_zero_input(inputIndex)->get_byte_size()});
     }
@@ -263,17 +200,20 @@ void ZeroInferRequest::create_pipeline() {
             continue;
         }
         _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
-        _levelZeroOutputTensors.at(outputIndex) =
-            allocate_tensor(_metadata.outputs.at(outputIndex), outputIndex, OUTPUT, *_outputAllocator, _batchSize);
+        _levelZeroOutputTensors.at(outputIndex) = allocate_tensor(_metadata.outputs.at(outputIndex),
+                                                                  outputIndex,
+                                                                  OUTPUT,
+                                                                  *_outputAllocator,
+                                                                  _graph->get_batch_size());
         _outputTensorsData.at(outputIndex) =
             std::optional(TensorData{_levelZeroOutputTensors.at(outputIndex)->data(),
                                      _levelZeroOutputTensors.at(outputIndex)->get_byte_size()});
     }
 
     // Find the corresponding command queue group.
-    _logger.debug("ZeroDevice::ZeroDevice - findGroupOrdinal");
+    _logger.debug("ZeroInferRequest::create_pipeline - findGroupOrdinal");
     auto groupOrdinal = zeroUtils::findGroupOrdinal(_initStructs->getDevice(), _properties);
-    _logger.debug("ZeroDevice::ZeroDevice - init completed");
+    _logger.debug("ZeroInferRequest::create_pipeline - init completed");
 
     _logger.debug("ZeroInferRequest::create_pipeline - constructing pipeline");
 
@@ -286,7 +226,6 @@ void ZeroInferRequest::create_pipeline() {
                                            _npuProfiling,
                                            _inputTensorsData,
                                            _outputTensorsData,
-                                           _numberOfCommandLists,
                                            groupOrdinal);
 
     _logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed");
@@ -321,7 +260,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
                                                index,
                                                isInput,
                                                isInput ? *_inputAllocator : *_outputAllocator,
-                                               _batchSize);
+                                               _graph->get_batch_size());
 
             setTensorData = true;
             levelZeroTensorCreatedLocally = true;
@@ -444,7 +383,7 @@ void ZeroInferRequest::set_tensors(const ov::Output<const ov::Node>& port,
     get_user_inputs(foundPort.idx) = tensors;
 
     if (_initStructs->getMutableCommandListVersion()) {
-        if (_batchSize.has_value()) {
+        if (_graph->get_batch_size().has_value()) {
             for (size_t i = 0; i < tensors.size(); i++) {
                 auto remoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(tensors[i]._ptr);
 
@@ -525,13 +464,17 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
                                        ioIndex,
                                        isInput,
                                        isInput ? *_inputAllocator : *_outputAllocator,
-                                       _batchSize);
+                                       _graph->get_batch_size());
     tensorsData = std::optional(TensorData{levelZeroTensors->data(), levelZeroTensors->get_byte_size()});
 
     return levelZeroTensors;
 }
 
 void ZeroInferRequest::infer() {
+    if (_config.get<RUN_INFERENCES_SEQUENTIALLY>()) {
+        OPENVINO_THROW("Only start async is supported when RUN_INFERENCES_SEQUENTIALLY is enabled!");
+    }
+
     infer_async();
     get_result();
 }
@@ -567,7 +510,7 @@ void ZeroInferRequest::infer_async() {
         }
 
         if (is_batched_input(inputIndex)) {
-            if (_batchSize.has_value()) {
+            if (_graph->get_batch_size().has_value()) {
                 for (size_t i = 0; i < userTensor.size(); i++) {
                     auto levelZeroBatchRemoteTensor =
                         std::dynamic_pointer_cast<ZeroRemoteTensor>(get_level_zero_input(inputIndex, i));