Skip to content

Commit

Permalink
[NPU] In-order async execution (#27696)
Browse files Browse the repository at this point in the history
### Details:
- In-order to activate it, the NPU_RUN_INFERENCES_SEQUENTIALLY property
should be set to true when compiling a model. This feature works only
for the inferences from a compiled model.
- It doesn't support scheduling inferences from different compiled
models.
- Inferences should be run each time in the same order they were called
the first time

---------

Signed-off-by: Bogdan Pereanu <[email protected]>
Co-authored-by: Dmitry Matveev <[email protected]>
  • Loading branch information
pereanub and dmatveev authored Nov 27, 2024
1 parent a1920c4 commit 8aff79e
Show file tree
Hide file tree
Showing 20 changed files with 655 additions and 214 deletions.
18 changes: 18 additions & 0 deletions src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,4 +270,22 @@ struct BYPASS_UMD_CACHING final : OptionBase<BYPASS_UMD_CACHING, bool> {
return OptionMode::RunTime;
}
};

//
// RUN_INFERENCES_SEQUENTIALLY
//
struct RUN_INFERENCES_SEQUENTIALLY final : OptionBase<RUN_INFERENCES_SEQUENTIALLY, bool> {
static std::string_view key() {
return ov::intel_npu::run_inferences_sequentially.name();
}

static bool defaultValue() {
return false;
}

static OptionMode mode() {
return OptionMode::RunTime;
}
};

} // namespace intel_npu
Original file line number Diff line number Diff line change
Expand Up @@ -327,5 +327,14 @@ static constexpr ov::Property<std::string, ov::PropertyMutability::RO> backend_n
*/
static constexpr ov::Property<std::string> backend_compilation_params{"NPU_BACKEND_COMPILATION_PARAMS"};

/**
* @brief [Only for NPU Plugin]
* Type: boolean, default is false.
* This option allows to run inferences sequentially, in the order in which they were created
* @note Experimental property, for now it only works in very specific scenarios. We need driver updates before we can
* implement a robust solution for in-order execution
*/
static constexpr ov::Property<bool> run_inferences_sequentially{"NPU_RUN_INFERENCES_SEQUENTIALLY"};

} // namespace intel_npu
} // namespace ov
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) {
desc.add<WORKLOAD_TYPE>();
desc.add<TURBO>();
desc.add<BYPASS_UMD_CACHING>();
desc.add<RUN_INFERENCES_SEQUENTIALLY>();
}

// Heuristically obtained number. Varies depending on the values of PLATFORM and PERFORMANCE_HINT
Expand Down
35 changes: 0 additions & 35 deletions src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,25 +38,6 @@ class ZeroInferRequest final : public SyncInferRequest {
std::vector<ov::ProfilingInfo> get_profiling_info() const override;
std::vector<uint8_t> get_raw_profiling_data() const;

/**
* @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by
* the model will also be deduced and returned.
* @details Batching can be handled by the plugin only if:
* - The batch axis is the first axis.
* - The batch size received by the compiler takes the default value of 1.
* - The batch size found in the IR model matches for all inputs/outputs and takes a value different than the
* default one.
*
* If any of the previous conditions is not fulfilled, the functon will return the default batch size, thus no
* custom algorithm will be applied inside the plugin in order to address batching.
*
* @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will
* ultimately be used for determining the batch size.
* @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside
* the plugin.
*/
std::optional<size_t> get_batch_size(const NetworkMetadata& metadata);

/**
* @brief Check the received tensor and set the Level Zero tensor accordingly
* @param tensor Reference to a tensor.
Expand Down Expand Up @@ -106,22 +87,6 @@ class ZeroInferRequest final : public SyncInferRequest {
std::shared_ptr<zeroProfiling::NpuInferProfiling> _npuProfiling;
std::unique_ptr<Pipeline> _pipeline;

/**
* @brief Indicates how many command lists will be used inside the pipeline.
* @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis
* between these lists.
*
* If batching is handled on compiler's side then a single command list shall be used, we don't do any
* specific operation inside the plugin in this case.
*/
size_t _numberOfCommandLists = 1;

/**
* @brief The batch size used by the corresponding model.
* @details The attribute contains a value only if the plugin performs the batches splitting operation.
*/
std::optional<std::size_t> _batchSize = std::nullopt;

bool _pipelineIsCreated = false;
};

Expand Down
18 changes: 15 additions & 3 deletions src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ struct Pipeline {
const std::shared_ptr<zeroProfiling::NpuInferProfiling>& npu_profiling,
const std::vector<std::vector<std::optional<TensorData>>>& inputTensorsData,
const std::vector<std::optional<TensorData>>& outputTensorsData,
size_t numberOfCommandLists,
uint32_t group_ordinal);

Pipeline(const Pipeline&) = delete;
Expand All @@ -43,12 +42,25 @@ struct Pipeline {
void updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex);

protected:
std::shared_ptr<IGraph> _graph;
const Config _config;
const uint32_t _id;

/**
* @brief Indicates how many command lists will be used inside the pipeline.
* @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis
* between these lists.
*
* If batching is handled on compiler's side then a single command list shall be used, we don't do any
* specific operation inside the plugin in this case.
*/
size_t _number_of_command_lists;

std::shared_ptr<CommandQueue> _command_queue;
std::vector<std::unique_ptr<CommandList>> _command_lists;
std::vector<std::unique_ptr<Fence>> _fences;
EventPool _event_pool;
std::vector<std::unique_ptr<Event>> _events;
std::shared_ptr<EventPool> _event_pool;
std::vector<std::shared_ptr<Event>> _events;
bool sync_output_with_fences_ = true;
std::shared_ptr<zeroProfiling::NpuInferProfiling> _npu_profiling;
Logger _logger;
Expand Down
105 changes: 24 additions & 81 deletions src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ using namespace intel_npu;
namespace {

constexpr std::size_t SINGLE_TENSOR = 0;
constexpr std::size_t BATCH_AXIS = 0;
constexpr std::size_t DEFAULT_BATCH_SIZE = 1;
constexpr bool INPUT = true;
constexpr bool OUTPUT = false;

Expand Down Expand Up @@ -96,64 +94,6 @@ bool memory_was_allocated_in_the_same_l0_context(ze_context_handle_t hContext, c

} // namespace

std::optional<size_t> ZeroInferRequest::get_batch_size(const NetworkMetadata& metadata) {
if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) {
_logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
return std::nullopt;
}

const ov::PartialShape& firstOutputShape = *metadata.outputs.at(0).shapeFromIRModel;
if (firstOutputShape.is_dynamic()) {
_logger.warning("Networks using dynamic shapes are not supported when batching is handled by the plugin");
return std::nullopt;
}
if (firstOutputShape.rank().get_length() == 0) {
_logger.warning(
"Networks using rank 0 shapes for inputs/outputs are not supported when batching is handled by the plugin");
return std::nullopt;
}

const size_t candidateBatchSize = firstOutputShape[BATCH_AXIS].get_length();
if (candidateBatchSize == 0 || candidateBatchSize == DEFAULT_BATCH_SIZE) {
_logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
return std::nullopt;
}

auto checkDescriptorsUseCandidateBatchSize = [candidateBatchSize](const std::vector<IODescriptor>& descriptors) {
for (const IODescriptor& descriptor : descriptors) {
OPENVINO_ASSERT(descriptor.shapeFromIRModel.has_value(),
"Missing value for the \"shapeFromIRModel\" attribute, I/O descriptor");

const ov::PartialShape& shapeFromCompiler = descriptor.shapeFromCompiler;
const ov::PartialShape& shapeFromIRModel = *descriptor.shapeFromIRModel;

if (shapeFromCompiler.is_dynamic() || shapeFromCompiler.rank().get_length() == 0 ||
*shapeFromCompiler.begin() != DEFAULT_BATCH_SIZE) {
return false;
}

if (!descriptor.isStateInput && !descriptor.isStateOutput && !descriptor.isShapeTensor) {
if (shapeFromIRModel.is_dynamic() || shapeFromIRModel.rank().get_length() == 0 ||
*shapeFromIRModel.begin() != candidateBatchSize) {
return false;
}
}
}

return true;
};

if (!checkDescriptorsUseCandidateBatchSize(metadata.inputs) ||
!checkDescriptorsUseCandidateBatchSize(metadata.outputs)) {
_logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
return std::nullopt;
}

_logger.debug("Batching is handled by the plugin");

return candidateBatchSize;
}

//------------------------------------------------------------------------------
ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>& initStructs,
const std::shared_ptr<const ICompiledModel>& compiledModel,
Expand Down Expand Up @@ -187,13 +127,6 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
_inputAllocator =
std::make_shared<const zeroMemory::HostMemAllocator>(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED);

if (config.get<BATCH_MODE>() != ov::intel_npu::BatchMode::COMPILER) {
_batchSize = get_batch_size(_metadata);
}
if (_batchSize.has_value()) {
_numberOfCommandLists = *_batchSize;
}

_logger.debug("ZeroInferRequest::ZeroInferRequest - checking level zero attributes and allocating tensors");

size_t ioIndex = 0;
Expand All @@ -205,7 +138,8 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
continue;
}

get_level_zero_input(ioIndex) = allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _batchSize);
get_level_zero_input(ioIndex) =
allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _graph->get_batch_size());
get_input_tensor_data(ioIndex) =
TensorData{get_level_zero_input(ioIndex)->data(), get_level_zero_input(ioIndex)->get_byte_size()};

Expand All @@ -222,7 +156,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
}

_levelZeroOutputTensors.at(ioIndex) =
allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _batchSize);
allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _graph->get_batch_size());
_outputTensorsData.at(ioIndex) =
std::optional(TensorData{_levelZeroOutputTensors.at(ioIndex)->data(),
_levelZeroOutputTensors.at(ioIndex)->get_byte_size()});
Expand All @@ -236,7 +170,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
void ZeroInferRequest::create_pipeline() {
for (size_t inputIndex = 0; inputIndex < _metadata.inputs.size(); ++inputIndex) {
if (is_batched_input(inputIndex)) {
if (_batchSize.has_value()) {
if (_graph->get_batch_size().has_value()) {
_logger.debug("ZeroInferRequest::create_pipeline - tensors %s were already allocated",
_metadata.inputs.at(inputIndex).nodeFriendlyName.c_str());
continue;
Expand All @@ -250,8 +184,11 @@ void ZeroInferRequest::create_pipeline() {
}

_logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
get_level_zero_input(inputIndex) =
allocate_tensor(_metadata.inputs.at(inputIndex), inputIndex, INPUT, *_inputAllocator, _batchSize);
get_level_zero_input(inputIndex) = allocate_tensor(_metadata.inputs.at(inputIndex),
inputIndex,
INPUT,
*_inputAllocator,
_graph->get_batch_size());
get_input_tensor_data(inputIndex) = std::optional(
TensorData{get_level_zero_input(inputIndex)->data(), get_level_zero_input(inputIndex)->get_byte_size()});
}
Expand All @@ -263,17 +200,20 @@ void ZeroInferRequest::create_pipeline() {
continue;
}
_logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
_levelZeroOutputTensors.at(outputIndex) =
allocate_tensor(_metadata.outputs.at(outputIndex), outputIndex, OUTPUT, *_outputAllocator, _batchSize);
_levelZeroOutputTensors.at(outputIndex) = allocate_tensor(_metadata.outputs.at(outputIndex),
outputIndex,
OUTPUT,
*_outputAllocator,
_graph->get_batch_size());
_outputTensorsData.at(outputIndex) =
std::optional(TensorData{_levelZeroOutputTensors.at(outputIndex)->data(),
_levelZeroOutputTensors.at(outputIndex)->get_byte_size()});
}

// Find the corresponding command queue group.
_logger.debug("ZeroDevice::ZeroDevice - findGroupOrdinal");
_logger.debug("ZeroInferRequest::create_pipeline - findGroupOrdinal");
auto groupOrdinal = zeroUtils::findGroupOrdinal(_initStructs->getDevice(), _properties);
_logger.debug("ZeroDevice::ZeroDevice - init completed");
_logger.debug("ZeroInferRequest::create_pipeline - init completed");

_logger.debug("ZeroInferRequest::create_pipeline - constructing pipeline");

Expand All @@ -286,7 +226,6 @@ void ZeroInferRequest::create_pipeline() {
_npuProfiling,
_inputTensorsData,
_outputTensorsData,
_numberOfCommandLists,
groupOrdinal);

_logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed");
Expand Down Expand Up @@ -321,7 +260,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
index,
isInput,
isInput ? *_inputAllocator : *_outputAllocator,
_batchSize);
_graph->get_batch_size());

setTensorData = true;
levelZeroTensorCreatedLocally = true;
Expand Down Expand Up @@ -444,7 +383,7 @@ void ZeroInferRequest::set_tensors(const ov::Output<const ov::Node>& port,
get_user_inputs(foundPort.idx) = tensors;

if (_initStructs->getMutableCommandListVersion()) {
if (_batchSize.has_value()) {
if (_graph->get_batch_size().has_value()) {
for (size_t i = 0; i < tensors.size(); i++) {
auto remoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(tensors[i]._ptr);

Expand Down Expand Up @@ -525,13 +464,17 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
ioIndex,
isInput,
isInput ? *_inputAllocator : *_outputAllocator,
_batchSize);
_graph->get_batch_size());
tensorsData = std::optional(TensorData{levelZeroTensors->data(), levelZeroTensors->get_byte_size()});

return levelZeroTensors;
}

void ZeroInferRequest::infer() {
if (_config.get<RUN_INFERENCES_SEQUENTIALLY>()) {
OPENVINO_THROW("Only start async is supported when RUN_INFERENCES_SEQUENTIALLY is enabled!");
}

infer_async();
get_result();
}
Expand Down Expand Up @@ -567,7 +510,7 @@ void ZeroInferRequest::infer_async() {
}

if (is_batched_input(inputIndex)) {
if (_batchSize.has_value()) {
if (_graph->get_batch_size().has_value()) {
for (size_t i = 0; i < userTensor.size(); i++) {
auto levelZeroBatchRemoteTensor =
std::dynamic_pointer_cast<ZeroRemoteTensor>(get_level_zero_input(inputIndex, i));
Expand Down
Loading

0 comments on commit 8aff79e

Please sign in to comment.