diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst
index 8a58dc27df1f83..78a364c18ca4e6 100644
--- a/docs/articles_en/about-openvino/performance-benchmarks.rst
+++ b/docs/articles_en/about-openvino/performance-benchmarks.rst
@@ -64,7 +64,7 @@ implemented in your solutions. Click the buttons below to see the chosen benchma
:outline:
:expand:
- :material-regular:`bar_chart;1.4em` OVMS for GenAI (coming soon)
+ :material-regular:`bar_chart;1.4em` OVMS for GenAI
diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst
index ebd4667d544616..f18b66915fc3ce 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst
@@ -218,6 +218,114 @@ Specify generation_config to use grouped beam search:
cout << pipe.generate("The Sun is yellow because", config);
}
+Efficient Text Generation via Speculative Decoding
+##################################################
+
+Speculative decoding (or assisted-generation) enables faster token generation
+when an additional smaller draft model is used alongside the main model.
+The draft model predicts the next K tokens one by one in an autoregressive manner,
+while the main model validates these predictions and corrects them if necessary.
+
+Each predicted token is compared, and when there is a difference between the draft and
+main model, the last token predicted by the main model is kept. Then, the draft
+model acquires this token and tries prediction of the next K tokens,
+thus repeating the cycle.
+
+This method eliminates the need for multiple infer requests to the main model,
+which results in increased performance. Its implementation in the pipeline is
+shown in the code samples below:
+
+.. tab-set::
+
+ .. tab-item:: Python
+ :sync: py
+
+ .. code-block:: python
+
+ import openvino_genai
+ import queue
+ import threading
+
+ def streamer(subword):
+ print(subword, end='', flush=True)
+ return False
+
+ def infer(model_dir: str, draft_model_dir: str, prompt: str):
+ main_device = 'CPU' # GPU can be used as well.
+ draft_device = 'CPU'
+
+ scheduler_config = openvino_genai.SchedulerConfig()
+ scheduler_config.cache_size = 2
+
+ draft_model = openvino_genai.draft_model(draft_model_dir, draft_device)
+
+ pipe = openvino_genai.LLMPipeline(model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model)
+
+ config = openvino_genai.GenerationConfig()
+ config.max_new_tokens = 100
+ config.num_assistant_tokens = 5
+
+ pipe.generate(prompt, config, streamer)
+
+
+ For more information, refer to the
+ `Python sample `__.
+
+
+ .. tab-item:: C++
+ :sync: cpp
+
+ .. code-block:: cpp
+
+ #include
+
+ #include "openvino/genai/llm_pipeline.hpp"
+
+ int main(int argc, char* argv[]) try {
+ if (4 != argc) {
+ throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''");
+ }
+
+ ov::genai::GenerationConfig config;
+ config.max_new_tokens = 100;
+ config.num_assistant_tokens = 5;
+
+ std::string main_model_path = argv[1];
+ std::string draft_model_path = argv[2];
+ std::string prompt = argv[3];
+
+ std::string main_device = "CPU", draft_device = "CPU";
+
+ ov::genai::SchedulerConfig scheduler_config;
+ scheduler_config.cache_size = 5;
+
+ ov::genai::LLMPipeline pipe(
+ main_model_path,
+ main_device,
+ ov::genai::draft_model(draft_model_path, draft_device),
+ ov::genai::scheduler_config(scheduler_config));
+
+ auto streamer = [](std::string subword) {
+ std::cout << subword << std::flush;
+ return false;
+ };
+
+ pipe.generate(prompt, config, streamer);
+ } catch (const std::exception& error) {
+ try {
+ std::cerr << error.what() << '\n';
+ } catch (const std::ios_base::failure&) {}
+ return EXIT_FAILURE;
+ } catch (...) {
+ try {
+ std::cerr << "Non-exception object thrown\n";
+ } catch (const std::ios_base::failure&) {}
+ return EXIT_FAILURE;
+ }
+
+
+ For more information, refer to the
+ `C++ sample `__
Comparing with Hugging Face Results
#######################################
diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst
index 6033bd8ed96106..245a2648aab491 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst
@@ -118,7 +118,7 @@ sample shows basic usage of the ``Text2ImagePipeline`` pipeline.
image_write("baseline.bmp", image)
For more information, refer to the
- `Python sample `__
+ `Python sample `__
.. tab-item:: C++
:sync: cpp
@@ -218,7 +218,7 @@ sample shows basic usage of the ``Text2ImagePipeline`` pipeline.
For more information, refer to the
- `C++ sample `__
+ `C++ sample `__
@@ -269,7 +269,7 @@ and use audio files in WAV format at a sampling rate of 16 kHz as input.
For more information, refer to the
- `Python sample `__.
+ `Python sample `__.
.. tab-item:: C++
:sync: cpp
@@ -322,7 +322,7 @@ and use audio files in WAV format at a sampling rate of 16 kHz as input.
For more information, refer to the
- `C++ sample `__.
+ `C++ sample `__.
Using GenAI in Chat Scenario
@@ -367,7 +367,7 @@ mark a conversation session, as shown in the samples below:
For more information, refer to the
- `Python sample `__.
+ `Python sample `__.
.. tab-item:: C++
:sync: cpp
@@ -415,7 +415,142 @@ mark a conversation session, as shown in the samples below:
For more information, refer to the
- `C++ sample `__
+ `C++ sample `__
+
+
+Using GenAI with Vision Language Models
+#######################################
+
+OpenVINO GenAI introduces the ``openvino_genai.VLMPipeline`` pipeline for
+inference of multimodal text-generation Vision Language Models (VLMs).
+With a text prompt and an image as input, VLMPipeline can generate text using
+models such as LLava or MiniCPM-V. See the chat scenario presented
+in the samples below:
+
+.. tab-set::
+
+ .. tab-item:: Python
+ :sync: py
+
+ .. code-block:: python
+
+ import numpy as np
+ import openvino_genai
+ from PIL import Image
+ from openvino import Tensor
+ from pathlib import Path
+
+
+ def streamer(subword: str) -> bool:
+ print(subword, end='', flush=True)
+
+
+ def read_image(path: str) -> Tensor:
+ pic = Image.open(path).convert("RGB")
+ image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)
+ return Tensor(image_data)
+
+
+ def read_images(path: str) -> list[Tensor]:
+ entry = Path(path)
+ if entry.is_dir():
+ return [read_image(str(file)) for file in sorted(entry.iterdir())]
+ return [read_image(path)]
+
+
+ def infer(model_dir: str, image_dir: str):
+ rgbs = read_images(image_dir)
+ device = 'CPU' # GPU can be used as well.
+ enable_compile_cache = dict()
+ if "GPU" == device:
+ enable_compile_cache["CACHE_DIR"] = "vlm_cache"
+ pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache)
+
+ config = openvino_genai.GenerationConfig()
+ config.max_new_tokens = 100
+
+ pipe.start_chat()
+ prompt = input('question:\n')
+ pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer)
+
+ while True:
+ try:
+ prompt = input("\n----------\n"
+ "question:\n")
+ except EOFError:
+ break
+ pipe.generate(prompt, generation_config=config, streamer=streamer)
+ pipe.finish_chat()
+
+
+ For more information, refer to the
+ `Python sample `__.
+
+ .. tab-item:: C++
+ :sync: cpp
+
+ .. code-block:: cpp
+
+ #include "load_image.hpp"
+ #include
+ #include
+
+ bool print_subword(std::string&& subword) {
+ return !(std::cout << subword << std::flush);
+ }
+
+ int main(int argc, char* argv[]) try {
+ if (3 != argc) {
+ throw std::runtime_error(std::string{"Usage "} + argv[0] + " ");
+ }
+
+ std::vector rgbs = utils::load_images(argv[2]);
+
+ std::string device = "CPU"; // GPU can be used as well.
+ ov::AnyMap enable_compile_cache;
+ if ("GPU" == device) {
+ enable_compile_cache.insert({ov::cache_dir("vlm_cache")});
+ }
+ ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache);
+
+ ov::genai::GenerationConfig generation_config;
+ generation_config.max_new_tokens = 100;
+
+ std::string prompt;
+
+ pipe.start_chat();
+ std::cout << "question:\n";
+
+ std::getline(std::cin, prompt);
+ pipe.generate(prompt,
+ ov::genai::images(rgbs),
+ ov::genai::generation_config(generation_config),
+ ov::genai::streamer(print_subword));
+ std::cout << "\n----------\n"
+ "question:\n";
+ while (std::getline(std::cin, prompt)) {
+ pipe.generate(prompt,
+ ov::genai::generation_config(generation_config),
+ ov::genai::streamer(print_subword));
+ std::cout << "\n----------\n"
+ "question:\n";
+ }
+ pipe.finish_chat();
+ } catch (const std::exception& error) {
+ try {
+ std::cerr << error.what() << '\n';
+ } catch (const std::ios_base::failure&) {}
+ return EXIT_FAILURE;
+ } catch (...) {
+ try {
+ std::cerr << "Non-exception object thrown\n";
+ } catch (const std::ios_base::failure&) {}
+ return EXIT_FAILURE;
+ }
+
+
+ For more information, refer to the
+ `C++ sample `__
Additional Resources
#####################
@@ -423,4 +558,6 @@ Additional Resources
* :doc:`Install OpenVINO GenAI <../../../get-started/install-openvino/install-openvino-genai>`
* `OpenVINO GenAI Repo `__
* `OpenVINO GenAI Samples `__
+* A Jupyter notebook demonstrating
+ `Visual-language assistant with MiniCPM-V2 and OpenVINO `__
* `OpenVINO Tokenizers `__
diff --git a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json
index f96fb11e6b029d..0d53c3813542d2 100644
--- a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json
+++ b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json
@@ -1,45 +1,330 @@
[
+ {
+ "Platform": "Intel® Xeon® Platinum 8380",
+ "Model": "meta-llama/Llama-2-7b-chat-hf",
+ "featured_SKU": false,
+ "whats_new_model": false,
+ "PlatformType": "Server Platforms (Intel® Xeon®)",
+ "Parameters": {
+ "OpenVINO Model Server": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 94.97,
+ "0.4": 187.12,
+ "0.6": 271.85,
+ "0.8": 290.81,
+ "1.0": 291.39,
+ "2.0": 291.45,
+ "inf": 291.59
+ },
+ "Latency": {
+ "0.2": 74.35,
+ "0.4": 122.25,
+ "0.6": 467.49,
+ "0.8": 749.39,
+ "1.0": 771.39,
+ "2.0": 773.31,
+ "inf": 783.63
+ }
+ }
+ ]
+ },
+ "vLLM with OpenVINO backend": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 94.83,
+ "0.4": 187.83,
+ "0.6": 272.32,
+ "0.8": 284.07,
+ "1.0": 291.88,
+ "2.0": 291.91,
+ "inf": 288.62
+ },
+ "Latency": {
+ "0.2": 82.31,
+ "0.4": 134.38,
+ "0.6": 495.99,
+ "0.8": 794.41,
+ "1.0": 798.39,
+ "2.0": 800.33,
+ "inf": 809.56
+ }
+ }
+ ]
+ }
+ }
+ },
+ {
+ "Platform": "Intel® Xeon® Platinum 8480+",
+ "Model": "meta-llama/Llama-2-7b-chat-hf",
+ "featured_SKU": true,
+ "whats_new_model": false,
+ "PlatformType": "Server Platforms (Intel® Xeon®)",
+ "Parameters": {
+ "OpenVINO Model Server": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 95.15,
+ "0.4": 188.31,
+ "0.6": 279.3,
+ "0.8": 366.78,
+ "1.0": 454.27,
+ "2.0": 788.9,
+ "inf": 825.97
+ },
+ "Latency": {
+ "0.2": 60.88,
+ "0.4": 71.96,
+ "0.6": 83.45,
+ "0.8": 103.77,
+ "1.0": 128.12,
+ "2.0": 237.62,
+ "inf": 253.59
+ }
+ }
+ ]
+ },
+ "vLLM with OpenVINO backend": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 95.06,
+ "0.4": 188.47,
+ "0.6": 280.54,
+ "0.8": 367.47,
+ "1.0": 450.81,
+ "2.0": 774.57,
+ "inf": 793.78
+ },
+ "Latency": {
+ "0.2": 63.84,
+ "0.4": 76.22,
+ "0.6": 87.21,
+ "0.8": 104.75,
+ "1.0": 136.77,
+ "2.0": 259.2,
+ "inf": 273.58
+ }
+ }
+ ]
+ }
+ }
+ },
{
"Platform": "Intel® Xeon® Platinum 8580",
- "Model": "mistralai/Mistral-7B-v0.1",
- "PlatformType": "None",
+ "Model": "meta-llama/Llama-2-7b-chat-hf",
+ "featured_SKU": true,
+ "whats_new_model": false,
+ "PlatformType": "Server Platforms (Intel® Xeon®)",
"Parameters": {
- "Vllm": {
+ "OpenVINO Model Server": {
"Precisions": [
{
"Throughput": {
- "0.2": "350.06",
- "0.6": "486.89",
- "0.8": "575.92",
- "2.0": "778.07"
+ "0.2": 95.29,
+ "0.4": 188.33,
+ "0.6": 280.09,
+ "0.8": 367.29,
+ "1.0": 453.21,
+ "2.0": 780.05,
+ "inf": 751.34
+ },
+ "Latency": {
+ "0.2": 52.44,
+ "0.4": 70.06,
+ "0.6": 84.54,
+ "0.8": 108.91,
+ "1.0": 136.45,
+ "2.0": 253.55,
+ "inf": 281.85
}
- },
+ }
+ ]
+ },
+ "vLLM with OpenVINO backend": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 95.0,
+ "0.4": 188.26,
+ "0.6": 279.78,
+ "0.8": 366.69,
+ "1.0": 450.26,
+ "2.0": 770.74,
+ "inf": 794.39
+ },
+ "Latency": {
+ "0.2": 58.07,
+ "0.4": 77.65,
+ "0.6": 91.14,
+ "0.8": 113.61,
+ "1.0": 144.21,
+ "2.0": 269.13,
+ "inf": 273.27
+ }
+ }
+ ]
+ }
+ }
+ },
+ {
+ "Platform": "Intel® Xeon® Platinum 8380",
+ "Model": "meta-llama/Meta-Llama-3-8B-Instruct",
+ "featured_SKU": false,
+ "whats_new_model": true,
+ "PlatformType": "Server Platforms (Intel® Xeon®)",
+ "Parameters": {
+ "OpenVINO Model Server": {
+ "Precisions": [
{
+ "Throughput": {
+ "0.2": 82.46,
+ "0.4": 162.73,
+ "0.6": 240.08,
+ "0.8": 273.75,
+ "1.0": 275.85,
+ "2.0": 276.3,
+ "inf": 275.15
+ },
"Latency": {
- "0.2": "60.93",
- "0.6": "91.63",
- "0.8": "113.61",
- "2.0": "240.25"
+ "0.2": 76.49,
+ "0.4": 122.1,
+ "0.6": 318.14,
+ "0.8": 785.8,
+ "1.0": 805.58,
+ "2.0": 809.37,
+ "inf": 816.2
}
}
]
},
- "Ovms": {
+ "vLLM with OpenVINO backend": {
"Precisions": [
{
"Throughput": {
- "0.2": "90.98",
- "0.6": "266.24",
- "0.8": "351.63",
- "2.0": "195.16"
+ "0.2": 82.32,
+ "0.4": 162.98,
+ "0.6": 239.28,
+ "2.0": 270.37
+ },
+ "Latency": {
+ "0.2": 87.92,
+ "0.4": 142.3,
+ "0.6": 343.36,
+ "2.0": 873.0
}
- },
+ }
+ ]
+ }
+ }
+ },
+ {
+ "Platform": "Intel® Xeon® Platinum 8480+",
+ "Model": "meta-llama/Meta-Llama-3-8B-Instruct",
+ "featured_SKU": true,
+ "whats_new_model": true,
+ "PlatformType": "Server Platforms (Intel® Xeon®)",
+ "Parameters": {
+ "OpenVINO Model Server": {
+ "Precisions": [
{
+ "Throughput": {
+ "0.2": 82.61,
+ "0.4": 164.44,
+ "0.6": 244.92,
+ "0.8": 323.34,
+ "1.0": 400.78,
+ "2.0": 731.9,
+ "inf": 848.45
+ },
"Latency": {
- "0.2": "54.9",
- "0.6": "78.78",
- "0.8": "95.78",
- "2.0": "352.23"
+ "0.2": 60.77,
+ "0.4": 69.1,
+ "0.6": 74.36,
+ "0.8": 81.41,
+ "1.0": 100.17,
+ "2.0": 206.5,
+ "inf": 246.56
+ }
+ }
+ ]
+ },
+ "vLLM with OpenVINO backend": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 82.54,
+ "0.4": 163.66,
+ "0.6": 243.88,
+ "0.8": 322.75,
+ "1.0": 400.46,
+ "2.0": 727.1
+ },
+ "Latency": {
+ "0.2": 65.37,
+ "0.4": 75.87,
+ "0.6": 81.14,
+ "0.8": 93.91,
+ "1.0": 107.13,
+ "2.0": 229.57
+ }
+ }
+ ]
+ }
+ }
+ },
+ {
+ "Platform": "Intel® Xeon® Platinum 8580",
+ "Model": "meta-llama/Meta-Llama-3-8B-Instruct",
+ "featured_SKU": true,
+ "whats_new_model": true,
+ "PlatformType": "Server Platforms (Intel® Xeon®)",
+ "Parameters": {
+ "OpenVINO Model Server": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 82.55,
+ "0.4": 164.52,
+ "0.6": 243.96,
+ "0.8": 323.07,
+ "1.0": 399.68,
+ "2.0": 727.18,
+ "inf": 856.72
+ },
+ "Latency": {
+ "0.2": 54.57,
+ "0.4": 69.17,
+ "0.6": 80.32,
+ "0.8": 92.94,
+ "1.0": 111.06,
+ "2.0": 215.46,
+ "inf": 245.72
+ }
+ }
+ ]
+ },
+ "vLLM with OpenVINO backend": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 82.64,
+ "0.6": 243.81,
+ "0.8": 321.8,
+ "1.0": 398.78,
+ "2.0": 722.48,
+ "inf": 792.34
+ },
+ "Latency": {
+ "0.2": 61.49,
+ "0.6": 90.54,
+ "0.8": 106.25,
+ "1.0": 123.6,
+ "2.0": 245.91,
+ "inf": 279.21
}
}
]
@@ -47,46 +332,168 @@
}
},
{
- "Platform": "Intel® Xeon® Platinum 8530",
+ "Platform": "Intel® Xeon® Platinum 8380",
"Model": "mistralai/Mistral-7B-v0.1",
- "PlatformType": "None",
+ "featured_SKU": false,
+ "whats_new_model": false,
+ "PlatformType": "Server Platforms (Intel® Xeon®)",
"Parameters": {
- "Vllm": {
+ "OpenVINO Model Server": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 91.74,
+ "0.4": 180.4,
+ "0.6": 262.97,
+ "0.8": 287.36,
+ "1.0": 289.08,
+ "2.0": 289.06,
+ "inf": 290.69
+ },
+ "Latency": {
+ "0.2": 74.84,
+ "0.4": 115.4,
+ "0.6": 345.64,
+ "0.8": 757.42,
+ "1.0": 776.6,
+ "2.0": 778.29,
+ "inf": 784.42
+ }
+ }
+ ]
+ },
+ "vLLM with OpenVINO backend": {
"Precisions": [
{
"Throughput": {
- "0.2": "350.06",
- "0.6": "486.89",
- "0.8": "575.92",
- "2.0": "778.07"
+ "0.2": 97.21,
+ "0.4": 192.46,
+ "0.6": 265.82,
+ "0.8": 273.24,
+ "1.0": 272.65,
+ "inf": 274.0
+ },
+ "Latency": {
+ "0.2": 166.77,
+ "0.4": 161.76,
+ "0.6": 666.89,
+ "0.8": 802.15,
+ "1.0": 810.26,
+ "inf": 807.71
}
- },
+ }
+ ]
+ }
+ }
+ },
+ {
+ "Platform": "Intel® Xeon® Platinum 8480+",
+ "Model": "mistralai/Mistral-7B-v0.1",
+ "featured_SKU": true,
+ "whats_new_model": false,
+ "PlatformType": "Server Platforms (Intel® Xeon®)",
+ "Parameters": {
+ "OpenVINO Model Server": {
+ "Precisions": [
{
+ "Throughput": {
+ "0.2": 90.95,
+ "0.4": 181.06,
+ "0.6": 267.29,
+ "0.8": 351.62,
+ "1.0": 431.45,
+ "2.0": 751.85,
+ "inf": 596.0
+ },
"Latency": {
- "0.2": "60.93",
- "0.6": "91.63",
- "0.8": "113.61",
- "2.0": "240.25"
+ "0.2": 59.95,
+ "0.4": 63.41,
+ "0.6": 73.42,
+ "0.8": 85.99,
+ "1.0": 98.67,
+ "2.0": 205.2,
+ "inf": 205.97
}
}
]
},
- "Ovms": {
+ "vLLM with OpenVINO backend": {
+ "Precisions": [
+ {
+ "Throughput": {
+ "0.2": 98.18,
+ "0.4": 194.35,
+ "0.6": 287.28,
+ "0.8": 376.31,
+ "1.0": 460.32,
+ "2.0": 771.81,
+ "inf": 789.38
+ },
+ "Latency": {
+ "0.2": 64.88,
+ "0.4": 73.3,
+ "0.6": 84.37,
+ "0.8": 100.8,
+ "1.0": 133.98,
+ "2.0": 240.99,
+ "inf": 251.55
+ }
+ }
+ ]
+ }
+ }
+ },
+ {
+ "Platform": "Intel® Xeon® Platinum 8580",
+ "Model": "mistralai/Mistral-7B-v0.1",
+ "featured_SKU": true,
+ "whats_new_model": false,
+ "PlatformType": "Server Platforms (Intel® Xeon®)",
+ "Parameters": {
+ "OpenVINO Model Server": {
"Precisions": [
{
"Throughput": {
- "0.2": "90.98",
- "0.6": "266.24",
- "0.8": "351.63",
- "2.0": "195.16"
+ "0.2": 91.2,
+ "0.4": 180.14,
+ "0.6": 267.75,
+ "0.8": 351.12,
+ "1.0": 428.31,
+ "2.0": 744.99,
+ "inf": 852.05
+ },
+ "Latency": {
+ "0.2": 54.31,
+ "0.4": 67.14,
+ "0.6": 77.59,
+ "0.8": 92.17,
+ "1.0": 112.75,
+ "2.0": 225.48,
+ "inf": 241.49
}
- },
+ }
+ ]
+ },
+ "vLLM with OpenVINO backend": {
+ "Precisions": [
{
+ "Throughput": {
+ "0.2": 98.1,
+ "0.4": 194.47,
+ "0.6": 286.97,
+ "0.8": 375.84,
+ "1.0": 460.21,
+ "2.0": 764.54,
+ "inf": 787.97
+ },
"Latency": {
- "0.2": "54.9",
- "0.6": "78.78",
- "0.8": "95.78",
- "2.0": "352.23"
+ "0.2": 62.26,
+ "0.4": 78.08,
+ "0.6": 91.61,
+ "0.8": 116.71,
+ "1.0": 141.76,
+ "2.0": 250.38,
+ "inf": 254.25
}
}
]
diff --git a/src/inference/include/openvino/runtime/infer_request.hpp b/src/inference/include/openvino/runtime/infer_request.hpp
index ed4dcd67797b84..10a606a2b6c535 100644
--- a/src/inference/include/openvino/runtime/infer_request.hpp
+++ b/src/inference/include/openvino/runtime/infer_request.hpp
@@ -255,7 +255,7 @@ class OPENVINO_RUNTIME_API InferRequest {
/**
* @brief Infers specified input(s) in synchronous mode.
* @note It blocks all methods of InferRequest while request is ongoing (running or waiting in a queue).
- * Calling any method leads to throwning the ov::Busy exception.
+ * Calling any method leads to throwing the ov::Busy exception.
*/
void infer();
@@ -274,7 +274,7 @@ class OPENVINO_RUNTIME_API InferRequest {
/**
* @brief Starts inference of specified input(s) in asynchronous mode.
* @note It returns immediately. Inference starts also immediately.
- * Calling any method while the request in a running state leads to throwning the ov::Busy exception.
+ * Calling any method while the request in a running state leads to throwing the ov::Busy exception.
*/
void start_async();
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
index 510ab7fc43b0c8..1fc3a3e20965c6 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
@@ -270,4 +270,22 @@ struct BYPASS_UMD_CACHING final : OptionBase {
return OptionMode::RunTime;
}
};
+
+//
+// RUN_INFERENCES_SEQUENTIALLY
+//
+struct RUN_INFERENCES_SEQUENTIALLY final : OptionBase {
+ static std::string_view key() {
+ return ov::intel_npu::run_inferences_sequentially.name();
+ }
+
+ static bool defaultValue() {
+ return false;
+ }
+
+ static OptionMode mode() {
+ return OptionMode::RunTime;
+ }
+};
+
} // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp
index ec92e10a9f89c8..8aabd132e9431a 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp
@@ -327,5 +327,14 @@ static constexpr ov::Property backend_n
*/
static constexpr ov::Property backend_compilation_params{"NPU_BACKEND_COMPILATION_PARAMS"};
+/**
+ * @brief [Only for NPU Plugin]
+ * Type: boolean, default is false.
+ * This option allows to run inferences sequentially, in the order in which they were created
+ * @note Experimental property, for now it only works in very specific scenarios. We need driver updates before we can
+ * implement a robust solution for in-order execution
+ */
+static constexpr ov::Property run_inferences_sequentially{"NPU_RUN_INFERENCES_SEQUENTIALLY"};
+
} // namespace intel_npu
} // namespace ov
diff --git a/src/plugins/intel_npu/src/al/src/config/runtime.cpp b/src/plugins/intel_npu/src/al/src/config/runtime.cpp
index 759956b6f597df..3da16796219332 100644
--- a/src/plugins/intel_npu/src/al/src/config/runtime.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/runtime.cpp
@@ -27,6 +27,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) {
desc.add();
desc.add();
desc.add();
+ desc.add();
}
// Heuristically obtained number. Varies depending on the values of PLATFORM and PERFORMANCE_HINT
diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
index 3efbdab666d1ba..1e8781b0afe820 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
@@ -38,25 +38,6 @@ class ZeroInferRequest final : public SyncInferRequest {
std::vector get_profiling_info() const override;
std::vector get_raw_profiling_data() const;
- /**
- * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by
- * the model will also be deduced and returned.
- * @details Batching can be handled by the plugin only if:
- * - The batch axis is the first axis.
- * - The batch size received by the compiler takes the default value of 1.
- * - The batch size found in the IR model matches for all inputs/outputs and takes a value different than the
- * default one.
- *
- * If any of the previous conditions is not fulfilled, the functon will return the default batch size, thus no
- * custom algorithm will be applied inside the plugin in order to address batching.
- *
- * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will
- * ultimately be used for determining the batch size.
- * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside
- * the plugin.
- */
- std::optional get_batch_size(const NetworkMetadata& metadata);
-
/**
* @brief Check the received tensor and set the Level Zero tensor accordingly
* @param tensor Reference to a tensor.
@@ -106,22 +87,6 @@ class ZeroInferRequest final : public SyncInferRequest {
std::shared_ptr _npuProfiling;
std::unique_ptr _pipeline;
- /**
- * @brief Indicates how many command lists will be used inside the pipeline.
- * @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis
- * between these lists.
- *
- * If batching is handled on compiler's side then a single command list shall be used, we don't do any
- * specific operation inside the plugin in this case.
- */
- size_t _numberOfCommandLists = 1;
-
- /**
- * @brief The batch size used by the corresponding model.
- * @details The attribute contains a value only if the plugin performs the batches splitting operation.
- */
- std::optional _batchSize = std::nullopt;
-
bool _pipelineIsCreated = false;
};
diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
index 5b7f488d3eb96a..de5e1ac81c4728 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
@@ -28,7 +28,6 @@ struct Pipeline {
const std::shared_ptr& npu_profiling,
const std::vector>>& inputTensorsData,
const std::vector>& outputTensorsData,
- size_t numberOfCommandLists,
uint32_t group_ordinal);
Pipeline(const Pipeline&) = delete;
@@ -43,12 +42,25 @@ struct Pipeline {
void updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex);
protected:
+ std::shared_ptr _graph;
const Config _config;
+ const uint32_t _id;
+
+ /**
+ * @brief Indicates how many command lists will be used inside the pipeline.
+ * @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis
+ * between these lists.
+ *
+ * If batching is handled on compiler's side then a single command list shall be used, we don't do any
+ * specific operation inside the plugin in this case.
+ */
+ size_t _number_of_command_lists;
+
std::shared_ptr _command_queue;
std::vector> _command_lists;
std::vector> _fences;
- EventPool _event_pool;
- std::vector> _events;
+ std::shared_ptr _event_pool;
+ std::vector> _events;
bool sync_output_with_fences_ = true;
std::shared_ptr _npu_profiling;
Logger _logger;
diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
index 88dfaf944a8b34..a0e5d2d11c1fef 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -20,8 +20,6 @@ using namespace intel_npu;
namespace {
constexpr std::size_t SINGLE_TENSOR = 0;
-constexpr std::size_t BATCH_AXIS = 0;
-constexpr std::size_t DEFAULT_BATCH_SIZE = 1;
constexpr bool INPUT = true;
constexpr bool OUTPUT = false;
@@ -96,64 +94,6 @@ bool memory_was_allocated_in_the_same_l0_context(ze_context_handle_t hContext, c
} // namespace
-std::optional ZeroInferRequest::get_batch_size(const NetworkMetadata& metadata) {
- if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) {
- _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
- return std::nullopt;
- }
-
- const ov::PartialShape& firstOutputShape = *metadata.outputs.at(0).shapeFromIRModel;
- if (firstOutputShape.is_dynamic()) {
- _logger.warning("Networks using dynamic shapes are not supported when batching is handled by the plugin");
- return std::nullopt;
- }
- if (firstOutputShape.rank().get_length() == 0) {
- _logger.warning(
- "Networks using rank 0 shapes for inputs/outputs are not supported when batching is handled by the plugin");
- return std::nullopt;
- }
-
- const size_t candidateBatchSize = firstOutputShape[BATCH_AXIS].get_length();
- if (candidateBatchSize == 0 || candidateBatchSize == DEFAULT_BATCH_SIZE) {
- _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
- return std::nullopt;
- }
-
- auto checkDescriptorsUseCandidateBatchSize = [candidateBatchSize](const std::vector& descriptors) {
- for (const IODescriptor& descriptor : descriptors) {
- OPENVINO_ASSERT(descriptor.shapeFromIRModel.has_value(),
- "Missing value for the \"shapeFromIRModel\" attribute, I/O descriptor");
-
- const ov::PartialShape& shapeFromCompiler = descriptor.shapeFromCompiler;
- const ov::PartialShape& shapeFromIRModel = *descriptor.shapeFromIRModel;
-
- if (shapeFromCompiler.is_dynamic() || shapeFromCompiler.rank().get_length() == 0 ||
- *shapeFromCompiler.begin() != DEFAULT_BATCH_SIZE) {
- return false;
- }
-
- if (!descriptor.isStateInput && !descriptor.isStateOutput && !descriptor.isShapeTensor) {
- if (shapeFromIRModel.is_dynamic() || shapeFromIRModel.rank().get_length() == 0 ||
- *shapeFromIRModel.begin() != candidateBatchSize) {
- return false;
- }
- }
- }
-
- return true;
- };
-
- if (!checkDescriptorsUseCandidateBatchSize(metadata.inputs) ||
- !checkDescriptorsUseCandidateBatchSize(metadata.outputs)) {
- _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
- return std::nullopt;
- }
-
- _logger.debug("Batching is handled by the plugin");
-
- return candidateBatchSize;
-}
-
//------------------------------------------------------------------------------
ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& initStructs,
const std::shared_ptr& compiledModel,
@@ -187,13 +127,6 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr&
_inputAllocator =
std::make_shared(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED);
- if (config.get() != ov::intel_npu::BatchMode::COMPILER) {
- _batchSize = get_batch_size(_metadata);
- }
- if (_batchSize.has_value()) {
- _numberOfCommandLists = *_batchSize;
- }
-
_logger.debug("ZeroInferRequest::ZeroInferRequest - checking level zero attributes and allocating tensors");
size_t ioIndex = 0;
@@ -205,7 +138,8 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr&
continue;
}
- get_level_zero_input(ioIndex) = allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _batchSize);
+ get_level_zero_input(ioIndex) =
+ allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _graph->get_batch_size());
get_input_tensor_data(ioIndex) =
TensorData{get_level_zero_input(ioIndex)->data(), get_level_zero_input(ioIndex)->get_byte_size()};
@@ -222,7 +156,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr&
}
_levelZeroOutputTensors.at(ioIndex) =
- allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _batchSize);
+ allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _graph->get_batch_size());
_outputTensorsData.at(ioIndex) =
std::optional(TensorData{_levelZeroOutputTensors.at(ioIndex)->data(),
_levelZeroOutputTensors.at(ioIndex)->get_byte_size()});
@@ -236,7 +170,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr&
void ZeroInferRequest::create_pipeline() {
for (size_t inputIndex = 0; inputIndex < _metadata.inputs.size(); ++inputIndex) {
if (is_batched_input(inputIndex)) {
- if (_batchSize.has_value()) {
+ if (_graph->get_batch_size().has_value()) {
_logger.debug("ZeroInferRequest::create_pipeline - tensors %s were already allocated",
_metadata.inputs.at(inputIndex).nodeFriendlyName.c_str());
continue;
@@ -250,8 +184,11 @@ void ZeroInferRequest::create_pipeline() {
}
_logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
- get_level_zero_input(inputIndex) =
- allocate_tensor(_metadata.inputs.at(inputIndex), inputIndex, INPUT, *_inputAllocator, _batchSize);
+ get_level_zero_input(inputIndex) = allocate_tensor(_metadata.inputs.at(inputIndex),
+ inputIndex,
+ INPUT,
+ *_inputAllocator,
+ _graph->get_batch_size());
get_input_tensor_data(inputIndex) = std::optional(
TensorData{get_level_zero_input(inputIndex)->data(), get_level_zero_input(inputIndex)->get_byte_size()});
}
@@ -263,17 +200,20 @@ void ZeroInferRequest::create_pipeline() {
continue;
}
_logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
- _levelZeroOutputTensors.at(outputIndex) =
- allocate_tensor(_metadata.outputs.at(outputIndex), outputIndex, OUTPUT, *_outputAllocator, _batchSize);
+ _levelZeroOutputTensors.at(outputIndex) = allocate_tensor(_metadata.outputs.at(outputIndex),
+ outputIndex,
+ OUTPUT,
+ *_outputAllocator,
+ _graph->get_batch_size());
_outputTensorsData.at(outputIndex) =
std::optional(TensorData{_levelZeroOutputTensors.at(outputIndex)->data(),
_levelZeroOutputTensors.at(outputIndex)->get_byte_size()});
}
// Find the corresponding command queue group.
- _logger.debug("ZeroDevice::ZeroDevice - findGroupOrdinal");
+ _logger.debug("ZeroInferRequest::create_pipeline - findGroupOrdinal");
auto groupOrdinal = zeroUtils::findGroupOrdinal(_initStructs->getDevice(), _properties);
- _logger.debug("ZeroDevice::ZeroDevice - init completed");
+ _logger.debug("ZeroInferRequest::create_pipeline - init completed");
_logger.debug("ZeroInferRequest::create_pipeline - constructing pipeline");
@@ -286,7 +226,6 @@ void ZeroInferRequest::create_pipeline() {
_npuProfiling,
_inputTensorsData,
_outputTensorsData,
- _numberOfCommandLists,
groupOrdinal);
_logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed");
@@ -321,7 +260,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr& tenso
index,
isInput,
isInput ? *_inputAllocator : *_outputAllocator,
- _batchSize);
+ _graph->get_batch_size());
setTensorData = true;
levelZeroTensorCreatedLocally = true;
@@ -444,7 +383,7 @@ void ZeroInferRequest::set_tensors(const ov::Output& port,
get_user_inputs(foundPort.idx) = tensors;
if (_initStructs->getMutableCommandListVersion()) {
- if (_batchSize.has_value()) {
+ if (_graph->get_batch_size().has_value()) {
for (size_t i = 0; i < tensors.size(); i++) {
auto remoteTensor = std::dynamic_pointer_cast(tensors[i]._ptr);
@@ -525,13 +464,17 @@ ov::SoPtr ZeroInferRequest::get_tensor(const ov::Outputget_batch_size());
tensorsData = std::optional(TensorData{levelZeroTensors->data(), levelZeroTensors->get_byte_size()});
return levelZeroTensors;
}
void ZeroInferRequest::infer() {
+ if (_config.get()) {
+ OPENVINO_THROW("Only start async is supported when RUN_INFERENCES_SEQUENTIALLY is enabled!");
+ }
+
infer_async();
get_result();
}
@@ -567,7 +510,7 @@ void ZeroInferRequest::infer_async() {
}
if (is_batched_input(inputIndex)) {
- if (_batchSize.has_value()) {
+ if (_graph->get_batch_size().has_value()) {
for (size_t i = 0; i < userTensor.size(); i++) {
auto levelZeroBatchRemoteTensor =
std::dynamic_pointer_cast(get_level_zero_input(inputIndex, i));
diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
index c782c3e0684f0d..d7f06b813810bb 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
@@ -8,6 +8,7 @@
#include
#include "intel_npu/common/itt.hpp"
+#include "intel_npu/config/runtime.hpp"
#include "intel_npu/prefix.hpp"
#include "intel_npu/utils/logger/logger.hpp"
#include "intel_npu/utils/zero/zero_api.hpp"
@@ -23,13 +24,15 @@ Pipeline::Pipeline(const Config& config,
const std::shared_ptr& npu_profiling,
const std::vector>>& inputTensorsData,
const std::vector>& outputTensorsData,
- size_t numberOfCommandLists,
uint32_t group_ordinal)
- : _config(config),
- _command_queue(graph->get_command_queue()),
- _event_pool{initStructs->getDevice(),
- initStructs->getContext(),
- numberOfCommandLists ? static_cast(numberOfCommandLists) : 1},
+ : _graph(graph),
+ _config(config),
+ _id(_graph->get_unique_id()),
+ _number_of_command_lists(_graph->get_batch_size().has_value() ? *_graph->get_batch_size() : 1),
+ _event_pool{
+ std::make_shared(initStructs->getDevice(),
+ initStructs->getContext(),
+ _number_of_command_lists ? static_cast(_number_of_command_lists) : 1)},
_npu_profiling(npu_profiling),
_logger("Pipeline", _config.get()) {
OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline");
@@ -39,20 +42,20 @@ Pipeline::Pipeline(const Config& config,
profiling_query.create(profiling_pool._handle);
}
- _command_lists.reserve(numberOfCommandLists);
- _events.reserve(numberOfCommandLists);
- _fences.reserve(numberOfCommandLists);
+ _command_lists.reserve(_number_of_command_lists);
+ _events.reserve(_number_of_command_lists);
+ _fences.reserve(_number_of_command_lists);
_logger.debug("Pipeline - emplace_back _event_pool and _command_queue");
- for (size_t i = 0; i < numberOfCommandLists; i++) {
+ for (size_t i = 0; i < _number_of_command_lists; i++) {
_command_lists.emplace_back(
std::make_unique(initStructs,
group_ordinal,
initStructs->getMutableCommandListVersion() ? true : false));
- _events.emplace_back(std::make_unique(_event_pool.handle(), static_cast(i)));
- _fences.emplace_back(std::make_unique(*_command_queue));
+ _events.emplace_back(std::make_shared(_event_pool, static_cast(i)));
+ _fences.emplace_back(std::make_unique(*_graph->get_command_queue()));
}
- for (size_t i = 0; i < numberOfCommandLists; i++) {
+ for (size_t i = 0; i < _number_of_command_lists; i++) {
size_t ioIndex = 0;
for (const auto& desc : graph->get_input_descriptors()) {
if (inputTensorsData.at(ioIndex).size() > 1) {
@@ -64,7 +67,7 @@ Pipeline::Pipeline(const Config& config,
graph->set_argument_value(desc.idx,
static_cast(inputTensorsData.at(ioIndex).at(0)->mem) +
- (i * inputTensorsData.at(ioIndex).at(0)->size) / numberOfCommandLists);
+ (i * inputTensorsData.at(ioIndex).at(0)->size) / _number_of_command_lists);
++ioIndex;
}
@@ -73,10 +76,16 @@ Pipeline::Pipeline(const Config& config,
for (const auto& desc : graph->get_output_descriptors()) {
graph->set_argument_value(desc.idx,
static_cast(outputTensorsData.at(ioIndex)->mem) +
- (i * outputTensorsData.at(ioIndex)->size) / numberOfCommandLists);
+ (i * outputTensorsData.at(ioIndex)->size) / _number_of_command_lists);
++ioIndex;
}
+ if (_config.get()) {
+ if (_graph->get_last_submitted_event(i)) {
+ _graph->get_last_submitted_event(i)->AppendWaitOnEvent(*_command_lists.at(i));
+ }
+ }
+
/// append timestamp command if feature was activated
if (_npu_profiling != nullptr) {
_command_lists.at(i)->appendBarrier();
@@ -92,6 +101,15 @@ Pipeline::Pipeline(const Config& config,
_command_lists.at(i)->appendNpuTimestamp(reinterpret_cast(_npu_profiling->npu_ts_infer_end));
}
+ if (_config.get()) {
+ if (_graph->get_last_submitted_event(i)) {
+ _graph->get_last_submitted_event(i)->AppendEventReset(*_command_lists.at(i));
+ }
+
+ _events.at(i)->AppendSignalEvent(*_command_lists.at(i));
+ _graph->set_last_submitted_event(_events.at(i), i);
+ }
+
// appendBarrier used in L0 as well
if (!sync_output_with_fences_) {
_command_lists.at(i)->appendBarrier();
@@ -105,12 +123,24 @@ Pipeline::Pipeline(const Config& config,
void Pipeline::push() {
_logger.debug("Pipeline - push() started");
+ if (_config.get()) {
+ if (_id) {
+ auto previousIndex = _graph->get_last_submitted_id();
+
+ if (_id != ++previousIndex) {
+ OPENVINO_THROW("Inferences should be called in the same order they were called the first time!");
+ }
+ }
+
+ _graph->set_last_submitted_id(_id);
+ }
+
for (size_t i = 0; i < _command_lists.size(); ++i) {
OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push");
if (sync_output_with_fences_) {
- _command_queue->executeCommandList(*_command_lists.at(i), *_fences.at(i));
+ _graph->get_command_queue()->executeCommandList(*_command_lists.at(i), *_fences.at(i));
} else {
- _command_queue->executeCommandList(*_command_lists.at(i));
+ _graph->get_command_queue()->executeCommandList(*_command_lists.at(i));
}
}
@@ -154,12 +184,12 @@ void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index)
OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList");
_logger.debug("Pipeline - updateCommandList");
- const size_t numberOfCommandLists = _command_lists.size();
+ const size_t _number_of_command_lists = _command_lists.size();
- for (size_t i = 0; i < numberOfCommandLists; i++) {
+ for (size_t i = 0; i < _number_of_command_lists; i++) {
_command_lists.at(i)->updateMutableCommandList(
index,
- static_cast(tensorsData.mem) + (i * tensorsData.size) / numberOfCommandLists);
+ static_cast(tensorsData.mem) + (i * tensorsData.size) / _number_of_command_lists);
_command_lists.at(i)->close();
}
};
@@ -168,9 +198,9 @@ void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index,
OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList");
_logger.debug("Pipeline - updateCommandList");
- const size_t numberOfCommandLists = _command_lists.size();
+ const size_t _number_of_command_lists = _command_lists.size();
- OPENVINO_ASSERT(commandListIndex < numberOfCommandLists,
+ OPENVINO_ASSERT(commandListIndex < _number_of_command_lists,
"Command list index is higgher than the number of Command lists ",
commandListIndex);
diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp
index 51c4a4cf26eafd..7e718d9172f4f7 100644
--- a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp
+++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp
@@ -9,6 +9,7 @@
#include
#include "intel_npu/network_metadata.hpp"
+#include "intel_npu/utils/zero/zero_init.hpp"
#include "intel_npu/utils/zero/zero_utils.hpp"
#include "intel_npu/utils/zero/zero_wrappers.hpp"
#include "openvino/runtime/profiling_info.hpp"
@@ -17,13 +18,10 @@ namespace intel_npu {
class IGraph : public std::enable_shared_from_this {
public:
- IGraph(ze_graph_handle_t handle, NetworkMetadata metadata, std::optional> blob)
- : _handle(handle),
- _metadata(std::move(metadata)) {
- if (blob.has_value()) {
- _blob = std::move(*blob);
- }
- }
+ IGraph(ze_graph_handle_t handle,
+ NetworkMetadata metadata,
+ const Config& config,
+ std::optional> blob);
virtual void export_blob(std::ostream& stream) const = 0;
@@ -36,55 +34,48 @@ class IGraph : public std::enable_shared_from_this {
virtual ~IGraph() = default;
- const NetworkMetadata& get_metadata() const {
- return _metadata;
- }
-
- ze_graph_handle_t get_handle() const {
- return _handle;
- }
-
- void update_network_name(std::string_view name) {
- _metadata.name = name;
- }
-
- inline const std::vector& get_input_descriptors() const {
- return _input_descriptors;
- }
-
- inline const std::vector& get_output_descriptors() const {
- return _output_descriptors;
- }
-
- inline const std::shared_ptr& get_command_queue() const {
- return _command_queue;
- }
-
- void set_workload_type(const ov::WorkloadType workloadType) const {
- if (_command_queue == nullptr) {
- return;
- }
-
- ze_command_queue_workload_type_t zeWorkloadType;
- switch (workloadType) {
- case ov::WorkloadType::DEFAULT:
- zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT;
- break;
- case ov::WorkloadType::EFFICIENT:
- zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND;
- break;
- default:
- OPENVINO_THROW("Unknown value for WorkloadType!");
- }
-
- _command_queue->setWorkloadType(zeWorkloadType);
- }
-
- std::mutex& get_mutex() {
- return _mutex;
- }
+ const NetworkMetadata& get_metadata() const;
+ ze_graph_handle_t get_handle() const;
+
+ void update_network_name(std::string_view name);
+
+ const std::vector& get_input_descriptors() const;
+ const std::vector& get_output_descriptors() const;
+ const std::shared_ptr& get_command_queue() const;
+
+ void set_workload_type(const ov::WorkloadType workloadType) const;
+
+ std::mutex& get_mutex();
+
+ void set_last_submitted_event(const std::shared_ptr& event, size_t indexOfCommandList);
+ const std::shared_ptr& get_last_submitted_event(size_t indexOfCommandList) const;
+
+ uint32_t get_unique_id();
+ void set_last_submitted_id(uint32_t id_index);
+ const uint32_t get_last_submitted_id() const;
+
+ const std::optional get_batch_size() const;
protected:
+ /**
+ * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by
+ * the model will also be deduced and returned.
+ * @details Batching can be handled by the plugin only if:
+ * - The batch axis is the first axis.
+ * - The batch size received by the compiler takes the default value of 1.
+ * - The batch size found in the IR model matches for all inputs/outputs and takes a value different than the
+ * default one.
+ *
+ * If any of the previous conditions is not fulfilled, the functon will return the default batch size, thus no
+ * custom algorithm will be applied inside the plugin in order to address batching.
+ *
+ * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will
+ * ultimately be used for determining the batch size.
+ * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside
+ * the plugin.
+ */
+ std::optional get_batch_size(const NetworkMetadata& metadata);
+
ze_graph_handle_t _handle = nullptr;
NetworkMetadata _metadata;
@@ -92,12 +83,24 @@ class IGraph : public std::enable_shared_from_this {
std::vector _output_descriptors;
std::shared_ptr _command_queue;
+ std::vector> _last_submitted_event;
// Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the
// first inference starts running
std::mutex _mutex;
std::vector _blob;
+
+ uint32_t _unique_id = 0;
+ uint32_t _last_submitted_id;
+
+ /**
+ * @brief The batch size used by the corresponding model.
+ * @details The attribute contains a value only if the plugin performs the batches splitting operation.
+ */
+ std::optional _batch_size = std::nullopt;
+
+ Logger _logger;
};
} // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/common/src/igraph.cpp b/src/plugins/intel_npu/src/common/src/igraph.cpp
new file mode 100644
index 00000000000000..fd5463af5eea3e
--- /dev/null
+++ b/src/plugins/intel_npu/src/common/src/igraph.cpp
@@ -0,0 +1,159 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_npu/common/igraph.hpp"
+
+#include "intel_npu/config/compiler.hpp"
+#include "intel_npu/config/runtime.hpp"
+
+namespace {
+constexpr std::size_t BATCH_AXIS = 0;
+constexpr std::size_t DEFAULT_BATCH_SIZE = 1;
+} // namespace
+
+namespace intel_npu {
+
+IGraph::IGraph(ze_graph_handle_t handle,
+ NetworkMetadata metadata,
+ const Config& config,
+ std::optional> blob)
+ : _handle(handle),
+ _metadata(std::move(metadata)),
+ _logger("IGraph", config.get()) {
+ if (blob.has_value()) {
+ _blob = std::move(*blob);
+ }
+}
+
+const NetworkMetadata& IGraph::get_metadata() const {
+ return _metadata;
+}
+
+ze_graph_handle_t IGraph::get_handle() const {
+ return _handle;
+}
+
+void IGraph::update_network_name(std::string_view name) {
+ _metadata.name = name;
+}
+
+const std::vector& IGraph::get_input_descriptors() const {
+ return _input_descriptors;
+}
+
+const std::vector& IGraph::get_output_descriptors() const {
+ return _output_descriptors;
+}
+
+const std::shared_ptr& IGraph::get_command_queue() const {
+ return _command_queue;
+}
+
+void IGraph::set_workload_type(const ov::WorkloadType workloadType) const {
+ if (_command_queue == nullptr) {
+ return;
+ }
+
+ ze_command_queue_workload_type_t zeWorkloadType;
+ switch (workloadType) {
+ case ov::WorkloadType::DEFAULT:
+ zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT;
+ break;
+ case ov::WorkloadType::EFFICIENT:
+ zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND;
+ break;
+ default:
+ OPENVINO_THROW("Unknown value for WorkloadType!");
+ }
+
+ _command_queue->setWorkloadType(zeWorkloadType);
+}
+
+std::mutex& IGraph::get_mutex() {
+ return _mutex;
+}
+
+void IGraph::set_last_submitted_event(const std::shared_ptr& event, size_t indexOfCommandList) {
+ _last_submitted_event[indexOfCommandList] = event;
+}
+
+const std::shared_ptr& IGraph::get_last_submitted_event(size_t indexOfCommandList) const {
+ return _last_submitted_event[indexOfCommandList];
+}
+
+uint32_t IGraph::get_unique_id() {
+ return _unique_id++;
+}
+
+void IGraph::set_last_submitted_id(uint32_t id_index) {
+ _last_submitted_id = id_index;
+}
+
+const uint32_t IGraph::get_last_submitted_id() const {
+ return _last_submitted_id;
+}
+
+std::optional IGraph::get_batch_size(const NetworkMetadata& metadata) {
+ if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) {
+ _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
+ return std::nullopt;
+ }
+
+ const ov::PartialShape& firstOutputShape = *metadata.outputs.at(0).shapeFromIRModel;
+ if (firstOutputShape.is_dynamic()) {
+ _logger.warning("Networks using dynamic shapes are not supported when batching is handled by the plugin");
+ return std::nullopt;
+ }
+ if (firstOutputShape.rank().get_length() == 0) {
+ _logger.warning("Networks using rank 0 shapes for inputs/outputs are not supported when batching is "
+ "handled by the plugin");
+ return std::nullopt;
+ }
+
+ const size_t candidateBatchSize = firstOutputShape[BATCH_AXIS].get_length();
+ if (candidateBatchSize == 0 || candidateBatchSize == DEFAULT_BATCH_SIZE) {
+ _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
+ return std::nullopt;
+ }
+
+ auto checkDescriptorsUseCandidateBatchSize = [candidateBatchSize](const std::vector& descriptors) {
+ for (const IODescriptor& descriptor : descriptors) {
+ OPENVINO_ASSERT(descriptor.shapeFromIRModel.has_value(),
+ "Missing value for the \"shapeFromIRModel\" attribute, I/O descriptor");
+
+ const ov::PartialShape& shapeFromCompiler = descriptor.shapeFromCompiler;
+ const ov::PartialShape& shapeFromIRModel = *descriptor.shapeFromIRModel;
+
+ if (shapeFromCompiler.is_dynamic() || shapeFromCompiler.rank().get_length() == 0 ||
+ *shapeFromCompiler.begin() != DEFAULT_BATCH_SIZE) {
+ return false;
+ }
+
+ if (!descriptor.isStateInput && !descriptor.isStateOutput && !descriptor.isShapeTensor) {
+ if (shapeFromIRModel.is_dynamic() || shapeFromIRModel.rank().get_length() == 0 ||
+ *shapeFromIRModel.begin() != candidateBatchSize) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ };
+
+ if (!checkDescriptorsUseCandidateBatchSize(metadata.inputs) ||
+ !checkDescriptorsUseCandidateBatchSize(metadata.outputs)) {
+ _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
+ return std::nullopt;
+ }
+
+ _logger.debug("Batching is handled by the plugin");
+
+ return candidateBatchSize;
+}
+
+const std::optional IGraph::get_batch_size() const {
+ return _batch_size;
+}
+
+} // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
index f819ed73711cf2..9d634656db109a 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
@@ -541,13 +541,21 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config,
content = std::regex_replace(content, std::regex(batchstr.str()), "");
}
- // NPU_DEFER_WEIGHTS_LOAD is not supported in versions < 6.2 - need to remove it
- if ((compilerVersion.major < 6) || (compilerVersion.major == 6 && compilerVersion.minor < 2)) {
+ // NPU_DEFER_WEIGHTS_LOAD is needed at runtime only
+ {
std::ostringstream batchstr;
batchstr << ov::intel_npu::defer_weights_load.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+"
<< VALUE_DELIMITER;
- logger.warning(
- "NPU_DEFER_WEIGHTS_LOAD property is not suppored by this compiler version. Removing from parameters");
+ logger.info("NPU_DEFER_WEIGHTS_LOAD property is needed at runtime only. Removing from parameters");
+ content = std::regex_replace(content, std::regex(batchstr.str()), "");
+ }
+
+ // NPU_RUN_INFERENCES_SEQUENTIALLY is needed at runtime only
+ {
+ std::ostringstream batchstr;
+ batchstr << ov::intel_npu::run_inferences_sequentially.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER
+ << "\\S+" << VALUE_DELIMITER;
+ logger.info("NPU_RUN_INFERENCES_SEQUENTIALLY property is needed at runtime only. Removing from parameters");
content = std::regex_replace(content, std::regex(batchstr.str()), "");
}
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
index e1f3990b835e8d..0d180f983ad3a9 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
@@ -16,7 +16,7 @@ DriverGraph::DriverGraph(const std::shared_ptr& zeGraphExt,
NetworkMetadata metadata,
const Config& config,
std::optional> blob)
- : IGraph(graphHandle, std::move(metadata), std::move(blob)),
+ : IGraph(graphHandle, std::move(metadata), config, std::move(blob)),
_zeGraphExt(zeGraphExt),
_zeroInitStruct(zeroInitStruct),
_logger("DriverGraph", config.get()) {
@@ -126,6 +126,16 @@ void DriverGraph::initialize(const Config& config) {
// _zeGraphExt->initializeGraph(). The driver will not access the original blob from this moment on, so we are
// releasing it here to avoid unnecessary memory usage.
_blobIsReleased = release_blob(config);
+
+ if (config.get() != ov::intel_npu::BatchMode::COMPILER) {
+ _batch_size = get_batch_size(_metadata);
+ }
+
+ if (config.get()) {
+ auto number_of_command_lists = _batch_size.has_value() ? *_batch_size : 1;
+
+ _last_submitted_event.resize(number_of_command_lists);
+ }
}
bool DriverGraph::release_blob(const Config& config) {
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
index c99069a0a9760f..b1658e7e0582e0 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
@@ -17,7 +17,7 @@ PluginGraph::PluginGraph(const std::shared_ptr& zeGraphExt,
NetworkMetadata metadata,
std::vector blob,
const Config& config)
- : IGraph(graphHandle, std::move(metadata), std::optional>(std::move(blob))),
+ : IGraph(graphHandle, std::move(metadata), config, std::optional>(std::move(blob))),
_zeGraphExt(zeGraphExt),
_zeroInitStruct(zeroInitStruct),
_compiler(compiler),
@@ -115,6 +115,16 @@ void PluginGraph::initialize(const Config& config) {
_zeGraphExt->initializeGraph(_handle, config);
+ if (config.get() != ov::intel_npu::BatchMode::COMPILER) {
+ _batch_size = get_batch_size(_metadata);
+ }
+
+ if (config.get()) {
+ auto number_of_command_lists = _batch_size.has_value() ? *_batch_size : 1;
+
+ _last_submitted_event.resize(number_of_command_lists);
+ }
+
_logger.debug("Graph initialize finish");
}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index c6be2793fe6f70..b9cdad9f4879db 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -434,6 +434,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model,
// Finalize memory in closures and weight banks
finalize_weights_bank();
+ detach_memory();
// Print stats report when possible
{
@@ -499,6 +500,23 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
LOG_INFO("Done.");
}
+void ov::npuw::CompiledModel::detach_memory() {
+ LOG_INFO("Detaching model & weight memory...");
+ LOG_BLOCK();
+ for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
+ auto& comp_model_desc = m_compiled_submodels[idx];
+ auto& proto_comp_model_desc = m_compiled_submodels[comp_model_desc.replaced_by.value_or(idx)];
+ if (!proto_comp_model_desc.model || !proto_comp_model_desc.compiled_model) {
+ continue; // optimized-out OR already cleared - skip
+ }
+ if (proto_comp_model_desc.device_it + 1 == m_dev_list.end()) {
+ LOG_INFO("No fallback expected - clear the OV model for Subgraph[" << idx << "]");
+ proto_comp_model_desc.model.reset();
+ }
+ }
+ LOG_INFO("Done");
+}
+
std::string ov::npuw::CompiledModel::global_mem_device() const {
// Force globally set device if set
const std::string device_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
@@ -668,6 +686,10 @@ ov::SoPtr ov::npuw::CompiledModel::compile_submodel(const st
// NOTE(dm): Not sure if it is required for the NPUW plugin, but likely it is
auto& device_config = m_meta_devices[device];
+ if (ov::npuw::util::starts_with(device, "NPU") && m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) {
+ device_config["NPU_RUN_INFERENCES_SEQUENTIALLY"] = "YES";
+ }
+
const auto& cache_dir = m_cfg.get<::intel_npu::NPUW_CACHE_DIR>();
if (!cache_dir.empty()) {
LOG_INFO("NPUW will try to utilize CACHE_DIR for " << submodel->get_friendly_name() << " submodel.");
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index ece1bc78fb5bf5..8ccb1f83349e47 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -78,6 +78,7 @@ class CompiledModel : public ov::ICompiledModel {
void implement_properties();
void finalize_weights_bank();
+ void detach_memory();
std::string global_mem_device() const;
std::string funcall_mem_device(const std::size_t idx) const;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp
index 81521222ae6fae..133101da8b7d38 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp
@@ -19,23 +19,34 @@ namespace npuw {
namespace weights {
namespace op {
struct Const {
- std::shared_ptr node;
-
+ std::shared_ptr m_node;
+ ov::element::Type m_cached_type;
+ ov::Shape m_cached_shape;
+ const void* m_cached_ptr = nullptr;
+
+ explicit Const(std::shared_ptr n) : m_node(n) {
+ m_cached_type = m_node->get_element_type();
+ m_cached_shape = m_node->get_shape();
+ m_cached_ptr = m_node->get_data_ptr();
+ }
std::size_t hash() const {
- std::size_t seed = std::hash()(node->get_data_ptr()) + 0x9e3779b9;
- seed ^= node->get_element_type().hash() + 0x9e3779b9;
- for (const auto& dim : node->get_shape()) {
+ std::size_t seed = std::hash()(m_cached_ptr) + 0x9e3779b9;
+ seed ^= m_cached_type.hash() + 0x9e3779b9;
+ for (const auto& dim : m_cached_shape) {
seed ^= std::hash()(dim) + 0x9e3779b9;
}
return seed;
}
bool operator==(const Const& other) const {
- return (node->get_shape() == other.node->get_shape() &&
- node->get_element_type() == other.node->get_element_type() &&
- node->get_data_ptr() == other.node->get_data_ptr());
+ return (m_cached_type == other.m_cached_type && m_cached_shape == other.m_cached_shape &&
+ m_cached_ptr == other.m_cached_ptr);
}
ov::Tensor eval() const {
- return ov::npuw::util::tensor_from_const(node);
+ NPUW_ASSERT(m_node && "Const::eval() can only happen before detach");
+ return ov::npuw::util::tensor_from_const(m_node);
+ }
+ void detach() {
+ m_node.reset();
}
};
struct Concat {
@@ -59,6 +70,11 @@ struct Concat {
}
return ov::npuw::util::concat(to_concat, axis);
}
+ void detach() {
+ for (auto&& lt : tensors) {
+ lt.detach();
+ }
+ }
};
struct Unpack {
@@ -95,6 +111,11 @@ struct Unpack {
}
return dst;
}
+ void detach() {
+ w.detach();
+ z.detach();
+ s.detach();
+ }
};
struct Permute {
LazyTensor tensor;
@@ -113,6 +134,9 @@ struct Permute {
ov::Tensor eval() const {
return ov::npuw::util::permute(tensor.eval(), axes);
}
+ void detach() {
+ tensor.detach();
+ }
};
struct Convert {
LazyTensor tensor;
@@ -130,6 +154,9 @@ struct Convert {
NPUW_ASSERT(ov::element::f16 == type);
return ov::npuw::util::to_f16(tensor.eval());
}
+ void detach() {
+ tensor.detach();
+ }
};
} // namespace op
@@ -137,16 +164,16 @@ using Transform = std::variant
overloaded(Ts...) -> overloaded;
-std::size_t LazyTensorImpl::get_hash() const {
- // Already calculated
- if (m_hash != 0) {
- return m_hash;
- }
-
- // Get hash
- std::size_t seed = 0;
- std::visit(overloaded{[&seed](const auto& op) {
- seed ^= op.hash();
- }},
- m_transform);
-
- return seed;
-}
-
-LazyTensorImpl::LazyTensorImpl(Transform&& t) {
- m_transform = std::move(t);
- m_hash = get_hash();
-}
+LazyTensorImpl::LazyTensorImpl(Transform&& t)
+ : m_transform(std::move(t)),
+ m_hash(std::visit(overloaded{[](const auto& op) {
+ return op.hash();
+ }},
+ m_transform)) {}
bool LazyTensorImpl::operator==(const LazyTensorImpl& other) const {
return m_hash == other.m_hash && m_transform == other.m_transform;
@@ -200,17 +213,25 @@ ov::Tensor LazyTensorImpl::eval() const {
some kind of indicator that the only difference is concat and we should look for an existing ov::Tensor.
Perhaps it should be done after model compilation and not handled here.
*/
+ return std::visit(overloaded{[](const auto& op) {
+ return op.eval();
+ }},
+ m_transform);
+}
+
+std::size_t LazyTensorImpl::get_hash() const {
+ return m_hash;
+}
- ov::Tensor result = std::visit(overloaded{[](const auto& op) {
- return op.eval();
- }},
- m_transform);
- NPUW_ASSERT(result);
- return result;
+void LazyTensorImpl::detach() {
+ std::visit(overloaded{[](auto& op) {
+ op.detach();
+ }},
+ m_transform);
}
LazyTensor::LazyTensor(const std::shared_ptr& const_ptr)
- : m_impl(std::make_shared(op::Const{const_ptr})) {}
+ : m_impl(std::make_shared(op::Const(const_ptr))) {}
LazyTensor::LazyTensor(const std::vector& to_concat, const std::size_t axis)
: m_impl(std::make_shared(op::Concat{to_concat, axis})) {}
LazyTensor::LazyTensor(const LazyTensor& cw,
@@ -233,11 +254,17 @@ LazyTensor LazyTensor::convert(const ov::element::Type& type) {
}
bool LazyTensor::operator==(const LazyTensor& other) const {
+ if (!m_impl && !other.m_impl) {
+ return true;
+ }
+ if ((!m_impl && other.m_impl) || (m_impl && !other.m_impl)) {
+ return false;
+ }
return *m_impl.get() == *other.m_impl.get();
}
bool LazyTensor::operator!=(const LazyTensor& other) const {
- return !(*m_impl.get() == *other.m_impl.get());
+ return !(*this == other);
}
ov::Tensor LazyTensor::eval() const {
@@ -254,6 +281,12 @@ std::size_t LazyTensor::get_hash() const {
return m_impl->get_hash();
}
+void LazyTensor::detach() {
+ if (m_impl) {
+ m_impl->detach();
+ }
+}
+
std::size_t LazyTensor::Hash::operator()(const LazyTensor& lt) const {
return lt.get_hash();
}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp
index 365d9d636872b8..840e22dcedad83 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp
@@ -39,8 +39,8 @@ class LazyTensor {
bool operator!=(const LazyTensor& other) const;
ov::Tensor eval() const;
-
std::size_t get_hash() const;
+ void detach();
private:
std::shared_ptr m_impl = nullptr;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
index 2b2878481f1330..3e712574606679 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
@@ -23,7 +23,7 @@ using ov::npuw::online::detail::isOp;
Group::Group(const std::shared_ptr& node,
size_t gid,
own::ade::NodeHandle nh,
- const std::shared_ptr& g,
+ const std::weak_ptr& g,
const std::weak_ptr& snapshot)
: m_nh(std::move(nh)),
m_id(gid),
@@ -36,7 +36,7 @@ Group::Group(const std::shared_ptr& node,
Group::Group(size_t gid,
own::ade::NodeHandle nh,
- const std::shared_ptr& g,
+ const std::weak_ptr& g,
const std::weak_ptr& snapshot)
: m_nh(std::move(nh)),
m_id(gid),
@@ -214,14 +214,16 @@ void Group::relinkGraph(const Group::GPtr& gptr_other) {
auto consumers = gptr_other->dstNodes();
// Remove gptr_other node from the graph. Note: also removes all it's edges
- m_graph->remove(gptr_other->getHandle());
+ auto&& graph = m_graph.lock();
+ NPUW_ASSERT(graph);
+ graph->remove(gptr_other->getHandle());
for (const auto& nh : producers) {
if (m_nh == nh) {
continue;
}
// relink the graph
- if (!m_graph->linked(nh, m_nh)) {
- m_graph->link(nh, m_nh);
+ if (!graph->linked(nh, m_nh)) {
+ graph->link(nh, m_nh);
}
}
for (const auto& nh : consumers) {
@@ -229,8 +231,8 @@ void Group::relinkGraph(const Group::GPtr& gptr_other) {
continue;
}
// relink the graph
- if (!m_graph->linked(m_nh, nh)) {
- m_graph->link(m_nh, nh);
+ if (!graph->linked(m_nh, nh)) {
+ graph->link(m_nh, nh);
}
}
}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp
index 17527033173a82..1d354542e135a8 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp
@@ -33,11 +33,11 @@ class Group : public std::enable_shared_from_this {
Group(const std::shared_ptr& node,
size_t gid,
own::ade::NodeHandle nh,
- const std::shared_ptr& g,
+ const std::weak_ptr& g,
const std::weak_ptr& snapshot);
Group(size_t gid,
own::ade::NodeHandle nh,
- const std::shared_ptr& g,
+ const std::weak_ptr& g,
const std::weak_ptr& snapshot);
// After we formed a final structure of partitioning,
@@ -100,7 +100,7 @@ class Group : public std::enable_shared_from_this {
own::ade::NodeHandle m_nh;
size_t m_id; // used for utility prints only
- std::shared_ptr m_graph;
+ std::weak_ptr m_graph;
std::weak_ptr m_snapshot;
bool m_frozen = false;
bool m_nofold = false;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
index 7a942f0b6c6351..616aff53128292 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
@@ -141,6 +141,14 @@ Impl _(std::shared_ptr pM) {
} // namespace at
+// Written here to be a drop-in replacement for ov::parallel_for for the debug purposes
+template
+void non_parallel_for(std::size_t count, F&& f) {
+ for (std::size_t idx = 0u; idx < count; idx++) {
+ f(idx);
+ }
+}
+
} // namespace util
} // namespace npuw
} // namespace ov
diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
index 51cf76020d81a1..2b4be1a759c17c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
@@ -40,16 +40,22 @@ ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) {
std::lock_guard guard(m_mutex);
- auto& device_bank = m_device_bank[device_for_alloc];
- auto iter_device = device_bank.find(tensor);
+ auto& device_bank = m_device_banks[device_for_alloc];
- if (iter_device != device_bank.end() && iter_device->second) {
+ std::unique_lock dev_guard(device_bank.mutex);
+ auto iter_device = device_bank.storage.find(tensor);
+
+ if (iter_device != device_bank.storage.end() && iter_device->second) {
// Already allocated
+ // tensor (the key) may be coming from a 2nd (3rd, ...) model
+ // detach it here just in case
+ const_cast(tensor).detach();
return iter_device->second;
}
+ dev_guard.unlock();
// Allocation and evaluation needed
- return unsafe_eval_and_alloc(tensor, device_for_alloc);
+ return eval_and_alloc(tensor, device_bank, device_for_alloc);
}
void Bank::registerLT(const LazyTensor& tensor, const std::string& device) {
@@ -57,64 +63,75 @@ void Bank::registerLT(const LazyTensor& tensor, const std::string& device) {
std::lock_guard guard(m_mutex);
- auto& device_bank = m_device_bank[device_for_alloc];
- if (device_bank.find(tensor) == device_bank.end()) {
- device_bank[tensor] = ov::Tensor();
+ auto& device_bank = m_device_banks[device_for_alloc];
+ if (device_bank.storage.find(tensor) == device_bank.storage.end()) {
+ device_bank.storage[tensor] = ov::Tensor();
}
}
void Bank::evaluate_and_allocate() {
std::lock_guard guard(m_mutex);
- for (auto&& bank : m_device_bank) {
+ for (auto&& bank : m_device_banks) {
const auto& device_for_alloc = bank.first;
auto& device_bank = bank.second;
+
std::vector vec;
- for (const auto& el : device_bank) {
+ vec.reserve(device_bank.storage.size());
+ for (const auto& el : device_bank.storage) {
vec.push_back(el.first);
}
ov::parallel_for(vec.size(), [&](std::size_t idx) {
const auto& lt = vec[idx];
- auto iter_device = device_bank.find(lt);
- if (iter_device != device_bank.end() && iter_device->second) {
+ std::unique_lock dev_guard(device_bank.mutex);
+ auto iter_device = device_bank.storage.find(lt);
+ if (iter_device != device_bank.storage.end() && iter_device->second) {
// Already allocated
return;
}
+ dev_guard.unlock();
// Allocation and evaluation needed
- unsafe_eval_and_alloc(lt, device_for_alloc);
+ eval_and_alloc(lt, device_bank, device_for_alloc);
});
}
}
-ov::Tensor Bank::unsafe_eval_and_alloc(const LazyTensor& tensor, const std::string& device_for_alloc) {
- // Note: private method used inside other methods with already locked mutex
+ov::Tensor Bank::eval_and_alloc(const LazyTensor& tensor,
+ Bank::DeviceBank& dbank,
+ const std::string& device_for_alloc) {
+ // Evaluate concurrently (see evaluate_and_allocate), lock the device
+ // mutex only to update the device bank (& allocate on-device memory, if needed)
const auto& transformed_tensor = tensor.eval();
+
+ std::unique_lock guard(dbank.mutex);
if (device_for_alloc == "CPU") {
- m_device_bank[device_for_alloc][tensor] = transformed_tensor;
+ dbank.storage[tensor] = transformed_tensor;
return transformed_tensor;
}
+ // Non-CPU case: detach the evaluated LazyTensor from its memory
+ const_cast(tensor).detach();
+
ov::SoPtr remote_tensor;
ov::Tensor allocated_tensor;
- {
- // FIXME: L0 allocation may crash when run in parallel
- std::lock_guard guard(m_alloc_mutex);
- m_remote_ctx = m_core->get_default_context(device_for_alloc)._ptr;
- remote_tensor =
- m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape());
- allocated_tensor = ov::make_tensor(remote_tensor);
- }
+
+ auto remote_ctx = m_core->get_default_context(device_for_alloc)._ptr;
+ remote_tensor =
+ remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape());
+ allocated_tensor = ov::make_tensor(remote_tensor);
+ dbank.storage[tensor] = allocated_tensor;
+ guard.unlock(); // Unlock the guard, map update is done - copy can continue in parallel
+
transformed_tensor.copy_to(allocated_tensor);
- m_device_bank[device_for_alloc][tensor] = allocated_tensor;
return allocated_tensor;
}
bool Bank::is_remote(const LazyTensor& tensor) const {
// FIXME: make generic
- auto npu_bank = m_device_bank.find("NPU");
- if (npu_bank != m_device_bank.end() && npu_bank->second.find(tensor) != npu_bank->second.end()) {
- // Found in NPU bank
+ auto npu_bank = m_device_banks.find("NPU");
+ if (npu_bank != m_device_banks.end() && npu_bank->second.storage.find(tensor) != npu_bank->second.storage.end()) {
+ // Found in NPU bank so considered remote (utterly wrong for the generic case)
return true;
}
return false;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp
index b9d8d21143c851..491e962a58b438 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp
@@ -35,13 +35,17 @@ class Bank {
bool is_remote(const LazyTensor& tensor) const;
private:
- ov::Tensor unsafe_eval_and_alloc(const LazyTensor& tensor, const std::string& device);
// Bank for specified device and their allocated memory
- std::unordered_map> m_device_bank;
+ struct DeviceBank {
+ std::unordered_map storage;
+ std::mutex mutex;
+ };
+ std::unordered_map m_device_banks;
+
+ ov::Tensor eval_and_alloc(const LazyTensor& tensor, DeviceBank& dbank, const std::string& device);
+
std::mutex m_mutex;
- std::mutex m_alloc_mutex;
std::shared_ptr m_core = nullptr;
- std::shared_ptr m_remote_ctx = nullptr;
std::string m_alloc_device;
};
diff --git a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp
index 4baf15d76718a8..4e86d32d2f72b1 100644
--- a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp
@@ -311,6 +311,12 @@ void CompiledModel::initialize_properties() {
[](const Config& config) {
return config.getString();
}}},
+ {ov::intel_npu::run_inferences_sequentially.name(),
+ {false,
+ ov::PropertyMutability::RO,
+ [](const Config& config) {
+ return config.get();
+ }}},
};
for (auto& property : _properties) {
diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
index 9f77d952fd813b..18a96bff02fb80 100644
--- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -568,6 +568,12 @@ Plugin::Plugin()
[](const Config& config) {
return config.getString();
}}},
+ {ov::intel_npu::run_inferences_sequentially.name(),
+ {false,
+ ov::PropertyMutability::RW,
+ [](const Config& config) {
+ return config.get();
+ }}},
{ov::intel_npu::batch_mode.name(), {false, ov::PropertyMutability::RW, [](const Config& config) {
return config.getString();
}}}};
diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp
index 6cb9e23d203c11..1e1b50fb925916 100644
--- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp
+++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp
@@ -4,6 +4,7 @@
#pragma once
+#include
#include
#include
@@ -57,12 +58,14 @@ namespace intel_npu {
symbol_statement(zeMemAllocDevice) \
symbol_statement(zeMemAllocHost) \
symbol_statement(zeMemFree) \
- symbol_statement(zeMemGetAllocProperties)
+ symbol_statement(zeMemGetAllocProperties) \
+ symbol_statement(zelLoaderGetVersions)
//unsupported symbols with older ze_loader versions
#define weak_symbols_list() \
symbol_statement(zeCommandListGetNextCommandIdExp) \
- symbol_statement(zeCommandListUpdateMutableCommandsExp)
+ symbol_statement(zeCommandListUpdateMutableCommandsExp) \
+ symbol_statement(zeInitDrivers)
// clang-format on
class ZeroApi {
diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_init.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_init.hpp
index 01b2de868e7572..25ceb018cdc243 100644
--- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_init.hpp
+++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_init.hpp
@@ -67,6 +67,8 @@ class ZeroInitStructsHolder final {
}
private:
+ void initNpuDriver();
+
static const ze_driver_uuid_t uuid;
Logger log;
diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp
index 8883bb99dd178e..0df0c5d66169a4 100644
--- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp
+++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp
@@ -188,7 +188,7 @@ static inline uint32_t findGroupOrdinal(ze_device_handle_t device_handle, const
"zeDeviceGetCommandQueueGroupProperties",
zeDeviceGetCommandQueueGroupProperties(device_handle, &command_queue_group_count, nullptr));
- log.debug("ZeroDevice::ZeroDevice - resize command_queue_group_count");
+ log.debug("zero_utils::findGroupOrdinal - resize command_queue_group_count");
command_group_properties.resize(command_queue_group_count);
for (auto& prop : command_group_properties) {
diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp
index 9b5b1b4540fbe7..61999376680e90 100644
--- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp
+++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp
@@ -37,7 +37,7 @@ class EventPool {
class Event {
public:
Event() = delete;
- Event(const ze_event_pool_handle_t& event_pool, uint32_t event_index);
+ Event(const std::shared_ptr& event_pool, uint32_t event_index);
Event(const Event&) = delete;
Event(Event&&) = delete;
Event& operator=(const Event&) = delete;
@@ -51,6 +51,7 @@ class Event {
~Event();
private:
+ std::shared_ptr _event_pool;
ze_event_handle_t _handle = nullptr;
Logger _log;
diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_init.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_init.cpp
index e87f8db788b9b8..b069bd64244142 100644
--- a/src/plugins/intel_npu/src/utils/src/zero/zero_init.cpp
+++ b/src/plugins/intel_npu/src/utils/src/zero/zero_init.cpp
@@ -4,6 +4,7 @@
#include "intel_npu/utils/zero/zero_init.hpp"
+#include
#include
#include
@@ -53,30 +54,93 @@ static std::tuple queryDriverExtensionVersion(
return std::make_tuple(targetVersion, functionExtName ? functionExtName : "");
}
-ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", Logger::global().level()) {
- log.debug("ZeroInitStructsHolder - performing zeInit on VPU only");
- THROW_ON_FAIL_FOR_LEVELZERO("zeInit", zeInit(ZE_INIT_FLAG_VPU_ONLY));
-
- uint32_t drivers = 0;
- THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGet", zeDriverGet(&drivers, nullptr));
+void ZeroInitStructsHolder::initNpuDriver() {
+ auto setNpuDriver = [&](uint32_t drivers_count, std::vector all_drivers) {
+ driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES;
+ log.debug("ZeroInitStructsHolder::initNpuDriver - setting driver properties to "
+ "ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES");
+ for (uint32_t i = 0; i < drivers_count; ++i) {
+ zeDriverGetProperties(all_drivers[i], &driver_properties);
+
+ if (memcmp(&driver_properties.uuid, &uuid, sizeof(uuid)) == 0) {
+ driver_handle = all_drivers[i];
+ break;
+ }
+ }
+ if (driver_handle == nullptr) {
+ OPENVINO_THROW("NPU driver wasn't found!");
+ }
+ };
+
+ auto fallbackToZeDriverGet = [&]() {
+ log.debug("ZeroInitStructsHolder - zeInitDrivers not supported, fallback to zeDriverGet");
+
+ uint32_t drivers_count = 0;
+ THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGet", zeDriverGet(&drivers_count, nullptr));
+
+ std::vector all_drivers(drivers_count);
+ THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGet", zeDriverGet(&drivers_count, all_drivers.data()));
+
+ // Get our target driver
+ setNpuDriver(drivers_count, std::move(all_drivers));
+ };
+
+ zel_version_t loader_version = {};
+ size_t num_components;
+ auto result = zelLoaderGetVersions(&num_components, nullptr);
+ if (result == ZE_RESULT_SUCCESS) {
+ zel_component_version_t* versions = new zel_component_version_t[num_components];
+ result = zelLoaderGetVersions(&num_components, versions);
+
+ if (result == ZE_RESULT_SUCCESS) {
+ for (size_t i = 0; i < num_components; ++i) {
+ if (strncmp(versions[i].component_name, "loader", strlen("loader")) == 0) {
+ loader_version = versions[i].component_lib_version;
+
+ log.debug("ZeroInitStructsHolder - ze_loader.dll version: %d.%d.%d",
+ loader_version.major,
+ loader_version.minor,
+ loader_version.patch);
+ }
+ }
+ }
- std::vector all_drivers(drivers);
- THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGet", zeDriverGet(&drivers, all_drivers.data()));
+ delete[] versions;
+ }
- // Get our target driver
- driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES;
- log.debug("ZeroInitStructsHolder - setting driver properties to ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES");
- for (uint32_t i = 0; i < drivers; ++i) {
- zeDriverGetProperties(all_drivers[i], &driver_properties);
+ if (loader_version.major > 1 || (loader_version.major == 1 && loader_version.minor > 18) ||
+ (loader_version.major == 1 && loader_version.minor == 18 && loader_version.patch >= 5)) {
+ uint32_t drivers_count = 0;
+ ze_init_driver_type_desc_t desc = {};
+ desc.flags = ZE_INIT_DRIVER_TYPE_FLAG_NPU;
+ auto result = zeInitDrivers(&drivers_count, nullptr, &desc);
+ if (result != ZE_RESULT_SUCCESS) {
+ fallbackToZeDriverGet();
+ return;
+ }
- if (memcmp(&driver_properties.uuid, &uuid, sizeof(uuid)) == 0) {
- driver_handle = all_drivers[i];
- break;
+ std::vector all_drivers(drivers_count);
+ result = zeInitDrivers(&drivers_count, all_drivers.data(), &desc);
+ if (result != ZE_RESULT_SUCCESS) {
+ fallbackToZeDriverGet();
+ return;
}
+
+ // Get our target driver
+ setNpuDriver(drivers_count, std::move(all_drivers));
+
+ return;
}
- if (driver_handle == nullptr) {
- OPENVINO_THROW("zeDriverGet failed to return NPU driver");
- }
+
+ fallbackToZeDriverGet();
+}
+
+ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", Logger::global().level()) {
+ log.debug("ZeroInitStructsHolder - performing zeInit on NPU only");
+ THROW_ON_FAIL_FOR_LEVELZERO("zeInit", zeInit(ZE_INIT_FLAG_VPU_ONLY));
+
+ log.debug("ZeroInitStructsHolder - initialize NPU Driver");
+ initNpuDriver();
// Check L0 API version
THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGetApiVersion", zeDriverGetApiVersion(driver_handle, &ze_drv_api_version));
diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp
index 858e65d4b5e6ee..d95b0e172a7d64 100644
--- a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp
+++ b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp
@@ -24,9 +24,11 @@ EventPool::~EventPool() {
}
}
-Event::Event(const ze_event_pool_handle_t& event_pool, uint32_t event_index) : _log("Event", Logger::global().level()) {
+Event::Event(const std::shared_ptr& event_pool, uint32_t event_index)
+ : _event_pool(event_pool),
+ _log("Event", Logger::global().level()) {
ze_event_desc_t event_desc = {ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, event_index, 0, 0};
- THROW_ON_FAIL_FOR_LEVELZERO("zeEventCreate", zeEventCreate(event_pool, &event_desc, &_handle));
+ THROW_ON_FAIL_FOR_LEVELZERO("zeEventCreate", zeEventCreate(_event_pool->handle(), &event_desc, &_handle));
}
void Event::AppendSignalEvent(CommandList& command_list) const {
THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendSignalEvent",
diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp
index 5d023fe9d0bee6..e4a49ce9b7ccdb 100644
--- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp
+++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp
@@ -19,6 +19,12 @@ INSTANTIATE_TEST_SUITE_P(compatibility_smoke_BehaviorTest,
::testing::ValuesIn(configsInferRequestRunTests)),
InferRequestRunTests::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest,
+ RunSeqTests,
+ ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU),
+ ::testing::ValuesIn(configsInferRequestRunTests)),
+ InferRequestRunTests::getTestCaseName);
+
const std::vector batchingConfigs = {
{ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::PLUGIN)},
{ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::COMPILER)},
@@ -29,3 +35,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest,
::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU),
::testing::ValuesIn(batchingConfigs)),
InferRequestRunTests::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest,
+ BatchingRunSeqTests,
+ ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU),
+ ::testing::ValuesIn(batchingConfigs)),
+ InferRequestRunTests::getTestCaseName);
diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp
index 20be5ed25edd27..07466677b9d547 100644
--- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp
+++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp
@@ -103,9 +103,7 @@ class InferRequestRunTests : public ov::test::behavior::OVPluginTestBase,
APIBaseTest::TearDown();
}
- std::shared_ptr createBatchingModel(element::Type type,
- const PartialShape& shape,
- const ov::Layout& layout) {
+ std::shared_ptr createModel(element::Type type, const PartialShape& shape, const ov::Layout& layout) {
ResultVector res;
ParameterVector params;
@@ -352,7 +350,7 @@ TEST_P(BatchingRunTests, CheckBatchingSupportInfer) {
ov::InferRequest inference_request;
auto batch_shape = Shape{4, 2, 32, 32};
- std::shared_ptr ov_model_batch = createBatchingModel(element::f32, batch_shape, "N...");
+ std::shared_ptr ov_model_batch = createModel(element::f32, batch_shape, "N...");
OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model_batch, target_device, configuration));
OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request());
@@ -365,7 +363,7 @@ TEST_P(BatchingRunTests, CheckBatchingSupportAsync) {
ov::InferRequest inference_request;
auto batch_shape = Shape{4, 2, 32, 32};
- std::shared_ptr ov_model_batch = createBatchingModel(element::f32, batch_shape, "N...");
+ std::shared_ptr ov_model_batch = createModel(element::f32, batch_shape, "N...");
OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model_batch, target_device, configuration));
OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request());
@@ -396,7 +394,7 @@ TEST_P(BatchingRunTests, UseCompilerBatchingErrorPluginBatching) {
TEST_P(BatchingRunTests, SetInputTensorInfer) {
auto batch_shape = Shape{4, 2, 2, 2};
auto shape_size = ov::shape_size(batch_shape);
- auto model = createBatchingModel(element::f32, batch_shape, "N...");
+ auto model = createModel(element::f32, batch_shape, "N...");
float* buffer = new float[shape_size];
compiled_model = core->compile_model(model, target_device, configuration);
@@ -422,7 +420,7 @@ TEST_P(BatchingRunTests, SetInputTensorInfer) {
TEST_P(BatchingRunTests, SetInputTensorAsync) {
auto batch_shape = Shape{4, 2, 2, 2};
auto shape_size = ov::shape_size(batch_shape);
- auto model = createBatchingModel(element::f32, batch_shape, "N...");
+ auto model = createModel(element::f32, batch_shape, "N...");
float* buffer = new float[shape_size];
compiled_model = core->compile_model(model, target_device, configuration);
@@ -449,7 +447,7 @@ TEST_P(BatchingRunTests, SetInputTensorAsync) {
TEST_P(BatchingRunTests, SetInputTensorInfer_Caching) {
auto batch_shape = Shape{4, 2, 2, 2};
auto shape_size = ov::shape_size(batch_shape);
- auto model = createBatchingModel(element::f32, batch_shape, "N...");
+ auto model = createModel(element::f32, batch_shape, "N...");
float* buffer = new float[shape_size];
m_cache_dir = generateCacheDirName(GetTestName());
@@ -480,7 +478,7 @@ TEST_P(BatchingRunTests, SetInputTensorInfer_Caching) {
TEST_P(BatchingRunTests, CheckTwoRunsInfer) {
auto batch_shape = Shape{4, 2, 2, 2};
auto shape_size = ov::shape_size(batch_shape);
- auto model = createBatchingModel(element::f32, batch_shape, "N...");
+ auto model = createModel(element::f32, batch_shape, "N...");
float* buffer = new float[shape_size];
auto context = core->get_default_context(target_device);
@@ -524,6 +522,250 @@ TEST_P(BatchingRunTests, CheckTwoRunsInfer) {
delete[] buffer;
}
+using RunSeqTests = InferRequestRunTests;
+
+TEST_P(RunSeqTests, CheckMultipleRunsSeq0) {
+ auto shape = Shape{1, 64, 64, 256};
+ auto shape_size = ov::shape_size(shape);
+ auto model = createModel(element::f32, shape, "N...");
+
+ auto context = core->get_default_context(target_device);
+
+ configuration[ov::intel_npu::run_inferences_sequentially.name()] = true;
+ configuration[ov::intel_npu::tiles.name()] = 2;
+ compiled_model = core->compile_model(model, target_device, configuration);
+
+ const uint32_t inferences = 32;
+ std::array inference_request;
+ ov::Tensor input_tensor;
+ std::array output_tensor;
+
+ input_tensor = context.create_host_tensor(ov::element::f32, shape);
+ for (uint32_t i = 0; i < inferences; i++) {
+ inference_request[i] = compiled_model.create_infer_request();
+ output_tensor[i] = context.create_host_tensor(ov::element::f32, shape);
+ }
+
+ inference_request[0].set_input_tensor(input_tensor);
+ inference_request[0].set_output_tensor(output_tensor[0]);
+
+ const uint32_t runs = 10;
+ for (uint32_t z = 0; z < runs; z++) {
+ auto* input_data = reinterpret_cast(input_tensor.data());
+ for (size_t i = 0; i < shape_size; ++i) {
+ input_data[i] = static_cast(z);
+ }
+
+ inference_request[0].start_async(); // Adds '1' to each element
+
+ for (uint32_t i = 1; i < inferences; i++) {
+ inference_request[i].set_input_tensor(output_tensor[i - 1]);
+ inference_request[i].set_output_tensor(output_tensor[i]);
+
+ inference_request[i].start_async(); // Adds '1' to each element
+ }
+
+ inference_request[inferences - 1].wait();
+
+ float expected_result = static_cast(z) + 1.f;
+
+ for (uint32_t i = 0; i < inferences; i++) {
+ auto* output_tensor_data = reinterpret_cast(output_tensor[i].data());
+ for (size_t j = 0; j < shape_size; ++j) {
+ EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5)
+ << "Run=" << z << "Output=" << i << " Expected=" << expected_result
+ << ", actual=" << output_tensor_data[j] << " for index " << j;
+ }
+ expected_result++;
+ }
+ }
+}
+
+TEST_P(RunSeqTests, CheckMultipleRunsSeq1) {
+ auto shape = Shape{1, 64, 64, 256};
+ auto shape_size = ov::shape_size(shape);
+ auto model = createModel(element::f32, shape, "N...");
+
+ auto context = core->get_default_context(target_device);
+
+ configuration[ov::intel_npu::run_inferences_sequentially.name()] = true;
+ configuration[ov::intel_npu::tiles.name()] = 2;
+ compiled_model = core->compile_model(model, target_device, configuration);
+
+ const int inferences = 32;
+ std::array inference_request;
+ ov::Tensor input_tensor;
+ std::array output_tensor;
+
+ input_tensor = context.create_host_tensor(ov::element::f32, shape);
+
+ for (int i = 0; i < inferences; i++) {
+ inference_request[i] = compiled_model.create_infer_request();
+ output_tensor[i] = context.create_host_tensor(ov::element::f32, shape);
+ }
+
+ inference_request[inferences - 1].set_input_tensor(input_tensor);
+ inference_request[inferences - 1].set_output_tensor(output_tensor[inferences - 1]);
+
+ const int runs = 10;
+ for (int z = 0; z < runs; z++) {
+ auto* input_data = reinterpret_cast(input_tensor.data());
+ for (size_t i = 0; i < shape_size; ++i) {
+ input_data[i] = static_cast(z);
+ }
+
+ inference_request[inferences - 1].start_async(); // Adds '1' to each element
+
+ for (int i = inferences - 2; i >= 0; i--) {
+ inference_request[i].set_input_tensor(output_tensor[i + 1]);
+ inference_request[i].set_output_tensor(output_tensor[i]);
+
+ inference_request[i].start_async(); // Adds '1' to each element
+ }
+
+ inference_request[0].wait();
+
+ float expected_result = static_cast(z) + 1.f;
+
+ for (int i = inferences - 1; i >= 0; i--) {
+ auto* output_tensor_data = reinterpret_cast(output_tensor[i].data());
+ for (size_t j = 0; j < shape_size; ++j) {
+ EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5)
+ << "Run=" << z << "Output=" << i << " Expected=" << expected_result
+ << ", actual=" << output_tensor_data[j] << " for index " << j;
+ }
+ expected_result++;
+ }
+ }
+}
+
+TEST_P(RunSeqTests, CheckMultipleRunsSeq2) {
+ auto shape = Shape{1, 64, 64, 256};
+ auto shape_size = ov::shape_size(shape);
+ auto model = createModel(element::f32, shape, "N...");
+
+ auto context = core->get_default_context(target_device);
+
+ configuration[ov::intel_npu::run_inferences_sequentially.name()] = true;
+ configuration[ov::intel_npu::tiles.name()] = 2;
+ compiled_model = core->compile_model(model, target_device, configuration);
+
+ const int inferences = 32;
+ std::array inference_request;
+ ov::Tensor input_tensor;
+ std::array output_tensor;
+
+ input_tensor = context.create_host_tensor(ov::element::f32, shape);
+
+ for (int i = 0; i < inferences; i++) {
+ inference_request[i] = compiled_model.create_infer_request();
+ output_tensor[i] = context.create_host_tensor(ov::element::f32, shape);
+ }
+
+ inference_request[inferences - 1].set_input_tensor(input_tensor);
+ inference_request[inferences - 1].set_output_tensor(output_tensor[inferences - 1]);
+
+ auto* input_data = reinterpret_cast(input_tensor.data());
+ for (size_t i = 0; i < shape_size; ++i) {
+ input_data[i] = 1.f;
+ }
+
+ inference_request[inferences - 1].start_async();
+
+ for (int i = inferences - 2; i >= 0; i--) {
+ inference_request[i].set_input_tensor(output_tensor[i + 1]);
+ inference_request[i].set_output_tensor(output_tensor[i]);
+
+ inference_request[i].start_async();
+ }
+
+ inference_request[0].wait();
+
+ try {
+ inference_request[5].start_async();
+ inference_request[5].wait();
+ } catch (const std::exception& ex) {
+ ASSERT_FALSE(false) << ex.what();
+ return;
+ }
+
+ ASSERT_FALSE(true) << "Exception is expected but it didn't throw any exception!";
+}
+
+TEST_P(RunSeqTests, CheckMultipleRunsSeq3) {
+ auto shape = Shape{1, 64, 64, 256};
+ auto model = createModel(element::f32, shape, "N...");
+
+ configuration[ov::intel_npu::run_inferences_sequentially.name()] = true;
+ configuration[ov::intel_npu::tiles.name()] = 2;
+ compiled_model = core->compile_model(model, target_device, configuration);
+ ov::InferRequest inference_request;
+ inference_request = compiled_model.create_infer_request();
+
+ OV_EXPECT_THROW(inference_request.infer(),
+ ov::Exception,
+ HasSubstr("Only start async is supported when RUN_INFERENCES_SEQUENTIALLY is enabled!"));
+}
+
+using BatchingRunSeqTests = InferRequestRunTests;
+
+TEST_P(BatchingRunSeqTests, CheckMultipleBatchingRunsSeq) {
+ auto shape = Shape{4, 2, 64, 64};
+ auto shape_size = ov::shape_size(shape);
+ auto model = createModel(element::f32, shape, "N...");
+
+ auto context = core->get_default_context(target_device);
+
+ configuration[ov::intel_npu::run_inferences_sequentially.name()] = true;
+ configuration[ov::intel_npu::tiles.name()] = 2;
+ compiled_model = core->compile_model(model, target_device, configuration);
+
+ const uint32_t inferences = 32;
+ std::array inference_request;
+ ov::Tensor input_tensor;
+ std::array output_tensor;
+
+ input_tensor = context.create_host_tensor(ov::element::f32, shape);
+ for (uint32_t i = 0; i < inferences; i++) {
+ inference_request[i] = compiled_model.create_infer_request();
+ output_tensor[i] = context.create_host_tensor(ov::element::f32, shape);
+ }
+
+ inference_request[0].set_input_tensor(input_tensor);
+ inference_request[0].set_output_tensor(output_tensor[0]);
+
+ const uint32_t runs = 10;
+ for (uint32_t z = 0; z < runs; z++) {
+ auto* input_data = reinterpret_cast(input_tensor.data());
+ for (size_t i = 0; i < shape_size; ++i) {
+ input_data[i] = static_cast(z);
+ }
+
+ inference_request[0].start_async(); // Adds '1' to each element
+
+ for (uint32_t i = 1; i < inferences; i++) {
+ inference_request[i].set_input_tensor(output_tensor[i - 1]);
+ inference_request[i].set_output_tensor(output_tensor[i]);
+
+ inference_request[i].start_async(); // Adds '1' to each element
+ }
+
+ inference_request[inferences - 1].wait();
+
+ float expected_result = static_cast(z) + 1.f;
+
+ for (uint32_t i = 0; i < inferences; i++) {
+ auto* output_tensor_data = reinterpret_cast(output_tensor[i].data());
+ for (size_t j = 0; j < shape_size; ++j) {
+ EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5)
+ << "Run=" << z << "Output=" << i << " Expected=" << expected_result
+ << ", actual=" << output_tensor_data[j] << " for index " << j;
+ }
+ expected_result++;
+ }
+ }
+}
+
} // namespace behavior
} // namespace test
} // namespace ov