diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst index 8a58dc27df1f83..78a364c18ca4e6 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks.rst @@ -64,7 +64,7 @@ implemented in your solutions. Click the buttons below to see the chosen benchma :outline: :expand: - :material-regular:`bar_chart;1.4em` OVMS for GenAI (coming soon) + :material-regular:`bar_chart;1.4em` OVMS for GenAI diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst index ebd4667d544616..f18b66915fc3ce 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst @@ -218,6 +218,114 @@ Specify generation_config to use grouped beam search: cout << pipe.generate("The Sun is yellow because", config); } +Efficient Text Generation via Speculative Decoding +################################################## + +Speculative decoding (or assisted-generation) enables faster token generation +when an additional smaller draft model is used alongside the main model. +The draft model predicts the next K tokens one by one in an autoregressive manner, +while the main model validates these predictions and corrects them if necessary. + +Each predicted token is compared, and when there is a difference between the draft and +main model, the last token predicted by the main model is kept. Then, the draft +model acquires this token and tries prediction of the next K tokens, +thus repeating the cycle. + +This method eliminates the need for multiple infer requests to the main model, +which results in increased performance. Its implementation in the pipeline is +shown in the code samples below: + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: python + + import openvino_genai + import queue + import threading + + def streamer(subword): + print(subword, end='', flush=True) + return False + + def infer(model_dir: str, draft_model_dir: str, prompt: str): + main_device = 'CPU' # GPU can be used as well. + draft_device = 'CPU' + + scheduler_config = openvino_genai.SchedulerConfig() + scheduler_config.cache_size = 2 + + draft_model = openvino_genai.draft_model(draft_model_dir, draft_device) + + pipe = openvino_genai.LLMPipeline(model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + config.num_assistant_tokens = 5 + + pipe.generate(prompt, config, streamer) + + + For more information, refer to the + `Python sample `__. + + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include + + #include "openvino/genai/llm_pipeline.hpp" + + int main(int argc, char* argv[]) try { + if (4 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); + } + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + config.num_assistant_tokens = 5; + + std::string main_model_path = argv[1]; + std::string draft_model_path = argv[2]; + std::string prompt = argv[3]; + + std::string main_device = "CPU", draft_device = "CPU"; + + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.cache_size = 5; + + ov::genai::LLMPipeline pipe( + main_model_path, + main_device, + ov::genai::draft_model(draft_model_path, draft_device), + ov::genai::scheduler_config(scheduler_config)); + + auto streamer = [](std::string subword) { + std::cout << subword << std::flush; + return false; + }; + + pipe.generate(prompt, config, streamer); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ Comparing with Hugging Face Results ####################################### diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst index 6033bd8ed96106..245a2648aab491 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst @@ -118,7 +118,7 @@ sample shows basic usage of the ``Text2ImagePipeline`` pipeline. image_write("baseline.bmp", image) For more information, refer to the - `Python sample `__ + `Python sample `__ .. tab-item:: C++ :sync: cpp @@ -218,7 +218,7 @@ sample shows basic usage of the ``Text2ImagePipeline`` pipeline. For more information, refer to the - `C++ sample `__ + `C++ sample `__ @@ -269,7 +269,7 @@ and use audio files in WAV format at a sampling rate of 16 kHz as input. For more information, refer to the - `Python sample `__. + `Python sample `__. .. tab-item:: C++ :sync: cpp @@ -322,7 +322,7 @@ and use audio files in WAV format at a sampling rate of 16 kHz as input. For more information, refer to the - `C++ sample `__. + `C++ sample `__. Using GenAI in Chat Scenario @@ -367,7 +367,7 @@ mark a conversation session, as shown in the samples below: For more information, refer to the - `Python sample `__. + `Python sample `__. .. tab-item:: C++ :sync: cpp @@ -415,7 +415,142 @@ mark a conversation session, as shown in the samples below: For more information, refer to the - `C++ sample `__ + `C++ sample `__ + + +Using GenAI with Vision Language Models +####################################### + +OpenVINO GenAI introduces the ``openvino_genai.VLMPipeline`` pipeline for +inference of multimodal text-generation Vision Language Models (VLMs). +With a text prompt and an image as input, VLMPipeline can generate text using +models such as LLava or MiniCPM-V. See the chat scenario presented +in the samples below: + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: python + + import numpy as np + import openvino_genai + from PIL import Image + from openvino import Tensor + from pathlib import Path + + + def streamer(subword: str) -> bool: + print(subword, end='', flush=True) + + + def read_image(path: str) -> Tensor: + pic = Image.open(path).convert("RGB") + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) + return Tensor(image_data) + + + def read_images(path: str) -> list[Tensor]: + entry = Path(path) + if entry.is_dir(): + return [read_image(str(file)) for file in sorted(entry.iterdir())] + return [read_image(path)] + + + def infer(model_dir: str, image_dir: str): + rgbs = read_images(image_dir) + device = 'CPU' # GPU can be used as well. + enable_compile_cache = dict() + if "GPU" == device: + enable_compile_cache["CACHE_DIR"] = "vlm_cache" + pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + prompt = input('question:\n') + pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer) + + while True: + try: + prompt = input("\n----------\n" + "question:\n") + except EOFError: + break + pipe.generate(prompt, generation_config=config, streamer=streamer) + pipe.finish_chat() + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "load_image.hpp" + #include + #include + + bool print_subword(std::string&& subword) { + return !(std::cout << subword << std::flush); + } + + int main(int argc, char* argv[]) try { + if (3 != argc) { + throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); + } + + std::vector rgbs = utils::load_images(argv[2]); + + std::string device = "CPU"; // GPU can be used as well. + ov::AnyMap enable_compile_cache; + if ("GPU" == device) { + enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); + } + ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); + + ov::genai::GenerationConfig generation_config; + generation_config.max_new_tokens = 100; + + std::string prompt; + + pipe.start_chat(); + std::cout << "question:\n"; + + std::getline(std::cin, prompt); + pipe.generate(prompt, + ov::genai::images(rgbs), + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ Additional Resources ##################### @@ -423,4 +558,6 @@ Additional Resources * :doc:`Install OpenVINO GenAI <../../../get-started/install-openvino/install-openvino-genai>` * `OpenVINO GenAI Repo `__ * `OpenVINO GenAI Samples `__ +* A Jupyter notebook demonstrating + `Visual-language assistant with MiniCPM-V2 and OpenVINO `__ * `OpenVINO Tokenizers `__ diff --git a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json index f96fb11e6b029d..0d53c3813542d2 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json +++ b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json @@ -1,45 +1,330 @@ [ + { + "Platform": "Intel® Xeon® Platinum 8380", + "Model": "meta-llama/Llama-2-7b-chat-hf", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "OpenVINO Model Server": { + "Precisions": [ + { + "Throughput": { + "0.2": 94.97, + "0.4": 187.12, + "0.6": 271.85, + "0.8": 290.81, + "1.0": 291.39, + "2.0": 291.45, + "inf": 291.59 + }, + "Latency": { + "0.2": 74.35, + "0.4": 122.25, + "0.6": 467.49, + "0.8": 749.39, + "1.0": 771.39, + "2.0": 773.31, + "inf": 783.63 + } + } + ] + }, + "vLLM with OpenVINO backend": { + "Precisions": [ + { + "Throughput": { + "0.2": 94.83, + "0.4": 187.83, + "0.6": 272.32, + "0.8": 284.07, + "1.0": 291.88, + "2.0": 291.91, + "inf": 288.62 + }, + "Latency": { + "0.2": 82.31, + "0.4": 134.38, + "0.6": 495.99, + "0.8": 794.41, + "1.0": 798.39, + "2.0": 800.33, + "inf": 809.56 + } + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8480+", + "Model": "meta-llama/Llama-2-7b-chat-hf", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "OpenVINO Model Server": { + "Precisions": [ + { + "Throughput": { + "0.2": 95.15, + "0.4": 188.31, + "0.6": 279.3, + "0.8": 366.78, + "1.0": 454.27, + "2.0": 788.9, + "inf": 825.97 + }, + "Latency": { + "0.2": 60.88, + "0.4": 71.96, + "0.6": 83.45, + "0.8": 103.77, + "1.0": 128.12, + "2.0": 237.62, + "inf": 253.59 + } + } + ] + }, + "vLLM with OpenVINO backend": { + "Precisions": [ + { + "Throughput": { + "0.2": 95.06, + "0.4": 188.47, + "0.6": 280.54, + "0.8": 367.47, + "1.0": 450.81, + "2.0": 774.57, + "inf": 793.78 + }, + "Latency": { + "0.2": 63.84, + "0.4": 76.22, + "0.6": 87.21, + "0.8": 104.75, + "1.0": 136.77, + "2.0": 259.2, + "inf": 273.58 + } + } + ] + } + } + }, { "Platform": "Intel® Xeon® Platinum 8580", - "Model": "mistralai/Mistral-7B-v0.1", - "PlatformType": "None", + "Model": "meta-llama/Llama-2-7b-chat-hf", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "Vllm": { + "OpenVINO Model Server": { "Precisions": [ { "Throughput": { - "0.2": "350.06", - "0.6": "486.89", - "0.8": "575.92", - "2.0": "778.07" + "0.2": 95.29, + "0.4": 188.33, + "0.6": 280.09, + "0.8": 367.29, + "1.0": 453.21, + "2.0": 780.05, + "inf": 751.34 + }, + "Latency": { + "0.2": 52.44, + "0.4": 70.06, + "0.6": 84.54, + "0.8": 108.91, + "1.0": 136.45, + "2.0": 253.55, + "inf": 281.85 } - }, + } + ] + }, + "vLLM with OpenVINO backend": { + "Precisions": [ + { + "Throughput": { + "0.2": 95.0, + "0.4": 188.26, + "0.6": 279.78, + "0.8": 366.69, + "1.0": 450.26, + "2.0": 770.74, + "inf": 794.39 + }, + "Latency": { + "0.2": 58.07, + "0.4": 77.65, + "0.6": 91.14, + "0.8": 113.61, + "1.0": 144.21, + "2.0": 269.13, + "inf": 273.27 + } + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8380", + "Model": "meta-llama/Meta-Llama-3-8B-Instruct", + "featured_SKU": false, + "whats_new_model": true, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "OpenVINO Model Server": { + "Precisions": [ { + "Throughput": { + "0.2": 82.46, + "0.4": 162.73, + "0.6": 240.08, + "0.8": 273.75, + "1.0": 275.85, + "2.0": 276.3, + "inf": 275.15 + }, "Latency": { - "0.2": "60.93", - "0.6": "91.63", - "0.8": "113.61", - "2.0": "240.25" + "0.2": 76.49, + "0.4": 122.1, + "0.6": 318.14, + "0.8": 785.8, + "1.0": 805.58, + "2.0": 809.37, + "inf": 816.2 } } ] }, - "Ovms": { + "vLLM with OpenVINO backend": { "Precisions": [ { "Throughput": { - "0.2": "90.98", - "0.6": "266.24", - "0.8": "351.63", - "2.0": "195.16" + "0.2": 82.32, + "0.4": 162.98, + "0.6": 239.28, + "2.0": 270.37 + }, + "Latency": { + "0.2": 87.92, + "0.4": 142.3, + "0.6": 343.36, + "2.0": 873.0 } - }, + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8480+", + "Model": "meta-llama/Meta-Llama-3-8B-Instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "OpenVINO Model Server": { + "Precisions": [ { + "Throughput": { + "0.2": 82.61, + "0.4": 164.44, + "0.6": 244.92, + "0.8": 323.34, + "1.0": 400.78, + "2.0": 731.9, + "inf": 848.45 + }, "Latency": { - "0.2": "54.9", - "0.6": "78.78", - "0.8": "95.78", - "2.0": "352.23" + "0.2": 60.77, + "0.4": 69.1, + "0.6": 74.36, + "0.8": 81.41, + "1.0": 100.17, + "2.0": 206.5, + "inf": 246.56 + } + } + ] + }, + "vLLM with OpenVINO backend": { + "Precisions": [ + { + "Throughput": { + "0.2": 82.54, + "0.4": 163.66, + "0.6": 243.88, + "0.8": 322.75, + "1.0": 400.46, + "2.0": 727.1 + }, + "Latency": { + "0.2": 65.37, + "0.4": 75.87, + "0.6": 81.14, + "0.8": 93.91, + "1.0": 107.13, + "2.0": 229.57 + } + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8580", + "Model": "meta-llama/Meta-Llama-3-8B-Instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "OpenVINO Model Server": { + "Precisions": [ + { + "Throughput": { + "0.2": 82.55, + "0.4": 164.52, + "0.6": 243.96, + "0.8": 323.07, + "1.0": 399.68, + "2.0": 727.18, + "inf": 856.72 + }, + "Latency": { + "0.2": 54.57, + "0.4": 69.17, + "0.6": 80.32, + "0.8": 92.94, + "1.0": 111.06, + "2.0": 215.46, + "inf": 245.72 + } + } + ] + }, + "vLLM with OpenVINO backend": { + "Precisions": [ + { + "Throughput": { + "0.2": 82.64, + "0.6": 243.81, + "0.8": 321.8, + "1.0": 398.78, + "2.0": 722.48, + "inf": 792.34 + }, + "Latency": { + "0.2": 61.49, + "0.6": 90.54, + "0.8": 106.25, + "1.0": 123.6, + "2.0": 245.91, + "inf": 279.21 } } ] @@ -47,46 +332,168 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8530", + "Platform": "Intel® Xeon® Platinum 8380", "Model": "mistralai/Mistral-7B-v0.1", - "PlatformType": "None", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "Vllm": { + "OpenVINO Model Server": { + "Precisions": [ + { + "Throughput": { + "0.2": 91.74, + "0.4": 180.4, + "0.6": 262.97, + "0.8": 287.36, + "1.0": 289.08, + "2.0": 289.06, + "inf": 290.69 + }, + "Latency": { + "0.2": 74.84, + "0.4": 115.4, + "0.6": 345.64, + "0.8": 757.42, + "1.0": 776.6, + "2.0": 778.29, + "inf": 784.42 + } + } + ] + }, + "vLLM with OpenVINO backend": { "Precisions": [ { "Throughput": { - "0.2": "350.06", - "0.6": "486.89", - "0.8": "575.92", - "2.0": "778.07" + "0.2": 97.21, + "0.4": 192.46, + "0.6": 265.82, + "0.8": 273.24, + "1.0": 272.65, + "inf": 274.0 + }, + "Latency": { + "0.2": 166.77, + "0.4": 161.76, + "0.6": 666.89, + "0.8": 802.15, + "1.0": 810.26, + "inf": 807.71 } - }, + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8480+", + "Model": "mistralai/Mistral-7B-v0.1", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "OpenVINO Model Server": { + "Precisions": [ { + "Throughput": { + "0.2": 90.95, + "0.4": 181.06, + "0.6": 267.29, + "0.8": 351.62, + "1.0": 431.45, + "2.0": 751.85, + "inf": 596.0 + }, "Latency": { - "0.2": "60.93", - "0.6": "91.63", - "0.8": "113.61", - "2.0": "240.25" + "0.2": 59.95, + "0.4": 63.41, + "0.6": 73.42, + "0.8": 85.99, + "1.0": 98.67, + "2.0": 205.2, + "inf": 205.97 } } ] }, - "Ovms": { + "vLLM with OpenVINO backend": { + "Precisions": [ + { + "Throughput": { + "0.2": 98.18, + "0.4": 194.35, + "0.6": 287.28, + "0.8": 376.31, + "1.0": 460.32, + "2.0": 771.81, + "inf": 789.38 + }, + "Latency": { + "0.2": 64.88, + "0.4": 73.3, + "0.6": 84.37, + "0.8": 100.8, + "1.0": 133.98, + "2.0": 240.99, + "inf": 251.55 + } + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8580", + "Model": "mistralai/Mistral-7B-v0.1", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "OpenVINO Model Server": { "Precisions": [ { "Throughput": { - "0.2": "90.98", - "0.6": "266.24", - "0.8": "351.63", - "2.0": "195.16" + "0.2": 91.2, + "0.4": 180.14, + "0.6": 267.75, + "0.8": 351.12, + "1.0": 428.31, + "2.0": 744.99, + "inf": 852.05 + }, + "Latency": { + "0.2": 54.31, + "0.4": 67.14, + "0.6": 77.59, + "0.8": 92.17, + "1.0": 112.75, + "2.0": 225.48, + "inf": 241.49 } - }, + } + ] + }, + "vLLM with OpenVINO backend": { + "Precisions": [ { + "Throughput": { + "0.2": 98.1, + "0.4": 194.47, + "0.6": 286.97, + "0.8": 375.84, + "1.0": 460.21, + "2.0": 764.54, + "inf": 787.97 + }, "Latency": { - "0.2": "54.9", - "0.6": "78.78", - "0.8": "95.78", - "2.0": "352.23" + "0.2": 62.26, + "0.4": 78.08, + "0.6": 91.61, + "0.8": 116.71, + "1.0": 141.76, + "2.0": 250.38, + "inf": 254.25 } } ] diff --git a/src/inference/include/openvino/runtime/infer_request.hpp b/src/inference/include/openvino/runtime/infer_request.hpp index ed4dcd67797b84..10a606a2b6c535 100644 --- a/src/inference/include/openvino/runtime/infer_request.hpp +++ b/src/inference/include/openvino/runtime/infer_request.hpp @@ -255,7 +255,7 @@ class OPENVINO_RUNTIME_API InferRequest { /** * @brief Infers specified input(s) in synchronous mode. * @note It blocks all methods of InferRequest while request is ongoing (running or waiting in a queue). - * Calling any method leads to throwning the ov::Busy exception. + * Calling any method leads to throwing the ov::Busy exception. */ void infer(); @@ -274,7 +274,7 @@ class OPENVINO_RUNTIME_API InferRequest { /** * @brief Starts inference of specified input(s) in asynchronous mode. * @note It returns immediately. Inference starts also immediately. - * Calling any method while the request in a running state leads to throwning the ov::Busy exception. + * Calling any method while the request in a running state leads to throwing the ov::Busy exception. */ void start_async(); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp index 510ab7fc43b0c8..1fc3a3e20965c6 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp @@ -270,4 +270,22 @@ struct BYPASS_UMD_CACHING final : OptionBase { return OptionMode::RunTime; } }; + +// +// RUN_INFERENCES_SEQUENTIALLY +// +struct RUN_INFERENCES_SEQUENTIALLY final : OptionBase { + static std::string_view key() { + return ov::intel_npu::run_inferences_sequentially.name(); + } + + static bool defaultValue() { + return false; + } + + static OptionMode mode() { + return OptionMode::RunTime; + } +}; + } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp index ec92e10a9f89c8..8aabd132e9431a 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp @@ -327,5 +327,14 @@ static constexpr ov::Property backend_n */ static constexpr ov::Property backend_compilation_params{"NPU_BACKEND_COMPILATION_PARAMS"}; +/** + * @brief [Only for NPU Plugin] + * Type: boolean, default is false. + * This option allows to run inferences sequentially, in the order in which they were created + * @note Experimental property, for now it only works in very specific scenarios. We need driver updates before we can + * implement a robust solution for in-order execution + */ +static constexpr ov::Property run_inferences_sequentially{"NPU_RUN_INFERENCES_SEQUENTIALLY"}; + } // namespace intel_npu } // namespace ov diff --git a/src/plugins/intel_npu/src/al/src/config/runtime.cpp b/src/plugins/intel_npu/src/al/src/config/runtime.cpp index 759956b6f597df..3da16796219332 100644 --- a/src/plugins/intel_npu/src/al/src/config/runtime.cpp +++ b/src/plugins/intel_npu/src/al/src/config/runtime.cpp @@ -27,6 +27,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); } // Heuristically obtained number. Varies depending on the values of PLATFORM and PERFORMANCE_HINT diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp index 3efbdab666d1ba..1e8781b0afe820 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp @@ -38,25 +38,6 @@ class ZeroInferRequest final : public SyncInferRequest { std::vector get_profiling_info() const override; std::vector get_raw_profiling_data() const; - /** - * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by - * the model will also be deduced and returned. - * @details Batching can be handled by the plugin only if: - * - The batch axis is the first axis. - * - The batch size received by the compiler takes the default value of 1. - * - The batch size found in the IR model matches for all inputs/outputs and takes a value different than the - * default one. - * - * If any of the previous conditions is not fulfilled, the functon will return the default batch size, thus no - * custom algorithm will be applied inside the plugin in order to address batching. - * - * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will - * ultimately be used for determining the batch size. - * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside - * the plugin. - */ - std::optional get_batch_size(const NetworkMetadata& metadata); - /** * @brief Check the received tensor and set the Level Zero tensor accordingly * @param tensor Reference to a tensor. @@ -106,22 +87,6 @@ class ZeroInferRequest final : public SyncInferRequest { std::shared_ptr _npuProfiling; std::unique_ptr _pipeline; - /** - * @brief Indicates how many command lists will be used inside the pipeline. - * @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis - * between these lists. - * - * If batching is handled on compiler's side then a single command list shall be used, we don't do any - * specific operation inside the plugin in this case. - */ - size_t _numberOfCommandLists = 1; - - /** - * @brief The batch size used by the corresponding model. - * @details The attribute contains a value only if the plugin performs the batches splitting operation. - */ - std::optional _batchSize = std::nullopt; - bool _pipelineIsCreated = false; }; diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index 5b7f488d3eb96a..de5e1ac81c4728 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -28,7 +28,6 @@ struct Pipeline { const std::shared_ptr& npu_profiling, const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, - size_t numberOfCommandLists, uint32_t group_ordinal); Pipeline(const Pipeline&) = delete; @@ -43,12 +42,25 @@ struct Pipeline { void updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex); protected: + std::shared_ptr _graph; const Config _config; + const uint32_t _id; + + /** + * @brief Indicates how many command lists will be used inside the pipeline. + * @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis + * between these lists. + * + * If batching is handled on compiler's side then a single command list shall be used, we don't do any + * specific operation inside the plugin in this case. + */ + size_t _number_of_command_lists; + std::shared_ptr _command_queue; std::vector> _command_lists; std::vector> _fences; - EventPool _event_pool; - std::vector> _events; + std::shared_ptr _event_pool; + std::vector> _events; bool sync_output_with_fences_ = true; std::shared_ptr _npu_profiling; Logger _logger; diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index 88dfaf944a8b34..a0e5d2d11c1fef 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -20,8 +20,6 @@ using namespace intel_npu; namespace { constexpr std::size_t SINGLE_TENSOR = 0; -constexpr std::size_t BATCH_AXIS = 0; -constexpr std::size_t DEFAULT_BATCH_SIZE = 1; constexpr bool INPUT = true; constexpr bool OUTPUT = false; @@ -96,64 +94,6 @@ bool memory_was_allocated_in_the_same_l0_context(ze_context_handle_t hContext, c } // namespace -std::optional ZeroInferRequest::get_batch_size(const NetworkMetadata& metadata) { - if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) { - _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); - return std::nullopt; - } - - const ov::PartialShape& firstOutputShape = *metadata.outputs.at(0).shapeFromIRModel; - if (firstOutputShape.is_dynamic()) { - _logger.warning("Networks using dynamic shapes are not supported when batching is handled by the plugin"); - return std::nullopt; - } - if (firstOutputShape.rank().get_length() == 0) { - _logger.warning( - "Networks using rank 0 shapes for inputs/outputs are not supported when batching is handled by the plugin"); - return std::nullopt; - } - - const size_t candidateBatchSize = firstOutputShape[BATCH_AXIS].get_length(); - if (candidateBatchSize == 0 || candidateBatchSize == DEFAULT_BATCH_SIZE) { - _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); - return std::nullopt; - } - - auto checkDescriptorsUseCandidateBatchSize = [candidateBatchSize](const std::vector& descriptors) { - for (const IODescriptor& descriptor : descriptors) { - OPENVINO_ASSERT(descriptor.shapeFromIRModel.has_value(), - "Missing value for the \"shapeFromIRModel\" attribute, I/O descriptor"); - - const ov::PartialShape& shapeFromCompiler = descriptor.shapeFromCompiler; - const ov::PartialShape& shapeFromIRModel = *descriptor.shapeFromIRModel; - - if (shapeFromCompiler.is_dynamic() || shapeFromCompiler.rank().get_length() == 0 || - *shapeFromCompiler.begin() != DEFAULT_BATCH_SIZE) { - return false; - } - - if (!descriptor.isStateInput && !descriptor.isStateOutput && !descriptor.isShapeTensor) { - if (shapeFromIRModel.is_dynamic() || shapeFromIRModel.rank().get_length() == 0 || - *shapeFromIRModel.begin() != candidateBatchSize) { - return false; - } - } - } - - return true; - }; - - if (!checkDescriptorsUseCandidateBatchSize(metadata.inputs) || - !checkDescriptorsUseCandidateBatchSize(metadata.outputs)) { - _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); - return std::nullopt; - } - - _logger.debug("Batching is handled by the plugin"); - - return candidateBatchSize; -} - //------------------------------------------------------------------------------ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& initStructs, const std::shared_ptr& compiledModel, @@ -187,13 +127,6 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& _inputAllocator = std::make_shared(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED); - if (config.get() != ov::intel_npu::BatchMode::COMPILER) { - _batchSize = get_batch_size(_metadata); - } - if (_batchSize.has_value()) { - _numberOfCommandLists = *_batchSize; - } - _logger.debug("ZeroInferRequest::ZeroInferRequest - checking level zero attributes and allocating tensors"); size_t ioIndex = 0; @@ -205,7 +138,8 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& continue; } - get_level_zero_input(ioIndex) = allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _batchSize); + get_level_zero_input(ioIndex) = + allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _graph->get_batch_size()); get_input_tensor_data(ioIndex) = TensorData{get_level_zero_input(ioIndex)->data(), get_level_zero_input(ioIndex)->get_byte_size()}; @@ -222,7 +156,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& } _levelZeroOutputTensors.at(ioIndex) = - allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _batchSize); + allocate_tensor(outputDescriptor, ioIndex, OUTPUT, *_outputAllocator, _graph->get_batch_size()); _outputTensorsData.at(ioIndex) = std::optional(TensorData{_levelZeroOutputTensors.at(ioIndex)->data(), _levelZeroOutputTensors.at(ioIndex)->get_byte_size()}); @@ -236,7 +170,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& void ZeroInferRequest::create_pipeline() { for (size_t inputIndex = 0; inputIndex < _metadata.inputs.size(); ++inputIndex) { if (is_batched_input(inputIndex)) { - if (_batchSize.has_value()) { + if (_graph->get_batch_size().has_value()) { _logger.debug("ZeroInferRequest::create_pipeline - tensors %s were already allocated", _metadata.inputs.at(inputIndex).nodeFriendlyName.c_str()); continue; @@ -250,8 +184,11 @@ void ZeroInferRequest::create_pipeline() { } _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor"); - get_level_zero_input(inputIndex) = - allocate_tensor(_metadata.inputs.at(inputIndex), inputIndex, INPUT, *_inputAllocator, _batchSize); + get_level_zero_input(inputIndex) = allocate_tensor(_metadata.inputs.at(inputIndex), + inputIndex, + INPUT, + *_inputAllocator, + _graph->get_batch_size()); get_input_tensor_data(inputIndex) = std::optional( TensorData{get_level_zero_input(inputIndex)->data(), get_level_zero_input(inputIndex)->get_byte_size()}); } @@ -263,17 +200,20 @@ void ZeroInferRequest::create_pipeline() { continue; } _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor"); - _levelZeroOutputTensors.at(outputIndex) = - allocate_tensor(_metadata.outputs.at(outputIndex), outputIndex, OUTPUT, *_outputAllocator, _batchSize); + _levelZeroOutputTensors.at(outputIndex) = allocate_tensor(_metadata.outputs.at(outputIndex), + outputIndex, + OUTPUT, + *_outputAllocator, + _graph->get_batch_size()); _outputTensorsData.at(outputIndex) = std::optional(TensorData{_levelZeroOutputTensors.at(outputIndex)->data(), _levelZeroOutputTensors.at(outputIndex)->get_byte_size()}); } // Find the corresponding command queue group. - _logger.debug("ZeroDevice::ZeroDevice - findGroupOrdinal"); + _logger.debug("ZeroInferRequest::create_pipeline - findGroupOrdinal"); auto groupOrdinal = zeroUtils::findGroupOrdinal(_initStructs->getDevice(), _properties); - _logger.debug("ZeroDevice::ZeroDevice - init completed"); + _logger.debug("ZeroInferRequest::create_pipeline - init completed"); _logger.debug("ZeroInferRequest::create_pipeline - constructing pipeline"); @@ -286,7 +226,6 @@ void ZeroInferRequest::create_pipeline() { _npuProfiling, _inputTensorsData, _outputTensorsData, - _numberOfCommandLists, groupOrdinal); _logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed"); @@ -321,7 +260,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr& tenso index, isInput, isInput ? *_inputAllocator : *_outputAllocator, - _batchSize); + _graph->get_batch_size()); setTensorData = true; levelZeroTensorCreatedLocally = true; @@ -444,7 +383,7 @@ void ZeroInferRequest::set_tensors(const ov::Output& port, get_user_inputs(foundPort.idx) = tensors; if (_initStructs->getMutableCommandListVersion()) { - if (_batchSize.has_value()) { + if (_graph->get_batch_size().has_value()) { for (size_t i = 0; i < tensors.size(); i++) { auto remoteTensor = std::dynamic_pointer_cast(tensors[i]._ptr); @@ -525,13 +464,17 @@ ov::SoPtr ZeroInferRequest::get_tensor(const ov::Outputget_batch_size()); tensorsData = std::optional(TensorData{levelZeroTensors->data(), levelZeroTensors->get_byte_size()}); return levelZeroTensors; } void ZeroInferRequest::infer() { + if (_config.get()) { + OPENVINO_THROW("Only start async is supported when RUN_INFERENCES_SEQUENTIALLY is enabled!"); + } + infer_async(); get_result(); } @@ -567,7 +510,7 @@ void ZeroInferRequest::infer_async() { } if (is_batched_input(inputIndex)) { - if (_batchSize.has_value()) { + if (_graph->get_batch_size().has_value()) { for (size_t i = 0; i < userTensor.size(); i++) { auto levelZeroBatchRemoteTensor = std::dynamic_pointer_cast(get_level_zero_input(inputIndex, i)); diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index c782c3e0684f0d..d7f06b813810bb 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -8,6 +8,7 @@ #include #include "intel_npu/common/itt.hpp" +#include "intel_npu/config/runtime.hpp" #include "intel_npu/prefix.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_api.hpp" @@ -23,13 +24,15 @@ Pipeline::Pipeline(const Config& config, const std::shared_ptr& npu_profiling, const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, - size_t numberOfCommandLists, uint32_t group_ordinal) - : _config(config), - _command_queue(graph->get_command_queue()), - _event_pool{initStructs->getDevice(), - initStructs->getContext(), - numberOfCommandLists ? static_cast(numberOfCommandLists) : 1}, + : _graph(graph), + _config(config), + _id(_graph->get_unique_id()), + _number_of_command_lists(_graph->get_batch_size().has_value() ? *_graph->get_batch_size() : 1), + _event_pool{ + std::make_shared(initStructs->getDevice(), + initStructs->getContext(), + _number_of_command_lists ? static_cast(_number_of_command_lists) : 1)}, _npu_profiling(npu_profiling), _logger("Pipeline", _config.get()) { OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline"); @@ -39,20 +42,20 @@ Pipeline::Pipeline(const Config& config, profiling_query.create(profiling_pool._handle); } - _command_lists.reserve(numberOfCommandLists); - _events.reserve(numberOfCommandLists); - _fences.reserve(numberOfCommandLists); + _command_lists.reserve(_number_of_command_lists); + _events.reserve(_number_of_command_lists); + _fences.reserve(_number_of_command_lists); _logger.debug("Pipeline - emplace_back _event_pool and _command_queue"); - for (size_t i = 0; i < numberOfCommandLists; i++) { + for (size_t i = 0; i < _number_of_command_lists; i++) { _command_lists.emplace_back( std::make_unique(initStructs, group_ordinal, initStructs->getMutableCommandListVersion() ? true : false)); - _events.emplace_back(std::make_unique(_event_pool.handle(), static_cast(i))); - _fences.emplace_back(std::make_unique(*_command_queue)); + _events.emplace_back(std::make_shared(_event_pool, static_cast(i))); + _fences.emplace_back(std::make_unique(*_graph->get_command_queue())); } - for (size_t i = 0; i < numberOfCommandLists; i++) { + for (size_t i = 0; i < _number_of_command_lists; i++) { size_t ioIndex = 0; for (const auto& desc : graph->get_input_descriptors()) { if (inputTensorsData.at(ioIndex).size() > 1) { @@ -64,7 +67,7 @@ Pipeline::Pipeline(const Config& config, graph->set_argument_value(desc.idx, static_cast(inputTensorsData.at(ioIndex).at(0)->mem) + - (i * inputTensorsData.at(ioIndex).at(0)->size) / numberOfCommandLists); + (i * inputTensorsData.at(ioIndex).at(0)->size) / _number_of_command_lists); ++ioIndex; } @@ -73,10 +76,16 @@ Pipeline::Pipeline(const Config& config, for (const auto& desc : graph->get_output_descriptors()) { graph->set_argument_value(desc.idx, static_cast(outputTensorsData.at(ioIndex)->mem) + - (i * outputTensorsData.at(ioIndex)->size) / numberOfCommandLists); + (i * outputTensorsData.at(ioIndex)->size) / _number_of_command_lists); ++ioIndex; } + if (_config.get()) { + if (_graph->get_last_submitted_event(i)) { + _graph->get_last_submitted_event(i)->AppendWaitOnEvent(*_command_lists.at(i)); + } + } + /// append timestamp command if feature was activated if (_npu_profiling != nullptr) { _command_lists.at(i)->appendBarrier(); @@ -92,6 +101,15 @@ Pipeline::Pipeline(const Config& config, _command_lists.at(i)->appendNpuTimestamp(reinterpret_cast(_npu_profiling->npu_ts_infer_end)); } + if (_config.get()) { + if (_graph->get_last_submitted_event(i)) { + _graph->get_last_submitted_event(i)->AppendEventReset(*_command_lists.at(i)); + } + + _events.at(i)->AppendSignalEvent(*_command_lists.at(i)); + _graph->set_last_submitted_event(_events.at(i), i); + } + // appendBarrier used in L0 as well if (!sync_output_with_fences_) { _command_lists.at(i)->appendBarrier(); @@ -105,12 +123,24 @@ Pipeline::Pipeline(const Config& config, void Pipeline::push() { _logger.debug("Pipeline - push() started"); + if (_config.get()) { + if (_id) { + auto previousIndex = _graph->get_last_submitted_id(); + + if (_id != ++previousIndex) { + OPENVINO_THROW("Inferences should be called in the same order they were called the first time!"); + } + } + + _graph->set_last_submitted_id(_id); + } + for (size_t i = 0; i < _command_lists.size(); ++i) { OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push"); if (sync_output_with_fences_) { - _command_queue->executeCommandList(*_command_lists.at(i), *_fences.at(i)); + _graph->get_command_queue()->executeCommandList(*_command_lists.at(i), *_fences.at(i)); } else { - _command_queue->executeCommandList(*_command_lists.at(i)); + _graph->get_command_queue()->executeCommandList(*_command_lists.at(i)); } } @@ -154,12 +184,12 @@ void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index) OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList"); _logger.debug("Pipeline - updateCommandList"); - const size_t numberOfCommandLists = _command_lists.size(); + const size_t _number_of_command_lists = _command_lists.size(); - for (size_t i = 0; i < numberOfCommandLists; i++) { + for (size_t i = 0; i < _number_of_command_lists; i++) { _command_lists.at(i)->updateMutableCommandList( index, - static_cast(tensorsData.mem) + (i * tensorsData.size) / numberOfCommandLists); + static_cast(tensorsData.mem) + (i * tensorsData.size) / _number_of_command_lists); _command_lists.at(i)->close(); } }; @@ -168,9 +198,9 @@ void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index, OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList"); _logger.debug("Pipeline - updateCommandList"); - const size_t numberOfCommandLists = _command_lists.size(); + const size_t _number_of_command_lists = _command_lists.size(); - OPENVINO_ASSERT(commandListIndex < numberOfCommandLists, + OPENVINO_ASSERT(commandListIndex < _number_of_command_lists, "Command list index is higgher than the number of Command lists ", commandListIndex); diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp index 51c4a4cf26eafd..7e718d9172f4f7 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp @@ -9,6 +9,7 @@ #include #include "intel_npu/network_metadata.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" #include "intel_npu/utils/zero/zero_wrappers.hpp" #include "openvino/runtime/profiling_info.hpp" @@ -17,13 +18,10 @@ namespace intel_npu { class IGraph : public std::enable_shared_from_this { public: - IGraph(ze_graph_handle_t handle, NetworkMetadata metadata, std::optional> blob) - : _handle(handle), - _metadata(std::move(metadata)) { - if (blob.has_value()) { - _blob = std::move(*blob); - } - } + IGraph(ze_graph_handle_t handle, + NetworkMetadata metadata, + const Config& config, + std::optional> blob); virtual void export_blob(std::ostream& stream) const = 0; @@ -36,55 +34,48 @@ class IGraph : public std::enable_shared_from_this { virtual ~IGraph() = default; - const NetworkMetadata& get_metadata() const { - return _metadata; - } - - ze_graph_handle_t get_handle() const { - return _handle; - } - - void update_network_name(std::string_view name) { - _metadata.name = name; - } - - inline const std::vector& get_input_descriptors() const { - return _input_descriptors; - } - - inline const std::vector& get_output_descriptors() const { - return _output_descriptors; - } - - inline const std::shared_ptr& get_command_queue() const { - return _command_queue; - } - - void set_workload_type(const ov::WorkloadType workloadType) const { - if (_command_queue == nullptr) { - return; - } - - ze_command_queue_workload_type_t zeWorkloadType; - switch (workloadType) { - case ov::WorkloadType::DEFAULT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT; - break; - case ov::WorkloadType::EFFICIENT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND; - break; - default: - OPENVINO_THROW("Unknown value for WorkloadType!"); - } - - _command_queue->setWorkloadType(zeWorkloadType); - } - - std::mutex& get_mutex() { - return _mutex; - } + const NetworkMetadata& get_metadata() const; + ze_graph_handle_t get_handle() const; + + void update_network_name(std::string_view name); + + const std::vector& get_input_descriptors() const; + const std::vector& get_output_descriptors() const; + const std::shared_ptr& get_command_queue() const; + + void set_workload_type(const ov::WorkloadType workloadType) const; + + std::mutex& get_mutex(); + + void set_last_submitted_event(const std::shared_ptr& event, size_t indexOfCommandList); + const std::shared_ptr& get_last_submitted_event(size_t indexOfCommandList) const; + + uint32_t get_unique_id(); + void set_last_submitted_id(uint32_t id_index); + const uint32_t get_last_submitted_id() const; + + const std::optional get_batch_size() const; protected: + /** + * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by + * the model will also be deduced and returned. + * @details Batching can be handled by the plugin only if: + * - The batch axis is the first axis. + * - The batch size received by the compiler takes the default value of 1. + * - The batch size found in the IR model matches for all inputs/outputs and takes a value different than the + * default one. + * + * If any of the previous conditions is not fulfilled, the functon will return the default batch size, thus no + * custom algorithm will be applied inside the plugin in order to address batching. + * + * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will + * ultimately be used for determining the batch size. + * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside + * the plugin. + */ + std::optional get_batch_size(const NetworkMetadata& metadata); + ze_graph_handle_t _handle = nullptr; NetworkMetadata _metadata; @@ -92,12 +83,24 @@ class IGraph : public std::enable_shared_from_this { std::vector _output_descriptors; std::shared_ptr _command_queue; + std::vector> _last_submitted_event; // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the // first inference starts running std::mutex _mutex; std::vector _blob; + + uint32_t _unique_id = 0; + uint32_t _last_submitted_id; + + /** + * @brief The batch size used by the corresponding model. + * @details The attribute contains a value only if the plugin performs the batches splitting operation. + */ + std::optional _batch_size = std::nullopt; + + Logger _logger; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/common/src/igraph.cpp b/src/plugins/intel_npu/src/common/src/igraph.cpp new file mode 100644 index 00000000000000..fd5463af5eea3e --- /dev/null +++ b/src/plugins/intel_npu/src/common/src/igraph.cpp @@ -0,0 +1,159 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_npu/common/igraph.hpp" + +#include "intel_npu/config/compiler.hpp" +#include "intel_npu/config/runtime.hpp" + +namespace { +constexpr std::size_t BATCH_AXIS = 0; +constexpr std::size_t DEFAULT_BATCH_SIZE = 1; +} // namespace + +namespace intel_npu { + +IGraph::IGraph(ze_graph_handle_t handle, + NetworkMetadata metadata, + const Config& config, + std::optional> blob) + : _handle(handle), + _metadata(std::move(metadata)), + _logger("IGraph", config.get()) { + if (blob.has_value()) { + _blob = std::move(*blob); + } +} + +const NetworkMetadata& IGraph::get_metadata() const { + return _metadata; +} + +ze_graph_handle_t IGraph::get_handle() const { + return _handle; +} + +void IGraph::update_network_name(std::string_view name) { + _metadata.name = name; +} + +const std::vector& IGraph::get_input_descriptors() const { + return _input_descriptors; +} + +const std::vector& IGraph::get_output_descriptors() const { + return _output_descriptors; +} + +const std::shared_ptr& IGraph::get_command_queue() const { + return _command_queue; +} + +void IGraph::set_workload_type(const ov::WorkloadType workloadType) const { + if (_command_queue == nullptr) { + return; + } + + ze_command_queue_workload_type_t zeWorkloadType; + switch (workloadType) { + case ov::WorkloadType::DEFAULT: + zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT; + break; + case ov::WorkloadType::EFFICIENT: + zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND; + break; + default: + OPENVINO_THROW("Unknown value for WorkloadType!"); + } + + _command_queue->setWorkloadType(zeWorkloadType); +} + +std::mutex& IGraph::get_mutex() { + return _mutex; +} + +void IGraph::set_last_submitted_event(const std::shared_ptr& event, size_t indexOfCommandList) { + _last_submitted_event[indexOfCommandList] = event; +} + +const std::shared_ptr& IGraph::get_last_submitted_event(size_t indexOfCommandList) const { + return _last_submitted_event[indexOfCommandList]; +} + +uint32_t IGraph::get_unique_id() { + return _unique_id++; +} + +void IGraph::set_last_submitted_id(uint32_t id_index) { + _last_submitted_id = id_index; +} + +const uint32_t IGraph::get_last_submitted_id() const { + return _last_submitted_id; +} + +std::optional IGraph::get_batch_size(const NetworkMetadata& metadata) { + if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) { + _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); + return std::nullopt; + } + + const ov::PartialShape& firstOutputShape = *metadata.outputs.at(0).shapeFromIRModel; + if (firstOutputShape.is_dynamic()) { + _logger.warning("Networks using dynamic shapes are not supported when batching is handled by the plugin"); + return std::nullopt; + } + if (firstOutputShape.rank().get_length() == 0) { + _logger.warning("Networks using rank 0 shapes for inputs/outputs are not supported when batching is " + "handled by the plugin"); + return std::nullopt; + } + + const size_t candidateBatchSize = firstOutputShape[BATCH_AXIS].get_length(); + if (candidateBatchSize == 0 || candidateBatchSize == DEFAULT_BATCH_SIZE) { + _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); + return std::nullopt; + } + + auto checkDescriptorsUseCandidateBatchSize = [candidateBatchSize](const std::vector& descriptors) { + for (const IODescriptor& descriptor : descriptors) { + OPENVINO_ASSERT(descriptor.shapeFromIRModel.has_value(), + "Missing value for the \"shapeFromIRModel\" attribute, I/O descriptor"); + + const ov::PartialShape& shapeFromCompiler = descriptor.shapeFromCompiler; + const ov::PartialShape& shapeFromIRModel = *descriptor.shapeFromIRModel; + + if (shapeFromCompiler.is_dynamic() || shapeFromCompiler.rank().get_length() == 0 || + *shapeFromCompiler.begin() != DEFAULT_BATCH_SIZE) { + return false; + } + + if (!descriptor.isStateInput && !descriptor.isStateOutput && !descriptor.isShapeTensor) { + if (shapeFromIRModel.is_dynamic() || shapeFromIRModel.rank().get_length() == 0 || + *shapeFromIRModel.begin() != candidateBatchSize) { + return false; + } + } + } + + return true; + }; + + if (!checkDescriptorsUseCandidateBatchSize(metadata.inputs) || + !checkDescriptorsUseCandidateBatchSize(metadata.outputs)) { + _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); + return std::nullopt; + } + + _logger.debug("Batching is handled by the plugin"); + + return candidateBatchSize; +} + +const std::optional IGraph::get_batch_size() const { + return _batch_size; +} + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp index f819ed73711cf2..9d634656db109a 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp @@ -541,13 +541,21 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config, content = std::regex_replace(content, std::regex(batchstr.str()), ""); } - // NPU_DEFER_WEIGHTS_LOAD is not supported in versions < 6.2 - need to remove it - if ((compilerVersion.major < 6) || (compilerVersion.major == 6 && compilerVersion.minor < 2)) { + // NPU_DEFER_WEIGHTS_LOAD is needed at runtime only + { std::ostringstream batchstr; batchstr << ov::intel_npu::defer_weights_load.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER; - logger.warning( - "NPU_DEFER_WEIGHTS_LOAD property is not suppored by this compiler version. Removing from parameters"); + logger.info("NPU_DEFER_WEIGHTS_LOAD property is needed at runtime only. Removing from parameters"); + content = std::regex_replace(content, std::regex(batchstr.str()), ""); + } + + // NPU_RUN_INFERENCES_SEQUENTIALLY is needed at runtime only + { + std::ostringstream batchstr; + batchstr << ov::intel_npu::run_inferences_sequentially.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER + << "\\S+" << VALUE_DELIMITER; + logger.info("NPU_RUN_INFERENCES_SEQUENTIALLY property is needed at runtime only. Removing from parameters"); content = std::regex_replace(content, std::regex(batchstr.str()), ""); } diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp index e1f3990b835e8d..0d180f983ad3a9 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp @@ -16,7 +16,7 @@ DriverGraph::DriverGraph(const std::shared_ptr& zeGraphExt, NetworkMetadata metadata, const Config& config, std::optional> blob) - : IGraph(graphHandle, std::move(metadata), std::move(blob)), + : IGraph(graphHandle, std::move(metadata), config, std::move(blob)), _zeGraphExt(zeGraphExt), _zeroInitStruct(zeroInitStruct), _logger("DriverGraph", config.get()) { @@ -126,6 +126,16 @@ void DriverGraph::initialize(const Config& config) { // _zeGraphExt->initializeGraph(). The driver will not access the original blob from this moment on, so we are // releasing it here to avoid unnecessary memory usage. _blobIsReleased = release_blob(config); + + if (config.get() != ov::intel_npu::BatchMode::COMPILER) { + _batch_size = get_batch_size(_metadata); + } + + if (config.get()) { + auto number_of_command_lists = _batch_size.has_value() ? *_batch_size : 1; + + _last_submitted_event.resize(number_of_command_lists); + } } bool DriverGraph::release_blob(const Config& config) { diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp index c99069a0a9760f..b1658e7e0582e0 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp @@ -17,7 +17,7 @@ PluginGraph::PluginGraph(const std::shared_ptr& zeGraphExt, NetworkMetadata metadata, std::vector blob, const Config& config) - : IGraph(graphHandle, std::move(metadata), std::optional>(std::move(blob))), + : IGraph(graphHandle, std::move(metadata), config, std::optional>(std::move(blob))), _zeGraphExt(zeGraphExt), _zeroInitStruct(zeroInitStruct), _compiler(compiler), @@ -115,6 +115,16 @@ void PluginGraph::initialize(const Config& config) { _zeGraphExt->initializeGraph(_handle, config); + if (config.get() != ov::intel_npu::BatchMode::COMPILER) { + _batch_size = get_batch_size(_metadata); + } + + if (config.get()) { + auto number_of_command_lists = _batch_size.has_value() ? *_batch_size : 1; + + _last_submitted_event.resize(number_of_command_lists); + } + _logger.debug("Graph initialize finish"); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index c6be2793fe6f70..b9cdad9f4879db 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -434,6 +434,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, // Finalize memory in closures and weight banks finalize_weights_bank(); + detach_memory(); // Print stats report when possible { @@ -499,6 +500,23 @@ void ov::npuw::CompiledModel::finalize_weights_bank() { LOG_INFO("Done."); } +void ov::npuw::CompiledModel::detach_memory() { + LOG_INFO("Detaching model & weight memory..."); + LOG_BLOCK(); + for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) { + auto& comp_model_desc = m_compiled_submodels[idx]; + auto& proto_comp_model_desc = m_compiled_submodels[comp_model_desc.replaced_by.value_or(idx)]; + if (!proto_comp_model_desc.model || !proto_comp_model_desc.compiled_model) { + continue; // optimized-out OR already cleared - skip + } + if (proto_comp_model_desc.device_it + 1 == m_dev_list.end()) { + LOG_INFO("No fallback expected - clear the OV model for Subgraph[" << idx << "]"); + proto_comp_model_desc.model.reset(); + } + } + LOG_INFO("Done"); +} + std::string ov::npuw::CompiledModel::global_mem_device() const { // Force globally set device if set const std::string device_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>(); @@ -668,6 +686,10 @@ ov::SoPtr ov::npuw::CompiledModel::compile_submodel(const st // NOTE(dm): Not sure if it is required for the NPUW plugin, but likely it is auto& device_config = m_meta_devices[device]; + if (ov::npuw::util::starts_with(device, "NPU") && m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) { + device_config["NPU_RUN_INFERENCES_SEQUENTIALLY"] = "YES"; + } + const auto& cache_dir = m_cfg.get<::intel_npu::NPUW_CACHE_DIR>(); if (!cache_dir.empty()) { LOG_INFO("NPUW will try to utilize CACHE_DIR for " << submodel->get_friendly_name() << " submodel."); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index ece1bc78fb5bf5..8ccb1f83349e47 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -78,6 +78,7 @@ class CompiledModel : public ov::ICompiledModel { void implement_properties(); void finalize_weights_bank(); + void detach_memory(); std::string global_mem_device() const; std::string funcall_mem_device(const std::size_t idx) const; diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp index 81521222ae6fae..133101da8b7d38 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp @@ -19,23 +19,34 @@ namespace npuw { namespace weights { namespace op { struct Const { - std::shared_ptr node; - + std::shared_ptr m_node; + ov::element::Type m_cached_type; + ov::Shape m_cached_shape; + const void* m_cached_ptr = nullptr; + + explicit Const(std::shared_ptr n) : m_node(n) { + m_cached_type = m_node->get_element_type(); + m_cached_shape = m_node->get_shape(); + m_cached_ptr = m_node->get_data_ptr(); + } std::size_t hash() const { - std::size_t seed = std::hash()(node->get_data_ptr()) + 0x9e3779b9; - seed ^= node->get_element_type().hash() + 0x9e3779b9; - for (const auto& dim : node->get_shape()) { + std::size_t seed = std::hash()(m_cached_ptr) + 0x9e3779b9; + seed ^= m_cached_type.hash() + 0x9e3779b9; + for (const auto& dim : m_cached_shape) { seed ^= std::hash()(dim) + 0x9e3779b9; } return seed; } bool operator==(const Const& other) const { - return (node->get_shape() == other.node->get_shape() && - node->get_element_type() == other.node->get_element_type() && - node->get_data_ptr() == other.node->get_data_ptr()); + return (m_cached_type == other.m_cached_type && m_cached_shape == other.m_cached_shape && + m_cached_ptr == other.m_cached_ptr); } ov::Tensor eval() const { - return ov::npuw::util::tensor_from_const(node); + NPUW_ASSERT(m_node && "Const::eval() can only happen before detach"); + return ov::npuw::util::tensor_from_const(m_node); + } + void detach() { + m_node.reset(); } }; struct Concat { @@ -59,6 +70,11 @@ struct Concat { } return ov::npuw::util::concat(to_concat, axis); } + void detach() { + for (auto&& lt : tensors) { + lt.detach(); + } + } }; struct Unpack { @@ -95,6 +111,11 @@ struct Unpack { } return dst; } + void detach() { + w.detach(); + z.detach(); + s.detach(); + } }; struct Permute { LazyTensor tensor; @@ -113,6 +134,9 @@ struct Permute { ov::Tensor eval() const { return ov::npuw::util::permute(tensor.eval(), axes); } + void detach() { + tensor.detach(); + } }; struct Convert { LazyTensor tensor; @@ -130,6 +154,9 @@ struct Convert { NPUW_ASSERT(ov::element::f16 == type); return ov::npuw::util::to_f16(tensor.eval()); } + void detach() { + tensor.detach(); + } }; } // namespace op @@ -137,16 +164,16 @@ using Transform = std::variant overloaded(Ts...) -> overloaded; -std::size_t LazyTensorImpl::get_hash() const { - // Already calculated - if (m_hash != 0) { - return m_hash; - } - - // Get hash - std::size_t seed = 0; - std::visit(overloaded{[&seed](const auto& op) { - seed ^= op.hash(); - }}, - m_transform); - - return seed; -} - -LazyTensorImpl::LazyTensorImpl(Transform&& t) { - m_transform = std::move(t); - m_hash = get_hash(); -} +LazyTensorImpl::LazyTensorImpl(Transform&& t) + : m_transform(std::move(t)), + m_hash(std::visit(overloaded{[](const auto& op) { + return op.hash(); + }}, + m_transform)) {} bool LazyTensorImpl::operator==(const LazyTensorImpl& other) const { return m_hash == other.m_hash && m_transform == other.m_transform; @@ -200,17 +213,25 @@ ov::Tensor LazyTensorImpl::eval() const { some kind of indicator that the only difference is concat and we should look for an existing ov::Tensor. Perhaps it should be done after model compilation and not handled here. */ + return std::visit(overloaded{[](const auto& op) { + return op.eval(); + }}, + m_transform); +} + +std::size_t LazyTensorImpl::get_hash() const { + return m_hash; +} - ov::Tensor result = std::visit(overloaded{[](const auto& op) { - return op.eval(); - }}, - m_transform); - NPUW_ASSERT(result); - return result; +void LazyTensorImpl::detach() { + std::visit(overloaded{[](auto& op) { + op.detach(); + }}, + m_transform); } LazyTensor::LazyTensor(const std::shared_ptr& const_ptr) - : m_impl(std::make_shared(op::Const{const_ptr})) {} + : m_impl(std::make_shared(op::Const(const_ptr))) {} LazyTensor::LazyTensor(const std::vector& to_concat, const std::size_t axis) : m_impl(std::make_shared(op::Concat{to_concat, axis})) {} LazyTensor::LazyTensor(const LazyTensor& cw, @@ -233,11 +254,17 @@ LazyTensor LazyTensor::convert(const ov::element::Type& type) { } bool LazyTensor::operator==(const LazyTensor& other) const { + if (!m_impl && !other.m_impl) { + return true; + } + if ((!m_impl && other.m_impl) || (m_impl && !other.m_impl)) { + return false; + } return *m_impl.get() == *other.m_impl.get(); } bool LazyTensor::operator!=(const LazyTensor& other) const { - return !(*m_impl.get() == *other.m_impl.get()); + return !(*this == other); } ov::Tensor LazyTensor::eval() const { @@ -254,6 +281,12 @@ std::size_t LazyTensor::get_hash() const { return m_impl->get_hash(); } +void LazyTensor::detach() { + if (m_impl) { + m_impl->detach(); + } +} + std::size_t LazyTensor::Hash::operator()(const LazyTensor& lt) const { return lt.get_hash(); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp index 365d9d636872b8..840e22dcedad83 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp @@ -39,8 +39,8 @@ class LazyTensor { bool operator!=(const LazyTensor& other) const; ov::Tensor eval() const; - std::size_t get_hash() const; + void detach(); private: std::shared_ptr m_impl = nullptr; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp index 2b2878481f1330..3e712574606679 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp @@ -23,7 +23,7 @@ using ov::npuw::online::detail::isOp; Group::Group(const std::shared_ptr& node, size_t gid, own::ade::NodeHandle nh, - const std::shared_ptr& g, + const std::weak_ptr& g, const std::weak_ptr& snapshot) : m_nh(std::move(nh)), m_id(gid), @@ -36,7 +36,7 @@ Group::Group(const std::shared_ptr& node, Group::Group(size_t gid, own::ade::NodeHandle nh, - const std::shared_ptr& g, + const std::weak_ptr& g, const std::weak_ptr& snapshot) : m_nh(std::move(nh)), m_id(gid), @@ -214,14 +214,16 @@ void Group::relinkGraph(const Group::GPtr& gptr_other) { auto consumers = gptr_other->dstNodes(); // Remove gptr_other node from the graph. Note: also removes all it's edges - m_graph->remove(gptr_other->getHandle()); + auto&& graph = m_graph.lock(); + NPUW_ASSERT(graph); + graph->remove(gptr_other->getHandle()); for (const auto& nh : producers) { if (m_nh == nh) { continue; } // relink the graph - if (!m_graph->linked(nh, m_nh)) { - m_graph->link(nh, m_nh); + if (!graph->linked(nh, m_nh)) { + graph->link(nh, m_nh); } } for (const auto& nh : consumers) { @@ -229,8 +231,8 @@ void Group::relinkGraph(const Group::GPtr& gptr_other) { continue; } // relink the graph - if (!m_graph->linked(m_nh, nh)) { - m_graph->link(m_nh, nh); + if (!graph->linked(m_nh, nh)) { + graph->link(m_nh, nh); } } } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp index 17527033173a82..1d354542e135a8 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp @@ -33,11 +33,11 @@ class Group : public std::enable_shared_from_this { Group(const std::shared_ptr& node, size_t gid, own::ade::NodeHandle nh, - const std::shared_ptr& g, + const std::weak_ptr& g, const std::weak_ptr& snapshot); Group(size_t gid, own::ade::NodeHandle nh, - const std::shared_ptr& g, + const std::weak_ptr& g, const std::weak_ptr& snapshot); // After we formed a final structure of partitioning, @@ -100,7 +100,7 @@ class Group : public std::enable_shared_from_this { own::ade::NodeHandle m_nh; size_t m_id; // used for utility prints only - std::shared_ptr m_graph; + std::weak_ptr m_graph; std::weak_ptr m_snapshot; bool m_frozen = false; bool m_nofold = false; diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp index 7a942f0b6c6351..616aff53128292 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp @@ -141,6 +141,14 @@ Impl _(std::shared_ptr pM) { } // namespace at +// Written here to be a drop-in replacement for ov::parallel_for for the debug purposes +template +void non_parallel_for(std::size_t count, F&& f) { + for (std::size_t idx = 0u; idx < count; idx++) { + f(idx); + } +} + } // namespace util } // namespace npuw } // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp index 51cf76020d81a1..2b4be1a759c17c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp @@ -40,16 +40,22 @@ ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) { std::lock_guard guard(m_mutex); - auto& device_bank = m_device_bank[device_for_alloc]; - auto iter_device = device_bank.find(tensor); + auto& device_bank = m_device_banks[device_for_alloc]; - if (iter_device != device_bank.end() && iter_device->second) { + std::unique_lock dev_guard(device_bank.mutex); + auto iter_device = device_bank.storage.find(tensor); + + if (iter_device != device_bank.storage.end() && iter_device->second) { // Already allocated + // tensor (the key) may be coming from a 2nd (3rd, ...) model + // detach it here just in case + const_cast(tensor).detach(); return iter_device->second; } + dev_guard.unlock(); // Allocation and evaluation needed - return unsafe_eval_and_alloc(tensor, device_for_alloc); + return eval_and_alloc(tensor, device_bank, device_for_alloc); } void Bank::registerLT(const LazyTensor& tensor, const std::string& device) { @@ -57,64 +63,75 @@ void Bank::registerLT(const LazyTensor& tensor, const std::string& device) { std::lock_guard guard(m_mutex); - auto& device_bank = m_device_bank[device_for_alloc]; - if (device_bank.find(tensor) == device_bank.end()) { - device_bank[tensor] = ov::Tensor(); + auto& device_bank = m_device_banks[device_for_alloc]; + if (device_bank.storage.find(tensor) == device_bank.storage.end()) { + device_bank.storage[tensor] = ov::Tensor(); } } void Bank::evaluate_and_allocate() { std::lock_guard guard(m_mutex); - for (auto&& bank : m_device_bank) { + for (auto&& bank : m_device_banks) { const auto& device_for_alloc = bank.first; auto& device_bank = bank.second; + std::vector vec; - for (const auto& el : device_bank) { + vec.reserve(device_bank.storage.size()); + for (const auto& el : device_bank.storage) { vec.push_back(el.first); } ov::parallel_for(vec.size(), [&](std::size_t idx) { const auto& lt = vec[idx]; - auto iter_device = device_bank.find(lt); - if (iter_device != device_bank.end() && iter_device->second) { + std::unique_lock dev_guard(device_bank.mutex); + auto iter_device = device_bank.storage.find(lt); + if (iter_device != device_bank.storage.end() && iter_device->second) { // Already allocated return; } + dev_guard.unlock(); // Allocation and evaluation needed - unsafe_eval_and_alloc(lt, device_for_alloc); + eval_and_alloc(lt, device_bank, device_for_alloc); }); } } -ov::Tensor Bank::unsafe_eval_and_alloc(const LazyTensor& tensor, const std::string& device_for_alloc) { - // Note: private method used inside other methods with already locked mutex +ov::Tensor Bank::eval_and_alloc(const LazyTensor& tensor, + Bank::DeviceBank& dbank, + const std::string& device_for_alloc) { + // Evaluate concurrently (see evaluate_and_allocate), lock the device + // mutex only to update the device bank (& allocate on-device memory, if needed) const auto& transformed_tensor = tensor.eval(); + + std::unique_lock guard(dbank.mutex); if (device_for_alloc == "CPU") { - m_device_bank[device_for_alloc][tensor] = transformed_tensor; + dbank.storage[tensor] = transformed_tensor; return transformed_tensor; } + // Non-CPU case: detach the evaluated LazyTensor from its memory + const_cast(tensor).detach(); + ov::SoPtr remote_tensor; ov::Tensor allocated_tensor; - { - // FIXME: L0 allocation may crash when run in parallel - std::lock_guard guard(m_alloc_mutex); - m_remote_ctx = m_core->get_default_context(device_for_alloc)._ptr; - remote_tensor = - m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape()); - allocated_tensor = ov::make_tensor(remote_tensor); - } + + auto remote_ctx = m_core->get_default_context(device_for_alloc)._ptr; + remote_tensor = + remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape()); + allocated_tensor = ov::make_tensor(remote_tensor); + dbank.storage[tensor] = allocated_tensor; + guard.unlock(); // Unlock the guard, map update is done - copy can continue in parallel + transformed_tensor.copy_to(allocated_tensor); - m_device_bank[device_for_alloc][tensor] = allocated_tensor; return allocated_tensor; } bool Bank::is_remote(const LazyTensor& tensor) const { // FIXME: make generic - auto npu_bank = m_device_bank.find("NPU"); - if (npu_bank != m_device_bank.end() && npu_bank->second.find(tensor) != npu_bank->second.end()) { - // Found in NPU bank + auto npu_bank = m_device_banks.find("NPU"); + if (npu_bank != m_device_banks.end() && npu_bank->second.storage.find(tensor) != npu_bank->second.storage.end()) { + // Found in NPU bank so considered remote (utterly wrong for the generic case) return true; } return false; diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp index b9d8d21143c851..491e962a58b438 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp @@ -35,13 +35,17 @@ class Bank { bool is_remote(const LazyTensor& tensor) const; private: - ov::Tensor unsafe_eval_and_alloc(const LazyTensor& tensor, const std::string& device); // Bank for specified device and their allocated memory - std::unordered_map> m_device_bank; + struct DeviceBank { + std::unordered_map storage; + std::mutex mutex; + }; + std::unordered_map m_device_banks; + + ov::Tensor eval_and_alloc(const LazyTensor& tensor, DeviceBank& dbank, const std::string& device); + std::mutex m_mutex; - std::mutex m_alloc_mutex; std::shared_ptr m_core = nullptr; - std::shared_ptr m_remote_ctx = nullptr; std::string m_alloc_device; }; diff --git a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp index 4baf15d76718a8..4e86d32d2f72b1 100644 --- a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp @@ -311,6 +311,12 @@ void CompiledModel::initialize_properties() { [](const Config& config) { return config.getString(); }}}, + {ov::intel_npu::run_inferences_sequentially.name(), + {false, + ov::PropertyMutability::RO, + [](const Config& config) { + return config.get(); + }}}, }; for (auto& property : _properties) { diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 9f77d952fd813b..18a96bff02fb80 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -568,6 +568,12 @@ Plugin::Plugin() [](const Config& config) { return config.getString(); }}}, + {ov::intel_npu::run_inferences_sequentially.name(), + {false, + ov::PropertyMutability::RW, + [](const Config& config) { + return config.get(); + }}}, {ov::intel_npu::batch_mode.name(), {false, ov::PropertyMutability::RW, [](const Config& config) { return config.getString(); }}}}; diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp index 6cb9e23d203c11..1e1b50fb925916 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -57,12 +58,14 @@ namespace intel_npu { symbol_statement(zeMemAllocDevice) \ symbol_statement(zeMemAllocHost) \ symbol_statement(zeMemFree) \ - symbol_statement(zeMemGetAllocProperties) + symbol_statement(zeMemGetAllocProperties) \ + symbol_statement(zelLoaderGetVersions) //unsupported symbols with older ze_loader versions #define weak_symbols_list() \ symbol_statement(zeCommandListGetNextCommandIdExp) \ - symbol_statement(zeCommandListUpdateMutableCommandsExp) + symbol_statement(zeCommandListUpdateMutableCommandsExp) \ + symbol_statement(zeInitDrivers) // clang-format on class ZeroApi { diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_init.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_init.hpp index 01b2de868e7572..25ceb018cdc243 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_init.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_init.hpp @@ -67,6 +67,8 @@ class ZeroInitStructsHolder final { } private: + void initNpuDriver(); + static const ze_driver_uuid_t uuid; Logger log; diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp index 8883bb99dd178e..0df0c5d66169a4 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp @@ -188,7 +188,7 @@ static inline uint32_t findGroupOrdinal(ze_device_handle_t device_handle, const "zeDeviceGetCommandQueueGroupProperties", zeDeviceGetCommandQueueGroupProperties(device_handle, &command_queue_group_count, nullptr)); - log.debug("ZeroDevice::ZeroDevice - resize command_queue_group_count"); + log.debug("zero_utils::findGroupOrdinal - resize command_queue_group_count"); command_group_properties.resize(command_queue_group_count); for (auto& prop : command_group_properties) { diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp index 9b5b1b4540fbe7..61999376680e90 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp @@ -37,7 +37,7 @@ class EventPool { class Event { public: Event() = delete; - Event(const ze_event_pool_handle_t& event_pool, uint32_t event_index); + Event(const std::shared_ptr& event_pool, uint32_t event_index); Event(const Event&) = delete; Event(Event&&) = delete; Event& operator=(const Event&) = delete; @@ -51,6 +51,7 @@ class Event { ~Event(); private: + std::shared_ptr _event_pool; ze_event_handle_t _handle = nullptr; Logger _log; diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_init.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_init.cpp index e87f8db788b9b8..b069bd64244142 100644 --- a/src/plugins/intel_npu/src/utils/src/zero/zero_init.cpp +++ b/src/plugins/intel_npu/src/utils/src/zero/zero_init.cpp @@ -4,6 +4,7 @@ #include "intel_npu/utils/zero/zero_init.hpp" +#include #include #include @@ -53,30 +54,93 @@ static std::tuple queryDriverExtensionVersion( return std::make_tuple(targetVersion, functionExtName ? functionExtName : ""); } -ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", Logger::global().level()) { - log.debug("ZeroInitStructsHolder - performing zeInit on VPU only"); - THROW_ON_FAIL_FOR_LEVELZERO("zeInit", zeInit(ZE_INIT_FLAG_VPU_ONLY)); - - uint32_t drivers = 0; - THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGet", zeDriverGet(&drivers, nullptr)); +void ZeroInitStructsHolder::initNpuDriver() { + auto setNpuDriver = [&](uint32_t drivers_count, std::vector all_drivers) { + driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES; + log.debug("ZeroInitStructsHolder::initNpuDriver - setting driver properties to " + "ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES"); + for (uint32_t i = 0; i < drivers_count; ++i) { + zeDriverGetProperties(all_drivers[i], &driver_properties); + + if (memcmp(&driver_properties.uuid, &uuid, sizeof(uuid)) == 0) { + driver_handle = all_drivers[i]; + break; + } + } + if (driver_handle == nullptr) { + OPENVINO_THROW("NPU driver wasn't found!"); + } + }; + + auto fallbackToZeDriverGet = [&]() { + log.debug("ZeroInitStructsHolder - zeInitDrivers not supported, fallback to zeDriverGet"); + + uint32_t drivers_count = 0; + THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGet", zeDriverGet(&drivers_count, nullptr)); + + std::vector all_drivers(drivers_count); + THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGet", zeDriverGet(&drivers_count, all_drivers.data())); + + // Get our target driver + setNpuDriver(drivers_count, std::move(all_drivers)); + }; + + zel_version_t loader_version = {}; + size_t num_components; + auto result = zelLoaderGetVersions(&num_components, nullptr); + if (result == ZE_RESULT_SUCCESS) { + zel_component_version_t* versions = new zel_component_version_t[num_components]; + result = zelLoaderGetVersions(&num_components, versions); + + if (result == ZE_RESULT_SUCCESS) { + for (size_t i = 0; i < num_components; ++i) { + if (strncmp(versions[i].component_name, "loader", strlen("loader")) == 0) { + loader_version = versions[i].component_lib_version; + + log.debug("ZeroInitStructsHolder - ze_loader.dll version: %d.%d.%d", + loader_version.major, + loader_version.minor, + loader_version.patch); + } + } + } - std::vector all_drivers(drivers); - THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGet", zeDriverGet(&drivers, all_drivers.data())); + delete[] versions; + } - // Get our target driver - driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES; - log.debug("ZeroInitStructsHolder - setting driver properties to ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES"); - for (uint32_t i = 0; i < drivers; ++i) { - zeDriverGetProperties(all_drivers[i], &driver_properties); + if (loader_version.major > 1 || (loader_version.major == 1 && loader_version.minor > 18) || + (loader_version.major == 1 && loader_version.minor == 18 && loader_version.patch >= 5)) { + uint32_t drivers_count = 0; + ze_init_driver_type_desc_t desc = {}; + desc.flags = ZE_INIT_DRIVER_TYPE_FLAG_NPU; + auto result = zeInitDrivers(&drivers_count, nullptr, &desc); + if (result != ZE_RESULT_SUCCESS) { + fallbackToZeDriverGet(); + return; + } - if (memcmp(&driver_properties.uuid, &uuid, sizeof(uuid)) == 0) { - driver_handle = all_drivers[i]; - break; + std::vector all_drivers(drivers_count); + result = zeInitDrivers(&drivers_count, all_drivers.data(), &desc); + if (result != ZE_RESULT_SUCCESS) { + fallbackToZeDriverGet(); + return; } + + // Get our target driver + setNpuDriver(drivers_count, std::move(all_drivers)); + + return; } - if (driver_handle == nullptr) { - OPENVINO_THROW("zeDriverGet failed to return NPU driver"); - } + + fallbackToZeDriverGet(); +} + +ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", Logger::global().level()) { + log.debug("ZeroInitStructsHolder - performing zeInit on NPU only"); + THROW_ON_FAIL_FOR_LEVELZERO("zeInit", zeInit(ZE_INIT_FLAG_VPU_ONLY)); + + log.debug("ZeroInitStructsHolder - initialize NPU Driver"); + initNpuDriver(); // Check L0 API version THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGetApiVersion", zeDriverGetApiVersion(driver_handle, &ze_drv_api_version)); diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp index 858e65d4b5e6ee..d95b0e172a7d64 100644 --- a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp +++ b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp @@ -24,9 +24,11 @@ EventPool::~EventPool() { } } -Event::Event(const ze_event_pool_handle_t& event_pool, uint32_t event_index) : _log("Event", Logger::global().level()) { +Event::Event(const std::shared_ptr& event_pool, uint32_t event_index) + : _event_pool(event_pool), + _log("Event", Logger::global().level()) { ze_event_desc_t event_desc = {ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, event_index, 0, 0}; - THROW_ON_FAIL_FOR_LEVELZERO("zeEventCreate", zeEventCreate(event_pool, &event_desc, &_handle)); + THROW_ON_FAIL_FOR_LEVELZERO("zeEventCreate", zeEventCreate(_event_pool->handle(), &event_desc, &_handle)); } void Event::AppendSignalEvent(CommandList& command_list) const { THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendSignalEvent", diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp index 5d023fe9d0bee6..e4a49ce9b7ccdb 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp @@ -19,6 +19,12 @@ INSTANTIATE_TEST_SUITE_P(compatibility_smoke_BehaviorTest, ::testing::ValuesIn(configsInferRequestRunTests)), InferRequestRunTests::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, + RunSeqTests, + ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(configsInferRequestRunTests)), + InferRequestRunTests::getTestCaseName); + const std::vector batchingConfigs = { {ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::PLUGIN)}, {ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::COMPILER)}, @@ -29,3 +35,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::ValuesIn(batchingConfigs)), InferRequestRunTests::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, + BatchingRunSeqTests, + ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(batchingConfigs)), + InferRequestRunTests::getTestCaseName); diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp index 20be5ed25edd27..07466677b9d547 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp @@ -103,9 +103,7 @@ class InferRequestRunTests : public ov::test::behavior::OVPluginTestBase, APIBaseTest::TearDown(); } - std::shared_ptr createBatchingModel(element::Type type, - const PartialShape& shape, - const ov::Layout& layout) { + std::shared_ptr createModel(element::Type type, const PartialShape& shape, const ov::Layout& layout) { ResultVector res; ParameterVector params; @@ -352,7 +350,7 @@ TEST_P(BatchingRunTests, CheckBatchingSupportInfer) { ov::InferRequest inference_request; auto batch_shape = Shape{4, 2, 32, 32}; - std::shared_ptr ov_model_batch = createBatchingModel(element::f32, batch_shape, "N..."); + std::shared_ptr ov_model_batch = createModel(element::f32, batch_shape, "N..."); OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model_batch, target_device, configuration)); OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request()); @@ -365,7 +363,7 @@ TEST_P(BatchingRunTests, CheckBatchingSupportAsync) { ov::InferRequest inference_request; auto batch_shape = Shape{4, 2, 32, 32}; - std::shared_ptr ov_model_batch = createBatchingModel(element::f32, batch_shape, "N..."); + std::shared_ptr ov_model_batch = createModel(element::f32, batch_shape, "N..."); OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model_batch, target_device, configuration)); OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request()); @@ -396,7 +394,7 @@ TEST_P(BatchingRunTests, UseCompilerBatchingErrorPluginBatching) { TEST_P(BatchingRunTests, SetInputTensorInfer) { auto batch_shape = Shape{4, 2, 2, 2}; auto shape_size = ov::shape_size(batch_shape); - auto model = createBatchingModel(element::f32, batch_shape, "N..."); + auto model = createModel(element::f32, batch_shape, "N..."); float* buffer = new float[shape_size]; compiled_model = core->compile_model(model, target_device, configuration); @@ -422,7 +420,7 @@ TEST_P(BatchingRunTests, SetInputTensorInfer) { TEST_P(BatchingRunTests, SetInputTensorAsync) { auto batch_shape = Shape{4, 2, 2, 2}; auto shape_size = ov::shape_size(batch_shape); - auto model = createBatchingModel(element::f32, batch_shape, "N..."); + auto model = createModel(element::f32, batch_shape, "N..."); float* buffer = new float[shape_size]; compiled_model = core->compile_model(model, target_device, configuration); @@ -449,7 +447,7 @@ TEST_P(BatchingRunTests, SetInputTensorAsync) { TEST_P(BatchingRunTests, SetInputTensorInfer_Caching) { auto batch_shape = Shape{4, 2, 2, 2}; auto shape_size = ov::shape_size(batch_shape); - auto model = createBatchingModel(element::f32, batch_shape, "N..."); + auto model = createModel(element::f32, batch_shape, "N..."); float* buffer = new float[shape_size]; m_cache_dir = generateCacheDirName(GetTestName()); @@ -480,7 +478,7 @@ TEST_P(BatchingRunTests, SetInputTensorInfer_Caching) { TEST_P(BatchingRunTests, CheckTwoRunsInfer) { auto batch_shape = Shape{4, 2, 2, 2}; auto shape_size = ov::shape_size(batch_shape); - auto model = createBatchingModel(element::f32, batch_shape, "N..."); + auto model = createModel(element::f32, batch_shape, "N..."); float* buffer = new float[shape_size]; auto context = core->get_default_context(target_device); @@ -524,6 +522,250 @@ TEST_P(BatchingRunTests, CheckTwoRunsInfer) { delete[] buffer; } +using RunSeqTests = InferRequestRunTests; + +TEST_P(RunSeqTests, CheckMultipleRunsSeq0) { + auto shape = Shape{1, 64, 64, 256}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + auto context = core->get_default_context(target_device); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + + const uint32_t inferences = 32; + std::array inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + input_tensor = context.create_host_tensor(ov::element::f32, shape); + for (uint32_t i = 0; i < inferences; i++) { + inference_request[i] = compiled_model.create_infer_request(); + output_tensor[i] = context.create_host_tensor(ov::element::f32, shape); + } + + inference_request[0].set_input_tensor(input_tensor); + inference_request[0].set_output_tensor(output_tensor[0]); + + const uint32_t runs = 10; + for (uint32_t z = 0; z < runs; z++) { + auto* input_data = reinterpret_cast(input_tensor.data()); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = static_cast(z); + } + + inference_request[0].start_async(); // Adds '1' to each element + + for (uint32_t i = 1; i < inferences; i++) { + inference_request[i].set_input_tensor(output_tensor[i - 1]); + inference_request[i].set_output_tensor(output_tensor[i]); + + inference_request[i].start_async(); // Adds '1' to each element + } + + inference_request[inferences - 1].wait(); + + float expected_result = static_cast(z) + 1.f; + + for (uint32_t i = 0; i < inferences; i++) { + auto* output_tensor_data = reinterpret_cast(output_tensor[i].data()); + for (size_t j = 0; j < shape_size; ++j) { + EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5) + << "Run=" << z << "Output=" << i << " Expected=" << expected_result + << ", actual=" << output_tensor_data[j] << " for index " << j; + } + expected_result++; + } + } +} + +TEST_P(RunSeqTests, CheckMultipleRunsSeq1) { + auto shape = Shape{1, 64, 64, 256}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + auto context = core->get_default_context(target_device); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + + const int inferences = 32; + std::array inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + input_tensor = context.create_host_tensor(ov::element::f32, shape); + + for (int i = 0; i < inferences; i++) { + inference_request[i] = compiled_model.create_infer_request(); + output_tensor[i] = context.create_host_tensor(ov::element::f32, shape); + } + + inference_request[inferences - 1].set_input_tensor(input_tensor); + inference_request[inferences - 1].set_output_tensor(output_tensor[inferences - 1]); + + const int runs = 10; + for (int z = 0; z < runs; z++) { + auto* input_data = reinterpret_cast(input_tensor.data()); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = static_cast(z); + } + + inference_request[inferences - 1].start_async(); // Adds '1' to each element + + for (int i = inferences - 2; i >= 0; i--) { + inference_request[i].set_input_tensor(output_tensor[i + 1]); + inference_request[i].set_output_tensor(output_tensor[i]); + + inference_request[i].start_async(); // Adds '1' to each element + } + + inference_request[0].wait(); + + float expected_result = static_cast(z) + 1.f; + + for (int i = inferences - 1; i >= 0; i--) { + auto* output_tensor_data = reinterpret_cast(output_tensor[i].data()); + for (size_t j = 0; j < shape_size; ++j) { + EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5) + << "Run=" << z << "Output=" << i << " Expected=" << expected_result + << ", actual=" << output_tensor_data[j] << " for index " << j; + } + expected_result++; + } + } +} + +TEST_P(RunSeqTests, CheckMultipleRunsSeq2) { + auto shape = Shape{1, 64, 64, 256}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + auto context = core->get_default_context(target_device); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + + const int inferences = 32; + std::array inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + input_tensor = context.create_host_tensor(ov::element::f32, shape); + + for (int i = 0; i < inferences; i++) { + inference_request[i] = compiled_model.create_infer_request(); + output_tensor[i] = context.create_host_tensor(ov::element::f32, shape); + } + + inference_request[inferences - 1].set_input_tensor(input_tensor); + inference_request[inferences - 1].set_output_tensor(output_tensor[inferences - 1]); + + auto* input_data = reinterpret_cast(input_tensor.data()); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 1.f; + } + + inference_request[inferences - 1].start_async(); + + for (int i = inferences - 2; i >= 0; i--) { + inference_request[i].set_input_tensor(output_tensor[i + 1]); + inference_request[i].set_output_tensor(output_tensor[i]); + + inference_request[i].start_async(); + } + + inference_request[0].wait(); + + try { + inference_request[5].start_async(); + inference_request[5].wait(); + } catch (const std::exception& ex) { + ASSERT_FALSE(false) << ex.what(); + return; + } + + ASSERT_FALSE(true) << "Exception is expected but it didn't throw any exception!"; +} + +TEST_P(RunSeqTests, CheckMultipleRunsSeq3) { + auto shape = Shape{1, 64, 64, 256}; + auto model = createModel(element::f32, shape, "N..."); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + ov::InferRequest inference_request; + inference_request = compiled_model.create_infer_request(); + + OV_EXPECT_THROW(inference_request.infer(), + ov::Exception, + HasSubstr("Only start async is supported when RUN_INFERENCES_SEQUENTIALLY is enabled!")); +} + +using BatchingRunSeqTests = InferRequestRunTests; + +TEST_P(BatchingRunSeqTests, CheckMultipleBatchingRunsSeq) { + auto shape = Shape{4, 2, 64, 64}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + auto context = core->get_default_context(target_device); + + configuration[ov::intel_npu::run_inferences_sequentially.name()] = true; + configuration[ov::intel_npu::tiles.name()] = 2; + compiled_model = core->compile_model(model, target_device, configuration); + + const uint32_t inferences = 32; + std::array inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + input_tensor = context.create_host_tensor(ov::element::f32, shape); + for (uint32_t i = 0; i < inferences; i++) { + inference_request[i] = compiled_model.create_infer_request(); + output_tensor[i] = context.create_host_tensor(ov::element::f32, shape); + } + + inference_request[0].set_input_tensor(input_tensor); + inference_request[0].set_output_tensor(output_tensor[0]); + + const uint32_t runs = 10; + for (uint32_t z = 0; z < runs; z++) { + auto* input_data = reinterpret_cast(input_tensor.data()); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = static_cast(z); + } + + inference_request[0].start_async(); // Adds '1' to each element + + for (uint32_t i = 1; i < inferences; i++) { + inference_request[i].set_input_tensor(output_tensor[i - 1]); + inference_request[i].set_output_tensor(output_tensor[i]); + + inference_request[i].start_async(); // Adds '1' to each element + } + + inference_request[inferences - 1].wait(); + + float expected_result = static_cast(z) + 1.f; + + for (uint32_t i = 0; i < inferences; i++) { + auto* output_tensor_data = reinterpret_cast(output_tensor[i].data()); + for (size_t j = 0; j < shape_size; ++j) { + EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5) + << "Run=" << z << "Output=" << i << " Expected=" << expected_result + << ", actual=" << output_tensor_data[j] << " for index " << j; + } + expected_result++; + } + } +} + } // namespace behavior } // namespace test } // namespace ov