From a1920c43df76a9a06a2602ffede63ddc514d940f Mon Sep 17 00:00:00 2001 From: Sebastian Golebiewski Date: Wed, 27 Nov 2024 12:10:02 +0100 Subject: [PATCH] [DOCS] Adding info on VLMS and Speculative Decoding (#27771) Porting: https://github.com/openvinotoolkit/openvino/pull/27760 Signed-off-by: sgolebiewski-intel --- .../llm_inference_guide/genai-guide.rst | 108 +++++++++++++ .../genai-guide/genai-use-cases.rst | 149 +++++++++++++++++- 2 files changed, 251 insertions(+), 6 deletions(-) diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst index ebd4667d544616..f18b66915fc3ce 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst @@ -218,6 +218,114 @@ Specify generation_config to use grouped beam search: cout << pipe.generate("The Sun is yellow because", config); } +Efficient Text Generation via Speculative Decoding +################################################## + +Speculative decoding (or assisted-generation) enables faster token generation +when an additional smaller draft model is used alongside the main model. +The draft model predicts the next K tokens one by one in an autoregressive manner, +while the main model validates these predictions and corrects them if necessary. + +Each predicted token is compared, and when there is a difference between the draft and +main model, the last token predicted by the main model is kept. Then, the draft +model acquires this token and tries prediction of the next K tokens, +thus repeating the cycle. + +This method eliminates the need for multiple infer requests to the main model, +which results in increased performance. Its implementation in the pipeline is +shown in the code samples below: + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: python + + import openvino_genai + import queue + import threading + + def streamer(subword): + print(subword, end='', flush=True) + return False + + def infer(model_dir: str, draft_model_dir: str, prompt: str): + main_device = 'CPU' # GPU can be used as well. + draft_device = 'CPU' + + scheduler_config = openvino_genai.SchedulerConfig() + scheduler_config.cache_size = 2 + + draft_model = openvino_genai.draft_model(draft_model_dir, draft_device) + + pipe = openvino_genai.LLMPipeline(model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + config.num_assistant_tokens = 5 + + pipe.generate(prompt, config, streamer) + + + For more information, refer to the + `Python sample `__. + + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include + + #include "openvino/genai/llm_pipeline.hpp" + + int main(int argc, char* argv[]) try { + if (4 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); + } + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + config.num_assistant_tokens = 5; + + std::string main_model_path = argv[1]; + std::string draft_model_path = argv[2]; + std::string prompt = argv[3]; + + std::string main_device = "CPU", draft_device = "CPU"; + + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.cache_size = 5; + + ov::genai::LLMPipeline pipe( + main_model_path, + main_device, + ov::genai::draft_model(draft_model_path, draft_device), + ov::genai::scheduler_config(scheduler_config)); + + auto streamer = [](std::string subword) { + std::cout << subword << std::flush; + return false; + }; + + pipe.generate(prompt, config, streamer); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ Comparing with Hugging Face Results ####################################### diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst index 6033bd8ed96106..245a2648aab491 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst @@ -118,7 +118,7 @@ sample shows basic usage of the ``Text2ImagePipeline`` pipeline. image_write("baseline.bmp", image) For more information, refer to the - `Python sample `__ + `Python sample `__ .. tab-item:: C++ :sync: cpp @@ -218,7 +218,7 @@ sample shows basic usage of the ``Text2ImagePipeline`` pipeline. For more information, refer to the - `C++ sample `__ + `C++ sample `__ @@ -269,7 +269,7 @@ and use audio files in WAV format at a sampling rate of 16 kHz as input. For more information, refer to the - `Python sample `__. + `Python sample `__. .. tab-item:: C++ :sync: cpp @@ -322,7 +322,7 @@ and use audio files in WAV format at a sampling rate of 16 kHz as input. For more information, refer to the - `C++ sample `__. + `C++ sample `__. Using GenAI in Chat Scenario @@ -367,7 +367,7 @@ mark a conversation session, as shown in the samples below: For more information, refer to the - `Python sample `__. + `Python sample `__. .. tab-item:: C++ :sync: cpp @@ -415,7 +415,142 @@ mark a conversation session, as shown in the samples below: For more information, refer to the - `C++ sample `__ + `C++ sample `__ + + +Using GenAI with Vision Language Models +####################################### + +OpenVINO GenAI introduces the ``openvino_genai.VLMPipeline`` pipeline for +inference of multimodal text-generation Vision Language Models (VLMs). +With a text prompt and an image as input, VLMPipeline can generate text using +models such as LLava or MiniCPM-V. See the chat scenario presented +in the samples below: + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: python + + import numpy as np + import openvino_genai + from PIL import Image + from openvino import Tensor + from pathlib import Path + + + def streamer(subword: str) -> bool: + print(subword, end='', flush=True) + + + def read_image(path: str) -> Tensor: + pic = Image.open(path).convert("RGB") + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) + return Tensor(image_data) + + + def read_images(path: str) -> list[Tensor]: + entry = Path(path) + if entry.is_dir(): + return [read_image(str(file)) for file in sorted(entry.iterdir())] + return [read_image(path)] + + + def infer(model_dir: str, image_dir: str): + rgbs = read_images(image_dir) + device = 'CPU' # GPU can be used as well. + enable_compile_cache = dict() + if "GPU" == device: + enable_compile_cache["CACHE_DIR"] = "vlm_cache" + pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + prompt = input('question:\n') + pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer) + + while True: + try: + prompt = input("\n----------\n" + "question:\n") + except EOFError: + break + pipe.generate(prompt, generation_config=config, streamer=streamer) + pipe.finish_chat() + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "load_image.hpp" + #include + #include + + bool print_subword(std::string&& subword) { + return !(std::cout << subword << std::flush); + } + + int main(int argc, char* argv[]) try { + if (3 != argc) { + throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); + } + + std::vector rgbs = utils::load_images(argv[2]); + + std::string device = "CPU"; // GPU can be used as well. + ov::AnyMap enable_compile_cache; + if ("GPU" == device) { + enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); + } + ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); + + ov::genai::GenerationConfig generation_config; + generation_config.max_new_tokens = 100; + + std::string prompt; + + pipe.start_chat(); + std::cout << "question:\n"; + + std::getline(std::cin, prompt); + pipe.generate(prompt, + ov::genai::images(rgbs), + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ Additional Resources ##################### @@ -423,4 +558,6 @@ Additional Resources * :doc:`Install OpenVINO GenAI <../../../get-started/install-openvino/install-openvino-genai>` * `OpenVINO GenAI Repo `__ * `OpenVINO GenAI Samples `__ +* A Jupyter notebook demonstrating + `Visual-language assistant with MiniCPM-V2 and OpenVINO `__ * `OpenVINO Tokenizers `__