Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fill prompt for sampler analysis with real tokens in VLM pipeline #1247

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/cpp/src/visual_language/inputs_embedder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class InputsEmbedder::IInputsEmbedder {
ChatHistory m_history;
// Templated chat history
std::string m_templated_chat_history;
// Tokenized chat history
ov::Tensor m_tokenized_chat_history = ov::Tensor(ov::element::i64, {0, 0});
// Whether we have computed some inputs already
bool m_is_cache_empty = true;

Expand All @@ -50,12 +52,17 @@ class InputsEmbedder::IInputsEmbedder {
return m_tokenizer;
}

ov::Tensor get_tokenized_chat_history() const {
return m_tokenized_chat_history;
}

virtual void start_chat(const std::string& system_message) {
m_is_chat_conversation = true;
if (!m_is_cache_empty) {
m_history.clear();
m_templated_chat_history.clear();
m_is_cache_empty = true;
m_tokenized_chat_history = ov::Tensor(ov::element::i64, {0, 0});
}
if (system_message.empty()) {
return;
Expand All @@ -78,6 +85,7 @@ class InputsEmbedder::IInputsEmbedder {

m_history.clear();
m_templated_chat_history.clear();
m_tokenized_chat_history = ov::Tensor(ov::element::i64, {0, 0});
}

protected:
Expand Down Expand Up @@ -125,6 +133,7 @@ class InputsEmbedder::IInputsEmbedder {
).input_ids;
}
m_templated_chat_history = std::move(new_templated_chat_history);
m_tokenized_chat_history = new_chat_tokens;
} else {
encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
}
Expand Down Expand Up @@ -1031,6 +1040,10 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const {
return m_impl->get_embedding_model();
}

ov::Tensor InputsEmbedder::get_tokenized_chat_history() const {
return m_impl->get_tokenized_chat_history();
}

Tokenizer InputsEmbedder::get_tokenizer() const {
return m_impl->get_tokenizer();
}
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/src/visual_language/inputs_embedder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ class InputsEmbedder {
// returns embedding model which converts token_id(s) to embedding vectors
EmbeddingsModel get_embedding_model() const;

// returns tokenized text part of chat history
ov::Tensor get_tokenized_chat_history() const;

// returns tokenizer
Tokenizer get_tokenizer() const;

Expand Down
13 changes: 9 additions & 4 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,19 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {

ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs);

Sampler sampler = Sampler(m_tokenizer);

std::vector<SequenceGroup::Ptr> requests;
size_t request_id = 0;
size_t block_size = 1; // not used
bool enable_prefix_caching = false;
size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1);
size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });
std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), 0);

auto chat_history = m_inputs_embedder->get_tokenized_chat_history();
size_t chat_history_size = std::max(chat_history.get_shape().at(1), history_size + inputs_embeds_size);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like we have the same case as for LLMs, when decode ( encode ( X ) ) provides smaller value than X ?
in this case we need to partially re-compute the history.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in general, I would consider merging VLM and LLM pipelines generate functions to keep all this magic with history in one place.
Or at least to create helper function similar to get_lm_encoded_results

ov::Tensor prompt_ids(ov::element::i64, { chat_history_size });
std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why 1 is used as default value? maybe pad_token ?

auto chat_history_data = chat_history.data<int64_t>();
std::copy(chat_history_data, chat_history_data + chat_history.get_size(), prompt_ids.data<int64_t>());

SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching);
sequence_group->set_sequence_group_ptr(sequence_group);
Expand Down Expand Up @@ -131,6 +134,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds.get_shape()[1] }};
std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);

Sampler sampler = Sampler(m_tokenizer);

ov::genai::EncodedResults encoded_result;
int32_t m_selected_beam = 0;
std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests,
Expand Down
Loading