Skip to content

Commit

Permalink
Automatically apply chat template in non-chat scenarios
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Jan 13, 2025
1 parent 4ac98b8 commit 5a0e10d
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 7 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ from PIL import Image

# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
pipe = openvino_genai.VLMPipeline("./InternVL2-1B", "CPU")
pipe.start_chat()

image = Image.open("dog.jpg")
image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
Expand Down
2 changes: 2 additions & 0 deletions src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ output:
'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in'
```
>**Note**: The chat_template from tokenizer_config.json will be automatically applied to the prompt at the generation stage. If you want to disable it, you can do it by calling pipe.get_tokenizer().set_chat_template("").
A simple chat in Python:
```python
import openvino_genai as ov_genai
Expand Down
12 changes: 11 additions & 1 deletion src/cpp/src/icontinuous_batching.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,17 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
input_ids.reserve(prompts.size());
timer.start();
for (const std::string& prompt : prompts) {
input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
ov::Tensor encoded_inputs;
try {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
encoded_inputs = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
encoded_inputs = m_tokenizer.encode(prompt).input_ids;
}
input_ids.push_back(encoded_inputs);
}
timer.end();
}
Expand Down
25 changes: 23 additions & 2 deletions src/cpp/src/llm_pipeline_stateful.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,19 @@ DecodedResults StatefulLLMPipeline::generate(

if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
encoded_input = m_tokenizer.encode(*input_vector);
std::vector<std::string> templated_input_vector;
for (auto& input : *input_vector) {
try {
ChatHistory history({{{"role", "user"}, {"content", input}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
templated_input_vector.push_back(templated_prompt);
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
templated_input_vector.push_back(input);
}
}
encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false));
} else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
std::string& prompt = *input_prompt;

Expand Down Expand Up @@ -157,7 +169,16 @@ DecodedResults StatefulLLMPipeline::generate(

// TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
} else {
encoded_input = m_tokenizer.encode(prompt);
std::string& prompt = *input_prompt;
try {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
encoded_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
encoded_input = m_tokenizer.encode(prompt);
}
}
}

Expand Down
20 changes: 18 additions & 2 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,15 @@ DecodedResults StatefulLLMPipeline::generate(
// for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
} else {
tokenized_input = m_tokenizer.encode(prompt);
try {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
tokenized_input = m_tokenizer.encode(prompt);
}
}

auto encode_stop_time = std::chrono::steady_clock::now();
Expand Down Expand Up @@ -1202,7 +1210,15 @@ DecodedResults StatelessLLMPipeline::generate(
// for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
} else {
tokenized_input = m_tokenizer.encode(prompt);
try {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
tokenized_input = m_tokenizer.encode(prompt);
}
}

auto encode_stop_time = std::chrono::steady_clock::now();
Expand Down
13 changes: 12 additions & 1 deletion src/cpp/src/visual_language/inputs_embedder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,19 @@ class InputsEmbedder::IInputsEmbedder {
m_tokenized_history.clear();
std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
} else {
std::string templated_prompt;
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;

try {
templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
} catch (const std::exception& error) {
// Use fallback chat template if it was not found in tokenizer_config.json
templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback);
}

auto start_tokenizer_time = std::chrono::steady_clock::now();
encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
encoded_input_ids = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
auto end_tokenizer_time = std::chrono::steady_clock::now();
metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
m_tokenized_history.clear();
Expand Down

0 comments on commit 5a0e10d

Please sign in to comment.