From 94f1c4812487328c6fbfa1d0d1776e0155c5e89a Mon Sep 17 00:00:00 2001 From: Cyber-Var Date: Thu, 9 Jan 2025 13:19:34 +0500 Subject: [PATCH] Added pre-commit logic that checks for trailing whitespaces, and non-ASCII chars in filenames and file content --- .github/workflows/causal_lm_cpp.yml | 12 + .pre-commit-config.yaml | 20 + .../check_non_ascii_filenames.sh | 20 + .../check_non_ascii_in_files.sh | 17 + requirements-build.txt | 2 +- .../beam_search_causal_lm.cpp | 2 +- .../cpp/benchmark_genai/benchmark_genai.cpp | 6 +- samples/cpp/chat_sample/chat_sample.cpp | 6 +- samples/cpp/image_generation/CMakeLists.txt | 4 +- .../prompt_lookup_decoding_lm/CMakeLists.txt | 4 +- .../prompt_lookup_decoding_lm.cpp | 2 +- .../speculative_decoding_lm/CMakeLists.txt | 4 +- .../speculative_decoding_lm.cpp | 2 +- .../encrypted_model_causal_lm.cpp | 2 +- .../cpp/visual_language_chat/CMakeLists.txt | 2 +- .../visual_language_chat/benchmark_vlm.cpp | 6 +- samples/export-requirements.txt | 2 +- .../python/benchmark_genai/benchmark_genai.py | 12 +- samples/python/image_generation/text2image.py | 2 +- .../multinomial_causal_lm.py | 34 +- .../prompt_lookup_decoding_lm.py | 12 +- .../speculative_decoding_lm.py | 12 +- .../whisper_speech_recognition/recorder.py | 4 +- .../genai/continuous_batching_pipeline.hpp | 2 +- .../openvino/genai/generation_handle.hpp | 2 +- .../image_generation/text2image_pipeline.hpp | 6 +- .../openvino/genai/scheduler_config.hpp | 4 +- .../genai/visual_language/perf_metrics.hpp | 2 +- .../genai/visual_language/pipeline.hpp | 2 +- src/cpp/src/cache_manager.hpp | 12 +- src/cpp/src/continuous_batching_adapter.hpp | 2 +- src/cpp/src/generation_config.cpp | 2 +- src/cpp/src/icontinuous_batching.hpp | 6 +- .../src/image_generation/flux_pipeline.hpp | 2 +- .../image_generation/inpainting_pipeline.cpp | 2 +- .../models/unet_inference.hpp | 2 +- .../models/unet_inference_dynamic.hpp | 2 +- .../models/unet_inference_static_bs1.hpp | 4 +- .../src/image_generation/schedulers/ddim.cpp | 2 +- .../schedulers/euler_ancestral_discrete.cpp | 4 +- .../schedulers/lms_discrete.cpp | 8 +- .../src/image_generation/schedulers/pndm.cpp | 2 +- .../stable_diffusion_3_pipeline.hpp | 2 +- .../stable_diffusion_pipeline.hpp | 2 +- .../image_generation/text2image_pipeline.cpp | 2 +- src/cpp/src/llm_pipeline.cpp | 28 +- src/cpp/src/llm_pipeline_stateful.hpp | 2 +- src/cpp/src/logit_processor.hpp | 26 +- src/cpp/src/lora_helper.cpp | 2 +- src/cpp/src/lora_helper.hpp | 2 +- src/cpp/src/lora_names_mapping.cpp | 2 +- src/cpp/src/make_tokenizer_stateful.cpp | 12 +- src/cpp/src/make_tokenizer_stateful.hpp | 8 +- src/cpp/src/model_runner.hpp | 2 +- src/cpp/src/perf_metrics.cpp | 18 +- .../continuous_batching_for_prompt_lookup.cpp | 2 +- .../continuous_batching_for_prompt_lookup.hpp | 4 +- .../src/prompt_lookup/prompt_lookup_impl.cpp | 6 +- .../src/prompt_lookup/prompt_lookup_impl.hpp | 2 +- src/cpp/src/safetensors.c | 2 +- src/cpp/src/sampler.cpp | 18 +- src/cpp/src/sampler.hpp | 2 +- src/cpp/src/scheduler.hpp | 2 +- src/cpp/src/sequence_group.cpp | 8 +- src/cpp/src/sequence_group.hpp | 8 +- ...batching_for_speculative_decoding_impl.cpp | 10 +- ...batching_for_speculative_decoding_impl.hpp | 2 +- .../speculative_decoding_impl.cpp | 6 +- .../speculative_decoding_impl.hpp | 6 +- .../speculative_decoding_metrics.cpp | 2 +- .../speculative_decoding_metrics.hpp | 2 +- src/cpp/src/text_callback_streamer.cpp | 2 +- src/cpp/src/text_callback_streamer.hpp | 2 +- src/cpp/src/utils.cpp | 2 +- src/cpp/src/utils.hpp | 2 +- .../utils/paged_attention_transformations.cpp | 2 +- src/cpp/src/visual_language/clip.cpp | 4 +- src/cpp/src/visual_language/clip.hpp | 2 +- .../src/visual_language/embedding_model.cpp | 2 +- .../src/visual_language/embedding_model.hpp | 2 +- .../src/visual_language/inputs_embedder.cpp | 18 +- .../src/visual_language/inputs_embedder.hpp | 2 +- src/cpp/src/visual_language/perf_metrics.cpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 6 +- .../src/visual_language/processor_config.cpp | 2 +- .../src/visual_language/vision_encoder.cpp | 8 +- src/cpp/src/visual_language/vlm_config.hpp | 2 +- .../src/visual_language/vlm_model_type.hpp | 2 +- src/cpp/src/whisper_pipeline_static.cpp | 2 +- src/docs/DEBUG_LOG.md | 2 +- src/python/openvino_genai/__init__.py | 2 +- .../openvino_genai/py_openvino_genai.pyi | 418 +++++++++--------- .../py_continuous_batching_pipeline.cpp | 6 +- src/python/py_llm_pipeline.cpp | 10 +- src/python/py_lora_adapter.cpp | 16 +- src/python/py_openvino_genai.cpp | 6 +- src/python/py_perf_metrics.cpp | 10 +- src/python/py_tokenizer.cpp | 4 +- src/python/py_utils.cpp | 2 +- tests/cpp/block_manager.cpp | 2 +- tests/cpp/cache_manager.cpp | 4 +- tests/cpp/logit_filtering.cpp | 1 - tests/cpp/sampler.cpp | 6 +- tests/cpp/speculative_decoding.cpp | 29 +- tests/cpp/utils.cpp | 2 +- tests/python_tests/common.py | 4 +- tests/python_tests/conftest.py | 1 - tests/python_tests/data/long_prompts.txt | 2 +- tests/python_tests/models/nightly | 2 +- tests/python_tests/models/precommit | 2 +- tests/python_tests/models/real_models | 2 +- tests/python_tests/test_generation_config.py | 2 +- tests/python_tests/test_kv_cache_eviction.py | 3 +- tests/python_tests/test_llm_pipeline.py | 2 +- tests/python_tests/test_tokenizer.py | 7 +- tools/cacheviz/__init__.py | 1 - tools/cacheviz/cacheviz.py | 3 - tools/cacheviz/requirements.txt | 2 +- .../accuracy/continuous_batching_accuracy.cpp | 2 +- ...ntinuous_batching_speculative_decoding.cpp | 6 +- .../continuous_batching_benchmark.cpp | 14 +- tools/llm_bench/doc/NOTES.md | 2 +- tools/llm_bench/doc/PROMPT.md | 2 +- .../llm_bench_utils/hook_beam_search.py | 2 +- .../llm_bench/llm_bench_utils/hook_common.py | 2 +- .../llm_bench_utils/hook_greedy_search.py | 6 +- .../llm_bench/llm_bench_utils/hook_sample.py | 2 +- .../llm_bench_utils/hook_sample_v43.py | 2 +- .../llm_bench_utils/hook_sample_v45.py | 2 +- .../llm_bench_utils/ov_model_classes.py | 4 +- .../llm_bench/prompts/llama-2-7b-chat_l.jsonl | 2 +- tools/llm_bench/requirements.txt | 2 +- 132 files changed, 587 insertions(+), 527 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100755 pre_commit_scripts/check_non_ascii_filenames.sh create mode 100755 pre_commit_scripts/check_non_ascii_in_files.sh diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index fb0c9c4b0b..72977b30b7 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -21,6 +21,18 @@ env: m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241230_x86_64.tgz w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/w_openvino_toolkit_windows_2025.0.0.dev20241230_x86_64.zip jobs: + code-quality-checks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install pre-commit + run: pip install pre-commit + - name: Run pre-commit (checks for trailing whitespaces, and non-ASCII symbols in filenames and file content) + run: pre-commit run --all-files --show-diff-on-failure + cpp-multinomial-greedy_causal_lm-ubuntu: runs-on: ubuntu-20.04-8-cores defaults: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..7d724ba222 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: trailing-whitespace # checks for files with trailing whitespaces, excluding .md and Git-related hidden files + exclude: '\.md$|.*\.git.*' + - id: check-merge-conflict # checks for files that contain merge conflict strings (such as <<<<<<<, =======, and >>>>>>>) + - id: check-json # Ensures that JSON files are syntactically correct + - id: end-of-file-fixer # ensures that each file ends with one blank line, excluding Git-related hidden files + exclude: '.*\.git.*' + - repo: local + hooks: + - id: forbid-non-ascii-filenames # runs the script that prohibits non-ASCII characters in file names + name: Prohibit non-ASCII characters in file names + entry: ./pre_commit_scripts/check_non_ascii_filenames.sh + language: script + - id: forbid-non-ascii-in-files # checks for non-ASCII characters in files (excluding Markdown and hidden files), with characters ± and ? allowed + name: Check for non-ASCII characters in files (excluding Markdown and hidden files), with characters ± and ? allowed + entry: ./pre_commit_scripts/check_non_ascii_in_files.sh + language: script diff --git a/pre_commit_scripts/check_non_ascii_filenames.sh b/pre_commit_scripts/check_non_ascii_filenames.sh new file mode 100755 index 0000000000..2bd4a5deb7 --- /dev/null +++ b/pre_commit_scripts/check_non_ascii_filenames.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Store the command output: +empty_tree=$(git hash-object -t tree /dev/null) + +# Get a list of new files that might have non-ASCII characters: +problem_files=$(git diff --name-only --diff-filter=A -z "$empty_tree" | LC_ALL=C grep -P "[^\x00-\x7F]") + +# Count the number of problematic files: +count=$(echo "$problem_files" | wc -w) + +# Print necessary info based on the result: +if [ "$count" -ne 0 ]; then + echo "Error: Non-ASCII characters found in filenames of new files:" + echo "$problem_files" + exit 1 +else + echo "Success: No non-ASCII filenames found." +fi +exit 0 diff --git a/pre_commit_scripts/check_non_ascii_in_files.sh b/pre_commit_scripts/check_non_ascii_in_files.sh new file mode 100755 index 0000000000..18206cbdb2 --- /dev/null +++ b/pre_commit_scripts/check_non_ascii_in_files.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Define the list of files to check, excluding .md, hidden, and a number of specific files: +files_to_check=$(git ls-files | grep -vE "^\." | grep -vE "\.md$" | grep -vE "^(tests/python_tests|tools/who_what_benchmark/(tests|whowhatbench))" | grep -v "tools/llm_bench/llm_bench_utils/ov_model_classes.py") + +# Run git grep to find non-ASCII characters in the selected files and store the results: +results=$(LC_ALL=C git grep -n "[^ -~±�\”\“]" -- $files_to_check) + +# Print the results: +if [ -n "$results" ]; then + echo "Error: Non-ASCII characters found in files:" + echo "$results" + exit 1 +else + echo "Success: No non-ASCII characters found in files." +fi +exit 0 diff --git a/requirements-build.txt b/requirements-build.txt index 6da3919e91..b6a8339285 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,2 +1,2 @@ cmake~=3.23.0 -pybind11-stubgen==2.5.1 \ No newline at end of file +pybind11-stubgen==2.5.1 diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp index fc18fa8e0c..3f45ec30e2 100644 --- a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp +++ b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp @@ -19,7 +19,7 @@ int main(int argc, char* argv[]) try { config.num_beams = 15; config.diversity_penalty = 1.0f; config.num_return_sequences = config.num_beams; - + // Since the streamer is set, the results will // be printed each time a new token is generated. auto beams = pipe.generate(prompts, config); diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp index d389e94432..5ac1f08ffc 100644 --- a/samples/cpp/benchmark_genai/benchmark_genai.cpp +++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp @@ -35,15 +35,15 @@ int main(int argc, char* argv[]) try { std::string device = result["device"].as(); size_t num_warmup = result["num_warmup"].as(); size_t num_iter = result["num_iter"].as(); - + ov::genai::GenerationConfig config; config.max_new_tokens = result["max_new_tokens"].as(); ov::genai::LLMPipeline pipe(models_path, device); - + for (size_t i = 0; i < num_warmup; i++) pipe.generate(prompt, config); - + ov::genai::DecodedResults res = pipe.generate(prompt, config); ov::genai::PerfMetrics metrics = res.perf_metrics; for (size_t i = 0; i < num_iter - 1; i++) { diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index 41d63fc0f1..691f7ce956 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -12,14 +12,14 @@ int main(int argc, char* argv[]) try { std::string device = "CPU"; // GPU, NPU can be used as well ov::genai::LLMPipeline pipe(models_path, device); - + ov::genai::GenerationConfig config; config.max_new_tokens = 100; - std::function streamer = [](std::string word) { + std::function streamer = [](std::string word) { std::cout << word << std::flush; // Return flag corresponds whether generation should be stopped. // false means continue generation. - return false; + return false; }; pipe.start_chat(); diff --git a/samples/cpp/image_generation/CMakeLists.txt b/samples/cpp/image_generation/CMakeLists.txt index 004b305088..f558258e24 100644 --- a/samples/cpp/image_generation/CMakeLists.txt +++ b/samples/cpp/image_generation/CMakeLists.txt @@ -80,7 +80,7 @@ install(TARGETS image2image RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) - + # create LoRA sample executable add_executable(inpainting inpainting.cpp load_image.cpp imwrite.cpp) @@ -96,4 +96,4 @@ set_target_properties(inpainting PROPERTIES install(TARGETS inpainting RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) \ No newline at end of file + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt index b0ce8b1b60..b781192836 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -13,11 +13,11 @@ add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai) set_target_properties(${TARGET_NAME} PROPERTIES - COMPILE_PDB_NAME ${TARGET_NAME} + COMPILE_PDB_NAME ${TARGET_NAME} # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) -install(TARGETS ${TARGET_NAME} +install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 8b48dbade0..854b1f5b12 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -19,7 +19,7 @@ int main(int argc, char* argv[]) try { std::string model_path = argv[1]; std::string prompt = argv[2]; - + std::string device = "CPU"; ov::genai::LLMPipeline pipe( diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt index 7c48b6cc0b..fe90e00211 100644 --- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt +++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt @@ -13,11 +13,11 @@ add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai) set_target_properties(${TARGET_NAME} PROPERTIES - COMPILE_PDB_NAME ${TARGET_NAME} + COMPILE_PDB_NAME ${TARGET_NAME} # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) -install(TARGETS ${TARGET_NAME} +install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index e10228863f..4da8309f3a 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -21,7 +21,7 @@ int main(int argc, char* argv[]) try { std::string main_model_path = argv[1]; std::string draft_model_path = argv[2]; std::string prompt = argv[3]; - + // User can run main and draft model on different devices. // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft. std::string main_device = "CPU", draft_device = "CPU"; diff --git a/samples/cpp/text_generation/encrypted_model_causal_lm.cpp b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp index 3ea94d605f..ab3ca97d43 100644 --- a/samples/cpp/text_generation/encrypted_model_causal_lm.cpp +++ b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp @@ -41,7 +41,7 @@ int main(int argc, char* argv[]) try { auto [model_str, model_weights] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin"); ov::genai::Tokenizer tokenizer = decrypt_tokenizer(models_path); - + ov::genai::LLMPipeline pipe(model_str, model_weights, tokenizer, device); std::string result = pipe.generate(prompt, ov::genai::max_new_tokens(100)); diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 9d3dea68a3..2cf56033fc 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -42,4 +42,4 @@ set_target_properties(benchmark_vlm PROPERTIES install(TARGETS benchmark_vlm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) \ No newline at end of file + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/visual_language_chat/benchmark_vlm.cpp b/samples/cpp/visual_language_chat/benchmark_vlm.cpp index 858a626fd5..e7a62ca3cf 100644 --- a/samples/cpp/visual_language_chat/benchmark_vlm.cpp +++ b/samples/cpp/visual_language_chat/benchmark_vlm.cpp @@ -42,15 +42,15 @@ int main(int argc, char* argv[]) try { size_t num_warmup = result["num_warmup"].as(); size_t num_iter = result["num_iter"].as(); ov::Tensor image = utils::load_image(image_path); - + ov::genai::GenerationConfig config; config.max_new_tokens = result["max_new_tokens"].as(); ov::genai::VLMPipeline pipe(models_path, device); - + for (size_t i = 0; i < num_warmup; i++) pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config)); - + auto res = pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config)); auto metrics = res.perf_metrics; for (size_t i = 0; i < num_iter - 1; i++) { diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index af38558656..2df67fe558 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -10,4 +10,4 @@ diffusers==0.32.1 # For image generation pipelines timm==1.0.12 # For exporting InternVL2 torchvision # For visual language models transformers>=4.43 # For Whisper -hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1 \ No newline at end of file +hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1 diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py index d2cc91e04d..d137a27707 100755 --- a/samples/python/benchmark_genai/benchmark_genai.py +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -12,31 +12,31 @@ def main(): parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") - + args = parser.parse_args() - # Perf metrics is stored in DecodedResults. + # Perf metrics is stored in DecodedResults. # In order to get DecodedResults instead of a string input should be a list. prompt = [args.prompt] models_path = args.model device = args.device num_warmup = args.num_warmup num_iter = args.num_iter - + config = ov_genai.GenerationConfig() config.max_new_tokens = args.max_new_tokens pipe = ov_genai.LLMPipeline(models_path, device) - + for _ in range(num_warmup): pipe.generate(prompt, config) - + res = pipe.generate(prompt, config) perf_metrics = res.perf_metrics for _ in range(num_iter - 1): res = pipe.generate(prompt, config) perf_metrics += res.perf_metrics - + print(f"Load time: {perf_metrics.get_load_time():.2f} ms") print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms") print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms") diff --git a/samples/python/image_generation/text2image.py b/samples/python/image_generation/text2image.py index cba1eefd1d..d7126cbe2e 100644 --- a/samples/python/image_generation/text2image.py +++ b/samples/python/image_generation/text2image.py @@ -29,4 +29,4 @@ def main(): if '__main__' == __name__: - main() \ No newline at end of file + main() diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py index 5ec9d54601..2c7669f3ce 100755 --- a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py +++ b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py @@ -11,18 +11,18 @@ class IterableStreamer(openvino_genai.StreamerBase): """ A custom streamer class for handling token streaming and detokenization with buffering. - + Attributes: tokenizer (Tokenizer): The tokenizer used for encoding and decoding tokens. tokens_cache (list): A buffer to accumulate tokens for detokenization. text_queue (Queue): A synchronized queue for storing decoded text chunks. print_len (int): The length of the printed text to manage incremental decoding. """ - + def __init__(self, tokenizer): """ Initializes the IterableStreamer with the given tokenizer. - + Args: tokenizer (Tokenizer): The tokenizer to use for encoding and decoding tokens. """ @@ -37,14 +37,14 @@ def __iter__(self): Returns the iterator object itself. """ return self - + def __next__(self): """ Returns the next value from the text queue. - + Returns: str: The next decoded text chunk. - + Raises: StopIteration: If there are no more elements in the queue. """ @@ -52,20 +52,20 @@ def __next__(self): if value is None: raise StopIteration return value - + def get_stop_flag(self): """ Checks whether the generation process should be stopped. - + Returns: bool: Always returns False in this implementation. """ return False - + def put_word(self, word: str): """ Puts a word into the text queue. - + Args: word (str): The word to put into the queue. """ @@ -74,20 +74,20 @@ def put_word(self, word: str): def put(self, token_id: int) -> bool: """ Processes a token and manages the decoding buffer. Adds decoded text to the queue. - + Args: token_id (int): The token_id to process. - + Returns: bool: True if generation should be stopped, False otherwise. - """ + """ self.tokens_cache.append(token_id) text = self.tokenizer.decode(self.tokens_cache) word = '' if len(text) > self.print_len and '\n' == text[-1]: # Flush the cache after the new line symbol. - word = text[self.print_len:] + word = text[self.print_len:] self.tokens_cache = [] self.print_len = 0 elif len(text) >= 3 and text[-1] == chr(65533): @@ -98,8 +98,8 @@ def put(self, token_id: int) -> bool: # Print to output only if text length is increaesed. word = text[self.print_len:] self.print_len = len(text) - self.put_word(word) - + self.put_word(word) + if self.get_stop_flag(): # When generation is stopped from streamer then end is not called, need to call it here manually. self.end() @@ -161,7 +161,7 @@ def token_printer(): config.top_p = 0.9 config.top_k = 30 - # Since the streamer is set, the results will be printed + # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. pipe.generate(args.prompt, config, text_print_streamer) printer_thread.join() diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py index 726391ba9b..ea06510db1 100755 --- a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py +++ b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py @@ -5,10 +5,10 @@ import argparse import openvino_genai -def streamer(subword): - print(subword, end='', flush=True) - # Return flag corresponds whether generation should be stopped. - # False means continue generation. +def streamer(subword): + print(subword, end='', flush=True) + # Return flag corresponds whether generation should be stopped. + # False means continue generation. return False def main(): @@ -20,7 +20,7 @@ def main(): device = 'CPU' pipe = openvino_genai.LLMPipeline(args.model_dir, device, prompt_lookup=True) - + config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 # add parameter to enable prompt lookup decoding to generate `num_assistant_tokens` candidates per iteration @@ -28,7 +28,7 @@ def main(): # Define max_ngram_size config.max_ngram_size = 3 - # Since the streamer is set, the results will be printed + # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. pipe.generate(args.prompt, config, streamer) diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py index 217b8a2730..96d01550fe 100755 --- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py +++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py @@ -7,10 +7,10 @@ import queue import threading -def streamer(subword): - print(subword, end='', flush=True) - # Return flag corresponds whether generation should be stopped. - # False means continue generation. +def streamer(subword): + print(subword, end='', flush=True) + # Return flag corresponds whether generation should be stopped. + # False means continue generation. return False def main(): @@ -28,7 +28,7 @@ def main(): draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device) pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model) - + config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 # Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded @@ -37,7 +37,7 @@ def main(): # add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold` # config.assistant_confidence_threshold = 0.4 - # Since the streamer is set, the results will be printed + # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. pipe.generate(args.prompt, config, streamer) diff --git a/samples/python/whisper_speech_recognition/recorder.py b/samples/python/whisper_speech_recognition/recorder.py index e79f1f9008..7202c98357 100644 --- a/samples/python/whisper_speech_recognition/recorder.py +++ b/samples/python/whisper_speech_recognition/recorder.py @@ -15,7 +15,7 @@ sample_format = pyaudio.paInt16 # 16 bits per sample channels = 1 fs = 16000 # Record at 16k samples per second -seconds = 5 +seconds = 5 filename = "output.wav" p = pyaudio.PyAudio() # Create an interface to PortAudio @@ -34,7 +34,7 @@ data = stream.read(chunk) frames.append(data) -# Stop and close the stream +# Stop and close the stream stream.stop_stream() stream.close() # Terminate the PortAudio interface diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index ed9fc3a30d..bac26639ce 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -95,7 +95,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { /** * @brief Constructs a ContinuousBatchingPipeline from already existing model and tokenizer. - * + * * This constructor allows for the creation of a ContinuousBatchingPipeline using an existing model * represented as a string and a weights tensor, along with a manually initialized tokenizer. * This is useful when the model and tokenizer are already loaded or created in memory and do not diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp index 7ff172e645..dcfb6dda2d 100644 --- a/src/cpp/include/openvino/genai/generation_handle.hpp +++ b/src/cpp/include/openvino/genai/generation_handle.hpp @@ -68,7 +68,7 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { ov::genai::GenerationConfig m_sampling_params; bool is_dropped(); - + public: GenerationHandleImpl(std::shared_ptr generation_stream, const ov::genai::GenerationConfig& sampling_params) : m_generation_stream(std::move(generation_stream)), diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp index 3dc1fc0803..3cbf177db0 100644 --- a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp +++ b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp @@ -18,7 +18,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { /** * Initializes text to image generation pipeline from a folder with models. * Note, such pipeline is not ready to use as models are not compiled internally. - * + * * Typical scenario is to initialize models using this constructor and then reshape pipeline * with 'reshape()' method and then perform compilation using 'compile()' method. * @param models_path A models path to read models and config files from @@ -30,7 +30,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { * @param models_path A models path to read models and config files from * @param device A single device used for all models * @param properties Properties to pass to 'compile_model' or other pipeline properties like LoRA adapters - * @note If you want to compile each model on a dedicated device or with specific properties, you can create + * @note If you want to compile each model on a dedicated device or with specific properties, you can create * models individually and then combine a final pipeline using static methods like 'latent_consistency_model' or * 'stable_diffusion_3'. See 'samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp' for example */ @@ -172,7 +172,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { * @param guidance_scale A guidance scale. Note, that it's important whether guidance_scale > 1, which affects whether negative prompts * are used or not. For example, all values > 1 are the same for reshape perspective and may vary in subsequent 'generate()' calls. * @note If pipeline has been already compiled, it cannot be reshaped and an exception is thrown. - * + * * Example how to reshape SD3 or Flux models for specific max sequence length: * @code * ov::genai::Text2ImagePipeline pipe("/path"); diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp index 45d98c51bb..54f5dd3699 100644 --- a/src/cpp/include/openvino/genai/scheduler_config.hpp +++ b/src/cpp/include/openvino/genai/scheduler_config.hpp @@ -46,8 +46,8 @@ struct SchedulerConfig { // Enable caching of KV-blocks. // When turned on all previously calculated KV-caches are kept in memory for future usages. // KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released. - // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. - // When turend off only KV-cache required for batch calculation is kept in memory and + // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. + // When turend off only KV-cache required for batch calculation is kept in memory and // when a sequence has finished genegartion its cache is released. bool enable_prefix_caching = false; diff --git a/src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp b/src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp index 18476a5e7f..165eeed568 100644 --- a/src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp @@ -31,4 +31,4 @@ struct OPENVINO_GENAI_EXPORTS VLMPerfMetrics : public PerfMetrics { VLMRawPerfMetrics vlm_raw_metrics; }; -} \ No newline at end of file +} diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 43f8a9b8b3..7e4ebf4ce3 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -19,7 +19,7 @@ class OPENVINO_GENAI_EXPORTS VLMDecodedResults : public DecodedResults{ VLMPerfMetrics perf_metrics; }; -/// @brief A map of models for VLMPipeline constructor. +/// @brief A map of models for VLMPipeline constructor. /// Key is model name (e.g. "vision_embeddings", "text_embeddings", "language", "resampler") /// and value is a pair of model IR as string and weights as tensor. using ModelsMap = std::map>; diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 20d4c0c51c..f57d3e1723 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -72,7 +72,7 @@ class CacheManager { value_roi_size_byte = m_value_cache[decoder_layer_id].get_byte_size(); key_cache_roi_end = static_cast(key_cache.data()) + key_roi_size_byte; value_cache_roi_end = static_cast(value_cache.data()) + value_roi_size_byte; - + // copy current cache data ov::Tensor dst_key_roi(key_cache, start_key, end_key); ov::Tensor dst_value_roi(value_cache, start_value, end_value); @@ -82,13 +82,13 @@ class CacheManager { } - // Some optimizations like AVX2, AVX512, AMX require a minimal shape and - // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, + // Some optimizations like AVX2, AVX512, AMX require a minimal shape and + // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, // so NAN * 0 returns non-zero invalid data. // So we need to set zeros to all newly allocated tensors data. std::memset(key_cache_roi_end, 0, key_cache.get_byte_size() - key_roi_size_byte); std::memset(value_cache_roi_end, 0, value_cache.get_byte_size() - value_roi_size_byte); - + // set new cache tensors if (m_key_cache.size() > decoder_layer_id) { m_key_cache[decoder_layer_id] = key_cache; @@ -110,7 +110,7 @@ class CacheManager { key_cache_shape); ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), value_cache_shape); - + if (m_key_cache.size() > decoder_layer_id) { ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape(); ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape(); @@ -155,7 +155,7 @@ class CacheManager { ov::Coordinate key_src_end_roi = key_shape; ov::Coordinate key_dst_start_roi(key_shape.size(), 0); ov::Coordinate key_dst_end_roi = key_shape; - + ov::Coordinate value_src_start_roi(value_shape.size(), 0); ov::Coordinate value_src_end_roi = value_shape; ov::Coordinate value_dst_start_roi(value_shape.size(), 0); diff --git a/src/cpp/src/continuous_batching_adapter.hpp b/src/cpp/src/continuous_batching_adapter.hpp index 0b0065aa1f..bb537ae709 100644 --- a/src/cpp/src/continuous_batching_adapter.hpp +++ b/src/cpp/src/continuous_batching_adapter.hpp @@ -50,7 +50,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { const ov::AnyMap& plugin_config, const ov::genai::GenerationConfig& generation_config ): LLMPipelineImplBase{tokenizer, GenerationConfig()}, m_impl{ - model_str, + model_str, weights_tensor, tokenizer, scheduler_config, diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 25402e22e7..046573a88d 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -210,7 +210,7 @@ void GenerationConfig::validate() const { // Sampling strategies - OPENVINO_ASSERT(num_return_sequences == 1 || (is_multinomial() || is_beam_search()), + OPENVINO_ASSERT(num_return_sequences == 1 || (is_multinomial() || is_beam_search()), "'num_return_sequences' can be more than 1 only in case of beam search or multinomial sampling, but got ", num_return_sequences); // generic penalties, but not supported by beam search currently diff --git a/src/cpp/src/icontinuous_batching.hpp b/src/cpp/src/icontinuous_batching.hpp index 12030f06f7..a44ac63051 100644 --- a/src/cpp/src/icontinuous_batching.hpp +++ b/src/cpp/src/icontinuous_batching.hpp @@ -61,7 +61,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { virtual GenerationHandle add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params) = 0; - + /** * Checks whether server (pipeline) has non-finished requests and step() should be called within a loop */ @@ -90,7 +90,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { /** * Starts chat with a given system prompt - * + * * In chat scenario prompts passed to `generate` method are accumulated inside the pipeline until `finish_chat` is called */ void start_chat(const std::string& system_message); @@ -100,4 +100,4 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { */ void finish_chat(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp index e74cd441ce..458caf001b 100644 --- a/src/cpp/src/image_generation/flux_pipeline.hpp +++ b/src/cpp/src/image_generation/flux_pipeline.hpp @@ -252,7 +252,7 @@ class FluxPipeline : public DiffusionPipeline { m_vae->compile(device, properties); m_transformer->compile(device, properties); } - + void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { // encode_prompt std::string prompt_2_str = generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt; diff --git a/src/cpp/src/image_generation/inpainting_pipeline.cpp b/src/cpp/src/image_generation/inpainting_pipeline.cpp index a510be0a57..0bbd1ca7ef 100644 --- a/src/cpp/src/image_generation/inpainting_pipeline.cpp +++ b/src/cpp/src/image_generation/inpainting_pipeline.cpp @@ -19,7 +19,7 @@ namespace genai { InpaintingPipeline::InpaintingPipeline(const std::filesystem::path& root_dir) { const std::string class_name = get_class_name(root_dir); - if (class_name == "StableDiffusionPipeline" || + if (class_name == "StableDiffusionPipeline" || class_name == "LatentConsistencyModelPipeline" || class_name == "StableDiffusionInpaintPipeline") { m_impl = std::make_shared(PipelineType::INPAINTING, root_dir); diff --git a/src/cpp/src/image_generation/models/unet_inference.hpp b/src/cpp/src/image_generation/models/unet_inference.hpp index ae928aac30..7671f1457d 100644 --- a/src/cpp/src/image_generation/models/unet_inference.hpp +++ b/src/cpp/src/image_generation/models/unet_inference.hpp @@ -65,4 +65,4 @@ class UNet2DConditionModel::UNetInference { }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp index dd265e3eca..f8d36b1d30 100644 --- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp @@ -44,4 +44,4 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel:: }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp index f63a8ea237..b2199c1a66 100644 --- a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp @@ -17,7 +17,7 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel const std::string& device, const ov::AnyMap& properties) override { - // All shapes for input/output tensors should be static. + // All shapes for input/output tensors should be static. // Double check this and throw runtime error if it's not the case. for (auto& input : model->inputs()) { OPENVINO_ASSERT(!input.get_partial_shape().is_dynamic(), @@ -142,4 +142,4 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/image_generation/schedulers/ddim.cpp b/src/cpp/src/image_generation/schedulers/ddim.cpp index 768ba56837..a8c107fbfc 100644 --- a/src/cpp/src/image_generation/schedulers/ddim.cpp +++ b/src/cpp/src/image_generation/schedulers/ddim.cpp @@ -36,7 +36,7 @@ DDIMScheduler::Config::Config(const std::filesystem::path& scheduler_config_path read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); } -DDIMScheduler::DDIMScheduler(const std::filesystem::path& scheduler_config_path) +DDIMScheduler::DDIMScheduler(const std::filesystem::path& scheduler_config_path) : DDIMScheduler(Config(scheduler_config_path)) { } diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp index a63a073cfc..7b8ab245a5 100644 --- a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp @@ -18,7 +18,7 @@ EulerAncestralDiscreteScheduler::Config::Config(const std::filesystem::path& sch nlohmann::json data = nlohmann::json::parse(file); using utils::read_json_param; - + read_json_param(data, "num_train_timesteps", num_train_timesteps); read_json_param(data, "beta_start", beta_start); read_json_param(data, "beta_end", beta_end); @@ -30,7 +30,7 @@ EulerAncestralDiscreteScheduler::Config::Config(const std::filesystem::path& sch read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); } -EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path) +EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path) : EulerAncestralDiscreteScheduler(Config(scheduler_config_path)) { } diff --git a/src/cpp/src/image_generation/schedulers/lms_discrete.cpp b/src/cpp/src/image_generation/schedulers/lms_discrete.cpp index d8c3c23745..767c14311d 100644 --- a/src/cpp/src/image_generation/schedulers/lms_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/lms_discrete.cpp @@ -108,7 +108,7 @@ LMSDiscreteScheduler::Config::Config(const std::filesystem::path& scheduler_conf read_json_param(data, "steps_offset", steps_offset); } -LMSDiscreteScheduler::LMSDiscreteScheduler(const std::filesystem::path& scheduler_config_path) +LMSDiscreteScheduler::LMSDiscreteScheduler(const std::filesystem::path& scheduler_config_path) : LMSDiscreteScheduler(Config(scheduler_config_path)) { } @@ -146,7 +146,7 @@ float LMSDiscreteScheduler::get_init_noise_sigma() const { m_config.timestep_spacing == TimestepSpacing::TRAILING) { return max_sigma; } - + return std::sqrt(max_sigma * max_sigma + 1); } @@ -175,7 +175,7 @@ void LMSDiscreteScheduler::set_timesteps(size_t num_inference_steps, float stren } m_sigmas.push_back(0.f); - + // initialize timesteps for (size_t i = 0; i < num_inference_steps; ++i) { int64_t timestep = _sigma_to_t(m_sigmas[i]); @@ -206,7 +206,7 @@ std::map LMSDiscreteScheduler::step(ov::Tensor noise_pr break; case PredictionType::V_PREDICTION: // pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) - pred_latent = noise_pred.data()[j] * (-sigma / std::sqrt(sigma * sigma + 1.0f) + + pred_latent = noise_pred.data()[j] * (-sigma / std::sqrt(sigma * sigma + 1.0f) + latents.data()[j] / (sigma * sigma + 1.0f)); break; default: diff --git a/src/cpp/src/image_generation/schedulers/pndm.cpp b/src/cpp/src/image_generation/schedulers/pndm.cpp index 4ddc099d0e..3900fd6b92 100644 --- a/src/cpp/src/image_generation/schedulers/pndm.cpp +++ b/src/cpp/src/image_generation/schedulers/pndm.cpp @@ -31,7 +31,7 @@ PNDMScheduler::Config::Config(const std::filesystem::path& scheduler_config_path read_json_param(data, "timestep_spacing", timestep_spacing); } -PNDMScheduler::PNDMScheduler(const std::filesystem::path& scheduler_config_path) +PNDMScheduler::PNDMScheduler(const std::filesystem::path& scheduler_config_path) : PNDMScheduler(Config(scheduler_config_path)) { } diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp index e3e720109d..04f0f5f9e0 100644 --- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp @@ -356,7 +356,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { ov::Tensor negative_pooled_prompt_2_embed_out = get_tensor_batch(text_encoder_2_output, 0); ov::Tensor negative_prompt_2_embed_out = get_tensor_batch(text_encoder_2_hidden_state, 0); ov::Tensor negative_t5_prompt_embed_out = get_tensor_batch(text_encoder_3_output, 0); - + ov::Tensor negative_pooled_prompt_embed, negative_prompt_embed, negative_pooled_prompt_2_embed, negative_prompt_2_embed, negative_t5_prompt_embed; if (generation_config.num_images_per_prompt == 1) { diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp index 3801c855fd..b789b1e7e0 100644 --- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp @@ -371,7 +371,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); noise_pred_shape[0] /= batch_size_multiplier; - + if (batch_size_multiplier > 1) { noisy_residual_tensor.set_shape(noise_pred_shape); diff --git a/src/cpp/src/image_generation/text2image_pipeline.cpp b/src/cpp/src/image_generation/text2image_pipeline.cpp index 56b02a2e10..87af2c1dbd 100644 --- a/src/cpp/src/image_generation/text2image_pipeline.cpp +++ b/src/cpp/src/image_generation/text2image_pipeline.cpp @@ -20,7 +20,7 @@ namespace genai { Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir) { const std::string class_name = get_class_name(root_dir); - if (class_name == "StableDiffusionPipeline" || + if (class_name == "StableDiffusionPipeline" || class_name == "LatentConsistencyModelPipeline") { m_impl = std::make_shared(PipelineType::TEXT_2_IMAGE, root_dir); } else if (class_name == "StableDiffusionXLPipeline") { diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 11efed8b32..9a0d3da21c 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -18,7 +18,7 @@ namespace genai { namespace { -/* +/* * NPU reads some properties from the config file, but when LLMPipeline is initialized * from the model_str and weights_tensor, there are no files. * In the later case ModelDesc is stored in properties. @@ -37,7 +37,7 @@ std::pair split_model_descr( pop_property(main_properties, "name_or_path", model_descr.name_or_path); pop_property(main_properties, "type", model_descr.type); pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads); - + return {main_properties, model_descr}; } @@ -62,7 +62,7 @@ std::pair draft_model( const std::string& device, const ov::AnyMap& properties) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); - + std::filesystem::path openvino_model_name = "openvino_model.xml"; auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config); auto generation_config = utils::from_config_json_if_exists(models_path); @@ -99,8 +99,8 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, plugin_config); @@ -118,8 +118,8 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, scheduler_config, device, device_properties); @@ -141,8 +141,8 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::GenerationConfig& generation_config) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()){ auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties); @@ -150,20 +150,20 @@ ov::genai::LLMPipeline::LLMPipeline( tokenizer, scheduler_config, device, device_properties, generation_config); } else if (device == "NPU") { // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution. - // NPU reads some properties from the config file, but when LLMPipeline is initialized - // from the model_str and weights_tensor, there is no files. + // NPU reads some properties from the config file, but when LLMPipeline is initialized + // from the model_str and weights_tensor, there is no files. // Therefore, we need to pass these properties manually. // This is necessary only for NPU, for other plugins can be ommited. // Example of usage: - // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, - // {"type", "llama"}, + // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, + // {"type", "llama"}, // {"num_key_value_heads", 32}}; // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties); // This will convert from AnyMap to ModelDesc. auto [filtered_properties, model_descr] = split_model_descr(properties); m_pimpl = static_llm::LLMPipelineFactory::create( - utils::singleton_core().read_model(model_str, weights_tensor), + utils::singleton_core().read_model(model_str, weights_tensor), model_descr, tokenizer, device, diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp index dbf8d89391..28f5bc1b43 100644 --- a/src/cpp/src/llm_pipeline_stateful.hpp +++ b/src/cpp/src/llm_pipeline_stateful.hpp @@ -22,7 +22,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { // Tail of previous output in chat mode is missing in KV cache, let's keep it std::optional m_last_disappeared_token = std::nullopt; // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache - // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history + // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; size_t m_kv_cache_seq_length_axis = 2; diff --git a/src/cpp/src/logit_processor.hpp b/src/cpp/src/logit_processor.hpp index a81a33017d..6bbe0e896b 100644 --- a/src/cpp/src/logit_processor.hpp +++ b/src/cpp/src/logit_processor.hpp @@ -29,7 +29,7 @@ struct Logits { OPENVINO_ASSERT(m_vector.size() == 0, "Logits vector already initialized"); m_vector.reserve(m_size); for (size_t i = 0; i < m_size; i++) - m_vector.emplace_back(m_data[i], i); + m_vector.emplace_back(m_data[i], i); } bool is_vector_initialized() const { @@ -59,8 +59,8 @@ class TopPFilter : public ILogitTransformer { TopPFilter(double top_p) : m_top_p(top_p) {} bool partial_sort_and_resize(Logits& logits) { - // Since most of the time huge part of logits vector contains minimal values - // expensive sorting of entire vector might be unnecessary, especially for low values of top_p. + // Since most of the time huge part of logits vector contains minimal values + // expensive sorting of entire vector might be unnecessary, especially for low values of top_p. // This method partially sorts vector finding M top elements and stops when top_p condition is met. // It iterates a few times starting with M = 16 and multiplying it by 2 each iteration until M = 1024. // If top_p is found in considered scope it resizes logits vector and returns true. Otherwise it returns false. @@ -111,9 +111,9 @@ class TopKFilter : public ILogitTransformer { // If this transform is used along with top_p, it should be applied after it since top_p sorts entire vector and top_k does it only partially void apply(Logits& logits) override { - if (m_top_k >= logits.m_size) + if (m_top_k >= logits.m_size) return; - + // If top_p is also used vector is already initialized and sorted if (!logits.is_vector_initialized()) { // Initialize and partially sort vector @@ -234,7 +234,7 @@ class RepetitionPenaltyTransform : public IPenaltyTransformer { class EOSPenaltyTransform : public ILogitTransformer { public: - EOSPenaltyTransform(const std::set& stop_token_ids, size_t min_generated_tokens) : + EOSPenaltyTransform(const std::set& stop_token_ids, size_t min_generated_tokens) : m_stop_token_ids(stop_token_ids), m_applicable_tensor_len(min_generated_tokens) {} void apply(Logits& logits) override { @@ -243,7 +243,7 @@ class EOSPenaltyTransform : public ILogitTransformer { for (auto stop_token_id: m_stop_token_ids) logits.m_data[stop_token_id] = 0.f; } - + bool is_applicable(size_t generated_tokens_cnt = 0) override { return generated_tokens_cnt < m_applicable_tensor_len; @@ -310,7 +310,7 @@ class PresencePenaltyTransform : public IPenaltyTransformer { class LogitProcessor { protected: std::vector> m_logit_transformers; - + std::shared_ptr> m_unique_generated_token_ids = std::shared_ptr>(new std::map); std::shared_ptr> m_unique_prompt_token_ids = std::shared_ptr>(new std::set); size_t m_generated_tokens = 0; @@ -334,21 +334,21 @@ class LogitProcessor { if (sampling_params.is_multinomial() || sampling_params.is_greedy_decoding()) { if (sampling_params.repetition_penalty != 1.0f) { - std::shared_ptr transformer = + std::shared_ptr transformer = std::shared_ptr(new LogitTransformers::RepetitionPenaltyTransform(sampling_params.repetition_penalty)); transformer->set_unique_prompt_token_ids(m_unique_prompt_token_ids); transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); m_logit_transformers.push_back(transformer); } if (sampling_params.presence_penalty != 0.0f) { - std::shared_ptr transformer = - std::shared_ptr(new LogitTransformers::PresencePenaltyTransform(sampling_params.presence_penalty)); + std::shared_ptr transformer = + std::shared_ptr(new LogitTransformers::PresencePenaltyTransform(sampling_params.presence_penalty)); transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); m_logit_transformers.push_back(transformer); - + } if (sampling_params.frequency_penalty != 0.0f) { - std::shared_ptr transformer = + std::shared_ptr transformer = std::shared_ptr(new LogitTransformers::FrequencyPenaltyTransform(sampling_params.frequency_penalty)); transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); m_logit_transformers.push_back(transformer); diff --git a/src/cpp/src/lora_helper.cpp b/src/cpp/src/lora_helper.cpp index 5d836832dd..1a8c49a281 100644 --- a/src/cpp/src/lora_helper.cpp +++ b/src/cpp/src/lora_helper.cpp @@ -28,4 +28,4 @@ bool update_adapters_from_properties (const AnyMap& properties, std::optional extract_adapters_from_properties (const AnyMap& properties bool update_adapters_from_properties (const AnyMap& properties, std::optional& adapter_config); } -} \ No newline at end of file +} diff --git a/src/cpp/src/lora_names_mapping.cpp b/src/cpp/src/lora_names_mapping.cpp index 0be3e740b4..b63beb0319 100644 --- a/src/cpp/src/lora_names_mapping.cpp +++ b/src/cpp/src/lora_names_mapping.cpp @@ -336,4 +336,4 @@ NameMap maybe_map_non_diffusers_lora_to_diffusers(const std::set& k } -} \ No newline at end of file +} diff --git a/src/cpp/src/make_tokenizer_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp index 4685b0e715..f6a6022454 100644 --- a/src/cpp/src/make_tokenizer_stateful.cpp +++ b/src/cpp/src/make_tokenizer_stateful.cpp @@ -24,12 +24,12 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr if (!combine_seg_node || combine_seg_node->input_value(1).get_element_type() != ov::element::i32) { return false; } - + std::shared_ptr input_1_const = std::dynamic_pointer_cast(combine_seg_node->get_input_node_shared_ptr(1)); if (!input_1_const) { return false; } - + op::util::VariableInfo var_info{ov::Shape{}, ov::element::boolean, ADD_SPECIAL_TOKENS_VAR_ID}; auto variable = std::make_shared(var_info); @@ -41,7 +41,7 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr combine_seg_node->input(1).replace_source_output(select_node->output(0)); auto assign = std::make_shared(read_value, variable); - + model->add_sinks({assign}); model->add_variables({variable}); return true; @@ -58,7 +58,7 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptrinput_value(4).get_element_type().is_integral_number()) return false; - + std::shared_ptr skip_tokens_const = std::dynamic_pointer_cast(vocab_decoder_node->get_input_node_shared_ptr(4)); std::shared_ptr skip_tokens_slice = std::dynamic_pointer_cast(vocab_decoder_node->get_input_node_shared_ptr(4)); if (!skip_tokens_const && !skip_tokens_slice) @@ -67,7 +67,7 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr(ov::element::i32, ov::Shape{1}, std::vector{0}); auto int_max_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{std::numeric_limits::max()}); auto one_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); - + // By default, INT_MAX will multiply with 1 and all skip_tokens will be selected. op::util::VariableInfo var_info{ov::Shape{1}, ov::element::i32, SKIP_SPECIAL_TOKENS_VAR_ID}; auto variable = std::make_shared(var_info); @@ -82,7 +82,7 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr slice_node = std::make_shared(skip_tokens_const, start_const, stop, one_const); vocab_decoder_node->input(4).replace_source_output(slice_node->output(0)); } - + auto assign = std::make_shared(read_value, variable); model->add_sinks({assign}); model->add_variables({variable}); diff --git a/src/cpp/src/make_tokenizer_stateful.hpp b/src/cpp/src/make_tokenizer_stateful.hpp index 307c6199c8..1530b50b04 100644 --- a/src/cpp/src/make_tokenizer_stateful.hpp +++ b/src/cpp/src/make_tokenizer_stateful.hpp @@ -7,10 +7,10 @@ namespace ov { namespace genai { -/** +/** * @brief This pass modifies tokenizer ov::Model so that special tokens adding will be * enabled or disabled depending on stateful value. - * + * * +--------------+ * | DefaultMode | * +--------------+ @@ -38,10 +38,10 @@ class MakeCombineSegmentsSatateful : public ov::pass::ModelPass { bool run_on_model(const std::shared_ptr& model) override; }; -/** +/** * @brief This pass modifies tokenizer ov::Model so that special tokens adding will be * enabled or disabled depending on stateful value. - * + * * +--------------+ * | DefaultMode | * +--------------+ diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp index 27eee9e27d..64d1647351 100644 --- a/src/cpp/src/model_runner.hpp +++ b/src/cpp/src/model_runner.hpp @@ -105,7 +105,7 @@ class ModelRunner { int64_t * input_ids_data = input_ids.data(), * position_ids_data = position_ids.data(); - int32_t + int32_t * past_lens_data = past_lens.data(), * subsequence_begins_data = subsequence_begins.data(), * block_indices_begins_data = block_indices_begins.data(); diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 3725dc0cfc..c326046e3c 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -15,12 +15,12 @@ ov::genai::MeanStdPair calc_mean_and_std(const std::vector float { return acc + duration.count() / 1000.0f; }); mean /= durations.size(); - + float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { auto d = duration.count() / 1000.0f; @@ -103,10 +103,10 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { raw_metrics.m_times_to_first_token = std::vector(); raw_metrics.m_times_to_first_token.emplace_back(ttft); num_generated_tokens = batch_sizes[0]; - + // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens. - // To have a clearer TPOT number, the time taken to generate the very first token at the prefill stage - // must not be included in the TPOT calculation. The first duration used for TPOT is from the first token + // To have a clearer TPOT number, the time taken to generate the very first token at the prefill stage + // must not be included in the TPOT calculation. The first duration used for TPOT is from the first token // to the second token, not from the start time to the first token. for (size_t i = 1; i < tok_times.size(); ++i) { // If in 10 ms a batch of 5 new tokens is generated then TPOT is 10 / 5 = 2 tok/ms. @@ -114,7 +114,7 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { num_generated_tokens += batch_sizes[i]; } } - + // calc_mean_and_std will convert microsecond to milliseconds. tpot = calc_mean_and_std(raw_metrics.m_durations); ipot = calc_mean_and_std(raw_metrics.m_token_infer_durations); @@ -132,7 +132,7 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { OPENVINO_ASSERT(right.load_time == load_time, "generation metrics can be accumulated only for the same pipeline"); - + // Copy left value to res. PerfMetrics res = *this; @@ -143,7 +143,7 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { auto& right_durations = right.raw_metrics.m_durations; auto& right_batch_sizes = right.raw_metrics.m_batch_sizes; auto& right_times_to_first_token = right.raw_metrics.m_times_to_first_token; - + new_durations.insert(new_durations.end(), right_durations.begin(), right_durations.end()); new_times_to_first_token.insert(new_times_to_first_token.end(), right_times_to_first_token.begin(), right_times_to_first_token.end()); new_batch_sizes.insert(new_batch_sizes.end(), right_batch_sizes.begin(), right_batch_sizes.end()); @@ -155,7 +155,7 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { auto& right_tok_durations = right.raw_metrics.tokenization_durations; auto& right_detok_durations = right.raw_metrics.detokenization_durations; auto& right_gen_durations = right.raw_metrics.generate_durations; - + new_tok_durations.insert(new_tok_durations.end(), right_tok_durations.begin(), right_tok_durations.end()); new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end()); new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end()); diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp index ffc8a8aab2..0482c5de11 100644 --- a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp +++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp @@ -83,4 +83,4 @@ void ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::generate } } -} \ No newline at end of file +} diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp index 8962aba0f2..3f1b66003e 100644 --- a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp +++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp @@ -27,7 +27,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl : public properties, generation_config, true } {}; - + void generate_candidates(); // { generated_len, validation_len } @@ -37,4 +37,4 @@ class ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl : public protected: TokenIds generate_candidates(const TokenIds& input_ids, size_t num_pred_tokens, size_t max_ngram_size); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp index 7a893a2603..a11db29d29 100644 --- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp +++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp @@ -58,7 +58,7 @@ void ContinuousBatchingPipeline::PromptLookupImpl::step() { num_matches = (present_req_len - prev_full_req_len - 1); acceptance_rate = static_cast(num_matches) / static_cast(prev_validation_len); - } + } m_sd_metrics.update_acceptance_rate(request_id, acceptance_rate * 100); m_sd_metrics.update_draft_accepted_tokens(request_id, num_matches); } @@ -103,8 +103,8 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector main_generations; for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) { - OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); - OPENVINO_ASSERT(sampling_params[request_id].is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`"); + OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); + OPENVINO_ASSERT(sampling_params[request_id].is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`"); main_generations.push_back(m_pipeline->add_request(request_id, input_ids[request_id], sampling_params[request_id])); } diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp index 0c05c2afd0..d3c67853a1 100644 --- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp +++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp @@ -46,4 +46,4 @@ class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPi SpeculativeDecodingMetrics get_metrics(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/safetensors.c b/src/cpp/src/safetensors.c index 61559882c6..d128eb1bee 100644 --- a/src/cpp/src/safetensors.c +++ b/src/cpp/src/safetensors.c @@ -1,2 +1,2 @@ #define SAFETENSORS_IMPLEMENTATION -#include "safetensors.h" \ No newline at end of file +#include "safetensors.h" diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 54850f657b..6a0b1f4465 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -4,7 +4,7 @@ #include "sampler.hpp" namespace ov::genai { -// Modified Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurrence in haystack +// Modified Knuth-Morris-Pratt algorithm which returns tokens following after every needle occurrence in haystack std::vector kmp_search(const std::vector& haystack, const std::vector& needle) { if (needle.empty()) { // no_repeat_ngram_size == 1, ban every token return {haystack.begin(), haystack.end()}; @@ -159,7 +159,7 @@ int match_stop_string2(Tokenizer & tokenizer, const TokenIds & generated_tokens, std::vector last_generated_tokens(generated_tokens.end()-num_tokens, generated_tokens.end()); if (stop_tokens == last_generated_tokens) return num_tokens; - + // Continue checking chunks of 4 tokens num_tokens += 4; while (num_tokens <= generated_tokens.size()) { @@ -188,7 +188,7 @@ void Sampler::GroupBeamSearcher::finalize(SamplerOutput& sampler_output) { // mark current sequence as finished beam.m_sequence->set_status(SequenceStatus::FINISHED); - // Setting length since this function is used when sequence generated tokens number reaches max_new_tokens + // Setting length since this function is used when sequence generated tokens number reaches max_new_tokens beam.m_sequence->set_finish_reason(GenerationFinishReason::LENGTH); // we also need to drop add ongoing / forked sequences from scheduler sampler_output.m_dropped_sequences.push_back(sequence_id); @@ -548,7 +548,7 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen std::vector dropped_seq_ids; for (auto& running_sequence : sequence_group->get_running_sequences()) { const auto generated_len = running_sequence->get_generated_len(); - if (sampling_params.max_new_tokens <= generated_len || + if (sampling_params.max_new_tokens <= generated_len || is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) { // stop sequence by max_new_tokens or stop token (eos included) running_sequence->set_status(SequenceStatus::FINISHED); @@ -679,7 +679,7 @@ bool Sampler::validate_candidate( float p_i = std::exp(*it_log_prob), q_i = std::exp(sampled_token.m_log_prob), probability_ratio = p_i / q_i; - + auto dist = std::uniform_int_distribution<>(0, 100); // equivalent to multinomial with number of trials == 1 float r_i = dist(rng_engine); r_i /= 100; @@ -722,7 +722,7 @@ float get_p_prime(Sequence::Ptr& running_sequence, if (cumulative_prob == 0.f) { return 1.f; } - + float p_n = std::exp(sampled_token.m_log_prob), q_n = std::exp(*it_log_prob), p_prime = std::max(0.f, (p_n - q_n)) / std::log(cumulative_prob); @@ -804,7 +804,7 @@ SamplerOutput Sampler::sample(const std::vector & sequence_g stop_sample_tokens(running_sequence, token_offset, max_num_sampled_token, max_removed_tokens_per_request); break; } - + // do sampling only for token validation/generation. // continue in case of extending draft model sequences by main model generated tokens which // should be taken to KV cache without validation @@ -890,7 +890,7 @@ SamplerOutput Sampler::sample(const std::vector & sequence_g m_beam_search_info.at(request_id).finalize(sampler_output); } } - // Notify handle after sampling is done. + // Notify handle after sampling is done. // For non-streaming this is effective only when the generation is finished. OPENVINO_ASSERT(num_tokens_to_process >= max_removed_tokens_per_request); sequence_group->notify_handle(); @@ -929,7 +929,7 @@ void Sampler::create_logit_processor(uint64_t request_id, const GenerationConfig m_logit_processors.insert({request_id, LogitProcessor(sampling_params, prompt)}); } -void Sampler::clear_request_info(uint64_t request_id) { +void Sampler::clear_request_info(uint64_t request_id) { m_beam_search_info.erase(request_id); m_logit_processors.erase(request_id); m_stop_strings.erase(request_id); diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 7796f93d1e..ca8937cb60 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -101,7 +101,7 @@ class Sampler::GroupBeamSearcher { return m_sequence->get_generated_len(); } }; - + static bool greater(const Beam& left, const Beam& right) { return left.m_score > right.m_score; } diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index 0057b19329..85ab4b778f 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -50,7 +50,7 @@ class Scheduler { m_can_use_partial_preemption(can_use_partial_preemption), m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, block_size, num_layers) { - + OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); } diff --git a/src/cpp/src/sequence_group.cpp b/src/cpp/src/sequence_group.cpp index 854fc85777..5b8f94e62a 100644 --- a/src/cpp/src/sequence_group.cpp +++ b/src/cpp/src/sequence_group.cpp @@ -20,7 +20,7 @@ size_t Sequence::_make_hash(size_t content_length) { // hash of current block depends on prefix hashes std::vector content; size_t prefix_hashes_needed_count = block_start_idx / block_size; - OPENVINO_ASSERT(prefix_hashes_needed_count <= m_prefix_hashes.size()); + OPENVINO_ASSERT(prefix_hashes_needed_count <= m_prefix_hashes.size()); content.insert(content.end(), m_prefix_hashes.begin(), m_prefix_hashes.begin() + prefix_hashes_needed_count); // get tokens corresponding to current block @@ -38,7 +38,7 @@ size_t Sequence::_make_hash(size_t content_length) { return std::hash{}(std::string_view(data, size)); } -// Each KV block can be uniquely identified by +// Each KV block can be uniquely identified by // the tokens within the block and the tokens in the prefix before the block. // hash(prefix tokens + block tokens) <--> KV Block size_t Sequence::get_hash(size_t content_length) { @@ -56,8 +56,8 @@ size_t Sequence::get_hash(size_t content_length) { if (content_len % block_size == 0) { return m_prefix_hashes[content_len / block_size - 1]; } - + return _make_hash(content_len); } } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index b6bcc83530..33e73ebfb6 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -210,7 +210,7 @@ class SequenceGroup : public std::enable_shared_from_this { bool m_has_echoed = false; uint64_t m_next_sequence_id = 0; - + // amount of processed tokens, e.g. prompt can be processed using multiple consequence inferences // so, we need to track which part of the prompt we have already processed size_t m_num_processed_tokens = 0; @@ -469,7 +469,7 @@ class SequenceGroup : public std::enable_shared_from_this { size_t get_num_tokens_to_validate() { return m_num_validation_tokens; } - + void set_stream_window_size(size_t k) { m_stream_window_size = k; } @@ -659,7 +659,7 @@ class SequenceGroup : public std::enable_shared_from_this { } } - + // Special notification path for max_new_tokens == 0 where we don't expect to return any new tokens, but only process prompt void notify_handle_echo_only() { // This method is called after scheduling and before sampling, @@ -682,7 +682,7 @@ class SequenceGroup : public std::enable_shared_from_this { GenerationOutputs outputs; outputs.emplace(0, output); m_generation_stream->push(std::move(outputs)); - } + } }; inline std::shared_ptr Sequence::get_sequence_group_ptr() const { diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp index a1d0e85f17..331094b0bb 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp @@ -81,7 +81,7 @@ get_prefix_len( const size_t candidate_sequence_gen_len = candidate_token_ids.size(), running_sequence_gen_len = running_sequence->get_generated_len(); - + // to find the len of prefix size_t sequence_prefix_len = std::min(candidate_sequence_gen_len, running_sequence_gen_len); for (size_t i = 0; i < sequence_prefix_len; ++i) { @@ -101,7 +101,7 @@ size_t remove_tokens_from_sequence(Sequence::Ptr& sequence, size_t min_generated_tokens, LogitProcessor& logit_proccessor) { - const auto generated_token_ids = sequence->get_generated_ids(); + const auto generated_token_ids = sequence->get_generated_ids(); const auto sequence_generated_len = generated_token_ids.size(); OPENVINO_ASSERT(sequence_generated_len >= min_generated_tokens); @@ -187,7 +187,7 @@ init_request( return min_candidate_len; } -UpdateRequestResult +UpdateRequestResult ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::init_request_by_candidate( uint64_t request_id, const GeneratedSequences& candidates) { @@ -195,7 +195,7 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::init_r if (request->get_request_id() != request_id) { continue; } - + UpdateRequestResult result; m_sampler->create_logit_processor(request_id, request->get_sampling_parameters(), request->get_prompt_ids()); auto& logit_processor = m_sampler->get_logit_processor(request_id); @@ -322,4 +322,4 @@ void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::m } } } -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp index 1c98969d36..fdbf4690cd 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp @@ -36,4 +36,4 @@ class ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl : void finish_request(SequenceGroup::Ptr request); void _pull_awaiting_requests() override {}; }; -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index 526c5df2d4..555a732873 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -15,7 +15,7 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) { std::string test_string = "Could you please tell me something about OpenVINO.GenAI?"; ov::Tensor encoded_string_lhs = lhs.encode(test_string).input_ids, encoded_string_rhs = rhs.encode(test_string).input_ids; - + ov::Shape shape_lhs = encoded_string_lhs.get_shape(), shape_rhs = encoded_string_rhs.get_shape(); @@ -23,7 +23,7 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) { lhs.get_bos_token_id() == rhs.get_bos_token_id() && lhs.get_pad_token_id() == rhs.get_pad_token_id(); } -ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, +ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc) { auto main_model = main_model_desc.model; auto draft_model = draft_model_desc.model; @@ -75,7 +75,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con // todo: remove this condition after support of CVS-154103 OPENVINO_ASSERT(are_tokenizers_equal(main_model_tokenizer, draft_model_tokenizer), "Tokenizers for draft and main models are different!"); - + m_tokenizer = main_model_tokenizer; // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp index 2f8067cbab..96a17edc5f 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp @@ -30,7 +30,7 @@ struct ModelDesc { properties(properties), scheduler_config(scheduler_config), generation_config(generation_config) {} - + ModelDesc() = default; }; @@ -41,7 +41,7 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat // Mutex protecting access to m_draft_generations, so add_request and step methods can be called from different threads std::mutex m_draft_generations_mutex; std::map m_draft_generations; - + public: SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc); @@ -64,4 +64,4 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat SpeculativeDecodingMetrics get_speculative_decoding_metrics(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp index 4e5602482a..75b1493407 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp @@ -154,4 +154,4 @@ void SpeculativeDecodingMetrics::clean_up() { total_duration = 0; } -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp index 0d9173b99f..33204247ff 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp @@ -43,4 +43,4 @@ class SpeculativeDecodingMetrics { void clean_up(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp index 5938b55f6c..314a7ffa4d 100644 --- a/src/cpp/src/text_callback_streamer.cpp +++ b/src/cpp/src/text_callback_streamer.cpp @@ -52,4 +52,4 @@ void TextCallbackStreamer::end() { ov::genai::StreamerBase::~StreamerBase() = default; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp index 6f0872ad1b..a03b0deccb 100644 --- a/src/cpp/src/text_callback_streamer.hpp +++ b/src/cpp/src/text_callback_streamer.hpp @@ -25,4 +25,4 @@ class TextCallbackStreamer: public StreamerBase { }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 9261aa7a4a..899a90bcec 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -203,7 +203,7 @@ ProcessorConfig from_any_map( } /** - * scheduler_config is a separate config for continuous batching pipeline. + * scheduler_config is a separate config for continuous batching pipeline. * This routine splits scheduler_config from plugin_config. */ std::pair split_scheduler_config(const ov::AnyMap& properties) { diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index ad0e1a05d4..0966da280d 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -16,7 +16,7 @@ namespace utils { // Variable template that checks if a type has begin() and end() member functions template constexpr bool is_container = false; - + template constexpr bool is_container().begin()), diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp index f564be8f19..0ed4b29b67 100644 --- a/src/cpp/src/utils/paged_attention_transformations.cpp +++ b/src/cpp/src/utils/paged_attention_transformations.cpp @@ -80,4 +80,4 @@ void apply_paged_attention_transformations(std::shared_ptr model, Dev } // namespace utils } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/visual_language/clip.cpp b/src/cpp/src/visual_language/clip.cpp index fba8b10d4f..c9e3e51cb5 100644 --- a/src/cpp/src/visual_language/clip.cpp +++ b/src/cpp/src/visual_language/clip.cpp @@ -258,7 +258,7 @@ clip_image_f32 clip_image_preprocess(clip_ctx& ctx, const clip_image_u8& img) { } std::vector get_image_patches( - const clip_image_u8& image, + const clip_image_u8& image, const std::vector>& image_grid_pinpoints, const std::pair& size, int patch_size @@ -274,7 +274,7 @@ std::vector get_image_patches( int base_patch_height = size.second; clip_image_u8 base_patch; bicubic_resize(image, base_patch, base_patch_width, base_patch_height); - + patches.push_back(base_patch); // Select best resolution for patching diff --git a/src/cpp/src/visual_language/clip.hpp b/src/cpp/src/visual_language/clip.hpp index 55cf03a49f..7d019e423f 100644 --- a/src/cpp/src/visual_language/clip.hpp +++ b/src/cpp/src/visual_language/clip.hpp @@ -36,7 +36,7 @@ void bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_wid clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img); std::vector get_image_patches( - const clip_image_u8& image, + const clip_image_u8& image, const std::vector>& image_grid_pinpoints, const std::pair& size, int patch_size diff --git a/src/cpp/src/visual_language/embedding_model.cpp b/src/cpp/src/visual_language/embedding_model.cpp index a2a9750c33..3150b0ce15 100644 --- a/src/cpp/src/visual_language/embedding_model.cpp +++ b/src/cpp/src/visual_language/embedding_model.cpp @@ -64,4 +64,4 @@ void EmbeddingsModel::merge_postprocess(std::shared_ptr model, float } } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/visual_language/embedding_model.hpp b/src/cpp/src/visual_language/embedding_model.hpp index 5e85e03026..5d675405b0 100644 --- a/src/cpp/src/visual_language/embedding_model.hpp +++ b/src/cpp/src/visual_language/embedding_model.hpp @@ -46,4 +46,4 @@ class EmbeddingsModel { }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index e53be4e1cd..7cdfae0172 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -45,7 +45,7 @@ class InputsEmbedder::IInputsEmbedder { // Tail of previous output for LM in chat mode is missing in KV cache. std::optional m_last_disappeared_token = std::nullopt; // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache - // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history + // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; @@ -77,7 +77,7 @@ class InputsEmbedder::IInputsEmbedder { } m_last_disappeared_token = last_disappeared_token; - + std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history)); } @@ -123,7 +123,7 @@ class InputsEmbedder::IInputsEmbedder { m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config), m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config), m_tokenizer{model_dir, device_config} { } - + IInputsEmbedder( const VLMConfig& vlm_config, const ModelsMap& models_map, @@ -616,7 +616,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; - + std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; @@ -735,7 +735,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { std::string formatted_prompt; std::vector image_embeds; image_embeds.reserve(single_images.size()); - + ov::Tensor image_newline; for (const auto& image : single_images) { @@ -1056,20 +1056,20 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { std::string image_start_token = m_vlm_config.image_start_token; std::string image_context_token = m_vlm_config.image_context_token; std::string image_end_token = m_vlm_config.image_end_token; - + std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; std::vector image_embeds; image_embeds.reserve(single_images.size()); - + for (const auto& image : single_images) { EncodedImage encoded_image = m_vision_encoder.encode(image); ov::Tensor single_image_embeds = encoded_image.resized_source; const size_t num_patches = single_image_embeds.get_shape().at(0); const size_t num_image_tokens = single_image_embeds.get_shape().at(1); - + formatted_prompt += image_start_token; for (int i = 0; i < num_patches * num_image_tokens; ++i) { formatted_prompt += image_context_token; @@ -1140,7 +1140,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { std::copy_n(image_embeds_data + image_context_token_idx * embed_dim, embed_dim, merged_embeds_data + offset); - + ++image_context_token_idx; if (image_context_token_idx == num_all_image_tokens) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 1d72b742ab..73357e6500 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -55,7 +55,7 @@ class InputsEmbedder { // adds currently generated text to chat history void update_chat_history(const std::string& decoded_results); - // finishes chat and clears a chat history + // finishes chat and clears a chat history void finish_chat(); private: class IInputsEmbedder; diff --git a/src/cpp/src/visual_language/perf_metrics.cpp b/src/cpp/src/visual_language/perf_metrics.cpp index a3afb83641..f83e5e2ec6 100644 --- a/src/cpp/src/visual_language/perf_metrics.cpp +++ b/src/cpp/src/visual_language/perf_metrics.cpp @@ -33,4 +33,4 @@ VLMPerfMetrics VLMPerfMetrics::operator+(const VLMPerfMetrics& right) const { right_prepare_embeddings_durations.end()); return result; } -} \ No newline at end of file +} diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index ebc5c3b5dd..708beeb10b 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -21,7 +21,7 @@ using namespace ov::genai; namespace { - + template struct overloaded : Ts... {using Ts::operator()...;}; template overloaded(Ts...) -> overloaded; @@ -127,7 +127,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { }, m_generation_config{generation_config}, m_is_chat_conversation{false} { - + m_inputs_embedder = std::make_shared( m_vlm_config, models_map, tokenizer, config_dir_path, device, properties); @@ -249,7 +249,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { res_raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(generate_end_time - generate_start_time)); res_raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_end_time - decode_start_time)); res_raw_counters.tokenization_durations.insert(res_raw_counters.tokenization_durations.end(), raw_counters.tokenization_durations.begin(), raw_counters.tokenization_durations.end()); - + // VLM specific perf metrics decoded.perf_metrics.vlm_raw_metrics.prepare_embeddings_durations.emplace_back(PerfMetrics::get_microsec(end_get_inputs_embeds - start_get_inputs_embeds)); diff --git a/src/cpp/src/visual_language/processor_config.cpp b/src/cpp/src/visual_language/processor_config.cpp index fc524fce9c..db439d3ace 100644 --- a/src/cpp/src/visual_language/processor_config.cpp +++ b/src/cpp/src/visual_language/processor_config.cpp @@ -20,7 +20,7 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa if (parsed.contains("norm_std")) { norm_std = parsed.at("norm_std").get>(); } - + // Setting llava config params if (parsed.contains("image_mean")) { image_mean = parsed.at("image_mean").get>(); diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 9f8f9b0498..70b3e7c0f1 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -331,7 +331,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o ov::Tensor pixel_values{ov::element::f32, {n_images, channels, patch_size, max_size / patch_size}}; size_t d3_all_pixel = pixel_values.get_shape().at(3); float* pixel_value_data = pixel_values.data(); - + //image chw to 1*c*kernel*hw/kernel and padding zero clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0); size_t img_h = resized_preprocessed.ny; @@ -346,7 +346,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o for (size_t k_idx = 0; k_idx < patch_size; k_idx++) { std::copy(clip_value_data, clip_value_data + d3_clip_pixel, pixel_value_data); clip_value_data += d3_clip_pixel; - pixel_value_data += d3_all_pixel; + pixel_value_data += d3_all_pixel; } } @@ -359,7 +359,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o img_w = elem.nx; ov::Tensor clip_img{ov::element::f32, {1, channels, img_h, img_w}, elem.buf.data()}; ov::Tensor clip_pixel_values = preprocess_for_encoder(clip_img, patch_size); - + d3_clip_pixel = clip_pixel_values.get_shape().at(3); clip_value_data = clip_pixel_values.data(); pixel_value_data = pixel_values.data() + batch_pixel * channels * patch_size * d3_all_pixel; @@ -473,7 +473,7 @@ clip_image_f32 preprocess_clip_image_llava(const clip_image_u8& image, const Pro for (int y = 0; y < crop_height; ++y) { for (int x = 0; x < crop_width; ++x) { for (int c = 0; c < 3; ++c) { - cropped_image.buf[(y * crop_width + x) * 3 + c] = + cropped_image.buf[(y * crop_width + x) * 3 + c] = resized_image.buf[((start_y + y) * resized_image.nx + (start_x + x)) * 3 + c]; } } diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp index c126d5495e..b717c5f1df 100644 --- a/src/cpp/src/visual_language/vlm_config.hpp +++ b/src/cpp/src/visual_language/vlm_config.hpp @@ -54,7 +54,7 @@ class VLMConfig { std::string image_context_token = ""; /// @brief A string token denoting end of image embeddings for InternVL2 model. std::string image_end_token = ""; - + /// @brief Default constructor. VLMConfig() = default; /// @brief Construct VLMConfig from values in json_path. diff --git a/src/cpp/src/visual_language/vlm_model_type.hpp b/src/cpp/src/visual_language/vlm_model_type.hpp index e4b5e823b6..5d5ef3a83c 100644 --- a/src/cpp/src/visual_language/vlm_model_type.hpp +++ b/src/cpp/src/visual_language/vlm_model_type.hpp @@ -32,4 +32,4 @@ inline VLMModelType to_vlm_model_type(const std::string& value) { } OPENVINO_THROW("Unsupported '", value, "' VLM model type"); } -} \ No newline at end of file +} diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp index 01fe882187..c1c6614aca 100644 --- a/src/cpp/src/whisper_pipeline_static.cpp +++ b/src/cpp/src/whisper_pipeline_static.cpp @@ -212,7 +212,7 @@ void zero_past_key_values(ov::InferRequest& request) { void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferRequest& decoder) { // NB: Prepare attetion mask to be in a format [0, 0, 0, 1, 1, 1, 1, ..., 0, 1] - // Mask should be inverted for decoder_with_past + // Mask should be inverted for decoder_with_past auto attention_mask = decoder_with_past.get_tensor("attention_mask"); auto* attention_mask_ptr = attention_mask.data(); std::fill(attention_mask_ptr, attention_mask_ptr + 3u, 0); diff --git a/src/docs/DEBUG_LOG.md b/src/docs/DEBUG_LOG.md index 5ed3f35d17..146072a6c4 100644 --- a/src/docs/DEBUG_LOG.md +++ b/src/docs/DEBUG_LOG.md @@ -40,4 +40,4 @@ the properties of the compiled model are printed as follows: AFFINITY: CORE EXECUTION_DEVICES: CPU: Intel(R) Xeon(R) Platinum 8468 -``` \ No newline at end of file +``` diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index 0ad7ba3f12..ff14ecc331 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -28,7 +28,7 @@ # LLM pipeline from .py_openvino_genai import ( - LLMPipeline, + LLMPipeline, draft_model, ) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 5adde32db4..505d4fb48c 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -28,15 +28,15 @@ class AdapterConfig: class Mode: """ Members: - + MODE_AUTO - + MODE_DYNAMIC - + MODE_STATIC_RANK - + MODE_STATIC - + MODE_FUSE """ MODE_AUTO: typing.ClassVar[AdapterConfig.Mode] # value = @@ -107,11 +107,11 @@ class AggregationMode: Represents the mode of per-token score aggregation when determining least important tokens for eviction from cache :param AggregationMode.SUM: In this mode the importance scores of each token will be summed after each step of generation :param AggregationMode.NORM_SUM: Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated) of a given token in cache - + Members: - + SUM - + NORM_SUM """ NORM_SUM: typing.ClassVar[AggregationMode] # value = @@ -312,17 +312,17 @@ class CLIPTextModelWithProjection: ... class CacheEvictionConfig: """ - + Configuration struct for the cache eviction algorithm. :param start_size: Number of tokens in the *beginning* of KV cache that should be retained in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for this pipeline. :type start_size: int - + :param recent_size: Number of tokens in the *end* of KV cache that should be retained in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for this pipeline. :type recent_size: int - + :param max_cache_size: Maximum number of tokens that should be kept in the KV cache. The evictable block area will be located between the "start" and "recent" blocks and its size will be calculated as (`max_cache_size` - `start_size` - `recent_size`). Must be non-zero, larger than (`start_size` + `recent_size`), and a multiple of the KV cache block size for this pipeline. Note that since only the completely filled blocks are evicted, the actual maximum per-sequence KV cache size in tokens may be up to (`max_cache_size` + `SchedulerConfig.block_size - 1`). :type max_cache_size: int - + :param aggregation_mode: The mode used to compute the importance of tokens for eviction :type aggregation_mode: openvino_genai.AggregationMode """ @@ -339,7 +339,7 @@ class CacheEvictionConfig: ... class ChunkStreamerBase: """ - + Base class for chunk streamers. In order to use inherit from from this class. """ def __init__(self) -> None: @@ -402,11 +402,11 @@ class CppStdGenerator(Generator): ... class DecodedResults: """ - + Structure to store resulting batched text outputs and scores for each batch. The first num_return_sequences elements correspond to the first batch element. - - Parameters: + + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -426,10 +426,10 @@ class DecodedResults: ... class EncodedGenerationResult: """ - + GenerationResult stores resulting batched tokens and scores. - - Parameters: + + Parameters: request_id: obsolete when handle API is approved as handle will connect results with prompts. generation_ids: in a generic case we have multiple generation results per initial prompt depending on sampling parameters (e.g. beam search or parallel sampling). @@ -440,7 +440,7 @@ class EncodedGenerationResult: IGNORED = 2 - Status set when generation run into out-of-memory condition and could not be continued. DROPPED_BY_PIPELINE = 3 - Currently not used, TODO: implement abort functionality. DROPPED_BY_HANDLE = 4 - Status set when generation handle is dropped. - + """ m_generation_ids: list[list[int]] m_scores: list[float] @@ -451,14 +451,14 @@ class EncodedGenerationResult: ... class EncodedResults: """ - + Structure to store resulting batched tokens and scores for each batch sequence. The first num_return_sequences elements correspond to the first batch element. In the case if results decoded with beam search and random sampling scores contain sum of logarithmic probabilities for each token in the sequence. In the case of greedy decoding scores are filled with zeros. - - Parameters: + + Parameters: tokens: sequence of resulting tokens. scores: sum of logarithmic probabilities of all tokens in the sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -521,11 +521,11 @@ class FluxTransformer2DModel: ... class GenerationConfig: """ - + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. @@ -539,11 +539,11 @@ class GenerationConfig: echo: if set to true, the model will echo the prompt in the output. logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). - + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. presence_penalty: reduces absolute log prob if the token was generated at least once. frequency_penalty: reduces absolute log prob as many times as the token was generated. - + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -558,7 +558,7 @@ class GenerationConfig: "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - + Random sampling parameters: temperature: the value used to modulate token probabilities for random sampling. top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. @@ -622,11 +622,11 @@ class GenerationConfig: class GenerationFinishReason: """ Members: - + NONE - + STOP - + LENGTH """ LENGTH: typing.ClassVar[GenerationFinishReason] # value = @@ -679,10 +679,10 @@ class GenerationOutput: score: float class GenerationResult: """ - + GenerationResult stores resulting batched tokens and scores. - - Parameters: + + Parameters: request_id: obsolete when handle API is approved as handle will connect results with prompts. generation_ids: in a generic case we have multiple generation results per initial prompt depending on sampling parameters (e.g. beam search or parallel sampling). @@ -693,7 +693,7 @@ class GenerationResult: IGNORED = 2 - Status set when generation run into out-of-memory condition and could not be continued. DROPPED_BY_PIPELINE = 3 - Currently not used, TODO: implement abort functionality. DROPPED_BY_HANDLE = 4 - Status set when generation handle is dropped. - + """ m_generation_ids: list[str] m_scores: list[float] @@ -710,15 +710,15 @@ class GenerationResult: class GenerationStatus: """ Members: - + RUNNING - + FINISHED - + IGNORED - + DROPPED_BY_PIPELINE - + DROPPED_BY_HANDLE """ DROPPED_BY_HANDLE: typing.ClassVar[GenerationStatus] # value = @@ -800,12 +800,12 @@ class Image2ImagePipeline: def generate(self, prompt: str, image: openvino._pyopenvino.Tensor, **kwargs) -> openvino._pyopenvino.Tensor: """ Generates images for text-to-image models. - + :param prompt: input prompt :type prompt: str - + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. - + Expected parameters list: prompt_2: str - second prompt, prompt_3: str - third prompt, @@ -823,7 +823,7 @@ class Image2ImagePipeline: adapters: LoRA adapters, strength: strength for image to image generation. 1.0f means initial image is fully noised, max_sequence_length: int - length of t5_encoder_model input - + :return: ov.Tensor with resulting images :rtype: ov.Tensor """ @@ -901,12 +901,12 @@ class InpaintingPipeline: def generate(self, prompt: str, image: openvino._pyopenvino.Tensor, mask_image: openvino._pyopenvino.Tensor, **kwargs) -> openvino._pyopenvino.Tensor: """ Generates images for text-to-image models. - + :param prompt: input prompt :type prompt: str - + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. - + Expected parameters list: prompt_2: str - second prompt, prompt_3: str - third prompt, @@ -924,7 +924,7 @@ class InpaintingPipeline: adapters: LoRA adapters, strength: strength for image to image generation. 1.0f means initial image is fully noised, max_sequence_length: int - length of t5_encoder_model input - + :return: ov.Tensor with resulting images :rtype: ov.Tensor """ @@ -943,27 +943,27 @@ class LLMPipeline: def __call__(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: """ Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. - + :param inputs: inputs in the form of string, list of strings or tokenized input_ids :type inputs: str, List[str], ov.genai.TokenizedInputs, or ov.Tensor - + :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict - + :return: return results in encoded, or decoded form depending on inputs type :rtype: DecodedResults, EncodedResults, str - - + + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. @@ -977,11 +977,11 @@ class LLMPipeline: echo: if set to true, the model will echo the prompt in the output. logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). - + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. presence_penalty: reduces absolute log prob if the token was generated at least once. frequency_penalty: reduces absolute log prob as many times as the token was generated. - + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -996,7 +996,7 @@ class LLMPipeline: "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - + Random sampling parameters: temperature: the value used to modulate token probabilities for random sampling. top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. @@ -1028,27 +1028,27 @@ class LLMPipeline: def generate(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: """ Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. - + :param inputs: inputs in the form of string, list of strings or tokenized input_ids :type inputs: str, List[str], ov.genai.TokenizedInputs, or ov.Tensor - + :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict - + :return: return results in encoded, or decoded form depending on inputs type :rtype: DecodedResults, EncodedResults, str - - + + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. @@ -1062,11 +1062,11 @@ class LLMPipeline: echo: if set to true, the model will echo the prompt in the output. logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). - + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. presence_penalty: reduces absolute log prob if the token was generated at least once. frequency_penalty: reduces absolute log prob as many times as the token was generated. - + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -1081,7 +1081,7 @@ class LLMPipeline: "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - + Random sampling parameters: temperature: the value used to modulate token probabilities for random sampling. top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. @@ -1110,9 +1110,9 @@ class MeanStdPair: ... class PerfMetrics: """ - + Holds performance metrics for each generate call. - + PerfMetrics holds fields with mean and standard deviations for the following metrics: - Time To the First Token (TTFT), ms - Time per Output Token (TPOT), ms/token @@ -1120,42 +1120,42 @@ class PerfMetrics: - Tokenization duration, ms - Detokenization duration, ms - Throughput, tokens/s - + Additional fields include: - Load time, ms - Number of generated tokens - Number of tokens in the input prompt - + Preferable way to access values is via get functions. Getters calculate mean and std values from raw_metrics and return pairs. If mean and std were already calculated, getters return cached values. - + :param get_load_time: Returns the load time in milliseconds. :type get_load_time: float - + :param get_num_generated_tokens: Returns the number of generated tokens. :type get_num_generated_tokens: int - + :param get_num_input_tokens: Returns the number of tokens in the input prompt. :type get_num_input_tokens: int - + :param get_ttft: Returns the mean and standard deviation of TTFT in milliseconds. :type get_ttft: MeanStdPair - + :param get_tpot: Returns the mean and standard deviation of TPOT in milliseconds. :type get_tpot: MeanStdPair - + :param get_throughput: Returns the mean and standard deviation of throughput in tokens per second. :type get_throughput: MeanStdPair - + :param get_generate_duration: Returns the mean and standard deviation of generate durations in milliseconds. :type get_generate_duration: MeanStdPair - + :param get_tokenization_duration: Returns the mean and standard deviation of tokenization durations in milliseconds. :type get_tokenization_duration: MeanStdPair - + :param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds. :type get_detokenization_duration: MeanStdPair - + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. :type raw_metrics: RawPerfMetrics """ @@ -1192,23 +1192,23 @@ class PerfMetrics: ... class PipelineMetrics: """ - + Contains general pipeline metrics, either aggregated throughout the lifetime of the generation pipeline or measured at the previous generation step. - + :param requests: Number of requests to be processed by the pipeline. :type requests: int - + :param scheduled_requests: Number of requests that were scheduled for processing at the previous step of the pipeline. :type scheduled_requests: int - + :param cache_usage: Percentage of KV cache usage in the last generation step. :type cache_usage: float - + :param max_cache_usage: Max KV cache usage during the lifetime of the pipeline in % :type max_cache_usage: float - - + + :param avg_cache_usage: Running average of the KV cache usage (in %) during the lifetime of the pipeline, with max window size of 1000 steps :type avg_cache_usage: float """ @@ -1231,33 +1231,33 @@ class PipelineMetrics: ... class RawPerfMetrics: """ - + Structure with raw performance metrics for each generation before any statistics are calculated. - + :param generate_durations: Durations for each generate call in milliseconds. :type generate_durations: List[float] - + :param tokenization_durations: Durations for the tokenization process in milliseconds. :type tokenization_durations: List[float] - + :param detokenization_durations: Durations for the detokenization process in milliseconds. :type detokenization_durations: List[float] - + :param m_times_to_first_token: Times to the first token for each call in milliseconds. :type m_times_to_first_token: List[float] - + :param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds. :type m_new_token_times: List[double] - + :param token_infer_durations : Inference time for each token in milliseconds. :type batch_sizes: List[float] - + :param m_batch_sizes: Batch sizes for each generate call. :type m_batch_sizes: List[int] - + :param m_durations: Total durations for each generate call in milliseconds. :type m_durations: List[float] - + :param inference_durations : Total inference duration for each generate call in milliseconds. :type batch_sizes: List[float] """ @@ -1346,21 +1346,21 @@ class Scheduler: class Type: """ Members: - + AUTO - + LCM - + LMS_DISCRETE - + DDIM - + EULER_DISCRETE - + FLOW_MATCH_EULER_DISCRETE - + PNDM - + EULER_ANCESTRAL_DISCRETE """ AUTO: typing.ClassVar[Scheduler.Type] # value = @@ -1403,17 +1403,17 @@ class Scheduler: ... class SchedulerConfig: """ - + SchedulerConfig to construct ContinuousBatchingPipeline - - Parameters: + + Parameters: max_num_batched_tokens: a maximum number of tokens to batch (in contrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch). num_kv_blocks: total number of KV blocks available to scheduler logic. cache_size: total size of KV cache in GB. block_size: block size for KV cache. dynamic_split_fuse: whether to split prompt / generate to different scheduling phases. - + vLLM-like settings: max_num_seqs: max number of scheduled sequences (you can think of it as "max batch size"). enable_prefix_caching: Enable caching of KV-blocks. @@ -1435,21 +1435,21 @@ class SchedulerConfig: ... class StopCriteria: """ - + StopCriteria controls the stopping condition for grouped beam search. - + The following values are possible: "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates. "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates. "openvino_genai.StopCriteria.NEVER" stops when there cannot be better candidates. - - + + Members: - + EARLY - + HEURISTIC - + NEVER """ EARLY: typing.ClassVar[StopCriteria] # value = @@ -1484,7 +1484,7 @@ class StopCriteria: ... class StreamerBase: """ - + Base class for streamers. In order to use inherit from from this class and implement put, and methods. """ def __init__(self) -> None: @@ -1589,12 +1589,12 @@ class Text2ImagePipeline: def generate(self, prompt: str, **kwargs) -> openvino._pyopenvino.Tensor: """ Generates images for text-to-image models. - + :param prompt: input prompt :type prompt: str - + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. - + Expected parameters list: prompt_2: str - second prompt, prompt_3: str - third prompt, @@ -1612,7 +1612,7 @@ class Text2ImagePipeline: adapters: LoRA adapters, strength: strength for image to image generation. 1.0f means initial image is fully noised, max_sequence_length: int - length of t5_encoder_model input - + :return: ov.Tensor with resulting images :rtype: ov.Tensor """ @@ -1747,10 +1747,10 @@ class UNet2DConditionModel: ... class VLMDecodedResults: """ - + Structure to store resulting batched text outputs and scores for each batch. The first num_return_sequences elements correspond to the first batch element. - + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. @@ -1771,12 +1771,12 @@ class VLMDecodedResults: ... class VLMPerfMetrics(PerfMetrics): """ - + Structure with raw performance metrics for each generation before any statistics are calculated. - + :param get_prepare_embeddings_duration: Returns mean and standard deviation of embeddings preparation duration in milliseconds :type get_prepare_embeddings_duration: MeanStdPair - + :param vlm_raw_metrics: VLM specific raw metrics :type VLMRawPerfMetrics: """ @@ -1805,22 +1805,22 @@ class VLMPipeline: def generate(self, prompt: str, images: list[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. - + :param prompt: input prompt :type prompt: str - + :param images: image or list of images :type images: List[ov.Tensor] or ov.Tensor - + :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict - + :return: return results in decoded form :rtype: VLMDecodedResults """ @@ -1828,22 +1828,22 @@ class VLMPipeline: def generate(self, prompt: str, images: openvino._pyopenvino.Tensor, generation_config: GenerationConfig, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. - + :param prompt: input prompt :type prompt: str - + :param images: image or list of images :type images: List[ov.Tensor] or ov.Tensor - + :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict - + :return: return results in decoded form :rtype: VLMDecodedResults """ @@ -1851,18 +1851,18 @@ class VLMPipeline: def generate(self, prompt: str, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. - + :param prompt: input prompt :type prompt: str - + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. - + Expected parameters list: image: ov.Tensor - input image, images: List[ov.Tensor] - input images, generation_config: GenerationConfig, streamer: Callable[[str], bool], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped - + :return: return results in decoded form :rtype: VLMDecodedResults """ @@ -1878,9 +1878,9 @@ class VLMPipeline: ... class VLMRawPerfMetrics: """ - + Structure with VLM specific raw performance metrics for each generation before any statistics are calculated. - + :param prepare_embeddings_durations: Durations of embeddings preparation. :type prepare_embeddings_durations: List[MicroSeconds] """ @@ -1891,9 +1891,9 @@ class VLMRawPerfMetrics: ... class WhisperDecodedResultChunk: """ - + Structure to store decoded text with corresponding timestamps - + :param start_ts chunk start time in seconds :param end_ts chunk end time in seconds :param text chunk text @@ -1911,9 +1911,9 @@ class WhisperDecodedResultChunk: ... class WhisperDecodedResults: """ - + Structure to store resulting text outputs and scores. - + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. @@ -1936,57 +1936,57 @@ class WhisperDecodedResults: ... class WhisperGenerationConfig: """ - + WhisperGenerationConfig :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. :type max_length: int - + :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. :type max_new_tokens: int - + :param eos_token_id: End of stream token id. :type eos_token_id: int - + Whisper specific parameters: - + :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. :type decoder_start_token_id: int - + :param pad_token_id: Padding token id. :type pad_token_id: int - + :param translate_token_id: Translate token id. :type translate_token_id: int - + :param transcribe_token_id: Transcribe token id. :type transcribe_token_id: int - + :param no_timestamps_token_id: No timestamps token id. :type no_timestamps_token_id: int - + :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token. :type prev_sot_token_id: int - + :param is_multilingual: :type is_multilingual: bool - + :param begin_suppress_tokens: A list containing tokens that will be suppressed at the beginning of the sampling process. :type begin_suppress_tokens: list[int] - + :param suppress_tokens: A list containing the non-speech tokens that will be suppressed during generation. :type suppress_tokens: list[int] - + :param language: Language token to use for generation in the form of <|en|>. You can find all the possible language tokens in the generation_config.json lang_to_id dictionary. :type language: Optional[str] - + :param lang_to_id: Language token to token_id map. Initialized from the generation_config.json lang_to_id dictionary. :type lang_to_id: Dict[str, int] - + :param task: Task to use for generation, either “translate” or “transcribe” :type task: int - + :param return_timestamps: If `true` the pipeline will return timestamps along the text for *segments* of words in the text. For instance, if you get WhisperDecodedResultChunk @@ -1996,25 +1996,25 @@ class WhisperGenerationConfig: then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. Note that a segment of text refers to a sequence of one or more words, rather than individual words. :type return_timestamps: bool - + :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window. Can be used to steer the model to use particular spellings or styles. - + Example: auto result = pipeline.generate(raw_speech); // He has gone and gone for good answered Paul Icrom who... - + auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome")); // He has gone and gone for good answered Polychrome who... :type initial_prompt: Optional[str] - + :param hotwords: Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows. Can be used to steer the model to use particular spellings or styles. - + Example: auto result = pipeline.generate(raw_speech); // He has gone and gone for good answered Paul Icrom who... - + auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); // He has gone and gone for good answered Polychrome who... :type hotwords: Optional[str] @@ -2052,12 +2052,12 @@ class WhisperGenerationConfig: ... class WhisperPerfMetrics(PerfMetrics): """ - + Structure with raw performance metrics for each generation before any statistics are calculated. - + :param get_features_extraction_duration: Returns mean and standard deviation of features extraction duration in milliseconds :type get_features_extraction_duration: MeanStdPair - + :param whisper_raw_metrics: Whisper specific raw metrics :type WhisperRawPerfMetrics: """ @@ -2081,74 +2081,74 @@ class WhisperPipeline: def generate(self, raw_speech_input: list[float], generation_config: WhisperGenerationConfig | None = None, streamer: typing.Callable[[str], bool] | ChunkStreamerBase | None = None, **kwargs) -> WhisperDecodedResults: """ High level generate that receives raw speech as a vector of floats and returns decoded output. - + :param raw_speech_input: inputs in the form of list of floats. Required to be normalized to near [-1, 1] range and have 16k Hz sampling rate. :type raw_speech_input: List[float] - + :param generation_config: generation_config :type generation_config: WhisperGenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped. Streamer supported for short-form audio (< 30 seconds) with `return_timestamps=False` only :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to WhisperGenerationConfig fields. :type : Dict - + :return: return results in decoded form :rtype: WhisperDecodedResults - - + + WhisperGenerationConfig :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. :type max_length: int - + :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. :type max_new_tokens: int - + :param eos_token_id: End of stream token id. :type eos_token_id: int - + Whisper specific parameters: - + :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. :type decoder_start_token_id: int - + :param pad_token_id: Padding token id. :type pad_token_id: int - + :param translate_token_id: Translate token id. :type translate_token_id: int - + :param transcribe_token_id: Transcribe token id. :type transcribe_token_id: int - + :param no_timestamps_token_id: No timestamps token id. :type no_timestamps_token_id: int - + :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token. :type prev_sot_token_id: int - + :param is_multilingual: :type is_multilingual: bool - + :param begin_suppress_tokens: A list containing tokens that will be suppressed at the beginning of the sampling process. :type begin_suppress_tokens: list[int] - + :param suppress_tokens: A list containing the non-speech tokens that will be suppressed during generation. :type suppress_tokens: list[int] - + :param language: Language token to use for generation in the form of <|en|>. You can find all the possible language tokens in the generation_config.json lang_to_id dictionary. :type language: Optional[str] - + :param lang_to_id: Language token to token_id map. Initialized from the generation_config.json lang_to_id dictionary. :type lang_to_id: Dict[str, int] - + :param task: Task to use for generation, either “translate” or “transcribe” :type task: int - + :param return_timestamps: If `true` the pipeline will return timestamps along the text for *segments* of words in the text. For instance, if you get WhisperDecodedResultChunk @@ -2158,25 +2158,25 @@ class WhisperPipeline: then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. Note that a segment of text refers to a sequence of one or more words, rather than individual words. :type return_timestamps: bool - + :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window. Can be used to steer the model to use particular spellings or styles. - + Example: auto result = pipeline.generate(raw_speech); // He has gone and gone for good answered Paul Icrom who... - + auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome")); // He has gone and gone for good answered Polychrome who... :type initial_prompt: Optional[str] - + :param hotwords: Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows. Can be used to steer the model to use particular spellings or styles. - + Example: auto result = pipeline.generate(raw_speech); // He has gone and gone for good answered Paul Icrom who... - + auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); // He has gone and gone for good answered Polychrome who... :type hotwords: Optional[str] @@ -2189,9 +2189,9 @@ class WhisperPipeline: ... class WhisperRawPerfMetrics: """ - + Structure with whisper specific raw performance metrics for each generation before any statistics are calculated. - + :param features_extraction_durations: Duration for each features extraction call. :type features_extraction_durations: List[MicroSeconds] """ diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 975100cb11..6fe2db588a 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -48,7 +48,7 @@ auto cache_eviction_config_docstring = R"( auto scheduler_config_docstring = R"( SchedulerConfig to construct ContinuousBatchingPipeline - Parameters: + Parameters: max_num_batched_tokens: a maximum number of tokens to batch (in contrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch). num_kv_blocks: total number of KV blocks available to scheduler logic. @@ -69,7 +69,7 @@ auto scheduler_config_docstring = R"( auto generation_result_docstring = R"( GenerationResult stores resulting batched tokens and scores. - Parameters: + Parameters: request_id: obsolete when handle API is approved as handle will connect results with prompts. generation_ids: in a generic case we have multiple generation results per initial prompt depending on sampling parameters (e.g. beam search or parallel sampling). @@ -149,7 +149,7 @@ void init_continuous_batching_pipeline(py::module_& m) { [](GenerationResult &r) -> py::typing::List { return pyutils::handle_utf8(r.m_generation_ids); }); - + py::class_(m, "EncodedGenerationResult", generation_result_docstring) .def(py::init<>()) .def_readonly("m_request_id", &EncodedGenerationResult::m_request_id) diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp index 2d5e5e6abc..f8e8cb06d3 100644 --- a/src/python/py_llm_pipeline.cpp +++ b/src/python/py_llm_pipeline.cpp @@ -91,7 +91,7 @@ extern char generation_config_docstring[]; void init_llm_pipeline(py::module_& m) { py::class_(m, "LLMPipeline", "This class is used for generation with LLMs") - // init(model_path, tokenizer, device, config, kwargs) should be defined before init(model_path, device, config, kwargs) + // init(model_path, tokenizer, device, config, kwargs) should be defined before init(model_path, device, config, kwargs) // to prevent tokenizer treated as kwargs argument .def(py::init([]( const std::filesystem::path& models_path, @@ -103,8 +103,8 @@ void init_llm_pipeline(py::module_& m) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); ov::AnyMap properties = pyutils::kwargs_to_any_map(kwargs); if (config.size()) { - PyErr_WarnEx(PyExc_DeprecationWarning, - "'config' parameters is deprecated, please use kwargs to pass config properties instead.", + PyErr_WarnEx(PyExc_DeprecationWarning, + "'config' parameters is deprecated, please use kwargs to pass config properties instead.", 1); auto config_properties = pyutils::properties_to_any_map(config); properties.insert(config_properties.begin(), config_properties.end()); @@ -133,8 +133,8 @@ void init_llm_pipeline(py::module_& m) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); ov::AnyMap properties = pyutils::kwargs_to_any_map(kwargs); if (config.size()) { - PyErr_WarnEx(PyExc_DeprecationWarning, - "'config' parameters is deprecated, please use kwargs to pass config properties instead.", + PyErr_WarnEx(PyExc_DeprecationWarning, + "'config' parameters is deprecated, please use kwargs to pass config properties instead.", 1); auto config_properties = pyutils::properties_to_any_map(config); properties.insert(config_properties.begin(), config_properties.end()); diff --git a/src/python/py_lora_adapter.cpp b/src/python/py_lora_adapter.cpp index 7f98b67064..93e241bec6 100644 --- a/src/python/py_lora_adapter.cpp +++ b/src/python/py_lora_adapter.cpp @@ -26,7 +26,7 @@ void init_lora_adapter(py::module_& m) { path (os.PathLike): Path to adapter file in safetensors format. )") .def( - "__bool__", + "__bool__", [](ov::genai::Adapter& self ) { return bool(self); @@ -47,7 +47,7 @@ void init_lora_adapter(py::module_& m) { py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def(py::init([]( - const ov::genai::Adapter& adapter, + const ov::genai::Adapter& adapter, float alpha, ov::genai::AdapterConfig::Mode mode) { return std::make_unique(adapter, alpha, mode); @@ -55,17 +55,17 @@ void init_lora_adapter(py::module_& m) { py::arg("adapter"), py::arg("alpha"), py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); - + adapter_config.def(py::init([]( - const ov::genai::Adapter& adapter, + const ov::genai::Adapter& adapter, ov::genai::AdapterConfig::Mode mode) { return std::make_unique(adapter, mode); }), py::arg("adapter"), py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); - + adapter_config.def(py::init([]( - const std::vector& adapters, + const std::vector& adapters, ov::genai::AdapterConfig::Mode mode) { return std::make_unique(adapters, mode); }), @@ -73,14 +73,14 @@ void init_lora_adapter(py::module_& m) { py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def(py::init([]( - const std::vector>& adapters, + const std::vector>& adapters, ov::genai::AdapterConfig::Mode mode) { return std::make_unique(adapters, mode); }), py::arg("adapters"), py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def( - "__bool__", + "__bool__", [](ov::genai::AdapterConfig& self ) { return bool(self); diff --git a/src/python/py_openvino_genai.cpp b/src/python/py_openvino_genai.cpp index f8e577d5c8..71cf93490e 100644 --- a/src/python/py_openvino_genai.cpp +++ b/src/python/py_openvino_genai.cpp @@ -41,7 +41,7 @@ auto decoded_results_docstring = R"( Structure to store resulting batched text outputs and scores for each batch. The first num_return_sequences elements correspond to the first batch element. - Parameters: + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -54,7 +54,7 @@ auto encoded_results_docstring = R"( sum of logarithmic probabilities for each token in the sequence. In the case of greedy decoding scores are filled with zeros. - Parameters: + Parameters: tokens: sequence of resulting tokens. scores: sum of logarithmic probabilities of all tokens in the sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -100,7 +100,7 @@ PYBIND11_MODULE(py_openvino_genai, m) { py::str res; if (valid_utf8_strings.size() == 1) return valid_utf8_strings[0]; - + for (size_t i = 0; i < valid_utf8_strings.size() - 1; i++) { res += py::str(std::to_string(dr.scores[i])) + py::str(": ") + valid_utf8_strings[i] + py::str("\n"); } diff --git a/src/python/py_perf_metrics.cpp b/src/python/py_perf_metrics.cpp index 1d17e34905..6f745587c9 100644 --- a/src/python/py_perf_metrics.cpp +++ b/src/python/py_perf_metrics.cpp @@ -111,8 +111,8 @@ std::vector timestamp_to_ms(const T& instance, U T::*member) { const auto& timestamps = instance.*member; res.reserve(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), std::back_inserter(res), - [](const auto& timestamp) { - return std::chrono::duration(timestamp.time_since_epoch()).count(); + [](const auto& timestamp) { + return std::chrono::duration(timestamp.time_since_epoch()).count(); }); return res; } @@ -125,11 +125,11 @@ void init_perf_metrics(py::module_& m) { .def_property_readonly("generate_durations", [](const RawPerfMetrics &rw) { return pyutils::get_ms(rw, &RawPerfMetrics::generate_durations); }) - .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { + .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { return pyutils::get_ms(rw, &RawPerfMetrics::tokenization_durations); }) - .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { - return pyutils::get_ms(rw, &RawPerfMetrics::detokenization_durations); + .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { + return pyutils::get_ms(rw, &RawPerfMetrics::detokenization_durations); }) .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) { return pyutils::get_ms(rw, &RawPerfMetrics::m_times_to_first_token); diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index db4643a65c..84b4e5c5b9 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -34,8 +34,8 @@ void init_tokenizer(py::module_& m) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); auto kwargs_properties = pyutils::kwargs_to_any_map(kwargs); if (properties.size()) { - PyErr_WarnEx(PyExc_DeprecationWarning, - "'properties' parameters is deprecated, please use kwargs to pass config properties instead.", + PyErr_WarnEx(PyExc_DeprecationWarning, + "'properties' parameters is deprecated, please use kwargs to pass config properties instead.", 1); auto map_properties = pyutils::properties_to_any_map(properties); kwargs_properties.insert(map_properties.begin(), map_properties.end()); diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 1fc34a36d2..94a0f053e0 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -87,7 +87,7 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) { "num_inference_steps", "max_sequence_length" }; - // These properties should be casted to ov::AnyMap, instead of std::map. + // These properties should be casted to ov::AnyMap, instead of std::map. std::set any_map_properties = { "GENERATE_CONFIG", "PREFILL_CONFIG", diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp index 46c2fdddd7..1e17d2523c 100644 --- a/tests/cpp/block_manager.cpp +++ b/tests/cpp/block_manager.cpp @@ -103,4 +103,4 @@ TEST(TestBlockManager, CanFreeBlocksFromSequence) { size_t seq_id = sequence_group->get_sequences()[0]->get_id(); bm.free_blocks_from_sequence(seq_id, { {0}, {1}, {2} }); EXPECT_EQ(bm.num_free_blocks(), 6); -} \ No newline at end of file +} diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index 095cc39f09..d400b3ef1d 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -64,7 +64,7 @@ TEST(TestCacheManager, test_cache_size_param) { auto cache_manager = std::make_shared(device_config, request, core); auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - + ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360); } @@ -133,4 +133,4 @@ TEST(TestCacheManager, test_dynamic_cache_increase) { // check that cache does not increase if new blocks were not allocated cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); -} \ No newline at end of file +} diff --git a/tests/cpp/logit_filtering.cpp b/tests/cpp/logit_filtering.cpp index dcb98113f3..06cc3e7b5b 100644 --- a/tests/cpp/logit_filtering.cpp +++ b/tests/cpp/logit_filtering.cpp @@ -342,4 +342,3 @@ const std::vector EOS_PENALTY_TRANSFORM_TEST_CASE INSTANTIATE_TEST_SUITE_P(VariousInputs, EOSPenaltyTransformTest, testing::ValuesIn(EOS_PENALTY_TRANSFORM_TEST_CASES)); - diff --git a/tests/cpp/sampler.cpp b/tests/cpp/sampler.cpp index 3741880827..19201f4fd1 100644 --- a/tests/cpp/sampler.cpp +++ b/tests/cpp/sampler.cpp @@ -42,7 +42,7 @@ TEST(SamplerValidationMode, gen_phase_to_cut_whole_seq) { }; // to emulate processed prompt and add next token [ 0 ] - sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); sequence_groups.front()->update_processed_tokens_num(5); // append candidates [ 2, 3, 4 ] @@ -86,7 +86,7 @@ TEST(SamplerValidationMode, gen_phase_to_cut_part_seq) { }; // to emulate processed prompt and add next token [ 0 ] - sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); sequence_groups.front()->update_processed_tokens_num(5); // append candidates [ 1, 2, 2 ] @@ -131,7 +131,7 @@ TEST(SamplerValidationMode, gen_phase) { }; // to emulate processed prompt and add next token [ 0 ] - sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); sequence_groups.front()->update_processed_tokens_num(5); // append candidates [ 1, 2, 3 ] diff --git a/tests/cpp/speculative_decoding.cpp b/tests/cpp/speculative_decoding.cpp index 1cf8db0fab..16cd75563f 100644 --- a/tests/cpp/speculative_decoding.cpp +++ b/tests/cpp/speculative_decoding.cpp @@ -19,7 +19,7 @@ class CBForSDTest : public testing::Test, public ov::genai::ContinuousBatchingPi sampling_params.num_assistant_tokens = 1; ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared(request_id, input_ids, - sampling_params, + sampling_params, 32); { @@ -43,7 +43,7 @@ TEST_F(CBForSDTest, init_sequence_by_not_empty__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -64,7 +64,7 @@ TEST_F(CBForSDTest, init_sequence_by_empty__one_sequence) { std::vector tokens = {}; std::vector log_probs = {}; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -85,7 +85,7 @@ TEST_F(CBForSDTest, no_updated_tokens__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -112,7 +112,7 @@ TEST_F(CBForSDTest, remove_tokens__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -141,7 +141,7 @@ TEST_F(CBForSDTest, remove_and_replace_tokens__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -170,7 +170,7 @@ TEST_F(CBForSDTest, add_tokens__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -204,7 +204,7 @@ TEST_F(CBForSDTest, update_empty_sequence_by_not_empty__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens_0, log_probs_0) }, { 1, ov::genai::GeneratedSequence(tokens_1, log_probs_1) } }; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -233,7 +233,7 @@ TEST_F(CBForSDTest, init_sequence_by_not_empty__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens_0, log_probs_0) }, { 1, ov::genai::GeneratedSequence(tokens_1, log_probs_1) } }; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -260,7 +260,7 @@ TEST_F(CBForSDTest, init_sequence_by_empty__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens, log_probs) }, { 1, ov::genai::GeneratedSequence(tokens, log_probs) }, }; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -286,7 +286,7 @@ TEST_F(CBForSDTest, no_updated_tokens__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens_0, log_probs_0) }, { 1, ov::genai::GeneratedSequence(tokens_1, log_probs_1) }, }; - + auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 2); @@ -319,7 +319,7 @@ TEST_F(CBForSDTest, remove_tokens__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens, log_probs) }, { 1, ov::genai::GeneratedSequence(tokens, log_probs) }, }; - + auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -358,7 +358,7 @@ TEST_F(CBForSDTest, remove_and_replace_tokens__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens, log_probs) }, { 1, ov::genai::GeneratedSequence(tokens, log_probs) }, }; - + auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -397,7 +397,7 @@ TEST_F(CBForSDTest, add_tokens__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens, log_probs) }, { 1, ov::genai::GeneratedSequence(tokens, log_probs) }, }; - + auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -426,4 +426,3 @@ TEST_F(CBForSDTest, add_tokens__two_sequence) { ASSERT_EQ(after.at(0).at(1).token_ids, tokens); ASSERT_EQ(after.at(0).at(1).log_probs, log_probs); } - diff --git a/tests/cpp/utils.cpp b/tests/cpp/utils.cpp index d00edae6fb..87f6e43efe 100644 --- a/tests/cpp/utils.cpp +++ b/tests/cpp/utils.cpp @@ -18,4 +18,4 @@ TEST(TestIsContainer, test_is_container) { EXPECT_EQ(is_container>, true); EXPECT_EQ(is_container, true); EXPECT_EQ(is_container>, true); -} \ No newline at end of file +} diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 2fca58a959..46d9bc5512 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -301,11 +301,11 @@ def run_continuous_batching( models_path : Path, scheduler_config : SchedulerConfig, prompts: List[str], - generation_configs : List[GenerationConfig] | GenerationConfig + generation_configs : List[GenerationConfig] | GenerationConfig ) -> List[GenerationResult]: if type(generation_configs) is not list: generation_configs = [generation_configs] * len(prompts) - + cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_properties()) output = cb_pipe.generate(prompts, generation_configs) diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py index e159045601..bd623f1f5a 100644 --- a/tests/python_tests/conftest.py +++ b/tests/python_tests/conftest.py @@ -22,4 +22,3 @@ def pytest_configure(config: pytest.Config): marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly' pytest.run_marker = marker pytest.selected_model_ids = config.getoption('--model_ids', default=None) - diff --git a/tests/python_tests/data/long_prompts.txt b/tests/python_tests/data/long_prompts.txt index 470f22f301..a668987ad1 100644 --- a/tests/python_tests/data/long_prompts.txt +++ b/tests/python_tests/data/long_prompts.txt @@ -12,4 +12,4 @@ Quantum entanglement is one of the most intriguing phenomena in the realm of qua The crystal became her talisman, reminding her of her promise and the magic of storytelling—a bridge between the ordinary and the extraordinary, where dreams take flight and every book waited to be opened . ### The Fascinating World of Bioluminescence #### Introduction Bioluminescence is a natural phenomenon that occurs in various organisms, characterized by the ability to emit light . This incredible adaptation can be found in a range of living beings, including certain species of fungi, bacteria, and marine animals . The light produced can serve various purposes such as predation, communication, and camouflage . This article explores the mechanisms, examples, and ecological significance of bioluminescence, shedding light on its role in the natural world . The process of bioluminescence involves a biochemical reaction between a light-emitting molecule known as luciferin and an enzyme called luciferase . This reaction occurs within specialized cells or organelles and typically requires oxygen . The specific structure of luciferin varies among different organisms, leading to a wide range of colors emitted, from blue and green to red and yellow . The basic biochemical reaction can be summarized as follows: 1 . **Formation of Luciferin-Oxygen Complex**: When luciferin reacts with oxygen in the presence of luciferase, it forms an unstable complex . 2 . The implications of quantum entanglement extend beyond fundamental physics. They intersect with various fields, including thermodynamics, information theory, and even biology. Researchers are exploring the possibility that quantum entanglement plays a role in biological processes, such as photosynthesis and avian navigation. For example, certain birds are thought to navigate using quantum coherence in their eyes. This intriguing intersection of quantum phenomena and biological systems suggests that entanglement may be a universal principle, manifesting in diverse contexts across nature. -The study of entanglement has also led to the exploration of quantum teleportation—the process of transferring quantum states from one location to another without physically moving the particle itself. By creating a pair of entangled particles, where one remains at point A and the other is sent to point B, the state of the particle at point A can be "teleported" to point B through a classical communication channel. This concept is not merely science fiction; researchers have successfully demonstrated teleportation of quantum states in laboratory settings, paving the way for potential advancements in quantum networks. \ No newline at end of file +The study of entanglement has also led to the exploration of quantum teleportation—the process of transferring quantum states from one location to another without physically moving the particle itself. By creating a pair of entangled particles, where one remains at point A and the other is sent to point B, the state of the particle at point A can be "teleported" to point B through a classical communication channel. This concept is not merely science fiction; researchers have successfully demonstrated teleportation of quantum states in laboratory settings, paving the way for potential advancements in quantum networks. diff --git a/tests/python_tests/models/nightly b/tests/python_tests/models/nightly index 72b707bd63..13749a1991 100644 --- a/tests/python_tests/models/nightly +++ b/tests/python_tests/models/nightly @@ -48,4 +48,4 @@ hf-internal-testing/tiny-random-StableLmForCausalLM hf-internal-testing/tiny-random-PhiForCausalLM hf-internal-testing/tiny-random-CodeGenForCausalLM hf-internal-testing/tiny-random-Starcoder2ForCausalLM -hf-internal-testing/tiny-random-OPTForCausalLM \ No newline at end of file +hf-internal-testing/tiny-random-OPTForCausalLM diff --git a/tests/python_tests/models/precommit b/tests/python_tests/models/precommit index 0b913d3b01..8adac460e0 100644 --- a/tests/python_tests/models/precommit +++ b/tests/python_tests/models/precommit @@ -1,3 +1,3 @@ hf-tiny-model-private/tiny-random-CodeGenForCausalLM hf-tiny-model-private/tiny-random-GPT2LMHeadModel -hf-tiny-model-private/tiny-random-OPTForCausalLM \ No newline at end of file +hf-tiny-model-private/tiny-random-OPTForCausalLM diff --git a/tests/python_tests/models/real_models b/tests/python_tests/models/real_models index 5fd8fe0500..a15878f63e 100644 --- a/tests/python_tests/models/real_models +++ b/tests/python_tests/models/real_models @@ -127,4 +127,4 @@ tiiuae/falcon-rw-7b togethercomputer/RedPajama-INCITE-Chat-3B-v1 # xverse/XVERSE-7B-Chat: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 # xverse/XVERSE-MoE-A4.2B: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 -Deci/DeciLM-7B \ No newline at end of file +Deci/DeciLM-7B diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py index 0a42685b05..1d25180a04 100644 --- a/tests/python_tests/test_generation_config.py +++ b/tests/python_tests/test_generation_config.py @@ -64,7 +64,7 @@ def test_valid_configs(generation_config_kwargs): dict(eos_token_id=1), # 'stop_token_ids' does not contain 'eos_token_id' dict(eos_token_id=1, stop_token_ids={2}), # 'stop_token_ids' is not empty, but does not contain 'eos_token_id' dict(ignore_eos=True), # no 'max_new_tokens', no 'max_length' with 'ignore_eos' - dict(stop_token_ids={-1}), # value in 'stop_token_ids' must be non-negative + dict(stop_token_ids={-1}), # value in 'stop_token_ids' must be non-negative dict(max_new_tokens=0), # max new tokens cannot be empty (only when 'echo' is True) dict(max_new_tokens=10, min_new_tokens=20), # 'max_new_tokens' must be >= 'min_new_tokens' # penalties diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index 428047ea28..2ef168c374 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -97,7 +97,7 @@ class CacheOptTestStruct: @pytest.mark.parametrize("enable_prefix_caching", [True, False]) # prefix caching shouldn't impact similarity def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, test_struct, enable_prefix_caching): import whowhatbench - + seqs_per_request = 32 scheduler_config = get_scheduler_config(test_struct.num_kv_blocks) @@ -171,4 +171,3 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.precommit def test_dynamic_memory_allocation(tmp_path, params): run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) - diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 5278f4424f..492f8742b9 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -475,5 +475,5 @@ def test_left_pad(): } models[2].pad_token = models[2].eos_token - + run_llm_pipeline_with_ref(model_id=models[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=models[1]) diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index 8129298763..578b240826 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -34,14 +34,14 @@ def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path): def get_chat_templates(): - # Returns chat templates saved in tokenizer_configs.py, + # Returns chat templates saved in tokenizer_configs.py, # but skips some models that currently are not processed correctly. skipped_models = { # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template. # Need to enable and unskip, since it's preset in continuous batching and has >100 000 downloads. "openchat/openchat-3.5-0106", - + # These models fail even on HF so no need to check if applying chat matches. "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy", "codellama/CodeLlama-34b-Instruct-hf", @@ -80,7 +80,7 @@ def get_chat_templates(): "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError "nlpai-lab/KULLM3", "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1", - "MediaTek-Research/Breeze-7B-Instruct-v0_1", + "MediaTek-Research/Breeze-7B-Instruct-v0_1", "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError "MLP-KTLim/llama-3-Korean-Bllossom-8B", "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp @@ -362,4 +362,3 @@ def test_load_special_tokens_from_special_tokens_map_json_with_string_repr(model assert tok.get_bos_token_id() == token_str_int_map['bos_token'] if 'eos_token' in token_str_int_map: assert tok.get_eos_token_id() == token_str_int_map['eos_token'] - diff --git a/tools/cacheviz/__init__.py b/tools/cacheviz/__init__.py index 88b5a71df7..88d510f769 100644 --- a/tools/cacheviz/__init__.py +++ b/tools/cacheviz/__init__.py @@ -1,3 +1,2 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - diff --git a/tools/cacheviz/cacheviz.py b/tools/cacheviz/cacheviz.py index b1ccf55098..2c752cc396 100644 --- a/tools/cacheviz/cacheviz.py +++ b/tools/cacheviz/cacheviz.py @@ -316,6 +316,3 @@ def on_press(event): if __name__ == "__main__": main() - - - diff --git a/tools/cacheviz/requirements.txt b/tools/cacheviz/requirements.txt index 9af70e35fa..e272c6488f 100644 --- a/tools/cacheviz/requirements.txt +++ b/tools/cacheviz/requirements.txt @@ -1,2 +1,2 @@ argparse -matplotlib \ No newline at end of file +matplotlib diff --git a/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp b/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp index 8139193779..c4b3f84417 100644 --- a/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp +++ b/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp @@ -120,7 +120,7 @@ int main(int argc, char* argv[]) try { std::cout << "Partial result:" << std::endl; print_generation_result(generation_result); } - break; + break; default: break; } diff --git a/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp b/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp index 7112030432..7f30ad6a9b 100644 --- a/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp +++ b/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp @@ -13,7 +13,7 @@ void print_cb_generation_result(const ov::genai::GenerationResult& generation_re } std::vector get_spec_decoding_generation_config_examples() { - + // sampling param for speulative decoding ov::genai::GenerationConfig generation_config_greedy_constant = ov::genai::greedy(); { @@ -105,7 +105,7 @@ int main(int argc, char* argv[]) try { scheduler_config.dynamic_split_fuse = dynamic_split_fuse; // vLLM specific params scheduler_config.max_num_seqs = 2; - + ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, {ov::genai::draft_model(draft_models_path, device)}); std::vector generation_results = pipe.generate(prompts, generation_config); @@ -130,7 +130,7 @@ int main(int argc, char* argv[]) try { std::cout << "Partial result:" << std::endl; print_cb_generation_result(generation_result); } - break; + break; default: break; } diff --git a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp index e0c50cda02..9f25dd68e9 100644 --- a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp +++ b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp @@ -146,7 +146,7 @@ class GenerationInfo { std::chrono::milliseconds cumulated_tpot; std::chrono::milliseconds mean_tpot; size_t num_output_tokens; - + std::chrono::steady_clock::time_point start_time; std::chrono::steady_clock::time_point last_read_time; @@ -269,7 +269,7 @@ class GenerationInfoCollector { for (GenerationInfo& generation_info : generations_info) { if (!generation_info.is_active()) continue; - + if (generation_info.is_finished()) { num_finished++; generation_info.set_inactive(); @@ -287,8 +287,8 @@ class GenerationInfoCollector { std::chrono::milliseconds mean_tpot = std::chrono::milliseconds::zero(); size_t total_input_len = 0; size_t total_output_len = 0; - - + + for (GenerationInfo& generation_info : generations_info){ auto generation_metrics = generation_info.get_metrics(); mean_ttft += generation_metrics.mean_ttft; @@ -304,7 +304,7 @@ class GenerationInfoCollector { std::cout << "Input throughput: " << total_input_len / total_duration.count() << " tokens / s" << std::endl; std::cout << "Output throughput: " << total_output_len / total_duration.count() << " tokens / s" << std::endl; std::cout << "Mean TTFT: " << mean_ttft.count() << " ms" << std::endl; - std::cout << "Mean TPOT: " << mean_tpot.count() << " ms" << std::endl; + std::cout << "Mean TPOT: " << mean_tpot.count() << " ms" << std::endl; } }; @@ -512,7 +512,7 @@ int main(int argc, char* argv[]) try { std::cout << "ERROR: Wrong json parameter in device_config." << std::endl; return EXIT_FAILURE; } - + // Benchmarking std::cout << "Loading models, creating pipelines, preparing environment..." << std::endl; ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, device_config_map); @@ -526,7 +526,7 @@ int main(int argc, char* argv[]) try { std::thread trafficSimulatorThread(trafficSimulator, &pipe, &dataset, request_rate, &generation_info_collector, is_speculative_decoding_enabled); trafficSimulatorThread.join(); } - + std::thread lmmEngineThread(llmEngineLoop, &pipe, &dataset, &finishGenerationThread); std::thread statisticsReporterThread(statisticsReporter, &generation_info_collector, num_prompts); if (request_rate != "inf") { diff --git a/tools/llm_bench/doc/NOTES.md b/tools/llm_bench/doc/NOTES.md index 8d84b4e8c8..96f936cd76 100644 --- a/tools/llm_bench/doc/NOTES.md +++ b/tools/llm_bench/doc/NOTES.md @@ -71,4 +71,4 @@ ConnectionError: Couldn't reach 'wikitext' on the Hub (SSLError) ``` root cause: The wikitext data set was not downloaded correctly, or the Hugging Face Hub network could not be connected normally.
Solution:
-Refer to https://huggingface.co/docs/datasets/loading#arrow , copy wikitext data set to ~/.cache/huggingface/datasets/ folder, set the environment variable HF_DATASETS_OFFLINE to 1. \ No newline at end of file +Refer to https://huggingface.co/docs/datasets/loading#arrow , copy wikitext data set to ~/.cache/huggingface/datasets/ folder, set the environment variable HF_DATASETS_OFFLINE to 1. diff --git a/tools/llm_bench/doc/PROMPT.md b/tools/llm_bench/doc/PROMPT.md index 5418bf0bb5..0613dd0073 100644 --- a/tools/llm_bench/doc/PROMPT.md +++ b/tools/llm_bench/doc/PROMPT.md @@ -41,4 +41,4 @@ Prompt file example: ## 5. Visual Language Models Supported parameters that can be set are: * `media` - imge file path -* `prompt`- input text prompt \ No newline at end of file +* `prompt`- input text prompt diff --git a/tools/llm_bench/llm_bench_utils/hook_beam_search.py b/tools/llm_bench/llm_bench_utils/hook_beam_search.py index d933acc3a5..e078116e5a 100644 --- a/tools/llm_bench/llm_bench_utils/hook_beam_search.py +++ b/tools/llm_bench/llm_bench_utils/hook_beam_search.py @@ -509,4 +509,4 @@ def new_forward(self, model): def new_get_multimodal_embeddings(self, model): model._orig_get_multimodal_embeddings = model.get_multimodal_embeddings - model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model) \ No newline at end of file + model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model) diff --git a/tools/llm_bench/llm_bench_utils/hook_common.py b/tools/llm_bench/llm_bench_utils/hook_common.py index c805680cee..062fe4d551 100644 --- a/tools/llm_bench/llm_bench_utils/hook_common.py +++ b/tools/llm_bench/llm_bench_utils/hook_common.py @@ -26,4 +26,4 @@ def get_bench_hook(num_beams, ov_model): else: log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}') bench_hook = None - return bench_hook \ No newline at end of file + return bench_hook diff --git a/tools/llm_bench/llm_bench_utils/hook_greedy_search.py b/tools/llm_bench/llm_bench_utils/hook_greedy_search.py index 9039a99e69..bef03edf2b 100644 --- a/tools/llm_bench/llm_bench_utils/hook_greedy_search.py +++ b/tools/llm_bench/llm_bench_utils/hook_greedy_search.py @@ -385,8 +385,8 @@ def new_forward(self, model): if trans_version >= version.parse('4.45.0'): model._sample = hook_sample_v45.new_sample.__get__(model, model.__class__) elif trans_version >= version.parse('4.43.0'): - model._sample = hook_sample_v43.new_sample.__get__(model, model.__class__) - + model._sample = hook_sample_v43.new_sample.__get__(model, model.__class__) + def new_get_multimodal_embeddings(self, model): model._orig_get_multimodal_embeddings = model.get_multimodal_embeddings - model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model) \ No newline at end of file + model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model) diff --git a/tools/llm_bench/llm_bench_utils/hook_sample.py b/tools/llm_bench/llm_bench_utils/hook_sample.py index 22111c1a3f..dd7759a81d 100644 --- a/tools/llm_bench/llm_bench_utils/hook_sample.py +++ b/tools/llm_bench/llm_bench_utils/hook_sample.py @@ -226,4 +226,4 @@ def new_sample( past_key_values=model_kwargs.get("past_key_values"), ) else: - return input_ids \ No newline at end of file + return input_ids diff --git a/tools/llm_bench/llm_bench_utils/hook_sample_v43.py b/tools/llm_bench/llm_bench_utils/hook_sample_v43.py index 7dce578dac..d8a7fa11f1 100644 --- a/tools/llm_bench/llm_bench_utils/hook_sample_v43.py +++ b/tools/llm_bench/llm_bench_utils/hook_sample_v43.py @@ -230,4 +230,4 @@ def new_sample( past_key_values=model_kwargs.get("past_key_values"), ) else: - return input_ids \ No newline at end of file + return input_ids diff --git a/tools/llm_bench/llm_bench_utils/hook_sample_v45.py b/tools/llm_bench/llm_bench_utils/hook_sample_v45.py index 1644c63a4f..e20b34e98d 100644 --- a/tools/llm_bench/llm_bench_utils/hook_sample_v45.py +++ b/tools/llm_bench/llm_bench_utils/hook_sample_v45.py @@ -222,4 +222,4 @@ def new_sample( past_key_values=model_kwargs.get("past_key_values"), ) else: - return input_ids \ No newline at end of file + return input_ids diff --git a/tools/llm_bench/llm_bench_utils/ov_model_classes.py b/tools/llm_bench/llm_bench_utils/ov_model_classes.py index 0ade0f1299..6e9f6743f2 100644 --- a/tools/llm_bench/llm_bench_utils/ov_model_classes.py +++ b/tools/llm_bench/llm_bench_utils/ov_model_classes.py @@ -78,7 +78,7 @@ def forward( past_key_values = tuple( past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer ) - + # Add the past_key_values to the decoder inputs inputs = dict(zip(self.key_value_input_names, past_key_values)) @@ -440,7 +440,7 @@ def forward( past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, **kwargs, ) -> CausalLMOutputWithPast: - + if not self.is_v1: return super().forward(input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, **kwargs) self.compile() diff --git a/tools/llm_bench/prompts/llama-2-7b-chat_l.jsonl b/tools/llm_bench/prompts/llama-2-7b-chat_l.jsonl index 4bf82b10b6..3a08db69dd 100644 --- a/tools/llm_bench/prompts/llama-2-7b-chat_l.jsonl +++ b/tools/llm_bench/prompts/llama-2-7b-chat_l.jsonl @@ -1 +1 @@ -{"prompt": "[INST] <> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <> You will act as a Christian, and fully summarize following text:\nSometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, \"Bless me, Father, for I have sinned. It has been 10 years since my last confession.\" This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, \"I did so and so,\" will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy.If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. Take the absolution to heart -- you now have a brand new, clean slate to work with. \"Penance\" is your expression of regret and repentance, showing God that you're truly sorry and that you wish for nothing more than to be forgiven. Thanks. [/INST]"} \ No newline at end of file +{"prompt": "[INST] <> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <> You will act as a Christian, and fully summarize following text:\nSometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, \"Bless me, Father, for I have sinned. It has been 10 years since my last confession.\" This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, \"I did so and so,\" will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy.If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. Take the absolution to heart -- you now have a brand new, clean slate to work with. \"Penance\" is your expression of regret and repentance, showing God that you're truly sorry and that you wish for nothing more than to be forgiven. Thanks. [/INST]"} diff --git a/tools/llm_bench/requirements.txt b/tools/llm_bench/requirements.txt index 6bf8d8cddf..48e64804dc 100644 --- a/tools/llm_bench/requirements.txt +++ b/tools/llm_bench/requirements.txt @@ -10,7 +10,7 @@ pillow torch transformers>=4.40.0 diffusers>=0.22.0 -#optimum is in dependency list of optimum-intel +#optimum is in dependency list of optimum-intel git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf packaging