diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 00a88bc4fb..e1ea686d4b 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -58,7 +58,7 @@ jobs: PYTHONPATH: "./build" - run: > . ./ov/setupvars.sh - && timeout 25s ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b + && timeout 25s ./samples/python/text_generation/multinomial_causal_lm.py ./open_llama_3b_v2/ b env: PYTHONPATH: "./build" - run: > @@ -79,7 +79,7 @@ jobs: executable: [ ./build/samples/cpp/text_generation/beam_search_causal_lm, - python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py, + python ./samples/python/text_generation/beam_search_causal_lm.py, ] runs-on: ubuntu-20.04 defaults: @@ -339,7 +339,7 @@ jobs: - run: > . ./ov/setupvars.sh && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - + | diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - env: PYTHONPATH: "./build" @@ -374,7 +374,7 @@ jobs: - run: > . ./ov/setupvars.sh && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./phi-2/ 69 - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) - + | diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./phi-2/ 69) - env: PYTHONPATH: "./build" @@ -409,7 +409,7 @@ jobs: - run: > . ./ov/setupvars.sh && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./notus-7b-v1/ 69 - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) - + | diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./notus-7b-v1/ 69) - env: PYTHONPATH: "./build" @@ -447,7 +447,7 @@ jobs: source ./ov/setupvars.sh ./build/samples/cpp/text_generation/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt ./build/samples/cpp/text_generation/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt - python ./samples/python/speculative_decoding_lm/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt + python ./samples/python/text_generation/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -504,7 +504,7 @@ jobs: ./build/samples/cpp/text_generation/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt ./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt - python ./samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_py.txt + python ./samples/python/text_generation/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_py.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -693,7 +693,7 @@ jobs: " diff pred.txt ref.txt echo "Chat sample cpp" passed - timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt + timeout 30s ./samples/python/text_generation/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt diff pred2.txt ref.txt echo "Chat sample python" passed diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 8cfc99847b..d32eb832a6 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -22,13 +22,8 @@ install(DIRECTORY DESTINATION samples/cpp COMPONENT cpp_samples_genai) install(DIRECTORY - python/beam_search_causal_lm - python/benchmark_genai - python/chat_sample python/text_generation python/image_generation - python/multinomial_causal_lm - python/speculative_decoding_lm python/visual_language_chat python/whisper_speech_recognition DESTINATION samples/python COMPONENT cpp_samples_genai diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md deleted file mode 100644 index fac6a26e8e..0000000000 --- a/samples/python/beam_search_causal_lm/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Text generation Python sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a different one, GPU for example, from the command line interface. The sample fearures `openvino_genai.LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python beam_search_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md deleted file mode 100644 index 95f24b6eca..0000000000 --- a/samples/python/benchmark_genai/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# LLMs benchmarking sample - -This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - - -## Usage - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -```sh -python benchmark_genai.py [OPTIONS] -``` - -### Options - -- `-m, --model`: Path to the model and tokenizers base directory. -- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. -- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. -- `-n, --num_iter` (default: `3`): Number of iterations. -- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. -- `-d, --device` (default: `"CPU"`): Device to run the model on. - -### Output: - -``` -python benchmark_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10 -``` - -``` -Load time: 3405.69 ms -Generate time: 1430.77 ± 3.04 ms -Tokenization time: 0.51 ± 0.02 ms -Detokenization time: 0.37 ± 0.01 ms -TTFT: 81.60 ± 0.54 ms -TPOT: 71.52 ± 2.72 ms -Throughput tokens/s: 13.98 ± 0.53 -``` - -For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics). diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md deleted file mode 100644 index 7e3c206431..0000000000 --- a/samples/python/chat_sample/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# Python chat_sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run: - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python chat_sample.py TinyLlama-1.1B-Chat-v1.0` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. - -#### Missing chat template - -If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. -The following template can be used as a default, but it may not work properly with every model: -``` -"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", -``` diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md deleted file mode 100644 index c1afc08a8d..0000000000 --- a/samples/python/multinomial_causal_lm/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -This sample also contains example implementation of an iterable streamer with bufferisation. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -## Streaming - -This Python example demonstrates custom detokenization with bufferization. The streamer receives integer tokens corresponding to each word or subword, one by one. If tokens are decoded individually, the resulting text misses necessary spaces because of detokenize(tokenize(" a")) == "a". - -To address this, the detokenizer needs a larger context. We accumulate tokens in a tokens_cache buffer and decode multiple tokens together, adding the text to the streaming queue only when a complete decoded chunk is ready. We run a separate thread to print all new elements arriving in this queue from the generation pipeline. Each generated chunk of text is put into a synchronized queue, ensuring that all put and get operations are thread-safe and blocked until they can proceed. - -At the same time, in order to optimize the performance in streaming mode, we provide the Chuck Streaming. Chunk streaming has significant benefits to very small LLM for streaming generate token rate improvement. It does sampling once after several token generation. We can use the tokens_len parameter to control the number of tokens in the token_cache before sampling. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/prompt_lookup_decoding_lm/README.md b/samples/python/prompt_lookup_decoding_lm/README.md deleted file mode 100644 index 1e5f4003d4..0000000000 --- a/samples/python/prompt_lookup_decoding_lm/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# prompt_lookup_decoding_lm Python sample that supports most popular models like LLaMA 3 - -[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -source /setupvars.sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/speculative_decoding_lm/README.md b/samples/python/speculative_decoding_lm/README.md deleted file mode 100644 index 7d2656c0a3..0000000000 --- a/samples/python/speculative_decoding_lm/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# speculative_decoding_lm Python sample that supports most popular models like LLaMA 3 and other - -Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. - -Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. - -This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Run `optimum-cli` to generate IRs for the samples. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -Download assisting and main model to run speculative decoding sample. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b -optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python speculative_decoding_lm.py ./dolly-v2-7b ./dolly-v2-3b "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - - -> *_NOTE:_* User can run speculative decoding on different devices. Please, specify `device` in `LLMPipeline` constructor to run main model and `device` for `draft_model` in the constructor. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/text_generation/README.md b/samples/python/text_generation/README.md index a634e21cb0..132dfc27f4 100644 --- a/samples/python/text_generation/README.md +++ b/samples/python/text_generation/README.md @@ -1,48 +1,129 @@ -# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 3 +# OpenVINO GenAI Text Generation Python Samples -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. +These samples showcase the use of OpenVINO's inference capabilities for text generation tasks, including different decoding strategies such as beam search, multinomial sampling, and speculative decoding. Each sample has a specific focus and demonstrates a unique aspect of text generation. +The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. +There are also Jupyter notebooks for some samples. You can find links to them in the appropriate sample descritions. -There are two sample files: - - [`greedy_causal_lm.py`](./greedy_causal_lm.py) demonstrates basic usage of the LLM pipeline - - [`lora.py`](./lora.py) shows how to apply LoRA adapters to the pipeline +## Table of Contents +1. [Download and Convert the Model and Tokenizers](#download-and-convert-the-model-and-tokenizers) +2. [Sample Descriptions](#sample-descriptions) +3. [Troubleshooting](#troubleshooting) +4. [Support and Contribution](#support-and-contribution) ## Download and convert the model and tokenizers The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. +It's not required to install [../../export-requirements.txt](../../export-requirements.txt) for deployment if the model has already been exported. ```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt +pip install --upgrade-strategy eager -r ../../requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python greedy_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` +Model examples to use for different samples: +chat_sample - meta-llama/Llama-2-7b-chat-hf +speculative_decoding_lm - meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model +other samples - meta-llama/Llama-2-7b-hf +## Sample Descriptions +### Common information +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to get common information about OpenVINO samples. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. -## Run with optional LoRA adapters - -LoRA adapters can be connected to the pipeline and modify generated text. Adapters are supported in Safetensors format and can be downloaded from public sources like [Civitai](https://civitai.com) or [HuggingFace](https://huggingface.co/models) or trained by the user. Adapters compatible with a base model should be used only. A weighted blend of multiple adapters can be applied by specifying multiple adapter files with corresponding alpha parameters in command line. Check `lora.py` source code to learn how to enable adapters and specify them in each `generate` call. - -Here is an example how to run the sample with a single adapter. First download adapter file from TODO page manually and save it as TODO. Or download it from command line: - -#TODO command to download adapter - -Then run `lora.py`: - -#TODO command to run lora.py with adapter - -### Troubleshooting +### 1. Greedy Causal LM (`greedy_causal_lm`) +- **Description:** +Basic text generation using a causal language model. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +- **Main Feature:** Demonstrates simple text continuation. +- **Run Command:** + ```bash + python greedy_causal_lm.py [-h] model_dir prompt + ``` + +### 2. Beam Search Causal LM (`beam_search_causal_lm`) +- **Description:** +Uses beam search for more coherent text generation. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +- **Main Feature:** Improves text quality with beam search. +- **Run Command:** + ```bash + python beam_search_causal_lm.py model_dir prompts [prompts ...] + ``` + +### 3. Chat Sample (`chat_sample`) +- **Description:** +Interactive chat interface powered by OpenVINO. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. +- **Main Feature:** Real-time chat-like text generation. +- **Run Command:** + ```bash + python chat_sample.py model_dir + ``` +#### Missing chat template +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +The following template can be used as a default, but it may not work properly with every model: +``` +"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", +``` -#### Unicode characters encoding error on Windows +### 4. Multinomial Causal LM (`multinomial_causal_lm`) +- **Description:** Text generation with multinomial sampling for diversity. +- **Main Feature:** Introduces randomness for creative outputs. +- **Run Command:** + ```bash + python multinomial_causal_lm.py model_dir prompt + ``` + +### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) +- **Description:** +[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. +- **Main Feature:** Specialized prompt-based inference. +- **Run Command:** + ```bash + python prompt_lookup_decoding_lm.py model_dir prompt + ``` + +### 6. Speculative Decoding LM (`speculative_decoding_lm`) +- **Description:** +Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. + +Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. + +This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf + +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/speculative-sampling) that provides an example of LLM-powered text generation in Python. +- **Main Feature:** Reduces latency while generating high-quality text. +- **Run Command:** + ```bash + python speculative_decoding_lm.py model_dir draft_model_dir prompt + ``` + +### 7. LLMs benchmarking sample (`benchmark_genai`) +- **Description:** +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). +- **Main Feature:** Benchmark model via GenAI +- **Run Command:** + ```bash + python benchmark_genai.py [-m MODEL] [-p PROMPT] [-nw NUM_WARMUP] [-n NUM_ITER] [-mt MAX_NEW_TOKENS] [-d DEVICE] + ``` + #### Options +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + + +## Troubleshooting + +### Unicode characters encoding error on Windows Example error: ``` @@ -52,3 +133,7 @@ UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: 1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. 2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +## Support and Contribution +- For troubleshooting, consult the [OpenVINO documentation](https://docs.openvino.ai). +- To report issues or contribute, visit the [GitHub repository](https://github.com/openvinotoolkit/openvino.genai). diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/text_generation/beam_search_causal_lm.py similarity index 100% rename from samples/python/beam_search_causal_lm/beam_search_causal_lm.py rename to samples/python/text_generation/beam_search_causal_lm.py diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/text_generation/benchmark_genai.py similarity index 100% rename from samples/python/benchmark_genai/benchmark_genai.py rename to samples/python/text_generation/benchmark_genai.py diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/text_generation/chat_sample.py similarity index 100% rename from samples/python/chat_sample/chat_sample.py rename to samples/python/text_generation/chat_sample.py diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/text_generation/multinomial_causal_lm.py similarity index 100% rename from samples/python/multinomial_causal_lm/multinomial_causal_lm.py rename to samples/python/text_generation/multinomial_causal_lm.py diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/text_generation/prompt_lookup_decoding_lm.py similarity index 100% rename from samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py rename to samples/python/text_generation/prompt_lookup_decoding_lm.py diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/text_generation/speculative_decoding_lm.py similarity index 100% rename from samples/python/speculative_decoding_lm/speculative_decoding_lm.py rename to samples/python/text_generation/speculative_decoding_lm.py diff --git a/src/README.md b/src/README.md index d8c15b1c77..028277d4db 100644 --- a/src/README.md +++ b/src/README.md @@ -394,7 +394,7 @@ durations = np.array(raw_metrics.m_new_token_times[1:]) - np.array(raw_metrics.m print(f'Median from token to token duration: {np.median(durations):.2f} ms') ``` -For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/text_generation/README.md) samples. +For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/text_generation/README.md) and C++ [benchmark_genai](../samples/cpp/text_generation/README.md) samples. ## How It Works