openvinotoolkit · Cyber-Var · Jan 9, 2025
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -21,6 +21,18 @@ env:
   m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241230_x86_64.tgz
   w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/w_openvino_toolkit_windows_2025.0.0.dev20241230_x86_64.zip
 jobs:
+  code-quality-checks:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install pre-commit
+        run: pip install pre-commit
+      - name: Run pre-commit (checks for trailing whitespaces, and non-ASCII symbols in filenames and file content)
+        run: pre-commit run --all-files --show-diff-on-failure
+
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
     defaults:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,20 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+      - id: trailing-whitespace # checks for files with trailing whitespaces, excluding .md and Git-related hidden files
+        exclude: '\.md$|.*\.git.*'
+      - id: check-merge-conflict # checks for files that contain merge conflict strings (such as <<<<<<<, =======, and >>>>>>>)
+      - id: check-json # Ensures that JSON files are syntactically correct
+      - id: end-of-file-fixer # ensures that each file ends with one blank line, excluding Git-related hidden files
+        exclude: '.*\.git.*'
+  - repo: local
+    hooks:
+      - id: forbid-non-ascii-filenames # runs the script that prohibits non-ASCII characters in file names
+        name: Prohibit non-ASCII characters in file names
+        entry: ./pre_commit_scripts/check_non_ascii_filenames.sh
+        language: script
+      - id: forbid-non-ascii-in-files # checks for non-ASCII characters in files (excluding Markdown and hidden files), with characters ± and ? allowed
+        name: Check for non-ASCII characters in files (excluding Markdown and hidden files), with characters ± and ? allowed
+        entry: ./pre_commit_scripts/check_non_ascii_in_files.sh
+        language: script
diff --git a/pre_commit_scripts/check_non_ascii_filenames.sh b/pre_commit_scripts/check_non_ascii_filenames.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Store the command output:
+empty_tree=$(git hash-object -t tree /dev/null)
+
+# Get a list of new files that might have non-ASCII characters:
+problem_files=$(git diff --name-only --diff-filter=A -z "$empty_tree" | LC_ALL=C grep -P "[^\x00-\x7F]")
+
+# Count the number of problematic files:
+count=$(echo "$problem_files" | wc -w)
+
+# Print necessary info based on the result:
+if [ "$count" -ne 0 ]; then
+  echo "Error: Non-ASCII characters found in filenames of new files:"
+  echo "$problem_files"
+  exit 1
+else
+  echo "Success: No non-ASCII filenames found."
+fi
+exit 0
diff --git a/pre_commit_scripts/check_non_ascii_in_files.sh b/pre_commit_scripts/check_non_ascii_in_files.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Define the list of files to check, excluding .md, hidden, and a number of specific files:
+files_to_check=$(git ls-files | grep -vE "^\." | grep -vE "\.md$" | grep -vE "^(tests/python_tests|tools/who_what_benchmark/(tests|whowhatbench))" | grep -v "tools/llm_bench/llm_bench_utils/ov_model_classes.py")
+
+# Run git grep to find non-ASCII characters in the selected files and store the results:
+results=$(LC_ALL=C git grep -n "[^ -~±�\”\“]" -- $files_to_check)
+
+# Print the results:
+if [ -n "$results" ]; then
+  echo "Error: Non-ASCII characters found in files:"
+  echo "$results"
+  exit 1
+else
+  echo "Success: No non-ASCII characters found in files."
+fi
+exit 0
diff --git a/requirements-build.txt b/requirements-build.txt
@@ -1,2 +1,2 @@
 cmake~=3.23.0
-pybind11-stubgen==2.5.1
+pybind11-stubgen==2.5.1
diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp
@@ -19,7 +19,7 @@ int main(int argc, char* argv[]) try {
     config.num_beams = 15;
     config.diversity_penalty = 1.0f;
     config.num_return_sequences = config.num_beams;
-       
+
     // Since the streamer is set, the results will
     // be printed each time a new token is generated.
     auto beams = pipe.generate(prompts, config);

diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp
@@ -35,15 +35,15 @@ int main(int argc, char* argv[]) try {
     std::string device = result["device"].as<std::string>();
     size_t num_warmup = result["num_warmup"].as<size_t>();
     size_t num_iter = result["num_iter"].as<size_t>();
-  
+
     ov::genai::GenerationConfig config;
     config.max_new_tokens = result["max_new_tokens"].as<size_t>();
 
     ov::genai::LLMPipeline pipe(models_path, device);
-    
+
     for (size_t i = 0; i < num_warmup; i++)
         pipe.generate(prompt, config);
-    
+
     ov::genai::DecodedResults res = pipe.generate(prompt, config);
     ov::genai::PerfMetrics metrics = res.perf_metrics;
     for (size_t i = 0; i < num_iter - 1; i++) {

diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp
@@ -12,14 +12,14 @@ int main(int argc, char* argv[]) try {
 
     std::string device = "CPU";  // GPU, NPU can be used as well
     ov::genai::LLMPipeline pipe(models_path, device);
-    
+
     ov::genai::GenerationConfig config;
     config.max_new_tokens = 100;
-    std::function<bool(std::string)> streamer = [](std::string word) { 
+    std::function<bool(std::string)> streamer = [](std::string word) {
         std::cout << word << std::flush;
         // Return flag corresponds whether generation should be stopped.
         // false means continue generation.
-        return false; 
+        return false;
     };
 
     pipe.start_chat();

diff --git a/samples/cpp/image_generation/CMakeLists.txt b/samples/cpp/image_generation/CMakeLists.txt
@@ -80,7 +80,7 @@ install(TARGETS image2image
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
         EXCLUDE_FROM_ALL)
-        
+
 # create LoRA sample executable
 
 add_executable(inpainting inpainting.cpp load_image.cpp imwrite.cpp)
@@ -96,4 +96,4 @@ set_target_properties(inpainting PROPERTIES
 install(TARGETS inpainting
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
-        EXCLUDE_FROM_ALL)
+        EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
@@ -13,11 +13,11 @@ add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp)
 target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai)
 
 set_target_properties(${TARGET_NAME} PROPERTIES
-    COMPILE_PDB_NAME ${TARGET_NAME} 
+    COMPILE_PDB_NAME ${TARGET_NAME}
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
 
-install(TARGETS ${TARGET_NAME} 
+install(TARGETS ${TARGET_NAME}
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
         EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
@@ -19,7 +19,7 @@ int main(int argc, char* argv[]) try {
 
     std::string model_path = argv[1];
     std::string prompt = argv[2];
-    
+
     std::string device = "CPU";
 
     ov::genai::LLMPipeline pipe(

diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
@@ -13,11 +13,11 @@ add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp)
 target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai)
 
 set_target_properties(${TARGET_NAME} PROPERTIES
-    COMPILE_PDB_NAME ${TARGET_NAME} 
+    COMPILE_PDB_NAME ${TARGET_NAME}
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
 
-install(TARGETS ${TARGET_NAME} 
+install(TARGETS ${TARGET_NAME}
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
         EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -21,7 +21,7 @@ int main(int argc, char* argv[]) try {
     std::string main_model_path = argv[1];
     std::string draft_model_path = argv[2];
     std::string prompt = argv[3];
-    
+
     // User can run main and draft model on different devices.
     // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft.
     std::string main_device = "CPU", draft_device = "CPU";

diff --git a/samples/cpp/text_generation/encrypted_model_causal_lm.cpp b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp
@@ -41,7 +41,7 @@ int main(int argc, char* argv[]) try {
 
     auto [model_str, model_weights] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin");
     ov::genai::Tokenizer tokenizer = decrypt_tokenizer(models_path);
-    
+
     ov::genai::LLMPipeline pipe(model_str, model_weights, tokenizer, device);
 
     std::string result = pipe.generate(prompt, ov::genai::max_new_tokens(100));

diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt
@@ -42,4 +42,4 @@ set_target_properties(benchmark_vlm PROPERTIES
 install(TARGETS benchmark_vlm
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
-        EXCLUDE_FROM_ALL)
+        EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/visual_language_chat/benchmark_vlm.cpp b/samples/cpp/visual_language_chat/benchmark_vlm.cpp
@@ -42,15 +42,15 @@ int main(int argc, char* argv[]) try {
     size_t num_warmup = result["num_warmup"].as<size_t>();
     size_t num_iter = result["num_iter"].as<size_t>();
     ov::Tensor image = utils::load_image(image_path);
-  
+
     ov::genai::GenerationConfig config;
     config.max_new_tokens = result["max_new_tokens"].as<size_t>();
 
     ov::genai::VLMPipeline pipe(models_path, device);
-    
+
     for (size_t i = 0; i < num_warmup; i++)
         pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config));
-    
+
     auto res = pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config));
     auto metrics = res.perf_metrics;
     for (size_t i = 0; i < num_iter - 1; i++) {

diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
@@ -10,4 +10,4 @@ diffusers==0.32.1 # For image generation pipelines
 timm==1.0.12  # For exporting InternVL2
 torchvision  # For visual language models
 transformers>=4.43 # For Whisper
-hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1
+hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1
diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py
@@ -12,31 +12,31 @@ def main():
     parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
     parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
     parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
-    
+
     args = parser.parse_args()
 
-    # Perf metrics is stored in DecodedResults. 
+    # Perf metrics is stored in DecodedResults.
     # In order to get DecodedResults instead of a string input should be a list.
     prompt = [args.prompt]
     models_path = args.model
     device = args.device
     num_warmup = args.num_warmup
     num_iter = args.num_iter
-    
+
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = args.max_new_tokens
 
     pipe = ov_genai.LLMPipeline(models_path, device)
-    
+
     for _ in range(num_warmup):
         pipe.generate(prompt, config)
-    
+
     res = pipe.generate(prompt, config)
     perf_metrics = res.perf_metrics
     for _ in range(num_iter - 1):
         res = pipe.generate(prompt, config)
         perf_metrics += res.perf_metrics
-    
+
     print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
     print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
     print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")

diff --git a/samples/python/image_generation/text2image.py b/samples/python/image_generation/text2image.py
@@ -29,4 +29,4 @@ def main():
 
 
 if '__main__' == __name__:
-    main()
+    main()
diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
@@ -11,18 +11,18 @@
 class IterableStreamer(openvino_genai.StreamerBase):
     """
     A custom streamer class for handling token streaming and detokenization with buffering.
-    
+
     Attributes:
         tokenizer (Tokenizer): The tokenizer used for encoding and decoding tokens.
         tokens_cache (list): A buffer to accumulate tokens for detokenization.
         text_queue (Queue): A synchronized queue for storing decoded text chunks.
         print_len (int): The length of the printed text to manage incremental decoding.
     """
-    
+
     def __init__(self, tokenizer):
         """
         Initializes the IterableStreamer with the given tokenizer.
-        
+
         Args:
             tokenizer (Tokenizer): The tokenizer to use for encoding and decoding tokens.
         """
@@ -37,35 +37,35 @@ def __iter__(self):
         Returns the iterator object itself.
         """
         return self
-    
+
     def __next__(self):
         """
         Returns the next value from the text queue.
-        
+
         Returns:
             str: The next decoded text chunk.
-        
+
         Raises:
             StopIteration: If there are no more elements in the queue.
         """
         value = self.text_queue.get()  # get() will be blocked until a token is available.
         if value is None:
             raise StopIteration
         return value
-    
+
     def get_stop_flag(self):
         """
         Checks whether the generation process should be stopped.
-        
+
         Returns:
             bool: Always returns False in this implementation.
         """
         return False
-    
+
     def put_word(self, word: str):
         """
         Puts a word into the text queue.
-        
+
         Args:
             word (str): The word to put into the queue.
         """
@@ -74,20 +74,20 @@ def put_word(self, word: str):
     def put(self, token_id: int) -> bool:
         """
         Processes a token and manages the decoding buffer. Adds decoded text to the queue.
-        
+
         Args:
             token_id (int): The token_id to process.
-        
+
         Returns:
             bool: True if generation should be stopped, False otherwise.
-        """        
+        """
         self.tokens_cache.append(token_id)
         text = self.tokenizer.decode(self.tokens_cache)
 
         word = ''
         if len(text) > self.print_len and '\n' == text[-1]:
             # Flush the cache after the new line symbol.
-            word = text[self.print_len:]            
+            word = text[self.print_len:]
             self.tokens_cache = []
             self.print_len = 0
         elif len(text) >= 3 and text[-1] == chr(65533):
@@ -98,8 +98,8 @@ def put(self, token_id: int) -> bool:
             # Print to output only if text length is increaesed.
             word = text[self.print_len:]
             self.print_len = len(text)
-        self.put_word(word)        
-        
+        self.put_word(word)
+
         if self.get_stop_flag():
             # When generation is stopped from streamer then end is not called, need to call it here manually.
             self.end()
@@ -161,7 +161,7 @@ def token_printer():
     config.top_p = 0.9
     config.top_k = 30
 
-    # Since the streamer is set, the results will be printed 
+    # Since the streamer is set, the results will be printed
     # every time a new token is generated and put into the streamer queue.
     pipe.generate(args.prompt, config, text_print_streamer)
     printer_thread.join()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,4 +29,4 @@ def main():


		if '__main__' == __name__:
		main()
		main()