Change prints in build_chunks_from_docling_json to debug messages

Signed-off-by: Khaled Sulayman <[email protected]>
instructlab · Nov 7, 2024 · f06e7f4 · f06e7f4
1 parent e7b1666
commit f06e7f4
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 15 deletions.
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -3,7 +3,7 @@
 from collections import defaultdict
 from enum import Enum
 from pathlib import Path
-from typing import DefaultDict, Iterable, List, Tuple, Optional
+from typing import DefaultDict, Iterable, List, Optional, Tuple
 import json
 import logging
 import re
@@ -50,7 +50,7 @@ def __new__(
         cls,
         leaf_node,
         taxonomy_path,
-        output_dir: Optional[Path],
+        output_dir: Path,
         server_ctx_size=4096,
         chunk_word_count=1024,
         tokenizer_model_name: str | None = None,
@@ -238,7 +238,7 @@ def _path_validator(self, path) -> Path:
                 raise FileNotFoundError(f"{path} does not exist.")
         return path
 
-    def _load_qna_yaml(self, qna_yaml_path: Path | None) -> dict:
+    def _load_qna_yaml(self, qna_yaml_path: Optional[Path]) -> dict:
         """
         Load the qna YAML file.
         Args:
@@ -503,7 +503,7 @@ def build_chunks_from_docling_json(
                         and len(current_buffer) > 1
                     ):
                         chunk_text = "\n\n".join(current_buffer[:-1])
-                        print(
+                        logger.debug(
                             f"Current chunk size {self.get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}"
                         )
 
@@ -513,8 +513,8 @@ def build_chunks_from_docling_json(
                             self.get_token_count(current_buffer[-1], tokenizer)
                             >= max_token_per_chunk
                         ):
-                            print(
-                                f"This is too big a document to be left in the current buffer {self.get_token_count(current_buffer[-1], tokenizer)}"
+                            logger.debug(
+                                f"The following text was dropped from the document because it was too long to fit into a single context for synthetic data generation: {current_buffer[-1]}"
                             )
                             document_chunks.append(current_buffer[-1])
                             current_buffer = []

diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
-import tempfile
 from pathlib import Path
+import tempfile
 
 # Third Party
 from docling.datamodel.base_models import PipelineOptions
@@ -44,16 +44,12 @@ def test_chunker_factory(filepaths, chunker_type, documents_dir):
         }
     ]
     with tempfile.TemporaryDirectory() as temp_dir:
-<<<<<<< HEAD
-        chunker = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
-=======
         chunker = DocumentChunker(
             leaf_node=leaf_node,
             taxonomy_path=documents_dir,
             output_dir=temp_dir,
             tokenizer_model_name="instructlab/merlinite-7b-lab",
         )
->>>>>>> eaf7434 (wip)
         assert isinstance(chunker, chunker_type)
 
 
@@ -68,13 +64,9 @@ def test_chunker_factory_unsupported_filetype(documents_dir):
     ]
     with pytest.raises(ValueError):
         with tempfile.TemporaryDirectory() as temp_dir:
-<<<<<<< HEAD
-            _ = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
-=======
             _ = DocumentChunker(
                 leaf_node=leaf_node,
                 taxonomy_path=documents_dir,
                 output_dir=temp_dir,
                 tokenizer_model_name="instructlab/merlinite-7b-lab",
             )
->>>>>>> eaf7434 (wip)