diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 5e46216d..97cb9084 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -3,7 +3,7 @@ from collections import defaultdict from enum import Enum from pathlib import Path -from typing import DefaultDict, Iterable, List, Tuple, Optional +from typing import DefaultDict, Iterable, List, Optional, Tuple import json import logging import re @@ -50,7 +50,7 @@ def __new__( cls, leaf_node, taxonomy_path, - output_dir: Optional[Path], + output_dir: Path, server_ctx_size=4096, chunk_word_count=1024, tokenizer_model_name: str | None = None, @@ -238,7 +238,7 @@ def _path_validator(self, path) -> Path: raise FileNotFoundError(f"{path} does not exist.") return path - def _load_qna_yaml(self, qna_yaml_path: Path | None) -> dict: + def _load_qna_yaml(self, qna_yaml_path: Optional[Path]) -> dict: """ Load the qna YAML file. Args: @@ -503,7 +503,7 @@ def build_chunks_from_docling_json( and len(current_buffer) > 1 ): chunk_text = "\n\n".join(current_buffer[:-1]) - print( + logger.debug( f"Current chunk size {self.get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}" ) @@ -513,8 +513,8 @@ def build_chunks_from_docling_json( self.get_token_count(current_buffer[-1], tokenizer) >= max_token_per_chunk ): - print( - f"This is too big a document to be left in the current buffer {self.get_token_count(current_buffer[-1], tokenizer)}" + logger.debug( + f"The following text was dropped from the document because it was too long to fit into a single context for synthetic data generation: {current_buffer[-1]}" ) document_chunks.append(current_buffer[-1]) current_buffer = [] diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index 2270a8a8..7d2923d9 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # Standard -import tempfile from pathlib import Path +import tempfile # Third Party from docling.datamodel.base_models import PipelineOptions @@ -44,16 +44,12 @@ def test_chunker_factory(filepaths, chunker_type, documents_dir): } ] with tempfile.TemporaryDirectory() as temp_dir: -<<<<<<< HEAD - chunker = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1") -======= chunker = DocumentChunker( leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="instructlab/merlinite-7b-lab", ) ->>>>>>> eaf7434 (wip) assert isinstance(chunker, chunker_type) @@ -68,13 +64,9 @@ def test_chunker_factory_unsupported_filetype(documents_dir): ] with pytest.raises(ValueError): with tempfile.TemporaryDirectory() as temp_dir: -<<<<<<< HEAD - _ = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1") -======= _ = DocumentChunker( leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="instructlab/merlinite-7b-lab", ) ->>>>>>> eaf7434 (wip)