Skip to content

Commit

Permalink
Change prints in build_chunks_from_docling_json to debug messages
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Nov 7, 2024
1 parent e7b1666 commit f06e7f4
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 15 deletions.
12 changes: 6 additions & 6 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import DefaultDict, Iterable, List, Tuple, Optional
from typing import DefaultDict, Iterable, List, Optional, Tuple
import json
import logging
import re
Expand Down Expand Up @@ -50,7 +50,7 @@ def __new__(
cls,
leaf_node,
taxonomy_path,
output_dir: Optional[Path],
output_dir: Path,
server_ctx_size=4096,
chunk_word_count=1024,
tokenizer_model_name: str | None = None,
Expand Down Expand Up @@ -238,7 +238,7 @@ def _path_validator(self, path) -> Path:
raise FileNotFoundError(f"{path} does not exist.")
return path

def _load_qna_yaml(self, qna_yaml_path: Path | None) -> dict:
def _load_qna_yaml(self, qna_yaml_path: Optional[Path]) -> dict:
"""
Load the qna YAML file.
Args:
Expand Down Expand Up @@ -503,7 +503,7 @@ def build_chunks_from_docling_json(
and len(current_buffer) > 1
):
chunk_text = "\n\n".join(current_buffer[:-1])
print(
logger.debug(
f"Current chunk size {self.get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}"
)

Expand All @@ -513,8 +513,8 @@ def build_chunks_from_docling_json(
self.get_token_count(current_buffer[-1], tokenizer)
>= max_token_per_chunk
):
print(
f"This is too big a document to be left in the current buffer {self.get_token_count(current_buffer[-1], tokenizer)}"
logger.debug(
f"The following text was dropped from the document because it was too long to fit into a single context for synthetic data generation: {current_buffer[-1]}"
)
document_chunks.append(current_buffer[-1])
current_buffer = []
Expand Down
10 changes: 1 addition & 9 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
import tempfile
from pathlib import Path
import tempfile

# Third Party
from docling.datamodel.base_models import PipelineOptions
Expand Down Expand Up @@ -44,16 +44,12 @@ def test_chunker_factory(filepaths, chunker_type, documents_dir):
}
]
with tempfile.TemporaryDirectory() as temp_dir:
<<<<<<< HEAD
chunker = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
=======
chunker = DocumentChunker(
leaf_node=leaf_node,
taxonomy_path=documents_dir,
output_dir=temp_dir,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)
>>>>>>> eaf7434 (wip)
assert isinstance(chunker, chunker_type)


Expand All @@ -68,13 +64,9 @@ def test_chunker_factory_unsupported_filetype(documents_dir):
]
with pytest.raises(ValueError):
with tempfile.TemporaryDirectory() as temp_dir:
<<<<<<< HEAD
_ = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
=======
_ = DocumentChunker(
leaf_node=leaf_node,
taxonomy_path=documents_dir,
output_dir=temp_dir,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)
>>>>>>> eaf7434 (wip)

0 comments on commit f06e7f4

Please sign in to comment.