Skip to content

Commit

Permalink
Update chunkers.py to add docling model path instead of downloading f…
Browse files Browse the repository at this point in the history
…rom hugging face

Signed-off-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
aakankshaduggal authored Nov 8, 2024
1 parent 0ea8481 commit 3359ba4
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __new__(
server_ctx_size=4096,
chunk_word_count=1024,
tokenizer_model_name: str | None = None,
docling_model_path: str | None = None,
):
"""Insantiate the appropriate chunker for the provided document
Expand Down Expand Up @@ -115,6 +116,7 @@ def __new__(
output_dir,
chunk_word_count,
tokenizer_model_name,
docling_model_path=docling_model_path,
)

@staticmethod
Expand Down Expand Up @@ -189,6 +191,7 @@ def __init__(
output_dir: Path,
chunk_word_count: int,
tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
docling_model_path=None
):
self.document_paths = document_paths
self.filepaths = filepaths
Expand All @@ -201,6 +204,7 @@ def __init__(
)

self.tokenizer = self.create_tokenizer(tokenizer_model_name)
self.docling_model_path = docling_model_path

def chunk_documents(self) -> List:
"""Semantically chunk PDF documents.
Expand All @@ -211,8 +215,11 @@ def chunk_documents(self) -> List:
if self.document_paths == []:
return []

model_artifacts_path = StandardPdfPipeline.download_models_hf()
pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path)
if not self.docling_model_path.exists():
raise FileNotFoundError(f"Docling model path not found: {self.docling_model_path}")
print("docling_model_path", docling_model_path)

Check failure on line 220 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

E0602: Undefined variable 'docling_model_path' (undefined-variable)
pipeline_options = PdfPipelineOptions(artifacts_path=docling_model_path)

Check failure on line 221 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

E0602: Undefined variable 'docling_model_path' (undefined-variable)

converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
Expand Down

0 comments on commit 3359ba4

Please sign in to comment.