Skip to content

Commit

Permalink
Merge pull request #284 from khaledsulayman/ks-integrate-docprocessor
Browse files Browse the repository at this point in the history
Integrate Context-Aware Chunking and PDF Support
  • Loading branch information
mergify[bot] authored Nov 7, 2024
2 parents f0d8a6f + f06e7f4 commit 4c82c05
Show file tree
Hide file tree
Showing 10 changed files with 907 additions and 208 deletions.
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
click>=8.1.7,<9.0.0
datasets>=2.18.0,<3.0.0
docling>=1.15.0,<2.0.0
GitPython>=3.1.42,<4.0.0
httpx>=0.25.0,<1.0.0
instructlab-schema>=0.4.0
langchain-text-splitters
openai>=1.13.3,<2.0.0
# Note: this dependency goes along with langchain-text-splitters and may be
# removed once that one is removed.
# do not use 8.4.0 due to a bug in the library
# https://github.com/instructlab/instructlab/issues/1389
openai>=1.13.3,<2.0.0
tabulate>=0.9.0
tenacity>=8.3.0,!=8.4.0
torch>=2.3.0,<2.5.0
transformers>=4.41.2
xdg-base-dirs>=6.0.1
31 changes: 20 additions & 11 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

# Third Party
# instructlab - All of these need to go away (other than sdg) - issue #6
from datasets import Dataset
from xdg_base_dirs import xdg_data_dirs, xdg_data_home
import openai

Expand Down Expand Up @@ -268,7 +267,7 @@ def generate_data(
model_name: Optional[str] = None,
num_cpus: Optional[int] = None,
num_instructions_to_generate: Optional[int] = 30,
taxonomy: Optional[str] = None,
taxonomy: Optional[str] = None, # TODO rename to taxonomy_path to match config
taxonomy_base: Optional[str] = None,
output_dir: Optional[str] = None,
# TODO - not used and should be removed from the CLI
Expand Down Expand Up @@ -309,12 +308,16 @@ def generate_data(
if not (taxonomy and os.path.exists(taxonomy)):
raise GenerateException(f"Error: taxonomy ({taxonomy}) does not exist.")

leaf_nodes = read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules)
date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
document_output_dir = Path(output_dir) / f"documents-{date_suffix}"

leaf_nodes = read_taxonomy_leaf_nodes(
taxonomy, taxonomy_base, yaml_rules, document_output_dir
)
if not leaf_nodes:
raise GenerateException("Error: No new leaf nodes found in the taxonomy.")

name = Path(model_name).stem # Just in case it is a file path
date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
output_file_messages = f"messages_{name}_{date_suffix}.jsonl"
output_file_test = f"test_{name}_{date_suffix}.jsonl"
output_file_train = f"train_{name}_{date_suffix}.jsonl"
Expand Down Expand Up @@ -362,25 +365,31 @@ def generate_data(
for leaf_node in leaf_nodes.values():
is_knowledge = False
leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
samples = leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count)
samples = leaf_node_to_samples(
leaf_node,
taxonomy,
server_ctx_size,
chunk_word_count,
document_output_dir,
model_name,
)

if not samples:
raise GenerateException("Error: No samples found in leaf node.")

if samples[0].get("document"):
if "document" in samples.column_names:
pipe = knowledge_pipe
is_knowledge = True

elif samples[0].get("seed_context"):
elif "seed_context" in samples.column_names:
pipe = grounded_skills_pipe

else:
pipe = freeform_skills_pipe

logger.debug("Samples: %s", samples)
ds = Dataset.from_list(samples)
logger.debug("Dataset: %s", ds)
new_generated_data = pipe.generate(ds, leaf_node_path)

new_generated_data = pipe.generate(samples, leaf_node_path)
if len(new_generated_data) == 0:
empty_sdg_leaf_nodes.append(leaf_node_path)
logger.warning("Empty dataset for qna node: %s", leaf_node_path)
Expand All @@ -398,7 +407,7 @@ def generate_data(
generate_eval_task_data(
mmlu_bench_pipe,
leaf_node_path,
ds,
samples,
output_dir,
date_suffix,
)
Expand Down
1 change: 0 additions & 1 deletion src/instructlab/sdg/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ def _generate_single(self, dataset) -> Dataset:
drop_duplicates_cols = block_prop.get("drop_duplicates", False)
block = block_type(self.ctx, self, block_name, **block_config)
logger.info("Running block: %s", block_name)
logger.info(dataset)

# Execute the block and wrap errors with the block name/type
dataset = block.generate(dataset)
Expand Down
Loading

0 comments on commit 4c82c05

Please sign in to comment.