Skip to content

Commit

Permalink
Ensure knowledge docs are cloned into unique dirs
Browse files Browse the repository at this point in the history
Previously, we were attempting to clone multiple knowledge documents
into the same destination directory, leading to failures generating
data for any run that contained 2+ knowledge leaf nodes.

Now, we clone docs into a guaranteed unique (via `tempfile.mkdtemp`)
subdirectory per knowledge leaf node. Just using a subdirectory per
leaf node could still have led to collisions if the user ran data
generation twice within one minute, which is why this goes the extra
step of using `mkdtemp` for guaranteed uniqueness.

Fixes instructlab#404

Signed-off-by: Ben Browning <[email protected]>
  • Loading branch information
bbrowning committed Nov 27, 2024
1 parent eb8119c commit 823c279
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 17 deletions.
9 changes: 8 additions & 1 deletion src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# Standard
from pathlib import Path
from tempfile import mkdtemp
from typing import Dict, List, Tuple, Union
import glob
import logging
Expand Down Expand Up @@ -257,14 +258,20 @@ def _read_taxonomy_file(
try:
# get seed instruction data
tax_path = "->".join(taxonomy.path.parent.parts)
leaf_node_path = tax_path.replace("->", "_")
contents = taxonomy.contents
task_description = contents.get("task_description", None)
domain = contents.get("domain")
documents = contents.get("document")
document_contents, doc_filepaths = None, None
if documents:
os.makedirs(document_output_dir, exist_ok=True)
unique_output_dir = mkdtemp(
prefix=f"{leaf_node_path}_", dir=document_output_dir
)
document_contents, doc_filepaths = _get_documents(
source=documents, document_output_dir=document_output_dir
source=documents,
document_output_dir=unique_output_dir,
)
logger.debug("Content from git repo fetched")

Expand Down
44 changes: 28 additions & 16 deletions tests/test_generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,10 @@ def validate_phase_leaf_node_dataset(dataset_file_name):
assert ds.features["messages"][0]["role"].dtype == "string"


def validate_recipe(recipe_file_name):
def validate_recipe(recipe_file_name, num_datasets):
with open(recipe_file_name, encoding="utf-8") as fp:
yaml_contents = yaml.safe_load(fp)
assert len(yaml_contents["datasets"]) == 1
assert len(yaml_contents["datasets"]) == num_datasets
assert yaml_contents["datasets"][0]["path"].endswith(".jsonl")
assert "sampling_size" in yaml_contents["datasets"][0]
assert yaml_contents["metadata"]["sys_prompt"] == TEST_SYS_PROMPT
Expand Down Expand Up @@ -344,7 +344,7 @@ def test_generate(self):
if name.endswith("compositional_skills_new.jsonl"):
validate_skill_leaf_node_dataset(matches[0])
elif name.startswith("skills_recipe_"):
validate_recipe(matches[0])
validate_recipe(matches[0], 1)
elif name.startswith("skills_train_msgs_"):
validate_mixed_dataset(matches[0])

Expand Down Expand Up @@ -374,13 +374,19 @@ def setUp(self):
TEST_DATA_DIR, "test_valid_knowledge_skill.yaml"
)
tracked_knowledge_file = os.path.join("knowledge ", "tracked", "qna.yaml")
untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml")
# Explicitly add 2 files here to ensure multiple knowledge leaf nodes
# don't conflict in anything like document_output_dir for knowledge docs
untracked_knowledge_file1 = os.path.join("knowledge", "new1", "qna.yaml")
untracked_knowledge_file2 = os.path.join("knowledge", "new2", "qna.yaml")
test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file)
self.test_taxonomy.add_tracked(
tracked_knowledge_file, test_valid_knowledge_skill
)
self.test_taxonomy.create_untracked(
untracked_knowledge_file, test_valid_knowledge_skill
untracked_knowledge_file1, test_valid_knowledge_skill
)
self.test_taxonomy.create_untracked(
untracked_knowledge_file2, test_valid_knowledge_skill
)
self.expected_test_samples = generate_test_samples(test_valid_knowledge_skill)
self.expected_train_samples = generate_train_samples(test_valid_knowledge_skill)
Expand Down Expand Up @@ -412,40 +418,46 @@ def test_generate(self):
elif name.startswith("messages_"):
validate_messages_dataset(matches[0], self.expected_train_samples)

node_p07_file = os.path.join("node_datasets_*", "knowledge_new_p07.jsonl")
node_p10_file = os.path.join("node_datasets_*", "knowledge_new_p10.jsonl")
node1_p07_file = os.path.join("node_datasets_*", "knowledge_new1_p07.jsonl")
node1_p10_file = os.path.join("node_datasets_*", "knowledge_new1_p10.jsonl")
node2_p07_file = os.path.join("node_datasets_*", "knowledge_new2_p07.jsonl")
node2_p10_file = os.path.join("node_datasets_*", "knowledge_new2_p10.jsonl")
for name in [
"skills_recipe_*.yaml",
"skills_train_*.jsonl",
"knowledge_recipe_*.yaml",
"knowledge_train_msgs_*.jsonl",
node_p07_file,
node_p10_file,
node1_p07_file,
node1_p10_file,
node2_p07_file,
node2_p10_file,
]:
matches = glob.glob(os.path.join(self.tmp_path, name))
assert len(matches) == 1
if name.endswith("knowledge_new_p07.jsonl") or name.endswith(
"knowledge_new_p10.jsonl"
if name.endswith("knowledge_new1_p07.jsonl") or name.endswith(
"knowledge_new1_p10.jsonl"
):
validate_phase_leaf_node_dataset(matches[0])
elif name.startswith("skills_recipe_") or name.startswith(
"knowledge_recipe_"
):
validate_recipe(matches[0])
validate_recipe(matches[0], 2)
elif name.startswith("skills_train_msgs_") or name.startswith(
"knowledge_train_msgs_"
):
validate_mixed_dataset(matches[0])

for name in [
"knowledge_new_task.yaml",
"mmlubench_knowledge_new.jsonl",
"knowledge_new1_task.yaml",
"mmlubench_knowledge_new1.jsonl",
"knowledge_new2_task.yaml",
"mmlubench_knowledge_new2.jsonl",
]:
matches = glob.glob(os.path.join(self.tmp_path, "node_datasets_*", name))
assert len(matches) == 1
if name == "knowledge_new_task.yaml":
if name == "knowledge_new1_task.yaml":
validate_lm_eval_task(matches[0])
elif name == "mmlubench_knowledge_new.jsonl":
elif name == "mmlubench_knowledge_new1.jsonl":
validate_mmlubench_dataset(matches[0])

def teardown(self) -> None:
Expand Down

0 comments on commit 823c279

Please sign in to comment.