Skip to content

Commit

Permalink
Merge pull request #57 from russellb/chunking
Browse files Browse the repository at this point in the history
Fix dataset formatting for pipeline differences
  • Loading branch information
russellb authored Jul 1, 2024
2 parents 1f71fb6 + e606811 commit 45ecc73
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 71 deletions.
12 changes: 6 additions & 6 deletions src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ Here are the requirements:
examples: |
Here are some examples to help you understand the type of questions that are asked for this document:
{question_1}
{response_1}
{icl_query_1}
{icl_response_1}
{question_2}
{response_2}
{icl_query_2}
{icl_response_2}
{question_3}
{response_3}
{icl_query_3}
{icl_response_3}
Here is the document:
{document}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,10 @@ Here are the requirements:
examples: |
The task is {task_description}.
Here are some examples to help you understand the type of questions that are asked for:
Here is an example to help you understand the type of questions that are asked for:
{question_1}
{response_1}
{question_2}
{response_2}
{question_3}
{response_3}
{seed_question}
{seed_response}
generation: |
Provide a single question and answer pair based on the examples.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,17 @@ Here are the requirements:
examples: |
The task is {task_description}.
Here is some context for the example questions:
Here is some context for the example question:
{context}
{seed_context}
Here are some examples to help you understand the type of questions that are asked for:
Here is an example to help you understand the type of questions that are asked for:
{question_1}
{response_1}
{question_2}
{response_2}
{question_3}
{response_3}
{seed_question}
{seed_response}
generation: |
Provide a single question and answer pair based on the examples.
Provide a single question and answer pair based on the example.
start_tags: [""]
end_tags: [""]
14 changes: 2 additions & 12 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
SynthSkillsFlow,
)
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.utils import chunking, models
from instructlab.sdg.utils import models
from instructlab.sdg.utils.taxonomy import (
leaf_node_to_samples,
read_taxonomy_leaf_nodes,
Expand Down Expand Up @@ -270,7 +270,7 @@ def generate_data(

generated_data = None
for leaf_node in leaf_nodes.values():
samples = leaf_node_to_samples(leaf_node)
samples = leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count)

if not samples:
raise utils.GenerateException("Error: No samples found in leaf node.")
Expand All @@ -290,16 +290,6 @@ def generate_data(
"Error: No SDG pipeline for this leaf node type: %s" % samples[0]
)

# TODO this is broken, just trying to get initial integration to run
# pylint: disable=consider-using-enumerate
if samples[0].get("document"):
for i in range(len(samples)):
samples[i]["document"] = chunking.chunk_document(
documents=samples[i]["document"],
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
)[0]

# TODO -- there is a parameter for how many samples to generate, but we ignore it so far

logger.debug("Samples: %s" % samples)
Expand Down
104 changes: 72 additions & 32 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

# First Party
from instructlab.sdg import utils
from instructlab.sdg.utils import chunking

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -415,42 +416,81 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):
return leaf_nodes


def leaf_node_to_samples(leaf_node):
def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
samples = [{}]

# document is the same for the whole leaf node
chunks = (
chunking.chunk_document(
documents=leaf_node[0]["document"],
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
)
if leaf_node[0].get("document")
else []
)

# domain is the same for the whole leaf node
domain = leaf_node[0].get("domain")

for chunk in chunks:
# pylint: disable=consider-using-enumerate
for i in range(len(leaf_node)):
samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
samples[-1].setdefault("domain", domain)
samples[-1].setdefault("document", chunk)
if samples[-1].get("document") and not samples[-1].get("domain"):
raise utils.GenerateException(
"Error: No domain provided for knowledge document in leaf node"
)
if "icl_query_3" in samples[-1]:
samples.append({})
if "icl_query_1" not in samples[-1]:
samples[-1]["icl_query_1"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_1"] = leaf_node[i]["output"]
elif "icl_query_2" not in samples[-1]:
samples[-1]["icl_query_2"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_2"] = leaf_node[i]["output"]
else:
samples[-1]["icl_query_3"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_3"] = leaf_node[i]["output"]

# wrap back around to the beginning if the number of examples was not
# evenly divisble by 3
if "icl_query_2" not in samples[-1]:
samples[-1]["icl_query_2"] = leaf_node[0]["instruction"]
samples[-1]["icl_response_2"] = leaf_node[0]["output"]
if "icl_query_3" not in samples[-1]:
samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"instruction"
]
samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"output"
]

return samples


def _skill_leaf_node_to_samples(leaf_node):
samples = []

# pylint: disable=consider-using-enumerate
for i in range(len(leaf_node)):
samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
for field in ["document", "domain"]:
if leaf_node[i].get(field):
samples[-1].setdefault(field, leaf_node[i][field])
if samples[-1].get("document") and not samples[-1].get("domain"):
raise utils.GenerateException(
"Error: No domain provided for knowledge document in leaf node"
)
samples.append({})
samples[-1]["task_description"] = leaf_node[i]["task_description"]
if leaf_node[i].get("input"):
samples[-1].setdefault("context", leaf_node[i]["input"])
if "question_3" in samples[-1]:
samples.append({})
if "question_1" not in samples[-1]:
samples[-1]["question_1"] = leaf_node[i]["instruction"]
samples[-1]["response_1"] = leaf_node[i]["output"]
elif "question_2" not in samples[-1]:
samples[-1]["question_2"] = leaf_node[i]["instruction"]
samples[-1]["response_2"] = leaf_node[i]["output"]
else:
samples[-1]["question_3"] = leaf_node[i]["instruction"]
samples[-1]["response_3"] = leaf_node[i]["output"]

# wrap back around to the beginning if the number of examples was not
# evenly divisble by 3
if "question_2" not in samples[-1]:
samples[-1]["question_2"] = leaf_node[0]["instruction"]
samples[-1]["response_2"] = leaf_node[0]["output"]
if "question_3" not in samples[-1]:
samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"instruction"
]
samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"]
samples[-1]["seed_context"] = leaf_node[i]["input"]
samples[-1]["seed_question"] = leaf_node[i]["instruction"]
samples[-1]["seed_response"] = leaf_node[i]["output"]

return samples


def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
if not leaf_node:
return []
if leaf_node[0].get("document"):
return _knowledge_leaf_node_to_samples(
leaf_node, server_ctx_size, chunk_word_count
)
return _skill_leaf_node_to_samples(leaf_node)

0 comments on commit 45ecc73

Please sign in to comment.