diff --git a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml index 9ad6fa77..c63b4209 100644 --- a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml +++ b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml @@ -15,14 +15,14 @@ Here are the requirements: examples: | Here are some examples to help you understand the type of questions that are asked for this document: - {question_1} - {response_1} + {icl_query_1} + {icl_response_1} - {question_2} - {response_2} + {icl_query_2} + {icl_response_2} - {question_3} - {response_3} + {icl_query_3} + {icl_response_3} Here is the document: {document} diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml index 2913d7df..d584ac33 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml @@ -15,16 +15,10 @@ Here are the requirements: examples: | The task is {task_description}. - Here are some examples to help you understand the type of questions that are asked for: + Here is an example to help you understand the type of questions that are asked for: - {question_1} - {response_1} - - {question_2} - {response_2} - - {question_3} - {response_3} + {seed_question} + {seed_response} generation: | Provide a single question and answer pair based on the examples. diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml index fe48c99c..2ac41a82 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml @@ -15,23 +15,17 @@ Here are the requirements: examples: | The task is {task_description}. - Here is some context for the example questions: + Here is some context for the example question: - {context} + {seed_context} - Here are some examples to help you understand the type of questions that are asked for: + Here is an example to help you understand the type of questions that are asked for: - {question_1} - {response_1} - - {question_2} - {response_2} - - {question_3} - {response_3} + {seed_question} + {seed_response} generation: | - Provide a single question and answer pair based on the examples. + Provide a single question and answer pair based on the example. start_tags: [""] end_tags: [""] diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 2c812361..66a2987e 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -30,7 +30,7 @@ SynthSkillsFlow, ) from instructlab.sdg.pipeline import Pipeline -from instructlab.sdg.utils import chunking, models +from instructlab.sdg.utils import models from instructlab.sdg.utils.taxonomy import ( leaf_node_to_samples, read_taxonomy_leaf_nodes, @@ -270,7 +270,7 @@ def generate_data( generated_data = None for leaf_node in leaf_nodes.values(): - samples = leaf_node_to_samples(leaf_node) + samples = leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count) if not samples: raise utils.GenerateException("Error: No samples found in leaf node.") @@ -290,16 +290,6 @@ def generate_data( "Error: No SDG pipeline for this leaf node type: %s" % samples[0] ) - # TODO this is broken, just trying to get initial integration to run - # pylint: disable=consider-using-enumerate - if samples[0].get("document"): - for i in range(len(samples)): - samples[i]["document"] = chunking.chunk_document( - documents=samples[i]["document"], - server_ctx_size=server_ctx_size, - chunk_word_count=chunk_word_count, - )[0] - # TODO -- there is a parameter for how many samples to generate, but we ignore it so far logger.debug("Samples: %s" % samples) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 9e62baa5..d6f6441b 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -19,6 +19,7 @@ # First Party from instructlab.sdg import utils +from instructlab.sdg.utils import chunking logger = logging.getLogger(__name__) @@ -415,42 +416,81 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules): return leaf_nodes -def leaf_node_to_samples(leaf_node): +def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count): samples = [{}] + # document is the same for the whole leaf node + chunks = ( + chunking.chunk_document( + documents=leaf_node[0]["document"], + server_ctx_size=server_ctx_size, + chunk_word_count=chunk_word_count, + ) + if leaf_node[0].get("document") + else [] + ) + + # domain is the same for the whole leaf node + domain = leaf_node[0].get("domain") + + for chunk in chunks: + # pylint: disable=consider-using-enumerate + for i in range(len(leaf_node)): + samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) + samples[-1].setdefault("domain", domain) + samples[-1].setdefault("document", chunk) + if samples[-1].get("document") and not samples[-1].get("domain"): + raise utils.GenerateException( + "Error: No domain provided for knowledge document in leaf node" + ) + if "icl_query_3" in samples[-1]: + samples.append({}) + if "icl_query_1" not in samples[-1]: + samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_1"] = leaf_node[i]["output"] + elif "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[i]["output"] + else: + samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_3"] = leaf_node[i]["output"] + + # wrap back around to the beginning if the number of examples was not + # evenly divisble by 3 + if "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[0]["output"] + if "icl_query_3" not in samples[-1]: + samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + "instruction" + ] + samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + "output" + ] + + return samples + + +def _skill_leaf_node_to_samples(leaf_node): + samples = [] + # pylint: disable=consider-using-enumerate for i in range(len(leaf_node)): - samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) - for field in ["document", "domain"]: - if leaf_node[i].get(field): - samples[-1].setdefault(field, leaf_node[i][field]) - if samples[-1].get("document") and not samples[-1].get("domain"): - raise utils.GenerateException( - "Error: No domain provided for knowledge document in leaf node" - ) + samples.append({}) + samples[-1]["task_description"] = leaf_node[i]["task_description"] if leaf_node[i].get("input"): - samples[-1].setdefault("context", leaf_node[i]["input"]) - if "question_3" in samples[-1]: - samples.append({}) - if "question_1" not in samples[-1]: - samples[-1]["question_1"] = leaf_node[i]["instruction"] - samples[-1]["response_1"] = leaf_node[i]["output"] - elif "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[i]["instruction"] - samples[-1]["response_2"] = leaf_node[i]["output"] - else: - samples[-1]["question_3"] = leaf_node[i]["instruction"] - samples[-1]["response_3"] = leaf_node[i]["output"] - - # wrap back around to the beginning if the number of examples was not - # evenly divisble by 3 - if "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[0]["instruction"] - samples[-1]["response_2"] = leaf_node[0]["output"] - if "question_3" not in samples[-1]: - samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ - "instruction" - ] - samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"] + samples[-1]["seed_context"] = leaf_node[i]["input"] + samples[-1]["seed_question"] = leaf_node[i]["instruction"] + samples[-1]["seed_response"] = leaf_node[i]["output"] return samples + + +def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count): + if not leaf_node: + return [] + if leaf_node[0].get("document"): + return _knowledge_leaf_node_to_samples( + leaf_node, server_ctx_size, chunk_word_count + ) + return _skill_leaf_node_to_samples(leaf_node)