Merge pull request #57 from russellb/chunking

Fix dataset formatting for pipeline differences
instructlab · Jul 1, 2024 · 45ecc73 · 45ecc73
2 parents 1f71fb6 + e606811
commit 45ecc73
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 71 deletions.
diff --git a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml
@@ -15,14 +15,14 @@ Here are the requirements:
 examples: |
   Here are some examples to help you understand the type of questions that are asked for this document:
 
-  {question_1}
-  {response_1}
+  {icl_query_1}
+  {icl_response_1}
 
-  {question_2}
-  {response_2}
+  {icl_query_2}
+  {icl_response_2}
 
-  {question_3}
-  {response_3}
+  {icl_query_3}
+  {icl_response_3}
 
   Here is the document:
   {document}

diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml
@@ -15,16 +15,10 @@ Here are the requirements:
 examples: |
   The task is {task_description}.
 
-  Here are some examples to help you understand the type of questions that are asked for:
+  Here is an example to help you understand the type of questions that are asked for:
 
-  {question_1}
-  {response_1}
-
-  {question_2}
-  {response_2}
-
-  {question_3}
-  {response_3}
+  {seed_question}
+  {seed_response}
 
 generation: |
   Provide a single question and answer pair based on the examples.

diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml
@@ -15,23 +15,17 @@ Here are the requirements:
 examples: |
   The task is {task_description}.
 
-  Here is some context for the example questions:
+  Here is some context for the example question:
 
-  {context}
+  {seed_context}
 
-  Here are some examples to help you understand the type of questions that are asked for:
+  Here is an example to help you understand the type of questions that are asked for:
 
-  {question_1}
-  {response_1}
-
-  {question_2}
-  {response_2}
-
-  {question_3}
-  {response_3}
+  {seed_question}
+  {seed_response}
 
 generation: |
-  Provide a single question and answer pair based on the examples.
+  Provide a single question and answer pair based on the example.
 
 start_tags: [""]
 end_tags: [""]
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -30,7 +30,7 @@
     SynthSkillsFlow,
 )
 from instructlab.sdg.pipeline import Pipeline
-from instructlab.sdg.utils import chunking, models
+from instructlab.sdg.utils import models
 from instructlab.sdg.utils.taxonomy import (
     leaf_node_to_samples,
     read_taxonomy_leaf_nodes,
@@ -270,7 +270,7 @@ def generate_data(
 
     generated_data = None
     for leaf_node in leaf_nodes.values():
-        samples = leaf_node_to_samples(leaf_node)
+        samples = leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count)
 
         if not samples:
             raise utils.GenerateException("Error: No samples found in leaf node.")
@@ -290,16 +290,6 @@ def generate_data(
                 "Error: No SDG pipeline for this leaf node type: %s" % samples[0]
             )
 
-        # TODO this is broken, just trying to get initial integration to run
-        # pylint: disable=consider-using-enumerate
-        if samples[0].get("document"):
-            for i in range(len(samples)):
-                samples[i]["document"] = chunking.chunk_document(
-                    documents=samples[i]["document"],
-                    server_ctx_size=server_ctx_size,
-                    chunk_word_count=chunk_word_count,
-                )[0]
-
         # TODO -- there is a parameter for how many samples to generate, but we ignore it so far
 
         logger.debug("Samples: %s" % samples)

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -19,6 +19,7 @@
 
 # First Party
 from instructlab.sdg import utils
+from instructlab.sdg.utils import chunking
 
 logger = logging.getLogger(__name__)
 
@@ -415,42 +416,81 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):
     return leaf_nodes
 
 
-def leaf_node_to_samples(leaf_node):
+def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
     samples = [{}]
 
+    # document is the same for the whole leaf node
+    chunks = (
+        chunking.chunk_document(
+            documents=leaf_node[0]["document"],
+            server_ctx_size=server_ctx_size,
+            chunk_word_count=chunk_word_count,
+        )
+        if leaf_node[0].get("document")
+        else []
+    )
+
+    # domain is the same for the whole leaf node
+    domain = leaf_node[0].get("domain")
+
+    for chunk in chunks:
+        # pylint: disable=consider-using-enumerate
+        for i in range(len(leaf_node)):
+            samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
+            samples[-1].setdefault("domain", domain)
+            samples[-1].setdefault("document", chunk)
+            if samples[-1].get("document") and not samples[-1].get("domain"):
+                raise utils.GenerateException(
+                    "Error: No domain provided for knowledge document in leaf node"
+                )
+            if "icl_query_3" in samples[-1]:
+                samples.append({})
+            if "icl_query_1" not in samples[-1]:
+                samples[-1]["icl_query_1"] = leaf_node[i]["instruction"]
+                samples[-1]["icl_response_1"] = leaf_node[i]["output"]
+            elif "icl_query_2" not in samples[-1]:
+                samples[-1]["icl_query_2"] = leaf_node[i]["instruction"]
+                samples[-1]["icl_response_2"] = leaf_node[i]["output"]
+            else:
+                samples[-1]["icl_query_3"] = leaf_node[i]["instruction"]
+                samples[-1]["icl_response_3"] = leaf_node[i]["output"]
+
+        # wrap back around to the beginning if the number of examples was not
+        # evenly divisble by 3
+        if "icl_query_2" not in samples[-1]:
+            samples[-1]["icl_query_2"] = leaf_node[0]["instruction"]
+            samples[-1]["icl_response_2"] = leaf_node[0]["output"]
+        if "icl_query_3" not in samples[-1]:
+            samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
+                "instruction"
+            ]
+            samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
+                "output"
+            ]
+
+    return samples
+
+
+def _skill_leaf_node_to_samples(leaf_node):
+    samples = []
+
     # pylint: disable=consider-using-enumerate
     for i in range(len(leaf_node)):
-        samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
-        for field in ["document", "domain"]:
-            if leaf_node[i].get(field):
-                samples[-1].setdefault(field, leaf_node[i][field])
-        if samples[-1].get("document") and not samples[-1].get("domain"):
-            raise utils.GenerateException(
-                "Error: No domain provided for knowledge document in leaf node"
-            )
+        samples.append({})
+        samples[-1]["task_description"] = leaf_node[i]["task_description"]
         if leaf_node[i].get("input"):
-            samples[-1].setdefault("context", leaf_node[i]["input"])
-        if "question_3" in samples[-1]:
-            samples.append({})
-        if "question_1" not in samples[-1]:
-            samples[-1]["question_1"] = leaf_node[i]["instruction"]
-            samples[-1]["response_1"] = leaf_node[i]["output"]
-        elif "question_2" not in samples[-1]:
-            samples[-1]["question_2"] = leaf_node[i]["instruction"]
-            samples[-1]["response_2"] = leaf_node[i]["output"]
-        else:
-            samples[-1]["question_3"] = leaf_node[i]["instruction"]
-            samples[-1]["response_3"] = leaf_node[i]["output"]
-
-    # wrap back around to the beginning if the number of examples was not
-    # evenly divisble by 3
-    if "question_2" not in samples[-1]:
-        samples[-1]["question_2"] = leaf_node[0]["instruction"]
-        samples[-1]["response_2"] = leaf_node[0]["output"]
-    if "question_3" not in samples[-1]:
-        samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
-            "instruction"
-        ]
-        samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"]
+            samples[-1]["seed_context"] = leaf_node[i]["input"]
+        samples[-1]["seed_question"] = leaf_node[i]["instruction"]
+        samples[-1]["seed_response"] = leaf_node[i]["output"]
 
     return samples
+
+
+def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
+    if not leaf_node:
+        return []
+    if leaf_node[0].get("document"):
+        return _knowledge_leaf_node_to_samples(
+            leaf_node, server_ctx_size, chunk_word_count
+        )
+    return _skill_leaf_node_to_samples(leaf_node)