utils: Update taxonomy reading code to handle knowledge v3

This is part of #160 The changes here originated from aakankshaduggal@5baf6df There are two major changes here. - When parsing a `qna.yaml` file from a taxonomy tree, adjust for the new schema for knowledge. There is no attempt to maintain compatibility with prior versions of the schema (v1, v2). - Change how we translate the taxonomy data into the dataset sent into the pipeline as input. Instead of implementing a sliding window approach of 3 sample qna pairs at a time over all chunks of the document, we now create a row per seed_example (context and associated qna pairs) for each chunk of knowledge docs. Co-authored-by: abhi1092 <[email protected]> Co-authored-by: shiv <[email protected]> Co-authored-by: Aakanksha Duggal <[email protected]> Signed-off-by: Russell Bryant <[email protected]>
instructlab · Jul 22, 2024 · 94a7a5e · connorgilchrist99 · Jul 24, 2024 · markmc
1 parent 33abe1e
commit 94a7a5e
Showing 1 changed file with 47 additions and 48 deletions.
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -335,28 +335,42 @@ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None):
 
         # get seed instruction data
         tax_path = "->".join(taxonomy_path.parent.parts)
-        task_description = contents.get("task_description")
+        task_description = contents.get("task_description", None)
         domain = contents.get("domain")
         documents = contents.get("document")
         if documents:
             documents = _get_documents(source=documents)
             logger.debug("Content from git repo fetched")
 
         for seed_example in contents.get("seed_examples"):
-            question = seed_example.get("question")
-            answer = seed_example.get("answer")
             context = seed_example.get("context", "")
-            seed_instruction_data.append(
-                {
-                    "instruction": question,
-                    "input": context,
-                    "output": answer,
-                    "taxonomy_path": tax_path,
-                    "task_description": task_description,
-                    "document": documents,
-                    "domain": domain,
-                }
-            )
+            if "questions_and_answers" in seed_example:
+                question_answer_list = seed_example.get("questions_and_answers")
+                seed_instruction_data.append(
+                    {
+                        "questions_and_answers": question_answer_list,
+                        "input": context,
+                        "taxonomy_path": tax_path,
+                        "document": documents,
+                        "domain": domain,
+                        "document_outline": contents.get("document_outline"),
+                    }
+                )
+            else:
+                question = seed_example.get("question")
+                answer = seed_example.get("answer")
+
+                seed_instruction_data.append(
+                    {
+                        "instruction": question,
+                        "input": context,
+                        "output": answer,
+                        "taxonomy_path": tax_path,
+                        "task_description": task_description,
+                        "document": documents,
+                        "domain": domain,
+                    }
+                )
     except Exception as e:
         errors += 1
         raise TaxonomyReadingException(f"Exception {e} raised in {file_path}") from e
@@ -418,8 +432,7 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):
 
 
 def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
-    samples = [{}]
-
+    samples = []
     # document is the same for the whole leaf node
     chunks = (
         chunking.chunk_document(
@@ -436,38 +449,24 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count
 
     for chunk in chunks:
         # pylint: disable=consider-using-enumerate
-        for i in range(len(leaf_node)):
-            samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
-            samples[-1].setdefault("domain", domain)
-            samples[-1].setdefault("document", chunk)
-            if samples[-1].get("document") and not samples[-1].get("domain"):
-                raise utils.GenerateException(
-                    "Error: No domain provided for knowledge document in leaf node"
-                )
-            if "icl_query_3" in samples[-1]:
-                samples.append({})
-            if "icl_query_1" not in samples[-1]:
-                samples[-1]["icl_query_1"] = leaf_node[i]["instruction"]
-                samples[-1]["icl_response_1"] = leaf_node[i]["output"]
-            elif "icl_query_2" not in samples[-1]:
-                samples[-1]["icl_query_2"] = leaf_node[i]["instruction"]
-                samples[-1]["icl_response_2"] = leaf_node[i]["output"]
-            else:
-                samples[-1]["icl_query_3"] = leaf_node[i]["instruction"]
-                samples[-1]["icl_response_3"] = leaf_node[i]["output"]
-
-        # wrap back around to the beginning if the number of examples was not
-        # evenly divisble by 3
-        if "icl_query_2" not in samples[-1]:
-            samples[-1]["icl_query_2"] = leaf_node[0]["instruction"]
-            samples[-1]["icl_response_2"] = leaf_node[0]["output"]
-        if "icl_query_3" not in samples[-1]:
-            samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
-                "instruction"
-            ]
-            samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
-                "output"
-            ]
+        for icl_ in leaf_node:
+            icl_query = {
+                f"icl_query_{idx+1}": val["question"]
+                for idx, val in enumerate(icl_["questions_and_answers"])
+            }
+            icl_resp = {
+                f"icl_response_{idx+1}": val["answer"]
+                for idx, val in enumerate(icl_["questions_and_answers"])
+            }
+            samples_row = {
+                "icl_document": icl_["input"],
+                "document": chunk,
+                "document_outline": icl_["document_outline"],
+                "domain": domain,
+            }
+            samples_row.update(icl_query)
+            samples_row.update(icl_resp)
+            samples.append(samples_row)
 
     return samples