-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This is part of #160 The changes here originated from aakankshaduggal@5baf6df There are two major changes here. - When parsing a `qna.yaml` file from a taxonomy tree, adjust for the new schema for knowledge. There is no attempt to maintain compatibility with prior versions of the schema (v1, v2). - Change how we translate the taxonomy data into the dataset sent into the pipeline as input. Instead of implementing a sliding window approach of 3 sample qna pairs at a time over all chunks of the document, we now create a row per seed_example (context and associated qna pairs) for each chunk of knowledge docs. Co-authored-by: abhi1092 <[email protected]> Co-authored-by: shiv <[email protected]> Co-authored-by: Aakanksha Duggal <[email protected]> Signed-off-by: Russell Bryant <[email protected]>
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -335,28 +335,42 @@ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None): | |
|
||
# get seed instruction data | ||
tax_path = "->".join(taxonomy_path.parent.parts) | ||
task_description = contents.get("task_description") | ||
task_description = contents.get("task_description", None) | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
markmc
Contributor
|
||
domain = contents.get("domain") | ||
documents = contents.get("document") | ||
if documents: | ||
documents = _get_documents(source=documents) | ||
logger.debug("Content from git repo fetched") | ||
|
||
for seed_example in contents.get("seed_examples"): | ||
question = seed_example.get("question") | ||
answer = seed_example.get("answer") | ||
context = seed_example.get("context", "") | ||
seed_instruction_data.append( | ||
{ | ||
"instruction": question, | ||
"input": context, | ||
"output": answer, | ||
"taxonomy_path": tax_path, | ||
"task_description": task_description, | ||
"document": documents, | ||
"domain": domain, | ||
} | ||
) | ||
if "questions_and_answers" in seed_example: | ||
question_answer_list = seed_example.get("questions_and_answers") | ||
seed_instruction_data.append( | ||
{ | ||
"questions_and_answers": question_answer_list, | ||
"input": context, | ||
"taxonomy_path": tax_path, | ||
"document": documents, | ||
"domain": domain, | ||
"document_outline": contents.get("document_outline"), | ||
} | ||
) | ||
else: | ||
question = seed_example.get("question") | ||
answer = seed_example.get("answer") | ||
|
||
seed_instruction_data.append( | ||
{ | ||
"instruction": question, | ||
"input": context, | ||
"output": answer, | ||
"taxonomy_path": tax_path, | ||
"task_description": task_description, | ||
"document": documents, | ||
"domain": domain, | ||
} | ||
) | ||
except Exception as e: | ||
errors += 1 | ||
raise TaxonomyReadingException(f"Exception {e} raised in {file_path}") from e | ||
|
@@ -418,8 +432,7 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules): | |
|
||
|
||
def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count): | ||
samples = [{}] | ||
|
||
samples = [] | ||
# document is the same for the whole leaf node | ||
chunks = ( | ||
chunking.chunk_document( | ||
|
@@ -436,38 +449,24 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count | |
|
||
for chunk in chunks: | ||
# pylint: disable=consider-using-enumerate | ||
for i in range(len(leaf_node)): | ||
samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) | ||
samples[-1].setdefault("domain", domain) | ||
samples[-1].setdefault("document", chunk) | ||
if samples[-1].get("document") and not samples[-1].get("domain"): | ||
raise utils.GenerateException( | ||
"Error: No domain provided for knowledge document in leaf node" | ||
) | ||
if "icl_query_3" in samples[-1]: | ||
samples.append({}) | ||
if "icl_query_1" not in samples[-1]: | ||
samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] | ||
samples[-1]["icl_response_1"] = leaf_node[i]["output"] | ||
elif "icl_query_2" not in samples[-1]: | ||
samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] | ||
samples[-1]["icl_response_2"] = leaf_node[i]["output"] | ||
else: | ||
samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] | ||
samples[-1]["icl_response_3"] = leaf_node[i]["output"] | ||
|
||
# wrap back around to the beginning if the number of examples was not | ||
# evenly divisble by 3 | ||
if "icl_query_2" not in samples[-1]: | ||
samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] | ||
samples[-1]["icl_response_2"] = leaf_node[0]["output"] | ||
if "icl_query_3" not in samples[-1]: | ||
samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ | ||
"instruction" | ||
] | ||
samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ | ||
"output" | ||
] | ||
for icl_ in leaf_node: | ||
icl_query = { | ||
f"icl_query_{idx+1}": val["question"] | ||
for idx, val in enumerate(icl_["questions_and_answers"]) | ||
} | ||
icl_resp = { | ||
f"icl_response_{idx+1}": val["answer"] | ||
for idx, val in enumerate(icl_["questions_and_answers"]) | ||
} | ||
samples_row = { | ||
"icl_document": icl_["input"], | ||
"document": chunk, | ||
"document_outline": icl_["document_outline"], | ||
"domain": domain, | ||
} | ||
samples_row.update(icl_query) | ||
samples_row.update(icl_resp) | ||
samples.append(samples_row) | ||
|
||
return samples | ||
|
||
|
Hi I've been looking at the v3 schema and was under the impression that task_description shouldn't be used anymore? Is this correct?