Skip to content

Commit

Permalink
utils: Update taxonomy reading code to handle knowledge v3
Browse files Browse the repository at this point in the history
This is part of #160

The changes here originated from aakankshaduggal@5baf6df

There are two major changes here.

- When parsing a `qna.yaml` file from a taxonomy tree, adjust for the
  new schema for knowledge. There is no attempt to maintain
  compatibility with prior versions of the schema (v1, v2).

- Change how we translate the taxonomy data into the dataset sent into
  the pipeline as input. Instead of implementing a sliding window
  approach of 3 sample qna pairs at a time over all chunks of the
  document, we now create a row per seed_example (context and
  associated qna pairs) for each chunk of knowledge docs.

Co-authored-by: abhi1092 <[email protected]>
Co-authored-by: shiv <[email protected]>
Co-authored-by: Aakanksha Duggal <[email protected]>
Signed-off-by: Russell Bryant <[email protected]>
  • Loading branch information
4 people committed Jul 22, 2024
1 parent 33abe1e commit 94a7a5e
Showing 1 changed file with 47 additions and 48 deletions.
95 changes: 47 additions & 48 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,28 +335,42 @@ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None):

# get seed instruction data
tax_path = "->".join(taxonomy_path.parent.parts)
task_description = contents.get("task_description")
task_description = contents.get("task_description", None)

This comment has been minimized.

Copy link
@connorgilchrist99

connorgilchrist99 Jul 24, 2024

Hi I've been looking at the v3 schema and was under the impression that task_description shouldn't be used anymore? Is this correct?

This comment has been minimized.

Copy link
@markmc

markmc Jul 24, 2024

Contributor

Great question!

Looks like task_description is used by the v3 skills schema still: https://github.com/instructlab/schema/blob/main/src/instructlab/schema/v3/compositional_skills.json

and you can see below that it's only used for skills instruction data

Pretty confusing though!

Thanks for checking

This comment has been minimized.

Copy link
@connorgilchrist99

connorgilchrist99 Jul 24, 2024

Ah my bad, thanks for pointing that out! Do you know when an example knowledge qna.yaml will be coming out?
Thanks again!

This comment has been minimized.

Copy link
@markmc

markmc Jul 24, 2024

Contributor

https://github.com/instructlab/instructlab/blob/main/scripts/test-data/e2e-qna-knowledge.yaml is the only example I know of

instructlab/taxonomy#1253 tracks the fact we need to update docs, add new examples, etc. - but I don't think anyone is working on that this week at least

This comment has been minimized.

Copy link
@connorgilchrist99

connorgilchrist99 Jul 25, 2024

OK, thanks so much for your help! It's much appreciated

This comment has been minimized.

Copy link
@connorgilchrist99

connorgilchrist99 Jul 25, 2024

Really sorry for reaching out again but we're still experiencing troubles with getting our knowledge qna.yaml to generate synthetic data. I have applied the formatting from the example you kindly provided and still getting this error. Not sure if you have any idea why this could be?

Error:
Screenshot 2024-07-25 100302

qna.yaml file: https://github.com/nicklamb97/taxonomy/blob/main/knowledge/programming_languages/actian_4gl/Language_Reference_Guide/3._Statements/EXEC_4GL_Statements_for_3GL/clear_array/qna.yaml

Any advice would be amazing! Thanks in advance

domain = contents.get("domain")
documents = contents.get("document")
if documents:
documents = _get_documents(source=documents)
logger.debug("Content from git repo fetched")

for seed_example in contents.get("seed_examples"):
question = seed_example.get("question")
answer = seed_example.get("answer")
context = seed_example.get("context", "")
seed_instruction_data.append(
{
"instruction": question,
"input": context,
"output": answer,
"taxonomy_path": tax_path,
"task_description": task_description,
"document": documents,
"domain": domain,
}
)
if "questions_and_answers" in seed_example:
question_answer_list = seed_example.get("questions_and_answers")
seed_instruction_data.append(
{
"questions_and_answers": question_answer_list,
"input": context,
"taxonomy_path": tax_path,
"document": documents,
"domain": domain,
"document_outline": contents.get("document_outline"),
}
)
else:
question = seed_example.get("question")
answer = seed_example.get("answer")

seed_instruction_data.append(
{
"instruction": question,
"input": context,
"output": answer,
"taxonomy_path": tax_path,
"task_description": task_description,
"document": documents,
"domain": domain,
}
)
except Exception as e:
errors += 1
raise TaxonomyReadingException(f"Exception {e} raised in {file_path}") from e
Expand Down Expand Up @@ -418,8 +432,7 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):


def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
samples = [{}]

samples = []
# document is the same for the whole leaf node
chunks = (
chunking.chunk_document(
Expand All @@ -436,38 +449,24 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count

for chunk in chunks:
# pylint: disable=consider-using-enumerate
for i in range(len(leaf_node)):
samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
samples[-1].setdefault("domain", domain)
samples[-1].setdefault("document", chunk)
if samples[-1].get("document") and not samples[-1].get("domain"):
raise utils.GenerateException(
"Error: No domain provided for knowledge document in leaf node"
)
if "icl_query_3" in samples[-1]:
samples.append({})
if "icl_query_1" not in samples[-1]:
samples[-1]["icl_query_1"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_1"] = leaf_node[i]["output"]
elif "icl_query_2" not in samples[-1]:
samples[-1]["icl_query_2"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_2"] = leaf_node[i]["output"]
else:
samples[-1]["icl_query_3"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_3"] = leaf_node[i]["output"]

# wrap back around to the beginning if the number of examples was not
# evenly divisble by 3
if "icl_query_2" not in samples[-1]:
samples[-1]["icl_query_2"] = leaf_node[0]["instruction"]
samples[-1]["icl_response_2"] = leaf_node[0]["output"]
if "icl_query_3" not in samples[-1]:
samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"instruction"
]
samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"output"
]
for icl_ in leaf_node:
icl_query = {
f"icl_query_{idx+1}": val["question"]
for idx, val in enumerate(icl_["questions_and_answers"])
}
icl_resp = {
f"icl_response_{idx+1}": val["answer"]
for idx, val in enumerate(icl_["questions_and_answers"])
}
samples_row = {
"icl_document": icl_["input"],
"document": chunk,
"document_outline": icl_["document_outline"],
"domain": domain,
}
samples_row.update(icl_query)
samples_row.update(icl_resp)
samples.append(samples_row)

return samples

Expand Down

0 comments on commit 94a7a5e

Please sign in to comment.