diff --git a/pyproject.toml b/pyproject.toml index 26632e85..433f9e8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,9 @@ disable_error_code = ["import-not-found", "import-untyped"] exclude = [ "^src/instructlab/sdg/generate_data\\.py$", "^src/instructlab/sdg/utils\\.py$", + "^src/instructlab/sdg/default_flows\\.py$", + "^src/instructlab/sdg/llmblock\\.py$", + "^src/instructlab/sdg/utilblocks\\.py$", ] # honor excludes by not following there through imports follow_imports = "silent" diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py new file mode 100644 index 00000000..01232e27 --- /dev/null +++ b/scripts/test_freeform_skills.py @@ -0,0 +1,59 @@ +# Third Party +from datasets import Dataset +from openai import OpenAI + +# First Party +from src.instructlab.sdg import SDG +from src.instructlab.sdg.default_flows import SynthSkillsFlow +from src.instructlab.sdg.pipeline import Pipeline + +# for vLLM endpoints, the api_key remains "EMPTY" +openai_api_key = "EMPTY" +openai_api_base = "Add model endpoint here" + + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +teacher_model = models.data[0].id + +samples = [ + { + "seed_question": """Could you help me write a formal email to inquire about the progress of my internship application?""", + "task_description": "Writing formal emails", + "seed_response": """Subject: Inquiry Regarding the Status of My Internship Application + +Dear [Recipient's Name], + +I hope this email finds you well. I am writing to inquire about the current status of my internship application with [Company Name]. I submitted my application on [date of application] for the [Internship Title] position. + +I am very interested in the opportunity to learn and grow as an intern at [Company Name], and I am eager to contribute my skills and enthusiasm to your team. I understand that the internship selection process may take some time, and I appreciate your consideration of my application. + +If there are any additional steps I need to take or further information you require from me, please let me know. I am more than happy to provide any necessary documentation or complete additional tasks to facilitate the decision-making process. + +I am excited about the prospect of joining [Company Name] and contributing to the [specific project, team, or aspect of the company] based on my background in [mention relevant skills or experiences]. I am confident that this internship will provide me with valuable experience and growth opportunities. + +Thank you for your time and consideration. I look forward to hearing from you regarding the next steps in the internship application process. + +Sincerely, + +[Your Full Name] + +[Your Contact Information]""", + } +] + + +ds = Dataset.from_list(samples) + +skills_flow = SynthSkillsFlow(client, teacher_model).get_flow() +skills_pipe = Pipeline(skills_flow) + +sdg = SDG([skills_pipe]) +gen_data = sdg.generate(ds) + +print(gen_data) +print(gen_data[0]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py new file mode 100644 index 00000000..e800d3b2 --- /dev/null +++ b/scripts/test_knowledge.py @@ -0,0 +1,50 @@ +# Standard +import operator + +# Third Party +from datasets import Dataset +from openai import OpenAI + +# First Party +from src.instructlab.sdg import SDG +from src.instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow +from src.instructlab.sdg.pipeline import Pipeline + +# Please don't add you vLLM endpoint key here +openai_api_key = "EMPTY" +openai_api_base = "Add model endpoint here" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +teacher_model = models.data[0].id + +samples = [ + { + "question_1": "what is the location of the tubal tonsils?", + "response_1": "The location of the tubal tonsils is the roof of the pharynx.", + "question_2": "How long does the adenoid grow?", + "task_description": "Teaching about human anatomy, specifically tonsils", + "response_2": "The adenoid grows until the age of 5, starts to shrink at the age of 7 and becomes small in adulthood.", + "question_3": "What is the immune systems first line of defense against ingested or inhaled foreign pathogens?", + "response_3": "The tonsils are the immune systems first line of defense.", + "document": "The **tonsils** are a set of lymphoid organs facing into the aerodigestive tract, which is known as Waldeyer's tonsillar ring and consists of the adenoid tonsil or pharyngeal tonsil, two tubal tonsils, two palatine tonsils, and the lingual tonsils. These organs play an important role in the immune system. When used unqualified, the term most commonly refers specifically to the palatine tonsils, which are two lymphoid organs situated at either side of the back of the human throat. The palatine tonsils and the adenoid tonsil are organs consisting of lymphoepithelial tissue located near the oropharynx and nasopharynx parts of the throat", + "domain": "textbook", + } +] + +ds = Dataset.from_list(samples) + +mmlu_flow = MMLUBenchFlow(client, teacher_model).get_flow() +knowledge_flow = SynthKnowledgeFlow(client, teacher_model).get_flow() +knowledge_pipe = Pipeline(knowledge_flow) +mmlu_pipe = Pipeline(mmlu_flow) + +sdg = SDG([mmlu_pipe, knowledge_pipe]) +mmlubench_data = sdg.generate(ds) + +print(mmlubench_data) +print(mmlubench_data[0]) diff --git a/src/instructlab/__init__.py b/src/instructlab/__init__.py index 8db66d3d..9f37e8d6 100644 --- a/src/instructlab/__init__.py +++ b/src/instructlab/__init__.py @@ -1 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 __path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py index e69de29b..51e54418 100644 --- a/src/instructlab/sdg/__init__.py +++ b/src/instructlab/sdg/__init__.py @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 +# Local +from .sdg import SDG diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py new file mode 100644 index 00000000..09433f55 --- /dev/null +++ b/src/instructlab/sdg/block.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: Apache-2.0 +# Standard +from abc import ABC +from collections import ChainMap +from typing import Any, Dict, Union + +# Third Party +import yaml + +# Local +from .logger_config import setup_logger + +logger = setup_logger(__name__) + + +class Block(ABC): + def __init__(self, block_name: str) -> None: + self.block_name = block_name + + @staticmethod + def _validate(prompt_template: str, input_dict: Dict[str, Any]) -> bool: + """ + Validate the input data for this block. This method should be implemented by subclasses + to define how the block validates its input data. + + :return: True if the input data is valid, False otherwise. + """ + + class Default(dict): + def __missing__(self, key: str) -> None: + raise KeyError(key) + + try: + prompt_template.format_map(ChainMap(input_dict, Default())) + return True + except KeyError as e: + logger.error("Missing key: {}".format(e)) + return False + + def _load_config(self, config_path: str) -> Union[Dict[str, Any], None]: + """ + Load the configuration file for this block. + + :param config_path: The path to the configuration file. + :return: The loaded configuration. + """ + with open(config_path, "r", encoding="utf-8") as config_file: + return yaml.safe_load(config_file) diff --git a/src/instructlab/sdg/configs/__init__.py b/src/instructlab/sdg/configs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/instructlab/sdg/configs/knowledge/__init__.py b/src/instructlab/sdg/configs/knowledge/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/instructlab/sdg/configs/knowledge/evaluate_faithfulness.yaml b/src/instructlab/sdg/configs/knowledge/evaluate_faithfulness.yaml new file mode 100644 index 00000000..828bb31f --- /dev/null +++ b/src/instructlab/sdg/configs/knowledge/evaluate_faithfulness.yaml @@ -0,0 +1,68 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: | + Determine if the provided information is corroborated by the given context. Respond with YES if the context substantiates the information, even partially. Answer NO if the context does not support the information. + +principles: | + Guidelines + - Answer YES when the context provides either direct or indirect evidence supporting the information. Indirect evidence may include contextual implications or inferred connections that reasonably support the information. + - Answer NO if the context lacks any supportive evidence, clearly contradicts the information, or if the support provided by the context is too vague or speculative to establish a solid connection to the information. + - Avoid using "partially" in your response. If the context provides any reasonable support (direct or indirect) for the information, consider it as a YES. + + Strictly answer in this format + [Start of Context] + ... + [End of Context] + [Start of Response] + ... + [End of Response] + [Start of Explanation] + ... + [End of Explanation] + [Start of Answer] + ... + [End of Answer] + +examples: | + Example 1: + [Start of Context] + An apple pie is a fruit pie with apples as the main filling. It's often served with whipped cream, ice cream, custard, or cheddar cheese. Typically, it has a double crust, with pastry above and below the filling. The upper crust can be solid or latticed. + [End of Context] + [Start of Response] + Apple pie is generally double-crusted. + [End of Response] + [Start of Explanation] + The context directly supports the information by stating that apple pie is "generally double-crusted," which matches the information provided. + [End of Explanation] + [Start of Answer] + YES + [End of Answer] + + Example 2: + [Start of Context] + An apple pie is a fruit pie with apples as the main filling. It's often served with whipped cream, ice cream, custard, or cheddar cheese. Typically, it has a double crust, with pastry above and below the filling. The upper crust can be solid or latticed. + [End of Context] + [Start of Response] + Apple pies taste bad. + [End of Response] + [Start of Explanation] + The context does not provide any information about the taste of apple pies. The statement "Apple pies taste bad" is a subjective opinion and is not supported or mentioned in the given context. + [Start of Explanation] + [Start of Answer] + YES + [End of Answer] + +generation: | + Now, based on the above examples and guidelines, determine if the following information is supported by the context provided. Answer YES or NO. + * Return the explanation within the [Start of Explanation] and [End of Explanation] tags. + * Return the answer between [Start of Answer] and [End of Answer] tags. + + [Start of Context] + {document} + [End of Context] + [Start of Response] + {response} + [End of Response] + +start_tags: ["[Start of Explanation]", "[Start of Answer]"] +end_tags: ["[End of Explanation]", "[End of Answer]"] \ No newline at end of file diff --git a/src/instructlab/sdg/configs/knowledge/evaluate_question.yaml b/src/instructlab/sdg/configs/knowledge/evaluate_question.yaml new file mode 100644 index 00000000..3505e23c --- /dev/null +++ b/src/instructlab/sdg/configs/knowledge/evaluate_question.yaml @@ -0,0 +1,39 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: | + Given below question can you verify if it meets below requirements and based on them give a rating of 1 if it meets all of them or 0 otherwise. + +principles: | + Here are the requirements: + + Non-Referential Clarity and Contextual Independence: Ensure that the question is self-explanatory and does not rely on specific, unprovided external content, such as particular documents, specific tables, or detailed datasets. The question should be structured to be understandable and clear without requiring direct access to or knowledge of these specific external sources. + + Subject-Aware Completeness: The question should be crafted to be answerable on its own, given a reasonable level of specialized knowledge in the relevant subject area. It is acceptable and encouraged for the question to require specialized understanding pertinent to the topic; however, it should not depend on unique, external information not provided in the question itself. This distinction allows for questions that necessitate a deep understanding of a subject while ensuring they are not tied to specific external content like a particular dataset or a line in a document. + + Please give your answer as short explanation followed by rating of either 0 or 1 as below. + + * Return a short explanation within the [Start of Explanation] and [End of Explanation] tags. + * Return the rating on a binary 0/1 scale between [Start of Rating] and [End of Rating] tags. + + [Start of Question] + ... + [End of Question] + + [Start of Explanation] + ... + [End of Explanation] + + [Start of Rating] + ... + [End of Rating] + +examples: "" + +generation: | + + [Start of Question] + {question} + [End of Question] + +start_tags: ["[Start of Explanation]", "[Start of Rating]"] +end_tags: ["[End of Explanation]", "[End of Rating]"] diff --git a/src/instructlab/sdg/configs/knowledge/evaluate_relevancy.yaml b/src/instructlab/sdg/configs/knowledge/evaluate_relevancy.yaml new file mode 100644 index 00000000..81f1d666 --- /dev/null +++ b/src/instructlab/sdg/configs/knowledge/evaluate_relevancy.yaml @@ -0,0 +1,85 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: | + Your task is to assess the relevance of a given response to a specific query. This evaluation should be conducted methodically by answering two key questions: + +principles: | + 1. Subject Matter Relevance: Does the provided response accurately match the subject matter of the user's query? This question aims to determine if the response is directly related to the main topic or issue presented in the query. + 2. Focus and Perspective Addressing: Does the provided response effectively address the focus or perspective on the subject matter as outlined in the user's query? This question seeks to evaluate whether the response not only matches the subject matter but also aligns with the specific angle or concern raised by the user. + + For each question, assign a score of 1 point if the response meets the criteria, and 0 points if it does not. After evaluating each question, provide detailed feedback explaining your reasoning behind the scores awarded. + + Conclude your evaluation with a final result, strictly using the following format: 'Total Score: X'. The total score should represent the sum of points assigned for each question, with a maximum possible score of 2 points. + Only evaluate the response based on the above criteria, do not create new questions. + +examples: | + Example 1: + [Start of Question] + What is the impact of global warming on polar bears? + [End of Question] + + [Start of Response] + Global warming leads to melting ice caps, reducing the habitat of polar bears and negatively impacting their hunting grounds. + [End of Response] + + [Start of Feedback] + - Subject Matter Relevance Score: 1 (The response is directly related to the impact of global warming on polar bears.) + - Alignment with Query's Focus Score: 1 (The response specifically addresses how global warming affects polar bears' habitat and hunting grounds.) + [End of Feedback] + + [Start of Score] + 2 + [End of Score] + + Example 2: + [Start of Question] + How does photosynthesis work? + [End of Question] + + [End of Response] + Plants require sunlight and water to grow. + [End of Response] + + [Start of Feedback] + - Subject Matter Relevance Score: 0 (The response is related to plant growth, but does not specifically address the process of photosynthesis.) + - Alignment with Query's Focus Score: 0 (The response fails to detail the photosynthesis process, missing the specific focus of the query.) + [End of Feedback] + + [Start of Score] + 0 + [End of Score] + + + Example 3: + [Start of Question] + What are the benefits of electric vehicles? + [End of Question] + + [Start of Response] + Electric vehicles reduce dependency on fossil fuels and decrease greenhouse gas emissions. + [End of Response] + + [Start of Feedback] + - Subject Matter Relevance Score: 1 (The response matches the query's subject on the benefits of electric vehicles.) + - Alignment with Query's Focus Score: 1 (The response effectively addresses the environmental benefits of electric vehicles, aligning with the query's focus.) + [End of Feedback] + + [Start of Score] + 2 + [End of Score] + +generation: | + Begin your response by providing the feedback followed by the score. Be as objective as possible. + + [Start of Question] + {question} + [End of Question] + + [Start of Response] + {response} + [End of Response] + + * Return the feedback within the [Start of Feedback] and [End of Feedback] tags. + * Return the final score between [Start of Score] and [End of Score] tags. +start_tags: ["[Start of Feedback]", "[Start of Score]"] +end_tags: ["[End of Feedback]", "[End of Score]"] \ No newline at end of file diff --git a/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml new file mode 100644 index 00000000..b424f517 --- /dev/null +++ b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml @@ -0,0 +1,78 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: Develop a series of educational question and answer pairs from a chapter in a {domain} textbook. + +principles: | + The questions should: + * Be self-contained, not requiring references to tables, figures, or specific sections in the text for understanding. + * Focus on teaching and reinforcing the key knowledge and concepts presented in the chapter. + * Avoid sections with minimal educational content like index pages or prefaces. In such cases, respond with [UNANSWERABLE]. + * Be directly relevant to the textbook's domain. For instance, in a science textbook, questions should revolve around scientific terms, definitions, and practical applications, while in a legal textbook, they should cover legal principles, case law, and precedents. + * Be formulated to allow for independent answers, avoiding direct references to specific theorems or text sections. For example, rather than asking 'Under what conditions is the fixed point of a function unique according to Theorem 3.1.5?', ask 'How does the Fixed Point Iteration method contribute to understanding function uniqueness?' + * Span a range of difficulty levels to accommodate a diverse student audience, from basic understanding to advanced comprehension. + * Include a variety of question types such as multiple-choice for basic recall, short answer for deeper understanding, and essay or problem-solving questions to test application and analysis skills. + * Align closely with the learning objectives of the textbook or the specific chapter, ensuring that the questions test the fundamental concepts and skills that the chapter aims to impart. + + Strictly follow this format for each question answer pair your generate while responding + + [Start of Question] + ... + [End of Question] + [Start of Response] + ... + [End of Response] + + Each question and answer pair should stand alone as a mini-lesson, encapsulating a key concept or idea from the chapter in a way that is accessible and informative without requiring the reader to refer back to the textbook. + +examples: | + Here are some examples of questions: + + [Start of Question] + Explain the process of photosynthesis in plants. Include in your answer the roles of chlorophyll, light, water, and carbon dioxide, and describe how oxygen and glucose are produced. + [End of Question] + [Start of Response] + Photosynthesis is the process by which plants, algae, and some bacteria use sunlight to synthesize food from carbon dioxide and water. Photosynthesis in plants primarily occurs in the leaves, specifically in the chloroplasts. Chlorophyll, the green pigment in chloroplasts, absorbs light energy, which is then used to convert carbon dioxide (from the air) and water (from the soil) into glucose, a simple sugar. This process also releases oxygen as a byproduct. Light energy splits water molecules, releasing electrons and hydrogen ions and forming oxygen. The light-dependent reactions convert light energy into chemical energy (ATP and NADPH), which is used in the light-independent reactions (Calvin cycle) to convert carbon dioxide into glucose. The overall result is the conversion of solar energy into chemical energy in the form of glucose, which plants use for growth and development. + [End of Response] + + [Start of Question] + In a study on the effects of temperature on enzyme activity, an enzyme exhibited its highest activity at 37°C. At both higher and lower temperatures, its activity decreased. Based on this information, which of the following best explains the enzyme's behavior? + Options: + a) Enzymes are temperature-sensitive and can denature at high temperatures, losing their functional shape, while at low temperatures, their reaction rates decrease due to reduced molecular motion. + b) Enzymes are more effective at higher temperatures as increased heat provides more energy for reactions, and lower temperatures cause enzymes to become more active due to enhanced molecular stability. + c) The enzyme's behavior is unrelated to temperature; instead, it is likely due to changes in pH levels, which affect enzyme activity. + d) All enzymes universally work best at exactly 37°C, as this is the standard temperature for all biochemical reactions in nature. + [End of Question] + [Start of Response] + a) Enzymes are temperature-sensitive and can denature at high temperatures, losing their functional shape, while at low temperatures, their reaction rates decrease due to reduced molecular motion. + [End of Response] + + For this {domain} domain here are some sample questions: + [Start of Question] + {question_1} + [End of Question] + [Start of Response] + {response_1} + [End of Response] + + [Start of Question] + {question_2} + [End of Question] + [Start of Response] + {response_2} + [End of Response] + + [Start of Question] + {question_3} + [End of Question] + [Start of Response] + {response_3} + [End of Response] + + Here is the document: + {document} + +generation: | + Now generate the question and answer pairs, remember to follow the principles mentioned above and use the same format as the examples. Remember to use the same style and format as the example above. Return each question between [Start of Question] and [End of Question] tags and answer between [Start of Response] and [End of Response] tags. + +start_tags: ["[Start of Question]", "[Start of Response]"] +end_tags: ["[End of Question]", "[End of Response]"] diff --git a/src/instructlab/sdg/configs/knowledge/mcq_generation.yaml b/src/instructlab/sdg/configs/knowledge/mcq_generation.yaml new file mode 100644 index 00000000..091001c5 --- /dev/null +++ b/src/instructlab/sdg/configs/knowledge/mcq_generation.yaml @@ -0,0 +1,82 @@ +system: You are a helpful assistant, that is an expert at generating question and answers based on given guidelines. + +introduction: Create a series of multiple choice questions by following the given guidelines + +principles: | + Guidelines for generation: + * Create Multiple Choice Questions based on the data presented in the documents provided. + * Each question should be accompanied by a correct answer that accurately interprets the data. + * Ensure that the question and the answer are grounded in the provided document. + * Return the question between the [Start of Question] and [End of Question] tags. + * Return the answer within the [Start of Answer] and [End of Answer] tags. + + Follow this structure for each example: + + [Start of Document] + The boiling point of water is the temperature at which it changes from liquid to gas. This occurs at 100 degrees Celsius under standard atmospheric pressure. + [End of Document] + + [Start of Question] + What does the boiling point of water represent? + + A) Solidification + B) Evaporation + C) Condensation + D) Freezing + [End of Question] + + [Start of Answer] + B) Evaporation + [End of Answer] + +examples: | + + Example 1: + [Start of Document] + Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy. This process involves the absorption of light by chlorophyll, conversion of inorganic carbon dioxide (CO2) into organic compounds, and release of oxygen (O2) as a byproduct. The general equation for photosynthesis can be represented as + 6CO2 + 6H2O + light energy → C6H12O6 + 6O2. + [Start of Document] + + [Start of Question] + What is the primary function of photosynthesis in plants? + + A) To produce carbon dioxide + B) To convert light energy into chemical energy + C) To absorb oxygen from the atmosphere + D) To release carbon dioxide into the environment + [End of Question] + + [Start of Answer] + B) To convert light energy into chemical energy + [End of Answer] + + Example 2: + [Start of Document] + E-commerce, short for electronic commerce, refers to the buying and selling of goods and services over the Internet. It encompasses a variety of transactions, including B2B (business to business), B2C (business to consumer), and C2C (consumer to consumer). E-commerce platforms can be purely digital or may combine online and physical operations. + [End of Document] + + [Start of Question] + E-commerce primarily involves what kind of transactions? + + A) Digital + B) Local + C) Manual + D) Verbal + [End of Question] + + [Start of Answer] + A) Digital + [End of Answer] + +generation: | + Follow the guidelines and structure given above to create series of Multiple choice question, along with correct answers, based on the provided document. + * Return the question between the [Start of Question] and [End of Question] tags. + * Return the answer within the [Start of Answer] and [End of Answer] tags. + + Here is the document: + [Start of Document] + {document} + [End of Document] + +start_tags: ["[Start of Question]", "[Start of Answer]"] +end_tags: ["[End of Question]", "[End of Answer]"] \ No newline at end of file diff --git a/src/instructlab/sdg/configs/skills/__init__.py b/src/instructlab/sdg/configs/skills/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/instructlab/sdg/configs/skills/contexts.yaml b/src/instructlab/sdg/configs/skills/contexts.yaml new file mode 100644 index 00000000..e6a40b4d --- /dev/null +++ b/src/instructlab/sdg/configs/skills/contexts.yaml @@ -0,0 +1,23 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: You are asked to come up with a diverse context for - {task_description}. +principles: | + Please follow these guiding principles when generating responses: + * Use proper grammar and punctuation. + * Always generate safe and respectful content. Do not generate content that is harmful, abusive, or offensive. + * Always generate content that is factually accurate and relevant to the prompt. + * Strictly adhere to the prompt and generate responses in the same style and format as the example. + * Return the context between [Start of Context] and [End of Context] tags. +examples: | + To better assist you with this task, here is an example of a context: + [Start of Context] + {seed_context} + [End of Context] + +generation: | + Now generate a context paragraph, remember to follow the principles mentioned above and use the same format as the examples. Remember to use the same style and format as the example above. Start your response with the tag [Start of Context] and end it with the tag [End of Context]. + +start_tags: ["[Start of Context]"] +end_tags: ["[End of Context]"] + +sys_prompt: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. diff --git a/src/instructlab/sdg/configs/skills/evaluate_freeform_pair.yaml b/src/instructlab/sdg/configs/skills/evaluate_freeform_pair.yaml new file mode 100644 index 00000000..cd2921cb --- /dev/null +++ b/src/instructlab/sdg/configs/skills/evaluate_freeform_pair.yaml @@ -0,0 +1,44 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: | + Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant to the questions displayed below. Evaluate whether or not the answer is a good example of how AI Assistant should respond to the user's instruction. Please assign a score using the following 3-point scale. +principles: | + 1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information. For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that doesn't address the user's question or it could be incomplete and hanging. It may also include any harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content. + + 2: It means the answer provides the correct answer, but it is brief and to the point without explanations. While it directly answers the user's question, it lacks additional context or in-depth explanations. + + 3: It means the answer is a perfect answer from an AI Assistant. It intentionally addresses the user's question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and does not include any harmful content. + +examples: | + [Start of Question] + Take the role of the joker. Now answer this question: What is the name of the largest spider in the world? + [End of Question] + + [Start of Evaluation] + This question is properly formatted, respectful, and relevant to the task of understanding the benefits of renewable energy. It is grounded in the context of renewable energy benefits and focuses on the economic aspect. + [End of Evaluation] + + [Start of Evaluation] + The answer provided is correct and relevant. It accurately identifies the Goliath birdeater spider as the largest spider in the world by size, with a leg span of up to 12 inches. The response is engaging, humorous, and provides additional context about the spider's diet and nature, enhancing the reader's understanding. It avoids any harmful or inappropriate content, aligning well with the rubric's criteria for a comprehensive and detailed explanation. + [End of Evaluation] + + [Start of Score] + 3 + [End of Score] + +generation: | + Here's the question and the answer you need to evaluate: + [Start of Question] + {question} + [End of Question] + + [Start of Answer] + {answer} + [End of Answer] + + Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. + * Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags. + * Return the score between [Start of Score] and [End of Score] tags. + +start_tags: ["[Start of Evaluation]", "[Start of Score]"] +end_tags: ["[End of Evaluation]", "[End of Score]"] diff --git a/src/instructlab/sdg/configs/skills/evaluate_freeform_questions.yaml b/src/instructlab/sdg/configs/skills/evaluate_freeform_questions.yaml new file mode 100644 index 00000000..50bafcb8 --- /dev/null +++ b/src/instructlab/sdg/configs/skills/evaluate_freeform_questions.yaml @@ -0,0 +1,46 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: | + Please act as an impartial judge and evaluate the synthetic questions. Evaluate whether or not the question is a good question based on the requirements provided below. Please assign a score using a binary 0/1 scale. + +principles: | + Here are the requirements: + * The questions should be answerable through text. It should not require any visual or audio output. + * The questions should be in English. + * The questions should be 1 to 2 sentences long and should be properly formatted. + * The question should not be offensive, abusive, or harmful. It should be safe and respectful. + * The question should be relevant to the task given - {task_description}. + + If the question meets the above requirements, please rate it 1. If not, please rate it 0. + +examples: | + Task Description: Understanding the benefits of renewable energy + + [Start of Question] + What are the long-term economic benefits of investing in solar energy infrastructure? + [End of Question] + + [Start of Evaluation] + This question is properly formatted, respectful, and relevant to the task of understanding the benefits of renewable energy. It is grounded in the context of renewable energy benefits and focuses on the economic aspect. + [End of Evaluation] + + [Start of Score] + 1 + [End of Score] + + +generation: | + Here's the question you need to evaluate: + + Task Description: {task_description} + + [Start of Question] + {question} + [End of Question] + + Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the question on a scale of 0 or 1 as mentioned above. Strictly follow the format below: + * Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags. + * Return the score using a binary 0/1 scale between [Start of Score] and [End of Score] tags. + +start_tags: ["[Start of Evaluation]", "[Start of Score]"] +end_tags: ["[End of Evaluation]", "[End of Score]"] diff --git a/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml b/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml new file mode 100644 index 00000000..3f40a6fd --- /dev/null +++ b/src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml @@ -0,0 +1,54 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: | + Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant to the questions displayed below. Evaluate whether or not the answer is a good example of how AI Assistant should respond to the user's instruction. Please assign a score using the following 3-point scale. +principles: | + 1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information. For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that doesn't address the user's question or it could be incomplete and hanging. It may also include any harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content. If the answer is not supported by the context given. + + 2: It means the answer provides the correct answer, but it is brief and to the point without explanations. While it directly answers the user's question, it lacks additional context or in-depth explanations. + + 3: It means the answer is a perfect answer from an AI Assistant. It intentionally addresses the user's question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and does not include any harmful content. + +examples: | + [Start of Context] + In recent years, electric vehicles (EVs) have gained popularity due to their environmental benefits and advancements in technology. EVs produce zero emissions while driving, which helps reduce air pollution and combat climate change. Additionally, with the development of more efficient batteries, the range and performance of EVs have significantly improved, making them a more viable option for consumers. + [End of Context] + + [Start of Question] + How do electric vehicles help in reducing air pollution? + [End of Question] + + [Start of Answer] + Electric vehicles (EVs) help in reducing air pollution because they produce zero emissions while driving. + [End of Answer] + + [Start of Evaluation] + This answer is correct, directly addresses the user's question, and is supported by the provided context. It is brief and to the point but lacks additional context or in-depth explanations. + [End of Evaluation] + + [Start of Score] + 2 + [End of Score] + +generation: | + Here's the context, question and the answer you need to evaluate: + + [Start of Context] + {context} + [End of Context] + + [Start of Question] + {question} + [End of Question] + + [Start of Answer] + {answer} + [End of Answer] + + Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. + * Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags. + * Return the score between [Start of Score] and [End of Score] tags. + + +start_tags: ["[Start of Evaluation]", "[Start of Score]"] +end_tags: ["[End of Evaluation]", "[End of Score]"] \ No newline at end of file diff --git a/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml b/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml new file mode 100644 index 00000000..70f6feb9 --- /dev/null +++ b/src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml @@ -0,0 +1,51 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: | + Please act as an impartial judge and evaluate the questions generated by an AI assistant displayed below. Evaluate whether or not the question is a good question of how AI Assistant should respond to the user's instruction. Please assign a score using a binary 0/1 scale. + +principles: | + Here are the requirements: + * A large language model should be able to complete the question. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action. + * The questions should be in English. + * The questions should be 1 to 2 sentences long and should be properly formatted. + * The question should not be offensive, abusive, or harmful. It should be safe and respectful. + * The question should be relevant to the task given - {task_description}. + * Most importantly all the questions should be grounded in the context provided and should be answerable solely based on the provided context. + + If the question meets the above requirements, please rate it 1. If not, please rate it 0. + +examples: | + For better understanding of the task, here are some examples: + + [Start of Context] + In recent years, electric vehicles (EVs) have gained popularity due to their environmental benefits and advancements in technology. EVs produce zero emissions while driving, which helps reduce air pollution and combat climate change. Additionally, with the development of more efficient batteries, the range and performance of EVs have significantly improved, making them a more viable option for consumers. + [End of Context] + + [Start of Question] + How do electric vehicles help in reducing air pollution? + [End of Question] + + [Start of Evaluation] + This question is properly formatted, respectful, and directly relevant to the task of understanding the benefits of electric vehicles. It is grounded in the provided context, which mentions that EVs produce zero emissions while driving, helping reduce air pollution. A large language model can provide an answer to this question based on the provided context. + [End of Evaluation] + + [Start of Score] + 1 + [End of Score] + +generation: | + Here's the context and question you need to evaluate: + + [Start of Context] + {context} + [End of Context] + [Start of Question] + {question} + [End of Question] + + Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the question on a scale of 0 or 1 as mentioned above. + * Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags. + * Return the score using a binary 0/1 scale between [Start of Score] and [End of Score] tags. + +start_tags: ["[Start of Evaluation]", "[Start of Score]"] +end_tags: ["[End of Evaluation]", "[End of Score]"] \ No newline at end of file diff --git a/src/instructlab/sdg/configs/skills/freeform_questions.yaml b/src/instructlab/sdg/configs/skills/freeform_questions.yaml new file mode 100644 index 00000000..f3d1ed90 --- /dev/null +++ b/src/instructlab/sdg/configs/skills/freeform_questions.yaml @@ -0,0 +1,29 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: | + You are asked to come up with a set of {num_samples} diverse questions - {task_description}. + +principles: | + Please follow these guiding principles when generating responses: + * Use proper grammar and punctuation. + * Always generate safe and respectful content. Do not generate content that is harmful, abusive, or offensive. + * Always generate content that is factually accurate and relevant to the prompt. + * The questions should be clear and human-like. + * The questions should be diverse and cover a wide range of topics. + * The questions should not be template-based or generic, it should be very diverse. + * Simply return the questions, do not return any answers or explanations. + * Strictly adhere to the prompt and generate responses in the same style and format as the example. + * Return each question between [Start of Question] and [End of Question] tags. + +examples: | + To better assist you with this task, here is an example: + + [Start of Question] + {seed_question} + [End of Question] + +generation: | + Now generate {num_samples} such questions, remember to follow the principles mentioned above and use the same format as the examples. Remember to use the same style and format as the example above. Return each question between [Start of Question] and [End of Question] tags. + +start_tags: ["[Start of Question]"] +end_tags: ["[End of Question]"] diff --git a/src/instructlab/sdg/configs/skills/freeform_responses.yaml b/src/instructlab/sdg/configs/skills/freeform_responses.yaml new file mode 100644 index 00000000..0b0eda38 --- /dev/null +++ b/src/instructlab/sdg/configs/skills/freeform_responses.yaml @@ -0,0 +1,33 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: Your task is to faithfully follow the user's prompt and generate a response. + +principles: | + Please follow these guiding principles when generating responses: + * Use proper grammar and punctuation. + * Always generate safe and respectful content. Do not generate content that is harmful, abusive, or offensive. + * Always generate content that is factually accurate and relevant to the prompt. + * Strictly adhere to the prompt and generate responses in the same style and format as the example. + * Return the response between [Start of Response] and [End of Response] tags. + +examples: | + To better assist you with this task, here is an example: + [Start of Question] + {seed_question} + [End of Question] + + [Start of Response] + {seed_response} + [End of Response] + +generation: | + Now generate a response to the following prompt. + + [Start of Question] + {question} + [End of Question] + + Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags. + +start_tags: ["[Start of Response]"] +end_tags: ["[End of Response]"] diff --git a/src/instructlab/sdg/configs/skills/grounded_questions.yaml b/src/instructlab/sdg/configs/skills/grounded_questions.yaml new file mode 100644 index 00000000..904523c9 --- /dev/null +++ b/src/instructlab/sdg/configs/skills/grounded_questions.yaml @@ -0,0 +1,38 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: | + You are asked to come up with a set of {num_samples} diverse questions - {task_description}. + +principles: | + Please follow these guiding principles when generating responses: + * Use proper grammar and punctuation. + * Always generate safe and respectful content. Do not generate content that is harmful, abusive, or offensive. + * Always generate content that is factually accurate and relevant to the prompt. + * The questions should be clear and human-like. + * The questions should be diverse and cover a wide range of topics. + * The questions should not be template-based or generic, it should be very diverse. + * Simply return the questions, do not return any answers or explanations. + * Strictly adhere to the prompt and generate responses in the same style and format as the example. + * Most importantly all the questions should be grounded in the context provided and should be answerable solely based on the provided context. + * The question should address the task described in the prompt. + * Return each question between [Start of Question] and [End of Question] tags. + +examples: | + To better assist you with this task, here is an example: + + [Start of Context] + {seed_context} + [End of Context] + [Start of Question] + {seed_question} + [End of Question] + +generation: | + Now generate {num_samples} such questions, remember to follow the principles mentioned above and use the same format as the examples. Remember to use the same style and format as the example above. Do not return any contexts or answers, only the questions. Return each question between [Start of Question] and [End of Question] tags. + + [Start of Context] + {context} + [End of Context] + +start_tags: ["[Start of Question]"] +end_tags: ["[End of Question]"] diff --git a/src/instructlab/sdg/configs/skills/grounded_responses.yaml b/src/instructlab/sdg/configs/skills/grounded_responses.yaml new file mode 100644 index 00000000..87429b9a --- /dev/null +++ b/src/instructlab/sdg/configs/skills/grounded_responses.yaml @@ -0,0 +1,40 @@ +system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task. + +introduction: Your task is to faithfully follow the user's prompt, given context and generate a response. +principles: | + Please follow these guiding principles when generating responses: + * Use proper grammar and punctuation. + * Always generate safe and respectful content. Do not generate content that is harmful, abusive, or offensive. + * Always generate content that is factually accurate and relevant to the prompt. + * Strictly adhere to the prompt and generate responses in the same style and format as the example. + * Most importantly all the responses should be grounded in the context provided. + * Return the response between [Start of Response] and [End of Response] tags. + +examples: | + To better assist you with this task, here is an example: + + [Start of Context] + {seed_context} + [End of Context] + + [Start of Question] + {seed_question} + [End of Question] + + [Start of Response] + {seed_response} + [End of Response] + +generation: | + Now generate a response to the following prompt. Remember to use the same style and format as the example above. Return the response between [Start of Response] and [End of Response] tags. + + [Start of Context] + {context} + [End of Context] + [Start of Question] + {question} + [End of Question] + + +start_tags: ["[Start of Response]"] +end_tags: ["[End of Response]"] \ No newline at end of file diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py new file mode 100644 index 00000000..8fbe4f86 --- /dev/null +++ b/src/instructlab/sdg/default_flows.py @@ -0,0 +1,391 @@ +# SPDX-License-Identifier: Apache-2.0 +# Standard +from abc import ABC, abstractmethod +from importlib import resources +import operator +import os + +# Local +from .filterblock import FilterByValueBlock +from .iterblock import IterBlock +from .llmblock import LLMBlock + + +class Flow(ABC): + def __init__(self, client, model_id) -> None: + self.client = client + self.model_id = model_id + + @abstractmethod + def get_flow(self) -> list: + pass + + +class MMLUBenchFlow(Flow): + def get_flow(self) -> list: + sdg_base = resources.files(__package__) + return [ + { + "block_type": LLMBlock, + "block_config": { + "block_name": "gen_mmlu_knowledge", + "config_path": os.path.join( + sdg_base, "configs/knowledge/mcq_generation.yaml" + ), + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["mmlubench_question", "mmlubench_answer"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + "gen_kwargs": { + "temperature": 0, + "max_tokens": 2048, + }, + "drop_duplicates": ["mmlubench_question"], + }, + ] + + +class SynthKnowledgeFlow(Flow): + def get_flow(self) -> list: + sdg_base = resources.files(__package__) + return [ + { + "block_type": LLMBlock, + "block_config": { + "block_name": "gen_knowledge", + "config_path": os.path.join( + sdg_base, "configs/knowledge/generate_questions_responses.yaml" + ), + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["question", "response"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + "gen_kwargs": { + "max_tokens": 2048, + }, + "drop_duplicates": ["question"], + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "eval_faithfulness_qa_pair", + "config_path": os.path.join( + sdg_base, "configs/knowledge/evaluate_faithfulness.yaml" + ), + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["explanation", "judgment"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + "gen_kwargs": { + "max_tokens": 2048, + }, + }, + { + "block_type": FilterByValueBlock, + "block_config": { + "block_name": "filter_faithfulness", + "filter_column": "judgment", + "filter_value": "YES", + "operation": operator.eq, + "batch_kwargs": { + "num_procs": 8, + }, + }, + "drop_columns": ["judgment", "explanation"], + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "eval_relevancy_qa_pair", + "config_path": os.path.join( + sdg_base, "configs/knowledge/evaluate_relevancy.yaml" + ), + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["feedback", "score"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + "gen_kwargs": { + "max_tokens": 2048, + }, + }, + { + "block_type": FilterByValueBlock, + "block_config": { + "block_name": "filter_relevancy", + "filter_column": "score", + "filter_value": "2", + "operation": operator.eq, + "batch_kwargs": { + "num_procs": 8, + }, + }, + "drop_columns": ["feedback", "score"], + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "eval_verify_question", + "config_path": os.path.join( + sdg_base, "configs/knowledge/evaluate_question.yaml" + ), + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["explanation", "rating"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + "gen_kwargs": { + "max_tokens": 2048, + }, + }, + { + "block_type": FilterByValueBlock, + "block_config": { + "block_name": "filter_verify_question", + "filter_column": "rating", + "filter_value": "1", + "operation": operator.eq, + "batch_kwargs": { + "num_procs": 8, + }, + }, + "drop_columns": ["explanation", "rating", "__index_level_0__"], + }, + ] + + +class SynthSkillsFlow(Flow): + def get_flow(self) -> list: + return [ + { + "block_type": LLMBlock, + "block_config": { + "block_name": "gen_questions", + "config_path": "src/instructlab/sdg/configs/skills/freeform_questions.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["question"], + "batch_kwargs": { + "num_procs": 8, + "num_samples": 30, + "batched": True, + }, + }, + "drop_duplicates": ["question"], + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "eval_questions", + "config_path": "src/instructlab/sdg/configs/skills/evaluate_freeform_questions.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["evaluation", "score"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + }, + { + "block_type": FilterByValueBlock, + "block_config": { + "block_name": "filter_questions", + "filter_column": "score", + "filter_value": 1, + "operation": operator.eq, + "convert_dtype": int, + "batch_kwargs": { + "num_procs": 8, + }, + }, + "drop_columns": ["evaluation", "score", "num_samples"], + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "gen_responses", + "config_path": "src/instructlab/sdg/configs/skills/freeform_responses.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["answer"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "evaluate_qa_pair", + "config_path": "src/instructlab/sdg/configs/skills/evaluate_freeform_pair.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["evaluation", "score"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + }, + { + "block_type": FilterByValueBlock, + "block_config": { + "block_name": "filter_qa_pair", + "filter_column": "score", + "filter_value": 2, + "operation": operator.ge, + "convert_dtype": int, + "batch_kwargs": { + "num_procs": 8, + }, + }, + "drop_columns": ["evaluation", "score"], + }, + ] + + +class SynthGroundedSkillsFlow(Flow): + def get_flow(self) -> list: + return [ + { + "block_type": IterBlock, + "block_config": { + "block_name": "context_iter", + "num_iters": 10, + "block_type": LLMBlock, + "block_kwargs": { + "block_name": "gen_contexts", + "config_path": "src/instructlab/sdg/configs/skills/contexts.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["context"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + "gen_kwargs": { + "temperature": 0.7, + "max_tokens": 2048, + }, + }, + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "gen_grounded_questions", + "config_path": "src/instructlab/sdg/configs/skills/grounded_questions.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["question"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + "drop_duplicates": ["question"], + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "eval_grounded_questions", + "config_path": "src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["evaluation", "score"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + }, + { + "block_type": FilterByValueBlock, + "block_config": { + "block_name": "filter_grounded_questions", + "filter_column": "score", + "filter_value": 1, + "operation": operator.eq, + "convert_dtype": int, + "batch_kwargs": { + "num_procs": 8, + }, + }, + "drop_columns": ["evaluation", "score", "num_samples"], + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "gen_grounded_responses", + "config_path": "src/instructlab/sdg/configs/skills/grounded_responses.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["answer"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + }, + { + "block_type": LLMBlock, + "block_config": { + "block_name": "evaluate_grounded_qa_pair", + "config_path": "src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml", + "client": self.client, + "model_id": self.model_id, + "model_prompt": " [INST] {prompt} [/INST]", + "output_cols": ["evaluation", "score"], + "batch_kwargs": { + "num_procs": 8, + "batched": True, + }, + }, + }, + { + "block_type": FilterByValueBlock, + "block_config": { + "block_name": "filter_grounded_qa_pair", + "filter_column": "score", + "filter_value": 2, + "operation": operator.ge, + "convert_dtype": int, + "batch_kwargs": { + "num_procs": 8, + }, + }, + }, + ] diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py new file mode 100644 index 00000000..e6d4eb24 --- /dev/null +++ b/src/instructlab/sdg/filterblock.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +# Third Party +from datasets import Dataset + +# Local +from .block import Block +from .logger_config import setup_logger + +logger = setup_logger(__name__) + + +class FilterByValueBlock(Block): + def __init__( + self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs + ) -> None: + super().__init__(block_name=self.__class__.__name__) + self.value = filter_value + self.column_name = filter_column + self.operation = operation + self.convert_dtype = convert_dtype + self.num_procs = batch_kwargs.get("num_procs", 1) + + def generate(self, samples) -> Dataset: + if self.convert_dtype: + samples = samples.map( + lambda x: { + **x, + self.column_name: self.convert_dtype(x[self.column_name]), + }, + num_proc=self.num_procs, + ) + + return samples.filter( + lambda x: self.operation(x[self.column_name], self.value), + num_proc=self.num_procs, + ) diff --git a/src/instructlab/sdg/iterblock.py b/src/instructlab/sdg/iterblock.py new file mode 100644 index 00000000..21a20470 --- /dev/null +++ b/src/instructlab/sdg/iterblock.py @@ -0,0 +1,29 @@ +# Third Party +from datasets import Dataset + +# Local +from .block import Block +from .logger_config import setup_logger + +logger = setup_logger(__name__) + + +class IterBlock(Block): + def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs): + super().__init__(block_name) + self.num_iters = num_iters + self.block = block_type(**block_kwargs) + self.gen_kwargs = kwargs.get("gen_kwargs", {}) + self.gen_kwargs = kwargs.get("gen_kwargs", {}) + + def generate(self, samples, **gen_kwargs) -> Dataset: + generated_samples = [] + num_iters = self.num_iters + + for _ in range(num_iters): + batch_generated = self.block.generate( + samples, **{**self.gen_kwargs, **gen_kwargs} + ) + generated_samples.extend(batch_generated) + + return Dataset.from_list(generated_samples) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py new file mode 100644 index 00000000..ad429b75 --- /dev/null +++ b/src/instructlab/sdg/llmblock.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: Apache-2.0 +# Standard +import re + +# Third Party +from datasets import Dataset + +# Local +from .block import Block +from .logger_config import setup_logger + +logger = setup_logger(__name__) + + +class LLMBlock(Block): + # pylint: disable=too-many-instance-attributes + def __init__( + self, + block_name, + config_path, + client, + model_id, + output_cols, + model_prompt="{prompt}", + **batch_kwargs, + ) -> None: + super().__init__(block_name) + self.block_config = self._load_config(config_path) + self.prompt_struct = ( + """{system}\n{introduction}\n{principles}\n{examples}\n{generation}""" + ) + self.prompt_template = self.prompt_struct.format(**self.block_config) + self.client = client + self.model = model_id + self.model_prompt = model_prompt + self.output_cols = output_cols + self.batch_params = batch_kwargs.get("batch_kwargs", {}) + self.defaults = { + "model": self.model, + "temperature": 0, + "max_tokens": 12000, + } + + def _parse(self, generated_string) -> dict: + matches = {} + for start_tag, end_tag, output_col in zip( + self.block_config["start_tags"], + self.block_config["end_tags"], + self.output_cols, + ): + if not start_tag and not end_tag: + matches[output_col] = ( + generated_string.strip() if generated_string else None + ) + else: + pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag) + all_matches = re.findall(pattern, generated_string, re.DOTALL) + matches[output_col] = ( + [match.strip() for match in all_matches] if all_matches else None + ) + + return matches + + def _generate(self, samples, **gen_kwargs) -> list: + prompts = [ + self.model_prompt.format( + prompt=self.prompt_template.format(**sample).strip() + ) + for sample in samples + ] + response = self.client.completions.create( + prompt=prompts, **{**self.defaults, **gen_kwargs} + ) + return [choice.text.strip() for choice in response.choices] + + def generate(self, samples, **gen_kwargs) -> Dataset: + """ + Generate the output from the block. This method should first validate the input data, + then generate the output, and finally parse the generated output before returning it. + + :return: The parsed output after generation. + """ + num_samples = self.batch_params.get("num_samples", None) + batched = self.batch_params.get("batched", False) + + if (num_samples is not None) and ("num_samples" not in samples.column_names): + samples = samples.add_column("num_samples", [num_samples] * len(samples)) + + # validate the each sample + for sample in samples: + if not self._validate(self.prompt_template, sample): + return None + + # generate the output + outputs = [] + if batched: + outputs = self._generate(samples, **gen_kwargs) + else: + outputs = [self._generate([sample], **gen_kwargs) for sample in samples] + + new_data = [] + for sample, output in zip(samples, outputs): + parsed_outputs = self._parse(output) + # pylint: disable=consider-using-generator + max_length = max([len(value) for value in parsed_outputs.values()]) + for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())): + new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))}) + + return Dataset.from_list(new_data) diff --git a/src/instructlab/sdg/logger_config.py b/src/instructlab/sdg/logger_config.py new file mode 100644 index 00000000..37c958ab --- /dev/null +++ b/src/instructlab/sdg/logger_config.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# Standard +import logging + +# Third Party +from rich.logging import RichHandler + + +def setup_logger(name): + # Set up the logger + logging.basicConfig( + level=logging.INFO, + format="%(message)s", + datefmt="[%X]", + handlers=[RichHandler()], + ) + logger = logging.getLogger(name) + return logger diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py new file mode 100644 index 00000000..0de65d1b --- /dev/null +++ b/src/instructlab/sdg/pipeline.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# Third Party +from datasets import Dataset + +# Local +from .iterblock import IterBlock +from .logger_config import setup_logger + +logger = setup_logger(__name__) + + +class Pipeline: + def __init__(self, chained_blocks: list) -> None: + """ + Initialize the Pipeline class with a configuration dictionary. + config_dict: the run config py or yaml loaded into a dictionary + """ + # pipeline config is the run configuration that consists of the pipeline steps + self.chained_blocks = chained_blocks + + def _drop_duplicates(self, dataset, cols): + """ + Drop duplicates from the dataset based on the columns provided. + """ + df = dataset.to_pandas() + df.drop_duplicates(subset=cols, inplace=True) + return Dataset.from_pandas(df) + + def generate(self, dataset) -> Dataset: + """ + Generate the dataset by running the pipeline steps. + dataset: the input dataset + """ + for block_prop in self.chained_blocks: + block_type = block_prop["block_type"] + block_config = block_prop["block_config"] + drop_columns = block_prop.get("drop_columns", None) + gen_kwargs = block_prop.get("gen_kwargs", {}) + drop_duplicates_cols = block_prop.get("drop_duplicates", False) + block = block_type(**block_config) + + if block_type == IterBlock: + block_kwargs = block_config.pop("block_kwargs") + block = block_type(**block_config, block_kwargs=block_kwargs) + else: + block = block_type(**block_config) + + logger.info("Running block: %s", block_config["block_name"]) + logger.info(dataset) + + dataset = block.generate(dataset, **gen_kwargs) + + if drop_columns: + dataset = dataset.remove_columns(drop_columns) + + if drop_duplicates_cols: + dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) + + return dataset diff --git a/src/instructlab/sdg/sdg.py b/src/instructlab/sdg/sdg.py new file mode 100644 index 00000000..c3bce90f --- /dev/null +++ b/src/instructlab/sdg/sdg.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# Third Party +from datasets import Dataset + +# Local +from .pipeline import Pipeline + + +class SDG: + def __init__(self, pipelines: list[Pipeline]) -> None: + self.pipelines = pipelines + + def generate(self, dataset: Dataset): + """ + Generate the dataset by running the chained pipeline steps. + dataset: the input dataset + """ + for pipeline in self.pipelines: + dataset = pipeline.generate(dataset) + return dataset diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py new file mode 100644 index 00000000..5f3c0407 --- /dev/null +++ b/src/instructlab/sdg/utilblocks.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# Third Party +from datasets import Dataset + +# Local +from .block import Block +from .logger_config import setup_logger + +logger = setup_logger(__name__) + + +class SamplePopulatorBlock(Block): + def __init__(self, config_paths, column_name, **batch_kwargs) -> None: + super().__init__(block_name=self.__class__.__name__) + self.configs = {} + for config in config_paths: + config_key = config.split("/")[-1].split(".")[0] + self.configs[config_key] = self._load_config(config) + self.column_name = column_name + self.num_procs = batch_kwargs.get("num_procs", 8) + + def _generate(self, sample) -> dict: + sample = {**sample, **self.configs[sample[self.column_name]]} + return sample + + def generate(self, samples) -> Dataset: + samples = samples.map(self._generate, num_proc=self.num_procs) + return samples + + +class SelectorBlock(Block): + def __init__(self, choice_map, choice_col, output_col, **batch_kwargs) -> None: + super().__init__(block_name=self.__class__.__name__) + self.choice_map = choice_map + self.choice_col = choice_col + self.output_col = output_col + self.num_procs = batch_kwargs.get("num_procs", 8) + + def _generate(self, sample) -> dict: + sample[self.output_col] = sample[self.choice_map[sample[self.choice_col]]] + return sample + + def generate(self, samples: Dataset) -> Dataset: + samples = samples.map(self._generate, num_proc=self.num_procs) + return samples + + +class CombineColumnsBlock(Block): + def __init__(self, columns, output_col, separator="\n\n", **batch_kwargs) -> None: + super().__init__(block_name=self.__class__.__name__) + self.columns = columns + self.output_col = output_col + self.separator = separator + self.num_procs = batch_kwargs.get("num_procs", 8) + + def _generate(self, sample) -> dict: + sample[self.output_col] = self.separator.join( + [sample[col] for col in self.columns] + ) + return sample + + def generate(self, samples: Dataset) -> Dataset: + samples = samples.map(self._generate, num_proc=self.num_procs) + return samples