From b41d7af75a2c260da928a9266f99783eafcbbc2b Mon Sep 17 00:00:00 2001 From: Claudio Spiess Date: Wed, 4 Sep 2024 10:22:11 -0400 Subject: [PATCH] Begin merging optimizer --- examples/prompt_library/CoT.pdl | 77 +++++---- examples/prompt_library/ReAct.pdl | 52 +++--- examples/prompt_library/ReWoo.pdl | 5 +- .../prompt_library/demos/Verifier_json.pdl | 11 +- .../demos/evalplus/evalplus.pdl | 38 ++--- .../prompt_library/demos/evalplus/react.pdl | 158 ++++++++++++++++++ pdl/optimize/config_parser.py | 9 +- pdl/optimize/mbpp_thread.py | 139 ++++++++------- pdl/optimize/optimize.py | 50 +++++- pdl/optimize/pdl_optimizer.py | 34 ++-- pdl/optimize/util.py | 13 +- pdl/pdl_ast.py | 4 +- pdl/pdl_scheduler.py | 1 - 13 files changed, 415 insertions(+), 176 deletions(-) create mode 100644 examples/prompt_library/demos/evalplus/react.pdl diff --git a/examples/prompt_library/CoT.pdl b/examples/prompt_library/CoT.pdl index b6c27126..de7b5077 100644 --- a/examples/prompt_library/CoT.pdl +++ b/examples/prompt_library/CoT.pdl @@ -31,17 +31,17 @@ defs: model: str answer: str return: - - |- - Question: {{question}} - - Answer: Let's think step by step. - - model: "{{ model }}" - params: - decoding_method: "greedy" - stop_sequences: - - "The answer is" - include_stop_sequence: false - - "The answer is {{ answer }}." + - |- + Question: {{ question }} + Answer: Let's think step by step. + - model: "{{ model }}" + platform: bam + parameters: + decoding_method: "greedy" + stop_sequences: + - "The answer is" + include_stop_sequence: false + - "The answer is {{ answer }}." fewshot_cot: function: @@ -65,24 +65,39 @@ defs: examples: { list: { obj: { question: str, reasoning: str, answer: str } } } return: - - call: fewshot_cot - args: - examples: "{{ examples }}" - - |- - Question: {{question}} + - call: fewshot_cot + args: + examples: "{{ examples }}" + block_style: "question_cot" + - | + Question: {{ question }} + Answer: Let's think step by step. + - model: "{{ model }}" + platform: bam + parameters: + decoding_method: "greedy" + stop_sequences: + - "<|endoftext|>" + include_stop_sequence: false - Answer: Let's think step by step. - - model: "{{ model }}" - params: - decoding_method: "greedy" - stop_sequences: - - "The answer is" - include_stop_sequence: false - - "The answer is " - - def: answer - model: "{{ model }}" - params: - decoding_method: "greedy" - - "\n\nJSON Output: " - - data: - answer: "{{ answer|trim }}" \ No newline at end of file + chain_of_thought_claim: + function: + question: str + model: str + examples: + { list: { obj: { question: str, reasoning: str, answer: str } } } + return: + - call: fewshot_cot + args: + examples: "{{ examples }}" + block_style: "claim_cot" + - | + {{ question }} + Thought: Let's think step by step. + - model: "{{ model }}" + platform: bam + parameters: + decoding_method: "greedy" + stop_sequences: + - "<|endoftext|>" + include_stop_sequence: false diff --git a/examples/prompt_library/ReAct.pdl b/examples/prompt_library/ReAct.pdl index c4428271..a889bf1f 100644 --- a/examples/prompt_library/ReAct.pdl +++ b/examples/prompt_library/ReAct.pdl @@ -93,7 +93,10 @@ defs: iterations: "{{ iterations+1 }}" - def: THOUGHT model: "{{ model }}" - params: + parameters: + time_limit: 60000 + random_seed: 42 + truncate_input_tokens: 8191 decoding_method: sample repetition_penalty: 1.1 temperature: "{{ temperature }}" @@ -102,7 +105,10 @@ defs: until: "{{ THOUGHT.endswith('Act:') or iterations>20 }}" - def: action_raw model: "{{ model }}" - params: + parameters: + time_limit: 60000 + random_seed: 42 + truncate_input_tokens: 8191 decoding_method: sample repetition_penalty: 1.1 temperature: "{{ temperature }}" @@ -113,7 +119,10 @@ defs: - "[" - def: SUBJECT model: "{{ model }}" - params: + parameters: + time_limit: 60000 + random_seed: 42 + truncate_input_tokens: 8191 decoding_method: sample repetition_penalty: 1.1 temperature: "{{ temperature }}" @@ -130,7 +139,12 @@ defs: subject: "{{ SUBJECT }}" - "\nTho:" - model: "{{ model }}" - params: + fallback: "Error calling model" + parameters: + time_limit: 60000 + random_seed: 42 + truncate_input_tokens: 8191 + repetition_penalty: 1.1 decoding_method: sample temperature: "{{ temperature }}" stop_sequences: ["\n", "Act:", "Obs:", "Tho:"] @@ -178,21 +192,21 @@ defs: - repeat: - def: THOUGHT model: "{{ model }}" - params: - DECODING_METHOD: "{{ decoding_method }}" - TEMPERATURE: "{{ temperature }}" - STOP_SEQUENCES: ["\n", "Act:", "Obs:", "Tho:"] - INCLUDE_STOP_SEQUENCE: true + parameters: + decoding_method: "{{ decoding_method }}" + temperature: "{{ temperature }}" + stop_sequences: ["\n", "Act:", "Obs:", "Tho:"] + include_stop_sequence: true until: "{{ THOUGHT.endswith('Act:') }}" - def: action model: "{{ model }}" parser: json spec: {name: str, arguments: obj} - params: - DECODING_METHOD: "{{ decoding_method }}" - TEMPERATURE: "{{ temperature }}" - STOP_SEQUENCES: ["\n", "<|endoftext|>"] - INCLUDE_STOP_SEQUENCE: false + parameters: + decoding_method: "{{ decoding_method }}" + temperature: "{{ temperature }}" + stop_sequences: ["\n", "<|endoftext|>"] + include_stop_sequence: false - if: "{{ action.name != 'Finish' }}" then: - "\nObs: " @@ -202,11 +216,11 @@ defs: args: arguments: "{{ action.arguments }}" - model: "{{ model }}" - params: - DECODING_METHOD: "{{ decoding_method }}" - TEMPERATURE: "{{ temperature }}" - STOP_SEQUENCES: ["\n", "Act:", "Obs:", "Tho:"] - INCLUDE_STOP_SEQUENCE: false + parameters: + decoding_method: "{{ decoding_method }}" + temperature: "{{ temperature }}" + stop_sequences: ["\n", "Act:", "Obs:", "Tho:"] + include_stop_sequence: false else: "Invalid action. Valid actions are {{ TOOL_INFO.signatures|join(', ') }} and Finish[]." until: "{{ action.name == 'Finish' }}" - show_result: false diff --git a/examples/prompt_library/ReWoo.pdl b/examples/prompt_library/ReWoo.pdl index bebd3b46..3d524a16 100644 --- a/examples/prompt_library/ReWoo.pdl +++ b/examples/prompt_library/ReWoo.pdl @@ -86,10 +86,11 @@ defs: {{ task }} - def: PLANS model: "{{ model }}" + platform: bam parser: # plan, step_name, tool, tool_input regex: 'Plan:\s*(?P(?:.|\n)*?)\s*(?P#E\d+)\s*=\s*(?P\w+)\s*\[(?P[^\]]+)\]' mode: findall - params: + parameters: decoding_method: greedy stop_sequences: - "<|endoftext|>" @@ -152,7 +153,7 @@ defs: Response: - def: SOLUTION model: "{{ model }}" - params: + parameters: decoding_method: greedy stop_sequences: - "<|endoftext|>" diff --git a/examples/prompt_library/demos/Verifier_json.pdl b/examples/prompt_library/demos/Verifier_json.pdl index 1ecd30b0..9731dedd 100644 --- a/examples/prompt_library/demos/Verifier_json.pdl +++ b/examples/prompt_library/demos/Verifier_json.pdl @@ -9,14 +9,14 @@ document: tools: "{{ default_tools }}" tool_names: ["Search"] - def: QUESTION + show_result: false read: message: "Please enter a question: " - def: PROPOSED call: react_json + show_result: true args: - context: - - role: system - content: "{{ granite_models.granite_7b_lab.system_prompt }}" + context: "{{ granite_models.granite_7b_lab.system_prompt }}" question: "{{ QUESTION }}" model: ibm/granite-7b-lab tools: "{{ filtered_tools }}" @@ -24,15 +24,16 @@ document: - "\n\n----- Verifying answer... -----\n\n" - def: VERIFIED call: react_json + show_result: true args: - context: [{"role": "system", "content": ""}] + context: "" question: |- Is this the right answer to this question? "{{ QUESTION }}" Proposed answer: {{ PROPOSED.answer.topic }} Please answer as True or False. - model: ibm/granite-34b-code-instruct + model: "{{ granite_models.granite_20b_code_instruct_v2.id }}" tools: "{{ filtered_tools }}" trajectories: [] - "\n\nThe answer '{{ PROPOSED.answer.topic }}' has been verified as '{{VERIFIED.answer.topic}}'.\n" diff --git a/examples/prompt_library/demos/evalplus/evalplus.pdl b/examples/prompt_library/demos/evalplus/evalplus.pdl index fd3a4b4d..5af64dfe 100644 --- a/examples/prompt_library/demos/evalplus/evalplus.pdl +++ b/examples/prompt_library/demos/evalplus/evalplus.pdl @@ -1,15 +1,15 @@ description: EvalPlus simple defs: - demonstrations: - data: - - answer: "def max_run_uppercase(test_str):\n cnt = 0\n res = 0\n for idx in range(0,\ - \ len(test_str)):\n if test_str.isupper():\n cnt += 1\n else:\n\ - \ res = cnt\n cnt = 0\n if test_str.isupper():\n\ - \ res = cnt\n return (res)" - question: Write a function to find maximum run of uppercase characters in the given - string. - - answer: "def remove_length(test_str, K):\n temp = test_str.split()\n res = \n res = ' '.join(res)\n return (res) " - question: Write a function to remove all the words with k length in the given string. + # demonstrations: + # data: + # - answer: "def max_run_uppercase(test_str):\n cnt = 0\n res = 0\n for idx in range(0,\ + # \ len(test_str)):\n if test_str.isupper():\n cnt += 1\n else:\n\ + # \ res = cnt\n cnt = 0\n if test_str.isupper():\n\ + # \ res = cnt\n return (res)" + # question: Write a function to find maximum run of uppercase characters in the given + # string. + # - answer: "def remove_length(test_str, K):\n temp = test_str.split()\n res = \n res = ' '.join(res)\n return (res) " + # question: Write a function to remove all the words with k length in the given string. model: ibm/granite-34b-code-instruct prompt_pattern: cot prompt: Write a python function to toggle bits of the number except the first and the last bit. @@ -23,12 +23,12 @@ document: {{ example.answer }} - | - Task: {{ prompt }} - Solution: -- model: "{{ model }}" - platform: bam - parameters: - decoding_method: "greedy" - stop_sequences: - - "<|endoftext|>" - include_stop_sequence: false \ No newline at end of file + Task: {{ prompt }} + Solution: +# - model: "{{ model }}" +# platform: bam +# parameters: +# decoding_method: "greedy" +# stop_sequences: +# - "<|endoftext|>" +# include_stop_sequence: false \ No newline at end of file diff --git a/examples/prompt_library/demos/evalplus/react.pdl b/examples/prompt_library/demos/evalplus/react.pdl new file mode 100644 index 00000000..a93df0f1 --- /dev/null +++ b/examples/prompt_library/demos/evalplus/react.pdl @@ -0,0 +1,158 @@ +description: PDL implementation of the ReAct agent for MBPP in MINT +defs: + model: "ibm/granite-34b-code-instruct" + prompt_pattern: react + # prompt: |- + # Write a python function to identify non-prime numbers. +# assert is_not_prime(2) == False +# assert is_not_prime(10) == True +# assert is_not_prime(35) == True +# assert is_not_prime(37) == False +document: + - | + Task: + Write a function to find the similar elements from the given two tuple lists. + assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5) + + Assistant: + The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. + + def similar_elements(test_tup1, test_tup2): + res = tuple(set(test_tup1) | set(test_tup2)) + return res + res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10)) + assert res == (4, 5), "Expected (4, 5) but got {}".format(res) + + + Observation: + --------------------------------------------------------------------------- + AssertionError Traceback (most recent call last) + Cell In[4], line 5 + 3 return res + 4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10)) + ----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res) + + AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10) + + Assistant: + To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. + + def similar_elements(test_tup1, test_tup2): + res = tuple(set(test_tup1) & set(test_tup2)) + return res + res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10)) + assert res == (4, 5), "Expected (4, 5) but got {}".format(res) + + + Observation: + [Executed Successfully with No Output] + + Assistant: + There is no more AssertionError. I can now submit the solution. + + def similar_elements(test_tup1, test_tup2): + res = tuple(set(test_tup1) & set(test_tup2)) + return res + + - "\n" + - | + Task: + {{ prompt|trim }} + + Assistant: + + - defs: + temperature: 0.7 + decoding_method: "sample" + SOLUTION: "" + iterations: 0 + max_iterations: 5 + repeat: + - repeat: + - defs: + iterations: "{{ iterations+1 }}" + - def: THOUGHT + model: "{{ model }}" + parameters: + truncate_input_tokens: 8191 + repetition_penalty: 1.05 + decoding_method: "{{ decoding_method }}" + temperature: "{{ temperature }}" + stop_sequences: ["", "", "<|endoftext|>"] + include_stop_sequence: true + - if: "{{ THOUGHT is none }}" + then: + - def: THOUGHT + document: "" + until: "{{ THOUGHT.endswith('') or THOUGHT.endswith('') or iterations>max_iterations }}" + - if: "{{ THOUGHT.endswith('') }}" + then: + - "\n\nObservation {{ iterations}}:\n" + - lan: ipython + show_result: false + def: observation + code: | + {{ THOUGHT.split('')[-1].strip('') or THOUGHT }} + - "{{ (observation|trim|trim('\n')) if observation is not none else '' }}" + - "\n\nAssistant {{ iterations}}:\n" + - if: "{{ THOUGHT.endswith('') }}" + then: + - defs: + SOLUTION: "{{ THOUGHT.split('')[-1].strip('') or THOUGHT }}" + + until: "{{ SOLUTION != '' or iterations>max_iterations }}" + - if: "{{ iterations>max_iterations }}" + then: "[ Failed to find a solution within {{ max_iterations }} iterations. ]" + - "\nSolution:\n{{ SOLUTION|trim }}" + +# WITH FEEDBACK +# Task: +# Write a function to find the similar elements from the given two tuple lists. +# assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5) + +# Assistant: +# The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. +# +# def similar_elements(test_tup1, test_tup2): +# res = tuple(set(test_tup1) | set(test_tup2)) +# return res +# res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10)) +# assert res == (4, 5), "Expected (4, 5) but got {}".format(res) +# + +# Observation: +# --------------------------------------------------------------------------- +# AssertionError Traceback (most recent call last) +# Cell In[4], line 5 +# 3 return res +# 4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10)) +# ----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res) + +# AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10) + +# Expert feedback: +# This is BAD. You should not use the | operator which is the union operator. + +# Assistant: +# To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. +# +# def similar_elements(test_tup1, test_tup2): +# res = tuple(set(test_tup1) & set(test_tup2)) +# return res +# res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10)) +# assert res == (4, 5), "Expected (4, 5) but got {}".format(res) +# + +# Observation: +# [Executed Successfully with No Output] + +# Expert feedback: +# This is GOOD. You are doing great! You can try to submit the solution now. + +# Assistant: +# There is no more AssertionError. I can now submit the solution. +# +# def similar_elements(test_tup1, test_tup2): +# res = tuple(set(test_tup1) & set(test_tup2)) +# return res +# diff --git a/pdl/optimize/config_parser.py b/pdl/optimize/config_parser.py index a46449ea..a51a7782 100644 --- a/pdl/optimize/config_parser.py +++ b/pdl/optimize/config_parser.py @@ -1,13 +1,14 @@ import argparse from pathlib import Path from typing import Literal -from pydantic import BaseModel, Field + import yaml +from pydantic import BaseModel, Field class OptimizationConfig(BaseModel): benchmark: Literal[ - "gsm8k", "gsm8k-baseline", "gsm8k-bench", "fever", "evalplus" + "gsm8k", "gsm8k-baseline", "gsm8k-bench", "fever", "evalplus", ] = Field() num_candidates: int = Field(default=30) num_demonstrations: int = Field(default=5) @@ -51,6 +52,6 @@ def get_variable_names(self) -> list[str]: print(config.get_variable_names()) Path("opticonfig1.yml").write_text( yaml.dump(config.model_dump( - exclude_defaults=False, exclude_none=False, exclude_unset=False - )) + exclude_defaults=False, exclude_none=False, exclude_unset=False, + )), ) diff --git a/pdl/optimize/mbpp_thread.py b/pdl/optimize/mbpp_thread.py index bf06560c..05e7c4e1 100644 --- a/pdl/optimize/mbpp_thread.py +++ b/pdl/optimize/mbpp_thread.py @@ -1,21 +1,13 @@ -import json -from pathlib import Path +import ast from typing import Any -from pdl.optimize.util import PDLThread -from pdl.pdl_ast import ScopeType -from pdl.pdl_interpreter import ( - empty_scope, -) -from evalplus.data import get_mbpp_plus, write_jsonl from evalplus.evaluate import ( check_correctness, - get_mbpp_plus_hash, - get_groundtruth, - MBPP_OUTPUT_NOT_NONE_TASKS, ) -from collections import Counter, defaultdict -from concurrent.futures import ProcessPoolExecutor, as_completed + +from pdl.optimize.util import PDLThread +from pdl.pdl_ast import ScopeType +from pdl.pdl_interpreter import empty_scope class MBPPTrialThread(PDLThread): @@ -25,7 +17,7 @@ def __init__( **kwargs, ): super().__init__(*args, **kwargs) - self.answer_key = "code" + self.answer_key = "canonical_solution" def get_scope(self) -> ScopeType: scope = empty_scope @@ -68,59 +60,80 @@ def get_scope(self) -> ScopeType: scope["prompt"] = self.example["prompt"] scope["task_id"] = self.example["task_id"] - - # with ProcessPoolExecutor(max_workers=1) as executor: - # eval_results = defaultdict(list) # task_id -> - # future = executor.submit( - # check_correctness, - # dataset="mbpp", - # completion_id=0, - # problem=mbpp_plus[task_id], - # solution=mbpp_plus[task_id]["canonical_solution"], - # expected_output=expected_output[task_id], - # base_only=False, - # fast_check=True, - # identifier=task_id + " line(1 in x)", - # min_time_limit=1, - # gt_time_limit_factor=4.0, - # ) - # print(future.result()) - return scope def extract_answer(self, document: str) -> bool: - response = document.split("Solution:\n")[-1] - - # solution = { - # "task_id": f"Mbpp/{self.example['task_id']}", - # "completion": response, - # } - # with Path(f"Mbpp/{self.example['task_id']}").open("w") as f: - # json.dump(solution, f) - - return response + return document.split("Solution:\n")[-1] def answer_correct(self, document: str, answer: Any, truth: Any) -> bool: - task_id = f"Mbpp/{self.example['task_id']}" - mbpp_plus = get_mbpp_plus() - - dataset_hash = get_mbpp_plus_hash() - expected_output = get_groundtruth( - mbpp_plus, - dataset_hash, - MBPP_OUTPUT_NOT_NONE_TASKS, + if answer is None or not isinstance(answer, str): + return False + + try: + ast.parse(answer) + except Exception as e: + print(e) + return False + + task_id = self.example["task_id"] + + solution = ( + self.example["prompt"] + answer ) + result = check_correctness( - dataset="mbpp", - completion_id=self.index, - problem=mbpp_plus[task_id], - solution=answer, #mbpp_plus[task_id]["canonical_solution"], - expected_output=expected_output[task_id], - base_only=False, - fast_check=True, - identifier=task_id + " line(1 in x)", - min_time_limit=1, - gt_time_limit_factor=4.0, - ) - print(result) - return result["base"][0] == 'pass' and result["plus"][0] == 'pass' + dataset="mbpp", + completion_id=self.index, + problem=self.example, + solution=solution, + expected_output=self.example["expected_output"], + base_only=False, + fast_check=False, + identifier=task_id + " line(1 in x)", + min_time_limit=1, # 1 + gt_time_limit_factor=4.0, # 4.0 + ) + + def get_failed_tests(stat, details, inputs, expected): + if stat == "pass" or not details: + return [] + + return [ + { + "inputs": inputs[i], + "expected_outputs": expected[i], + } + for i in range(len(details)) + if not details[i] + ] + + # else => simply return the only and the last fail test + # return [inputs[len(details) - 1]] + + base_stat, base_details = result["base"] + get_failed_tests( + base_stat, + base_details, + self.example["base_input"], + self.example["expected_output"].get("base"), + ) + # if len(base_fail_tests) > 0: + # print(base_fail_tests) + + plus_stat, plus_details = result["plus"] + get_failed_tests( + plus_stat, + plus_details, + self.example["plus_input"], + self.example["expected_output"].get("plus"), + ) + # if len(plus_fail_tests) > 0: + # print(self.example["task_id"], plus_fail_tests) + return result["base"][0] == "pass" and result["plus"][0] == "pass" + # if not passing_both: + # print( + # "FAIL", + # self.example["task_id"], + # self.example["canonical_solution"], + # answer, + # ) diff --git a/pdl/optimize/optimize.py b/pdl/optimize/optimize.py index 44ce08a1..0f172ce2 100644 --- a/pdl/optimize/optimize.py +++ b/pdl/optimize/optimize.py @@ -2,8 +2,13 @@ from enum import Enum from pathlib import Path -from datasets import load_from_disk, Dataset, load_dataset import yaml +from datasets import load_dataset, load_from_disk +from evalplus.evaluate import ( + MBPP_OUTPUT_NOT_NONE_TASKS, + get_groundtruth, + get_mbpp_plus_hash, +) from pdl.optimize.config_parser import OptimizationConfig from pdl.optimize.mbpp_thread import MBPPTrialThread @@ -85,14 +90,49 @@ class SamplingMethods(Enum): config=config, ).run() elif config.benchmark == "evalplus": - # from evalplus.data import get_mbpp_plus - # mbpp_plus = get_mbpp_plus() + from copy import deepcopy - mbpp = load_dataset("google-research-datasets/mbpp", name="sanitized") + from datasets import concatenate_datasets + from evalplus.data import get_mbpp_plus, get_mbpp_plus_hash + + class SelectableList(list): + def select(self, iterable): + return [self[i] for i in iterable] + + class MBPPDataset(dict): + def __init__(self): + self.mbpp_plus = get_mbpp_plus() + self.dataset_hash = get_mbpp_plus_hash() + + expected_outputs = get_groundtruth( + deepcopy(self.mbpp_plus), + self.dataset_hash, + MBPP_OUTPUT_NOT_NONE_TASKS, + ) + + self.mbpp = load_dataset("google-research-datasets/mbpp", name="full") + self["train"] = concatenate_datasets( + self.mbpp.filter( + lambda x: f"Mbpp/{x['task_id']}" not in self.mbpp_plus, + ) + .rename_columns({"code": "canonical_solution", "text": "prompt"}) + .values(), + ) + self["test"] = SelectableList([v for k, v in self.mbpp_plus.items()]) + for i, x in enumerate(self["test"]): + self["test"][i]["expected_output"] = expected_outputs[x["task_id"]] + + def __getitem__(self, key): + return dict.__getitem__(self, key) + + def __setitem__(self, key, val): + dict.__setitem__(self, key, val) + + mbpp_dataset = MBPPDataset() PDLOptimizer( pdl_path=args.pdl_file, - dataset=mbpp, + dataset=mbpp_dataset, trial_thread=MBPPTrialThread, yield_output=args.yield_output, experiment_path=Path("experiments"), diff --git a/pdl/optimize/pdl_optimizer.py b/pdl/optimize/pdl_optimizer.py index 7a53a8e6..bbf94b4e 100644 --- a/pdl/optimize/pdl_optimizer.py +++ b/pdl/optimize/pdl_optimizer.py @@ -1,16 +1,14 @@ -from enum import Enum import json -import random import string import sys import time -from collections.abc import Callable +from enum import Enum from math import ceil, log2 from pathlib import Path -from typing import Type import yaml -from datasets import Dataset, DatasetDict +from datasets import DatasetDict +from duration_parser import parse as parse_duration from numpy.random import default_rng from rich.table import Table from tqdm.rich import tqdm @@ -18,7 +16,6 @@ from pdl.optimize.config_parser import OptimizationConfig from pdl.optimize.util import ( CandidateResult, - Models, PDLThread, TrialOutput, console, @@ -26,7 +23,6 @@ ) from pdl.pdl_ast import DataBlock, Program from pdl.pdl_dumper import dump_program -from duration_parser import parse as parse_duration rng = default_rng() @@ -54,7 +50,7 @@ def __init__( # ending_test_set_size: int, # max_candidates: int, # timeout: int, - trial_thread: Type[PDLThread], + trial_thread: type[PDLThread], # budget_growth: str, # test_set: str, # train_set: str, @@ -120,9 +116,7 @@ def parse_budget(self): self.time_budget = duration def load_pdl(self, path: Path) -> Program: - with ( - path.open(encoding="utf-8") as pdl, - ): + with (path.open(encoding="utf-8") as pdl,): return Program.model_validate(yaml.safe_load(pdl)) def parse_signature(self): @@ -147,9 +141,7 @@ def sample_random_index(self, items: list): def random_uuid(self, k: int = 8) -> str: alphabet = string.ascii_lowercase + string.digits - return "".join( - rng.choice(list(alphabet), size=k) - ) + return "".join(rng.choice(list(alphabet), size=k)) def sample_candidates(self, num_candidates: int, demo_indices: list | None = None): demo_name = self.config.demonstrations_variable_name @@ -157,7 +149,7 @@ def sample_candidates(self, num_candidates: int, demo_indices: list | None = Non for _ in range(num_candidates): if demo_indices is None: demo_indices = self.sample_random_indices( - self.dataset[self.train_set_name], size=self.num_demonstrations + self.dataset[self.train_set_name], size=self.num_demonstrations, ) variable_instance = { k: self.sample_random_index(v) for k, v in self.config.variables.items() @@ -290,7 +282,7 @@ def run(self): test_set_indices = list( range( min(len(self.dataset[self.test_set_name]), ending_test_set_size), - ) + ), ) start_time = time.time() @@ -463,14 +455,16 @@ def evaluate( console.log("Progressed on exception") console.log(result) elif isinstance(result, TrialOutput): - answer = ( + ( round(result.answer, 2) if isinstance(result.answer, float) else result.answer ) - console.log( - f"Answer: {answer} Ground truth: {result.groundtruth} Match: {result.correct}", - ) + if result.correct is not True: + console.log("FAIL", result.example["task_id"]) + # console.log( + # f"Answer: {answer} Ground truth: {result.groundtruth} Match: {result.correct}", + # ) results.append(result) diff --git a/pdl/optimize/util.py b/pdl/optimize/util.py index 23d61175..d86a364a 100644 --- a/pdl/optimize/util.py +++ b/pdl/optimize/util.py @@ -7,8 +7,8 @@ from typing import Any import yaml -from rich.console import Console from datasets import Dataset +from rich.console import Console from pdl.optimize.bam_logprobs import ModelResponse, get_seq_logprobs from pdl.optimize.config_parser import OptimizationConfig @@ -99,16 +99,16 @@ def run( self.pdl_program, timeout=self.timeout, ) - console.log("result", result) + # console.log("result", result) self.scope = scope end_time = time.time() runtime = end_time - start_time - console.log(f"Runtime took seconds: {runtime:.2f}") + # console.log(f"Runtime took seconds: {runtime:.2f}") tries += 1 - if DEBUG: - console.log("DEBUG:", document) + # if DEBUG: + # console.log("DEBUG:", document) errored = contains_error(trace) if errored: @@ -162,6 +162,7 @@ def run( answer=answer, groundtruth=truth, runtime=runtime, + example=self.example, ) @@ -177,6 +178,7 @@ class TrialOutput: answer: str | None = None groundtruth: str | None = None runtime: int | None = None + example: Any = None def to_dict(self) -> dict: return { @@ -227,6 +229,7 @@ def execute_threads(max_threads: int, pdl_threads: list, timeout: int | None = N console.log("Running without parallelism") for job in pdl_threads: yield job.run() + return service = ThreadPoolExecutor(max_workers=max_threads) future_to_trial = {service.submit(thread.run): thread for thread in pdl_threads} diff --git a/pdl/pdl_ast.py b/pdl/pdl_ast.py index 7d88715c..25822453 100644 --- a/pdl/pdl_ast.py +++ b/pdl/pdl_ast.py @@ -113,7 +113,7 @@ class Block(BaseModel): """ # Fields for internal use result: Optional[Any] = None - location: Optional[LocationType] = None + location: Optional[LocationType] = Field(default=None, exclude=True) has_error: bool = False @@ -279,7 +279,7 @@ class CodeBlock(Block): """Execute a piece of code.""" kind: Literal[BlockKind.CODE] = BlockKind.CODE - lan: Literal["python"] + lan: Literal["python", "ipython"] """Programming language of the code. """ code: "BlocksType" diff --git a/pdl/pdl_scheduler.py b/pdl/pdl_scheduler.py index be7e9f72..7683e74a 100644 --- a/pdl/pdl_scheduler.py +++ b/pdl/pdl_scheduler.py @@ -103,7 +103,6 @@ def schedule( end_time = time.time() runtime = end_time - start_time if timeout and timeout < runtime: - # print("RUNTIME over 15", runtime) raise TimeoutError(f"Out of time. {runtime}") todo = todo_next todo_next = []