From b41d7af75a2c260da928a9266f99783eafcbbc2b Mon Sep 17 00:00:00 2001
From: Claudio Spiess <claudiosv@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:22:11 -0400
Subject: [PATCH] Begin merging optimizer

---
 examples/prompt_library/CoT.pdl               |  77 +++++----
 examples/prompt_library/ReAct.pdl             |  52 +++---
 examples/prompt_library/ReWoo.pdl             |   5 +-
 .../prompt_library/demos/Verifier_json.pdl    |  11 +-
 .../demos/evalplus/evalplus.pdl               |  38 ++---
 .../prompt_library/demos/evalplus/react.pdl   | 158 ++++++++++++++++++
 pdl/optimize/config_parser.py                 |   9 +-
 pdl/optimize/mbpp_thread.py                   | 139 ++++++++-------
 pdl/optimize/optimize.py                      |  50 +++++-
 pdl/optimize/pdl_optimizer.py                 |  34 ++--
 pdl/optimize/util.py                          |  13 +-
 pdl/pdl_ast.py                                |   4 +-
 pdl/pdl_scheduler.py                          |   1 -
 13 files changed, 415 insertions(+), 176 deletions(-)
 create mode 100644 examples/prompt_library/demos/evalplus/react.pdl

diff --git a/examples/prompt_library/CoT.pdl b/examples/prompt_library/CoT.pdl
index b6c27126..de7b5077 100644
--- a/examples/prompt_library/CoT.pdl
+++ b/examples/prompt_library/CoT.pdl
@@ -31,17 +31,17 @@ defs:
       model: str
       answer: str
     return:
-    - |-
-      Question: {{question}}
-
-      Answer: Let's think step by step. 
-    - model: "{{ model }}"
-      params:
-        decoding_method: "greedy"
-        stop_sequences:
-        - "The answer is"
-        include_stop_sequence: false
-    - "The answer is {{ answer }}."
+      - |-
+        Question: {{ question }}
+        Answer: Let's think step by step.
+      - model: "{{ model }}"
+        platform: bam
+        parameters:
+          decoding_method: "greedy"
+          stop_sequences:
+            - "The answer is"
+          include_stop_sequence: false
+      - "The answer is {{ answer }}."
 
   fewshot_cot:
     function:
@@ -65,24 +65,39 @@ defs:
       examples:
         { list: { obj: { question: str, reasoning: str, answer: str } } }
     return:
-    - call: fewshot_cot
-      args:
-        examples: "{{ examples }}"
-    - |-
-      Question: {{question}}
+      - call: fewshot_cot
+        args:
+          examples: "{{ examples }}"
+          block_style: "question_cot"
+      - |
+        Question: {{ question }}
+        Answer: Let's think step by step.
+      - model: "{{ model }}"
+        platform: bam
+        parameters:
+          decoding_method: "greedy"
+          stop_sequences:
+            - "<|endoftext|>"
+          include_stop_sequence: false
 
-      Answer: Let's think step by step.
-    - model: "{{ model }}"
-      params:
-        decoding_method: "greedy"
-        stop_sequences:
-        - "The answer is"
-        include_stop_sequence: false
-    - "The answer is "
-    - def: answer
-      model: "{{ model }}"
-      params:
-        decoding_method: "greedy"
-    - "\n\nJSON Output: "
-    - data:
-        answer: "{{ answer|trim }}"
\ No newline at end of file
+  chain_of_thought_claim:
+    function:
+      question: str
+      model: str
+      examples:
+        { list: { obj: { question: str, reasoning: str, answer: str } } }
+    return:
+      - call: fewshot_cot
+        args:
+          examples: "{{ examples }}"
+          block_style: "claim_cot"
+      - |
+        {{ question }}
+        Thought: Let's think step by step.
+      - model: "{{ model }}"
+        platform: bam
+        parameters:
+          decoding_method: "greedy"
+          stop_sequences:
+            - "<|endoftext|>"
+          include_stop_sequence: false
diff --git a/examples/prompt_library/ReAct.pdl b/examples/prompt_library/ReAct.pdl
index c4428271..a889bf1f 100644
--- a/examples/prompt_library/ReAct.pdl
+++ b/examples/prompt_library/ReAct.pdl
@@ -93,7 +93,10 @@ defs:
                   iterations: "{{ iterations+1 }}"
               - def: THOUGHT
                 model: "{{ model }}"
-                params:
+                parameters:
+                  time_limit: 60000
+                  random_seed: 42
+                  truncate_input_tokens: 8191
                   decoding_method: sample
                   repetition_penalty: 1.1
                   temperature: "{{ temperature }}"
@@ -102,7 +105,10 @@ defs:
             until: "{{ THOUGHT.endswith('Act:') or iterations>20 }}"
           - def: action_raw
             model: "{{ model }}"
-            params:
+            parameters:
+              time_limit: 60000
+              random_seed: 42
+              truncate_input_tokens: 8191
               decoding_method: sample
               repetition_penalty: 1.1
               temperature: "{{ temperature }}"
@@ -113,7 +119,10 @@ defs:
           - "["
           - def: SUBJECT
             model: "{{ model }}"
-            params:
+            parameters:
+              time_limit: 60000
+              random_seed: 42
+              truncate_input_tokens: 8191
               decoding_method: sample
               repetition_penalty: 1.1
               temperature: "{{ temperature }}"
@@ -130,7 +139,12 @@ defs:
                       subject: "{{ SUBJECT }}"
                   - "\nTho:"
                   - model: "{{ model }}"
-                    params:
+                    fallback: "Error calling model"
+                    parameters:
+                      time_limit: 60000
+                      random_seed: 42
+                      truncate_input_tokens: 8191
+                      repetition_penalty: 1.1
                       decoding_method: sample
                       temperature: "{{ temperature }}"
                       stop_sequences: ["\n", "Act:", "Obs:", "Tho:"]
@@ -178,21 +192,21 @@ defs:
           - repeat:
               - def: THOUGHT
                 model: "{{ model }}"
-                params:
-                  DECODING_METHOD: "{{ decoding_method }}"
-                  TEMPERATURE: "{{ temperature }}"
-                  STOP_SEQUENCES: ["\n", "Act:", "Obs:", "Tho:"]
-                  INCLUDE_STOP_SEQUENCE: true
+                parameters:
+                  decoding_method: "{{ decoding_method }}"
+                  temperature: "{{ temperature }}"
+                  stop_sequences: ["\n", "Act:", "Obs:", "Tho:"]
+                  include_stop_sequence: true
             until: "{{ THOUGHT.endswith('Act:') }}"
           - def: action
             model: "{{ model }}"
             parser: json
             spec: {name: str, arguments: obj}
-            params:
-              DECODING_METHOD: "{{ decoding_method }}"
-              TEMPERATURE: "{{ temperature }}"
-              STOP_SEQUENCES: ["\n", "<|endoftext|>"]
-              INCLUDE_STOP_SEQUENCE: false
+            parameters:
+              decoding_method: "{{ decoding_method }}"
+              temperature: "{{ temperature }}"
+              stop_sequences: ["\n", "<|endoftext|>"]
+              include_stop_sequence: false
           - if: "{{ action.name != 'Finish' }}"
             then:
               - "\nObs: "
@@ -202,11 +216,11 @@ defs:
                     args:
                       arguments: "{{ action.arguments }}"
                   - model: "{{ model }}"
-                    params:
-                      DECODING_METHOD: "{{ decoding_method }}"
-                      TEMPERATURE: "{{ temperature }}"
-                      STOP_SEQUENCES: ["\n", "Act:", "Obs:", "Tho:"]
-                      INCLUDE_STOP_SEQUENCE: false
+                    parameters:
+                      decoding_method: "{{ decoding_method }}"
+                      temperature: "{{ temperature }}"
+                      stop_sequences: ["\n", "Act:", "Obs:", "Tho:"]
+                      include_stop_sequence: false
                 else: "Invalid action. Valid actions are {{ TOOL_INFO.signatures|join(', ') }} and Finish[<answer>]."
         until: "{{ action.name == 'Finish' }}"
       - show_result: false
diff --git a/examples/prompt_library/ReWoo.pdl b/examples/prompt_library/ReWoo.pdl
index bebd3b46..3d524a16 100644
--- a/examples/prompt_library/ReWoo.pdl
+++ b/examples/prompt_library/ReWoo.pdl
@@ -86,10 +86,11 @@ defs:
         {{ task }}
       - def: PLANS
         model: "{{ model }}"
+        platform: bam
         parser: # plan, step_name, tool, tool_input
           regex: 'Plan:\s*(?P<plan>(?:.|\n)*?)\s*(?P<step_name>#E\d+)\s*=\s*(?P<tool>\w+)\s*\[(?P<tool_input>[^\]]+)\]'
           mode: findall
-        params:
+        parameters:
           decoding_method: greedy
           stop_sequences:
             - "<|endoftext|>"
@@ -152,7 +153,7 @@ defs:
           Response: 
       - def: SOLUTION
         model: "{{ model }}"
-        params:
+        parameters:
           decoding_method: greedy
           stop_sequences:
             - "<|endoftext|>"
diff --git a/examples/prompt_library/demos/Verifier_json.pdl b/examples/prompt_library/demos/Verifier_json.pdl
index 1ecd30b0..9731dedd 100644
--- a/examples/prompt_library/demos/Verifier_json.pdl
+++ b/examples/prompt_library/demos/Verifier_json.pdl
@@ -9,14 +9,14 @@ document:
     tools: "{{ default_tools }}"
     tool_names: ["Search"]
 - def: QUESTION
+  show_result: false
   read: 
   message: "Please enter a question: "
 - def: PROPOSED
   call: react_json
+  show_result: true
   args:
-    context:
-    - role: system
-      content: "{{ granite_models.granite_7b_lab.system_prompt }}"
+    context: "{{ granite_models.granite_7b_lab.system_prompt }}"
     question: "{{ QUESTION }}"
     model: ibm/granite-7b-lab
     tools: "{{ filtered_tools }}"
@@ -24,15 +24,16 @@ document:
 - "\n\n----- Verifying answer... -----\n\n"
 - def: VERIFIED
   call: react_json
+  show_result: true
   args:
-    context: [{"role": "system", "content": ""}]
+    context: ""
     question: |-
       Is this the right answer to this question?
       "{{ QUESTION }}"
       Proposed answer: {{ PROPOSED.answer.topic }}
 
       Please answer as True or False.
-    model: ibm/granite-34b-code-instruct
+    model: "{{ granite_models.granite_20b_code_instruct_v2.id }}"
     tools: "{{ filtered_tools }}"
     trajectories: []
 - "\n\nThe answer '{{ PROPOSED.answer.topic }}' has been verified as '{{VERIFIED.answer.topic}}'.\n"
diff --git a/examples/prompt_library/demos/evalplus/evalplus.pdl b/examples/prompt_library/demos/evalplus/evalplus.pdl
index fd3a4b4d..5af64dfe 100644
--- a/examples/prompt_library/demos/evalplus/evalplus.pdl
+++ b/examples/prompt_library/demos/evalplus/evalplus.pdl
@@ -1,15 +1,15 @@
 description: EvalPlus simple
 defs:
-  demonstrations:
-    data:
-      - answer: "def max_run_uppercase(test_str):\n  cnt = 0\n  res = 0\n  for idx in range(0,\
-          \ len(test_str)):\n    if test_str.isupper():\n      cnt += 1\n    else:\n\
-          \      res = cnt\n      cnt = 0\n  if test_str.isupper():\n\
-          \    res = cnt\n  return (res)"
-        question: Write a function to find maximum run of uppercase characters in the given
-          string.
-      - answer: "def remove_length(test_str, K):\n  temp = test_str.split()\n  res = \n  res = ' '.join(res)\n  return (res) "
-        question: Write a function to remove all the words with k length in the given string.
+  # demonstrations:
+  #   data:
+  #     - answer: "def max_run_uppercase(test_str):\n  cnt = 0\n  res = 0\n  for idx in range(0,\
+  #         \ len(test_str)):\n    if test_str.isupper():\n      cnt += 1\n    else:\n\
+  #         \      res = cnt\n      cnt = 0\n  if test_str.isupper():\n\
+  #         \    res = cnt\n  return (res)"
+  #       question: Write a function to find maximum run of uppercase characters in the given
+  #         string.
+  #     - answer: "def remove_length(test_str, K):\n  temp = test_str.split()\n  res = \n  res = ' '.join(res)\n  return (res) "
+  #       question: Write a function to remove all the words with k length in the given string.
   model: ibm/granite-34b-code-instruct
   prompt_pattern: cot
   prompt: Write a python function to toggle bits of the number except the first and the last bit.
@@ -23,12 +23,12 @@ document:
       {{ example.answer }}
 
 - |
-   Task: {{ prompt }}
-   Solution:
-- model: "{{ model }}"
-  platform: bam
-  parameters:
-    decoding_method: "greedy"
-    stop_sequences:
-      - "<|endoftext|>"
-    include_stop_sequence: false
\ No newline at end of file
+  Task: {{ prompt }}
+  Solution:
+# - model: "{{ model }}"
+#   platform: bam
+#   parameters:
+#     decoding_method: "greedy"
+#     stop_sequences:
+#       - "<|endoftext|>"
+#     include_stop_sequence: false
\ No newline at end of file
diff --git a/examples/prompt_library/demos/evalplus/react.pdl b/examples/prompt_library/demos/evalplus/react.pdl
new file mode 100644
index 00000000..a93df0f1
--- /dev/null
+++ b/examples/prompt_library/demos/evalplus/react.pdl
@@ -0,0 +1,158 @@
+description: PDL implementation of the ReAct agent for MBPP in MINT
+defs:
+  model: "ibm/granite-34b-code-instruct"
+  prompt_pattern: react
+  # prompt: |-
+  #   Write a python function to identify non-prime numbers.
+# assert is_not_prime(2) == False
+# assert is_not_prime(10) == True
+# assert is_not_prime(35) == True
+# assert is_not_prime(37) == False
+document:
+  - |
+    Task:
+    Write a function to find the similar elements from the given two tuple lists.
+    assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
+
+    Assistant:
+    <thought> The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. </thought>
+    <execute>
+    def similar_elements(test_tup1, test_tup2):
+        res = tuple(set(test_tup1) | set(test_tup2))
+        return res
+    res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+    assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+    </execute>
+
+    Observation:
+    ---------------------------------------------------------------------------
+    AssertionError                            Traceback (most recent call last)
+    Cell In[4], line 5
+          3     return res
+          4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+    ----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+
+    AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10)
+
+    Assistant:
+    <thought> To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. </thought>
+    <execute>
+    def similar_elements(test_tup1, test_tup2):
+        res = tuple(set(test_tup1) & set(test_tup2))
+        return res
+    res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+    assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+    </execute>
+
+    Observation:
+    [Executed Successfully with No Output]
+
+    Assistant:
+    <thought> There is no more AssertionError. I can now submit the solution. </thought>
+    <solution>
+    def similar_elements(test_tup1, test_tup2):
+        res = tuple(set(test_tup1) & set(test_tup2))
+        return res
+    </solution>
+  - "\n"
+  - |
+    Task:
+    {{ prompt|trim }}
+
+    Assistant:
+    <thought>
+  - defs:
+      temperature: 0.7
+      decoding_method: "sample"
+      SOLUTION: ""
+      iterations: 0
+      max_iterations: 5
+    repeat:
+      - repeat:
+          - defs:
+              iterations: "{{ iterations+1 }}"
+          - def: THOUGHT
+            model: "{{ model }}"
+            parameters:
+              truncate_input_tokens: 8191
+              repetition_penalty: 1.05
+              decoding_method: "{{ decoding_method }}"
+              temperature: "{{ temperature }}"
+              stop_sequences: ["</execute>", "</solution>", "<|endoftext|>"]
+              include_stop_sequence: true
+          - if: "{{ THOUGHT is none }}"
+            then:
+              - def: THOUGHT
+                document: ""
+        until: "{{ THOUGHT.endswith('</execute>') or THOUGHT.endswith('</solution>') or iterations>max_iterations }}"
+      - if: "{{ THOUGHT.endswith('</execute>') }}"
+        then:
+          - "\n\nObservation {{ iterations}}:\n"
+          - lan: ipython
+            show_result: false
+            def: observation
+            code: |
+              {{ THOUGHT.split('<execute>')[-1].strip('</execute>') or THOUGHT }}
+          - "{{ (observation|trim|trim('\n')) if observation is not none else '' }}"
+          - "\n\nAssistant {{ iterations}}:\n<thought>"
+      - if: "{{ THOUGHT.endswith('</solution>') }}"
+        then:
+          - defs:
+              SOLUTION: "{{ THOUGHT.split('<solution>')[-1].strip('</solution>') or THOUGHT }}"
+
+    until: "{{ SOLUTION != '' or iterations>max_iterations }}"
+  - if: "{{ iterations>max_iterations }}"
+    then: "[ Failed to find a solution within {{ max_iterations }} iterations. ]</thought>"
+  - "\nSolution:\n{{ SOLUTION|trim }}"
+
+# WITH FEEDBACK
+# Task:
+# Write a function to find the similar elements from the given two tuple lists.
+# assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
+
+# Assistant:
+# <thought> The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. </thought>
+# <execute>
+# def similar_elements(test_tup1, test_tup2):
+#     res = tuple(set(test_tup1) | set(test_tup2))
+#     return res
+# res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+# assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+# </execute>
+
+# Observation:
+# ---------------------------------------------------------------------------
+# AssertionError                            Traceback (most recent call last)
+# Cell In[4], line 5
+#       3     return res
+#       4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+# ----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+
+# AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10)
+
+# Expert feedback:
+# This is BAD. You should not use the | operator which is the union operator.
+
+# Assistant:
+# <thought> To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. </thought>
+# <execute>
+# def similar_elements(test_tup1, test_tup2):
+#     res = tuple(set(test_tup1) & set(test_tup2))
+#     return res
+# res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+# assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+# </execute>
+
+# Observation:
+# [Executed Successfully with No Output]
+
+# Expert feedback:
+# This is GOOD. You are doing great! You can try to submit the solution now.
+
+# Assistant:
+# <thought> There is no more AssertionError. I can now submit the solution. </thought>
+# <solution>
+# def similar_elements(test_tup1, test_tup2):
+#     res = tuple(set(test_tup1) & set(test_tup2))
+#     return res
+# </solution>
diff --git a/pdl/optimize/config_parser.py b/pdl/optimize/config_parser.py
index a46449ea..a51a7782 100644
--- a/pdl/optimize/config_parser.py
+++ b/pdl/optimize/config_parser.py
@@ -1,13 +1,14 @@
 import argparse
 from pathlib import Path
 from typing import Literal
-from pydantic import BaseModel, Field
+
 import yaml
+from pydantic import BaseModel, Field
 
 
 class OptimizationConfig(BaseModel):
     benchmark: Literal[
-        "gsm8k", "gsm8k-baseline", "gsm8k-bench", "fever", "evalplus"
+        "gsm8k", "gsm8k-baseline", "gsm8k-bench", "fever", "evalplus",
     ] = Field()
     num_candidates: int = Field(default=30)
     num_demonstrations: int = Field(default=5)
@@ -51,6 +52,6 @@ def get_variable_names(self) -> list[str]:
     print(config.get_variable_names())
     Path("opticonfig1.yml").write_text(
         yaml.dump(config.model_dump(
-            exclude_defaults=False, exclude_none=False, exclude_unset=False
-        ))
+            exclude_defaults=False, exclude_none=False, exclude_unset=False,
+        )),
     )
diff --git a/pdl/optimize/mbpp_thread.py b/pdl/optimize/mbpp_thread.py
index bf06560c..05e7c4e1 100644
--- a/pdl/optimize/mbpp_thread.py
+++ b/pdl/optimize/mbpp_thread.py
@@ -1,21 +1,13 @@
-import json
-from pathlib import Path
+import ast
 from typing import Any
 
-from pdl.optimize.util import PDLThread
-from pdl.pdl_ast import ScopeType
-from pdl.pdl_interpreter import (
-    empty_scope,
-)
-from evalplus.data import get_mbpp_plus, write_jsonl
 from evalplus.evaluate import (
     check_correctness,
-    get_mbpp_plus_hash,
-    get_groundtruth,
-    MBPP_OUTPUT_NOT_NONE_TASKS,
 )
-from collections import Counter, defaultdict
-from concurrent.futures import ProcessPoolExecutor, as_completed
+
+from pdl.optimize.util import PDLThread
+from pdl.pdl_ast import ScopeType
+from pdl.pdl_interpreter import empty_scope
 
 
 class MBPPTrialThread(PDLThread):
@@ -25,7 +17,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
-        self.answer_key = "code"
+        self.answer_key = "canonical_solution"
 
     def get_scope(self) -> ScopeType:
         scope = empty_scope
@@ -68,59 +60,80 @@ def get_scope(self) -> ScopeType:
         scope["prompt"] = self.example["prompt"]
         scope["task_id"] = self.example["task_id"]
 
-        
-        # with ProcessPoolExecutor(max_workers=1) as executor:
-        #     eval_results = defaultdict(list)  # task_id ->
-        #     future = executor.submit(
-        #         check_correctness,
-        #         dataset="mbpp",
-        #         completion_id=0,
-        #         problem=mbpp_plus[task_id],
-        #         solution=mbpp_plus[task_id]["canonical_solution"],
-        #         expected_output=expected_output[task_id],
-        #         base_only=False,
-        #         fast_check=True,
-        #         identifier=task_id + " line(1 in x)",
-        #         min_time_limit=1,
-        #         gt_time_limit_factor=4.0,
-        #     )
-        # print(future.result())
-        
         return scope
 
     def extract_answer(self, document: str) -> bool:
-        response = document.split("Solution:\n")[-1]
-        
-        # solution = {
-        #     "task_id": f"Mbpp/{self.example['task_id']}",
-        #     "completion": response,
-        # }
-        # with Path(f"Mbpp/{self.example['task_id']}").open("w") as f:
-        #     json.dump(solution, f)
-
-        return response
+        return document.split("Solution:\n")[-1]
 
     def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
-        task_id = f"Mbpp/{self.example['task_id']}"
-        mbpp_plus = get_mbpp_plus()
-
-        dataset_hash = get_mbpp_plus_hash()
-        expected_output = get_groundtruth(
-            mbpp_plus,
-            dataset_hash,
-            MBPP_OUTPUT_NOT_NONE_TASKS,
+        if answer is None or not isinstance(answer, str):
+            return False
+
+        try:
+            ast.parse(answer)
+        except Exception as e:
+            print(e)
+            return False
+
+        task_id = self.example["task_id"]
+
+        solution = (
+            self.example["prompt"] + answer
         )
+
         result = check_correctness(
-                dataset="mbpp",
-                completion_id=self.index,
-                problem=mbpp_plus[task_id],
-                solution=answer, #mbpp_plus[task_id]["canonical_solution"],
-                expected_output=expected_output[task_id],
-                base_only=False,
-                fast_check=True,
-                identifier=task_id + " line(1 in x)",
-                min_time_limit=1,
-                gt_time_limit_factor=4.0,
-            )
-        print(result)
-        return result["base"][0] == 'pass' and result["plus"][0] == 'pass'
+            dataset="mbpp",
+            completion_id=self.index,
+            problem=self.example,
+            solution=solution,
+            expected_output=self.example["expected_output"],
+            base_only=False,
+            fast_check=False,
+            identifier=task_id + " line(1 in x)",
+            min_time_limit=1,  # 1
+            gt_time_limit_factor=4.0,  # 4.0
+        )
+
+        def get_failed_tests(stat, details, inputs, expected):
+            if stat == "pass" or not details:
+                return []
+
+            return [
+                {
+                    "inputs": inputs[i],
+                    "expected_outputs": expected[i],
+                }
+                for i in range(len(details))
+                if not details[i]
+            ]
+
+            # else => simply return the only and the last fail test
+            # return [inputs[len(details) - 1]]
+
+        base_stat, base_details = result["base"]
+        get_failed_tests(
+            base_stat,
+            base_details,
+            self.example["base_input"],
+            self.example["expected_output"].get("base"),
+        )
+        # if len(base_fail_tests) > 0:
+        #     print(base_fail_tests)
+
+        plus_stat, plus_details = result["plus"]
+        get_failed_tests(
+            plus_stat,
+            plus_details,
+            self.example["plus_input"],
+            self.example["expected_output"].get("plus"),
+        )
+        # if len(plus_fail_tests) > 0:
+        # print(self.example["task_id"], plus_fail_tests)
+        return result["base"][0] == "pass" and result["plus"][0] == "pass"
+        # if not passing_both:
+        #     print(
+        #         "FAIL",
+        #         self.example["task_id"],
+        #         self.example["canonical_solution"],
+        #         answer,
+        #     )
diff --git a/pdl/optimize/optimize.py b/pdl/optimize/optimize.py
index 44ce08a1..0f172ce2 100644
--- a/pdl/optimize/optimize.py
+++ b/pdl/optimize/optimize.py
@@ -2,8 +2,13 @@
 from enum import Enum
 from pathlib import Path
 
-from datasets import load_from_disk, Dataset, load_dataset
 import yaml
+from datasets import load_dataset, load_from_disk
+from evalplus.evaluate import (
+    MBPP_OUTPUT_NOT_NONE_TASKS,
+    get_groundtruth,
+    get_mbpp_plus_hash,
+)
 
 from pdl.optimize.config_parser import OptimizationConfig
 from pdl.optimize.mbpp_thread import MBPPTrialThread
@@ -85,14 +90,49 @@ class SamplingMethods(Enum):
             config=config,
         ).run()
     elif config.benchmark == "evalplus":
-        # from evalplus.data import get_mbpp_plus
-        # mbpp_plus = get_mbpp_plus()
+        from copy import deepcopy
 
-        mbpp = load_dataset("google-research-datasets/mbpp", name="sanitized")
+        from datasets import concatenate_datasets
+        from evalplus.data import get_mbpp_plus, get_mbpp_plus_hash
+
+        class SelectableList(list):
+            def select(self, iterable):
+                return [self[i] for i in iterable]
+
+        class MBPPDataset(dict):
+            def __init__(self):
+                self.mbpp_plus = get_mbpp_plus()
+                self.dataset_hash = get_mbpp_plus_hash()
+
+                expected_outputs = get_groundtruth(
+                    deepcopy(self.mbpp_plus),
+                    self.dataset_hash,
+                    MBPP_OUTPUT_NOT_NONE_TASKS,
+                )
+
+                self.mbpp = load_dataset("google-research-datasets/mbpp", name="full")
+                self["train"] = concatenate_datasets(
+                    self.mbpp.filter(
+                        lambda x: f"Mbpp/{x['task_id']}" not in self.mbpp_plus,
+                    )
+                    .rename_columns({"code": "canonical_solution", "text": "prompt"})
+                    .values(),
+                )
+                self["test"] = SelectableList([v for k, v in self.mbpp_plus.items()])
+                for i, x in enumerate(self["test"]):
+                    self["test"][i]["expected_output"] = expected_outputs[x["task_id"]]
+
+            def __getitem__(self, key):
+                return dict.__getitem__(self, key)
+
+            def __setitem__(self, key, val):
+                dict.__setitem__(self, key, val)
+
+        mbpp_dataset = MBPPDataset()
 
         PDLOptimizer(
             pdl_path=args.pdl_file,
-            dataset=mbpp,
+            dataset=mbpp_dataset,
             trial_thread=MBPPTrialThread,
             yield_output=args.yield_output,
             experiment_path=Path("experiments"),
diff --git a/pdl/optimize/pdl_optimizer.py b/pdl/optimize/pdl_optimizer.py
index 7a53a8e6..bbf94b4e 100644
--- a/pdl/optimize/pdl_optimizer.py
+++ b/pdl/optimize/pdl_optimizer.py
@@ -1,16 +1,14 @@
-from enum import Enum
 import json
-import random
 import string
 import sys
 import time
-from collections.abc import Callable
+from enum import Enum
 from math import ceil, log2
 from pathlib import Path
-from typing import Type
 
 import yaml
-from datasets import Dataset, DatasetDict
+from datasets import DatasetDict
+from duration_parser import parse as parse_duration
 from numpy.random import default_rng
 from rich.table import Table
 from tqdm.rich import tqdm
@@ -18,7 +16,6 @@
 from pdl.optimize.config_parser import OptimizationConfig
 from pdl.optimize.util import (
     CandidateResult,
-    Models,
     PDLThread,
     TrialOutput,
     console,
@@ -26,7 +23,6 @@
 )
 from pdl.pdl_ast import DataBlock, Program
 from pdl.pdl_dumper import dump_program
-from duration_parser import parse as parse_duration
 
 rng = default_rng()
 
@@ -54,7 +50,7 @@ def __init__(
         # ending_test_set_size: int,
         # max_candidates: int,
         # timeout: int,
-        trial_thread: Type[PDLThread],
+        trial_thread: type[PDLThread],
         # budget_growth: str,
         # test_set: str,
         # train_set: str,
@@ -120,9 +116,7 @@ def parse_budget(self):
                 self.time_budget = duration
 
     def load_pdl(self, path: Path) -> Program:
-        with (
-            path.open(encoding="utf-8") as pdl,
-        ):
+        with (path.open(encoding="utf-8") as pdl,):
             return Program.model_validate(yaml.safe_load(pdl))
 
     def parse_signature(self):
@@ -147,9 +141,7 @@ def sample_random_index(self, items: list):
 
     def random_uuid(self, k: int = 8) -> str:
         alphabet = string.ascii_lowercase + string.digits
-        return "".join(
-            rng.choice(list(alphabet), size=k)
-        )
+        return "".join(rng.choice(list(alphabet), size=k))
 
     def sample_candidates(self, num_candidates: int, demo_indices: list | None = None):
         demo_name = self.config.demonstrations_variable_name
@@ -157,7 +149,7 @@ def sample_candidates(self, num_candidates: int, demo_indices: list | None = Non
         for _ in range(num_candidates):
             if demo_indices is None:
                 demo_indices = self.sample_random_indices(
-                    self.dataset[self.train_set_name], size=self.num_demonstrations
+                    self.dataset[self.train_set_name], size=self.num_demonstrations,
                 )
             variable_instance = {
                 k: self.sample_random_index(v) for k, v in self.config.variables.items()
@@ -290,7 +282,7 @@ def run(self):
             test_set_indices = list(
                 range(
                     min(len(self.dataset[self.test_set_name]), ending_test_set_size),
-                )
+                ),
             )
 
         start_time = time.time()
@@ -463,14 +455,16 @@ def evaluate(
                     console.log("Progressed on exception")
                     console.log(result)
             elif isinstance(result, TrialOutput):
-                answer = (
+                (
                     round(result.answer, 2)
                     if isinstance(result.answer, float)
                     else result.answer
                 )
-                console.log(
-                    f"Answer: {answer} Ground truth: {result.groundtruth} Match: {result.correct}",
-                )
+                if result.correct is not True:
+                    console.log("FAIL", result.example["task_id"])
+                # console.log(
+                #     f"Answer: {answer} Ground truth: {result.groundtruth} Match: {result.correct}",
+                # )
 
                 results.append(result)
 
diff --git a/pdl/optimize/util.py b/pdl/optimize/util.py
index 23d61175..d86a364a 100644
--- a/pdl/optimize/util.py
+++ b/pdl/optimize/util.py
@@ -7,8 +7,8 @@
 from typing import Any
 
 import yaml
-from rich.console import Console
 from datasets import Dataset
+from rich.console import Console
 
 from pdl.optimize.bam_logprobs import ModelResponse, get_seq_logprobs
 from pdl.optimize.config_parser import OptimizationConfig
@@ -99,16 +99,16 @@ def run(
                     self.pdl_program,
                     timeout=self.timeout,
                 )
-                console.log("result", result)
+                # console.log("result", result)
                 self.scope = scope
                 end_time = time.time()
                 runtime = end_time - start_time
-                console.log(f"Runtime took seconds: {runtime:.2f}")
+                # console.log(f"Runtime took seconds: {runtime:.2f}")
 
                 tries += 1
 
-                if DEBUG:
-                    console.log("DEBUG:", document)
+                # if DEBUG:
+                #     console.log("DEBUG:", document)
 
                 errored = contains_error(trace)
                 if errored:
@@ -162,6 +162,7 @@ def run(
             answer=answer,
             groundtruth=truth,
             runtime=runtime,
+            example=self.example,
         )
 
 
@@ -177,6 +178,7 @@ class TrialOutput:
     answer: str | None = None
     groundtruth: str | None = None
     runtime: int | None = None
+    example: Any = None
 
     def to_dict(self) -> dict:
         return {
@@ -227,6 +229,7 @@ def execute_threads(max_threads: int, pdl_threads: list, timeout: int | None = N
         console.log("Running without parallelism")
         for job in pdl_threads:
             yield job.run()
+        return
 
     service = ThreadPoolExecutor(max_workers=max_threads)
     future_to_trial = {service.submit(thread.run): thread for thread in pdl_threads}
diff --git a/pdl/pdl_ast.py b/pdl/pdl_ast.py
index 7d88715c..25822453 100644
--- a/pdl/pdl_ast.py
+++ b/pdl/pdl_ast.py
@@ -113,7 +113,7 @@ class Block(BaseModel):
     """
     # Fields for internal use
     result: Optional[Any] = None
-    location: Optional[LocationType] = None
+    location: Optional[LocationType] = Field(default=None, exclude=True)
     has_error: bool = False
 
 
@@ -279,7 +279,7 @@ class CodeBlock(Block):
     """Execute a piece of code."""
 
     kind: Literal[BlockKind.CODE] = BlockKind.CODE
-    lan: Literal["python"]
+    lan: Literal["python", "ipython"]
     """Programming language of the code.
     """
     code: "BlocksType"
diff --git a/pdl/pdl_scheduler.py b/pdl/pdl_scheduler.py
index be7e9f72..7683e74a 100644
--- a/pdl/pdl_scheduler.py
+++ b/pdl/pdl_scheduler.py
@@ -103,7 +103,6 @@ def schedule(
             end_time = time.time()
             runtime = end_time - start_time
             if timeout and timeout < runtime:
-                # print("RUNTIME over 15", runtime)
                 raise TimeoutError(f"Out of time. {runtime}")
         todo = todo_next
         todo_next = []