Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bench config #1126

Merged
merged 8 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
33 changes: 23 additions & 10 deletions gpt_engineer/benchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
The standard boilerplate for invoking the main function when the script is executed.
"""
import importlib
import os.path

from typing import Annotated, Optional

Expand All @@ -29,6 +30,7 @@
from langchain.globals import set_llm_cache

from gpt_engineer.applications.cli.main import load_env_if_needed
from gpt_engineer.benchmark.bench_config import BenchConfig
from gpt_engineer.benchmark.benchmarks.load import get_benchmark
from gpt_engineer.benchmark.run import print_results, run

Expand Down Expand Up @@ -69,12 +71,9 @@ def main(
help="python file that contains a function called 'default_config_agent'"
),
],
benchmarks: Annotated[
str, typer.Argument(help="benchmark name(s) separated by ','")
],
task_name: Annotated[
bench_config: Annotated[
Optional[str], typer.Argument(help="optional task name in benchmark")
] = None,
] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"),
verbose: Annotated[
bool, typer.Option(help="print results for each task", show_default=False)
] = False,
Expand All @@ -88,8 +87,8 @@ def main(
The file path to the Python module that contains a function called 'default_config_agent'.
benchmarks : str
A comma-separated string of benchmark names to run.
task_name : Optional[str], default=None
An optional task name to run within the benchmark.
bench_config : Optional[str], default=default_bench_config.toml
Configuration file for choosing which benchmark problems to run. See default config for more details.
verbose : bool, default=False
A flag to indicate whether to print results for each task.
Expand All @@ -99,13 +98,27 @@ def main(
"""
set_llm_cache(SQLiteCache(database_path=".langchain.db"))
load_env_if_needed()
config = BenchConfig.from_toml(bench_config)
print("using config file: " + bench_config)
benchmarks = list()
for specific_config_name in vars(config):
specific_config = getattr(config, specific_config_name)
if hasattr(specific_config, "active"):
if specific_config.active:
benchmarks.append(specific_config_name)

benchmarks = benchmarks.split(",")
for benchmark_name in benchmarks:
benchmark = get_benchmark(benchmark_name)
benchmark = get_benchmark(benchmark_name, config)
if len(benchmark.tasks) == 0:
print(
benchmark_name
+ " was skipped, since no tasks are specified. Increase the number of tasks in the config file at: "
+ bench_config
)
continue
agent = get_agent(path_to_agent)

results = run(agent, benchmark, task_name, verbose=verbose)
results = run(agent, benchmark, verbose=verbose)
print(
f"\n--- Results for agent {path_to_agent}, benchmark: {benchmark_name} ---"
)
Expand Down
56 changes: 56 additions & 0 deletions gpt_engineer/benchmark/bench_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from dataclasses import dataclass, field
from pathlib import Path

from gpt_engineer.core.project_config import read_config


@dataclass
class AppsConfig:
active: bool | None = True
test_start_index: int | None = 0
test_end_index: int | None = 1
train_start_index: int | None = 0
train_end_index: int | None = 0


@dataclass
class MbppConfig:
active: bool | None = True
test_len: int | None = 1
train_len: int | None = 0


@dataclass
class GptmeConfig:
active: bool | None = True


@dataclass
class GptengConfig:
active: bool | None = True


@dataclass
class BenchConfig:
"""Configuration for the GPT Engineer CLI and gptengineer.app via `gpt-engineer.toml`."""

apps: AppsConfig = field(default_factory=AppsConfig)
mbpp: MbppConfig = field(default_factory=MbppConfig)
gptme: GptmeConfig = field(default_factory=GptmeConfig)
gpteng: GptengConfig = field(default_factory=GptengConfig)

@classmethod
def from_toml(cls, config_file: Path | str):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The from_toml method in the BenchConfig class does not seem to handle the case where the provided config_file does not exist or is not a valid TOML file. This could lead to unhandled exceptions if the method is called with an invalid file path. Consider adding error handling to this method to provide a more informative error message in such cases.

if isinstance(config_file, str):
config_file = Path(config_file)
config_dict = read_config(config_file)
return cls.from_dict(config_dict)

@classmethod
def from_dict(cls, config_dict: dict):
return cls(
apps=AppsConfig(**config_dict.get("apps", {})),
mbpp=MbppConfig(**config_dict.get("mbpp", {})),
gptme=GptmeConfig(**config_dict.get("gptme", {})),
gpteng=GptengConfig(**config_dict.get("gpteng", {})),
)
32 changes: 17 additions & 15 deletions gpt_engineer/benchmark/benchmarks/apps/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

from datasets import Dataset, DatasetDict, load_dataset, load_from_disk

from gpt_engineer.benchmark.bench_config import AppsConfig
from gpt_engineer.benchmark.benchmarks.apps.problem import Problem
from gpt_engineer.benchmark.benchmarks.apps.problems import PROBLEM_IDS
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
from gpt_engineer.core.files_dict import FilesDict
Expand Down Expand Up @@ -57,12 +57,12 @@ def _get_dataset() -> Union[Dataset, DatasetDict]:
print("Dataset not found locally, downloading...")

dataset = load_dataset("codeparrot/apps", trust_remote_code=True)
dataset.save_to_disk(DATASET_PATH)
dataset.save_to_disk(str(DATASET_PATH))

return dataset


def load_apps():
def load_apps(config: AppsConfig) -> Benchmark:
"""
Loads the APPS benchmark, which consists of a series coding problems.
Expand All @@ -73,17 +73,19 @@ def load_apps():
"""
dataset = _get_dataset()
tasks = []

problems = [
Problem(
id=problem["problem_id"],
question=problem["question"],
input_output=problem["input_output"],
starter_code=problem["starter_code"],
)
for problem in dataset["test"]
if problem["problem_id"] in PROBLEM_IDS
]
problems = list()
for dataset_type in ["test", "train"]:
problems += [
Problem(
id=problem["problem_id"],
question=problem["question"],
input_output=problem["input_output"],
starter_code=problem["starter_code"],
)
for index, problem in enumerate(dataset[dataset_type])
if (index < config.__getattribute__(dataset_type + "_end_index"))
and (index >= config.__getattribute__(dataset_type + "_start_index"))
]

for problem in problems:
prompt = Prompt(
Expand All @@ -110,6 +112,6 @@ def load_apps():
)

return Benchmark(
name="APPS",
name="apps",
tasks=tasks,
)
6 changes: 4 additions & 2 deletions gpt_engineer/benchmark/benchmarks/gpteng/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@

from pathlib import Path

from gpt_engineer.benchmark.bench_config import GptengConfig
from gpt_engineer.benchmark.benchmarks.gpteng.eval_tools import (
check_evaluation_component,
)
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.chat_to_files import chat_to_files_dict
from gpt_engineer.core.prompt import Prompt

evaluations = [
{
Expand Down Expand Up @@ -192,7 +194,7 @@ def eval_to_task(case):
return Task(
name=case["name"],
initial_code=chat_to_files_dict(Path(case["code_blob"]).read_text()),
prompt=prompt,
prompt=Prompt(prompt),
command=None,
assertions={
f"{e['type']}_{i}": expect_to_assertion(e)
Expand All @@ -201,7 +203,7 @@ def eval_to_task(case):
)


def load_gpteng():
def load_gpteng(config: GptengConfig) -> Benchmark:
"""
Loads the GPT-Eng benchmark, which consists of a series of tasks for evaluation.
Expand Down
3 changes: 2 additions & 1 deletion gpt_engineer/benchmark/benchmarks/gptme/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@
load_gptme : function
Loads the GPT-Me benchmark, which consists of a series of tasks for evaluation.
"""
from gpt_engineer.benchmark.bench_config import GptmeConfig
from gpt_engineer.benchmark.types import Benchmark, Task
from gpt_engineer.core.files_dict import FilesDict
from gpt_engineer.core.prompt import Prompt


def load_gptme():
def load_gptme(config: GptmeConfig) -> Benchmark:
"""
Loads the GPT-Me benchmark, which consists of a series of tasks for evaluation.
Expand Down
7 changes: 5 additions & 2 deletions gpt_engineer/benchmark/benchmarks/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
get_benchmark : function
Retrieves a Benchmark object by name. Raises ValueError if the benchmark is unknown.
"""
from gpt_engineer.benchmark.bench_config import BenchConfig
from gpt_engineer.benchmark.benchmarks.apps.load import load_apps
from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng
from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme
Expand All @@ -23,14 +24,16 @@
}


def get_benchmark(name: str) -> Benchmark:
def get_benchmark(name: str, config: BenchConfig) -> Benchmark:
"""
Retrieves a Benchmark object by name. Raises ValueError if the benchmark is unknown.

Parameters
----------
name : str
The name of the benchmark to retrieve.
config : BenchConfig
Configuration object for the benchmarks.

Returns
-------
Expand All @@ -44,4 +47,4 @@ def get_benchmark(name: str) -> Benchmark:
"""
if name not in BENCHMARKS:
raise ValueError(f"Unknown benchmark {name}.")
return BENCHMARKS[name]()
return BENCHMARKS[name](config.__getattribute__(name))
35 changes: 18 additions & 17 deletions gpt_engineer/benchmark/benchmarks/mbpp/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

from datasets import Dataset, DatasetDict, load_dataset, load_from_disk

from gpt_engineer.benchmark.bench_config import MbppConfig
from gpt_engineer.benchmark.benchmarks.mbpp.problem import Problem
from gpt_engineer.benchmark.benchmarks.mbpp.problems import PROBLEM_IDS
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
from gpt_engineer.core.files_dict import FilesDict
Expand Down Expand Up @@ -57,12 +57,12 @@ def _get_dataset() -> Union[Dataset, DatasetDict]:
print("Dataset not found locally, downloading...")

dataset = load_dataset("mbpp", "sanitized", trust_remote_code=True)
dataset.save_to_disk(DATASET_PATH)
dataset.save_to_disk(str(DATASET_PATH))

return dataset


def load_mbpp():
def load_mbpp(config: MbppConfig) -> Benchmark:
"""
Loads the MBPP benchmark, which consists of a series coding problems.
Expand All @@ -73,19 +73,20 @@ def load_mbpp():
"""
dataset = _get_dataset()
tasks = []

problems = [
Problem(
source_file=problem["source_file"],
task_id=problem["task_id"],
prompt=problem["prompt"],
code=problem["code"],
test_imports=problem["test_imports"],
test_list=problem["test_list"],
)
for problem in dataset["test"]
if problem["task_id"] in PROBLEM_IDS
]
problems = []
for dataset_type in ["test", "train"]:
problems += [
Problem(
source_file=problem["source_file"],
task_id=problem["task_id"],
prompt=problem["prompt"],
code=problem["code"],
test_imports=problem["test_imports"],
test_list=problem["test_list"],
)
for index, problem in enumerate(dataset[dataset_type])
if index < config.__getattribute__(dataset_type + "_len")
]

for problem in problems:
prompt = Prompt(
Expand All @@ -109,6 +110,6 @@ def load_mbpp():
)

return Benchmark(
name="MBPP",
name="mbpp",
tasks=tasks,
)
19 changes: 19 additions & 0 deletions gpt_engineer/benchmark/default_bench_config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# For apps, the maximal range is 0:5000 for both train and test
[apps]
active = true
test_start_index = 0
test_end_index = 2
train_start_index = 0
train_end_index = 2

# For mbpp, the maximal range is 0:47
[mbpp]
active = true
test_len = 2
train_len = 2

[gpteng]
active = true

[gptme]
active = true
5 changes: 1 addition & 4 deletions gpt_engineer/benchmark/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"""
import time

from typing import List, Optional
from typing import List

from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult
from gpt_engineer.core.base_agent import BaseAgent
Expand All @@ -24,7 +24,6 @@
def run(
agent: BaseAgent,
benchmark: Benchmark,
task_name: Optional[str] = None,
verbose=False,
) -> List[TaskResult]:
"""
Expand All @@ -36,8 +35,6 @@ def run(
The agent to use for running the benchmark tasks.
benchmark : Benchmark
The benchmark containing the tasks to run.
task_name : Optional[str], default=None
An optional name of a specific task to run within the benchmark.
verbose : bool, default=False
A flag to indicate whether to print verbose output during the benchmark.
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,8 @@ section-order = [
combine-as-imports = true
split-on-trailing-comma = false
lines-between-types = 1

[tool.pytest.ini_options]
markers = [
"requires_key: marks tests as requiring access to a valid OPENAI_API_KEY (deselect with '-m \"not requires_key\"')",
]
Loading
Loading