Skip to content

Commit

Permalink
Merge pull request #1126 from gpt-engineer-org/bench_config
Browse files Browse the repository at this point in the history
Bench config
  • Loading branch information
viborc authored Apr 25, 2024
2 parents 31da734 + 3542d17 commit 7630560
Show file tree
Hide file tree
Showing 12 changed files with 246 additions and 51 deletions.
Empty file.
33 changes: 23 additions & 10 deletions gpt_engineer/benchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
The standard boilerplate for invoking the main function when the script is executed.
"""
import importlib
import os.path

from typing import Annotated, Optional

Expand All @@ -29,6 +30,7 @@
from langchain.globals import set_llm_cache

from gpt_engineer.applications.cli.main import load_env_if_needed
from gpt_engineer.benchmark.bench_config import BenchConfig
from gpt_engineer.benchmark.benchmarks.load import get_benchmark
from gpt_engineer.benchmark.run import print_results, run

Expand Down Expand Up @@ -69,12 +71,9 @@ def main(
help="python file that contains a function called 'default_config_agent'"
),
],
benchmarks: Annotated[
str, typer.Argument(help="benchmark name(s) separated by ','")
],
task_name: Annotated[
bench_config: Annotated[
Optional[str], typer.Argument(help="optional task name in benchmark")
] = None,
] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"),
verbose: Annotated[
bool, typer.Option(help="print results for each task", show_default=False)
] = False,
Expand All @@ -88,8 +87,8 @@ def main(
The file path to the Python module that contains a function called 'default_config_agent'.
benchmarks : str
A comma-separated string of benchmark names to run.
task_name : Optional[str], default=None
An optional task name to run within the benchmark.
bench_config : Optional[str], default=default_bench_config.toml
Configuration file for choosing which benchmark problems to run. See default config for more details.
verbose : bool, default=False
A flag to indicate whether to print results for each task.
Expand All @@ -99,13 +98,27 @@ def main(
"""
set_llm_cache(SQLiteCache(database_path=".langchain.db"))
load_env_if_needed()
config = BenchConfig.from_toml(bench_config)
print("using config file: " + bench_config)
benchmarks = list()
for specific_config_name in vars(config):
specific_config = getattr(config, specific_config_name)
if hasattr(specific_config, "active"):
if specific_config.active:
benchmarks.append(specific_config_name)

benchmarks = benchmarks.split(",")
for benchmark_name in benchmarks:
benchmark = get_benchmark(benchmark_name)
benchmark = get_benchmark(benchmark_name, config)
if len(benchmark.tasks) == 0:
print(
benchmark_name
+ " was skipped, since no tasks are specified. Increase the number of tasks in the config file at: "
+ bench_config
)
continue
agent = get_agent(path_to_agent)

results = run(agent, benchmark, task_name, verbose=verbose)
results = run(agent, benchmark, verbose=verbose)
print(
f"\n--- Results for agent {path_to_agent}, benchmark: {benchmark_name} ---"
)
Expand Down
56 changes: 56 additions & 0 deletions gpt_engineer/benchmark/bench_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from dataclasses import dataclass, field
from pathlib import Path

from gpt_engineer.core.project_config import read_config


@dataclass
class AppsConfig:
active: bool | None = True
test_start_index: int | None = 0
test_end_index: int | None = 1
train_start_index: int | None = 0
train_end_index: int | None = 0


@dataclass
class MbppConfig:
active: bool | None = True
test_len: int | None = 1
train_len: int | None = 0


@dataclass
class GptmeConfig:
active: bool | None = True


@dataclass
class GptengConfig:
active: bool | None = True


@dataclass
class BenchConfig:
"""Configuration for the GPT Engineer CLI and gptengineer.app via `gpt-engineer.toml`."""

apps: AppsConfig = field(default_factory=AppsConfig)
mbpp: MbppConfig = field(default_factory=MbppConfig)
gptme: GptmeConfig = field(default_factory=GptmeConfig)
gpteng: GptengConfig = field(default_factory=GptengConfig)

@classmethod
def from_toml(cls, config_file: Path | str):
if isinstance(config_file, str):
config_file = Path(config_file)
config_dict = read_config(config_file)
return cls.from_dict(config_dict)

@classmethod
def from_dict(cls, config_dict: dict):
return cls(
apps=AppsConfig(**config_dict.get("apps", {})),
mbpp=MbppConfig(**config_dict.get("mbpp", {})),
gptme=GptmeConfig(**config_dict.get("gptme", {})),
gpteng=GptengConfig(**config_dict.get("gpteng", {})),
)
32 changes: 17 additions & 15 deletions gpt_engineer/benchmark/benchmarks/apps/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

from datasets import Dataset, DatasetDict, load_dataset, load_from_disk

from gpt_engineer.benchmark.bench_config import AppsConfig
from gpt_engineer.benchmark.benchmarks.apps.problem import Problem
from gpt_engineer.benchmark.benchmarks.apps.problems import PROBLEM_IDS
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
from gpt_engineer.core.files_dict import FilesDict
Expand Down Expand Up @@ -57,12 +57,12 @@ def _get_dataset() -> Union[Dataset, DatasetDict]:
print("Dataset not found locally, downloading...")

dataset = load_dataset("codeparrot/apps", trust_remote_code=True)
dataset.save_to_disk(DATASET_PATH)
dataset.save_to_disk(str(DATASET_PATH))

return dataset


def load_apps():
def load_apps(config: AppsConfig) -> Benchmark:
"""
Loads the APPS benchmark, which consists of a series coding problems.
Expand All @@ -73,17 +73,19 @@ def load_apps():
"""
dataset = _get_dataset()
tasks = []

problems = [
Problem(
id=problem["problem_id"],
question=problem["question"],
input_output=problem["input_output"],
starter_code=problem["starter_code"],
)
for problem in dataset["test"]
if problem["problem_id"] in PROBLEM_IDS
]
problems = list()
for dataset_type in ["test", "train"]:
problems += [
Problem(
id=problem["problem_id"],
question=problem["question"],
input_output=problem["input_output"],
starter_code=problem["starter_code"],
)
for index, problem in enumerate(dataset[dataset_type])
if (index < config.__getattribute__(dataset_type + "_end_index"))
and (index >= config.__getattribute__(dataset_type + "_start_index"))
]

for problem in problems:
prompt = Prompt(
Expand All @@ -110,6 +112,6 @@ def load_apps():
)

return Benchmark(
name="APPS",
name="apps",
tasks=tasks,
)
6 changes: 4 additions & 2 deletions gpt_engineer/benchmark/benchmarks/gpteng/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@

from pathlib import Path

from gpt_engineer.benchmark.bench_config import GptengConfig
from gpt_engineer.benchmark.benchmarks.gpteng.eval_tools import (
check_evaluation_component,
)
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.chat_to_files import chat_to_files_dict
from gpt_engineer.core.prompt import Prompt

evaluations = [
{
Expand Down Expand Up @@ -192,7 +194,7 @@ def eval_to_task(case):
return Task(
name=case["name"],
initial_code=chat_to_files_dict(Path(case["code_blob"]).read_text()),
prompt=prompt,
prompt=Prompt(prompt),
command=None,
assertions={
f"{e['type']}_{i}": expect_to_assertion(e)
Expand All @@ -201,7 +203,7 @@ def eval_to_task(case):
)


def load_gpteng():
def load_gpteng(config: GptengConfig) -> Benchmark:
"""
Loads the GPT-Eng benchmark, which consists of a series of tasks for evaluation.
Expand Down
3 changes: 2 additions & 1 deletion gpt_engineer/benchmark/benchmarks/gptme/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@
load_gptme : function
Loads the GPT-Me benchmark, which consists of a series of tasks for evaluation.
"""
from gpt_engineer.benchmark.bench_config import GptmeConfig
from gpt_engineer.benchmark.types import Benchmark, Task
from gpt_engineer.core.files_dict import FilesDict
from gpt_engineer.core.prompt import Prompt


def load_gptme():
def load_gptme(config: GptmeConfig) -> Benchmark:
"""
Loads the GPT-Me benchmark, which consists of a series of tasks for evaluation.
Expand Down
7 changes: 5 additions & 2 deletions gpt_engineer/benchmark/benchmarks/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
get_benchmark : function
Retrieves a Benchmark object by name. Raises ValueError if the benchmark is unknown.
"""
from gpt_engineer.benchmark.bench_config import BenchConfig
from gpt_engineer.benchmark.benchmarks.apps.load import load_apps
from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng
from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme
Expand All @@ -23,14 +24,16 @@
}


def get_benchmark(name: str) -> Benchmark:
def get_benchmark(name: str, config: BenchConfig) -> Benchmark:
"""
Retrieves a Benchmark object by name. Raises ValueError if the benchmark is unknown.
Parameters
----------
name : str
The name of the benchmark to retrieve.
config : BenchConfig
Configuration object for the benchmarks.
Returns
-------
Expand All @@ -44,4 +47,4 @@ def get_benchmark(name: str) -> Benchmark:
"""
if name not in BENCHMARKS:
raise ValueError(f"Unknown benchmark {name}.")
return BENCHMARKS[name]()
return BENCHMARKS[name](config.__getattribute__(name))
35 changes: 18 additions & 17 deletions gpt_engineer/benchmark/benchmarks/mbpp/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

from datasets import Dataset, DatasetDict, load_dataset, load_from_disk

from gpt_engineer.benchmark.bench_config import MbppConfig
from gpt_engineer.benchmark.benchmarks.mbpp.problem import Problem
from gpt_engineer.benchmark.benchmarks.mbpp.problems import PROBLEM_IDS
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
from gpt_engineer.core.files_dict import FilesDict
Expand Down Expand Up @@ -57,12 +57,12 @@ def _get_dataset() -> Union[Dataset, DatasetDict]:
print("Dataset not found locally, downloading...")

dataset = load_dataset("mbpp", "sanitized", trust_remote_code=True)
dataset.save_to_disk(DATASET_PATH)
dataset.save_to_disk(str(DATASET_PATH))

return dataset


def load_mbpp():
def load_mbpp(config: MbppConfig) -> Benchmark:
"""
Loads the MBPP benchmark, which consists of a series coding problems.
Expand All @@ -73,19 +73,20 @@ def load_mbpp():
"""
dataset = _get_dataset()
tasks = []

problems = [
Problem(
source_file=problem["source_file"],
task_id=problem["task_id"],
prompt=problem["prompt"],
code=problem["code"],
test_imports=problem["test_imports"],
test_list=problem["test_list"],
)
for problem in dataset["test"]
if problem["task_id"] in PROBLEM_IDS
]
problems = []
for dataset_type in ["test", "train"]:
problems += [
Problem(
source_file=problem["source_file"],
task_id=problem["task_id"],
prompt=problem["prompt"],
code=problem["code"],
test_imports=problem["test_imports"],
test_list=problem["test_list"],
)
for index, problem in enumerate(dataset[dataset_type])
if index < config.__getattribute__(dataset_type + "_len")
]

for problem in problems:
prompt = Prompt(
Expand All @@ -109,6 +110,6 @@ def load_mbpp():
)

return Benchmark(
name="MBPP",
name="mbpp",
tasks=tasks,
)
19 changes: 19 additions & 0 deletions gpt_engineer/benchmark/default_bench_config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# For apps, the maximal range is 0:5000 for both train and test
[apps]
active = true
test_start_index = 0
test_end_index = 2
train_start_index = 0
train_end_index = 2

# For mbpp, the maximal range is 0:47
[mbpp]
active = true
test_len = 2
train_len = 2

[gpteng]
active = true

[gptme]
active = true
5 changes: 1 addition & 4 deletions gpt_engineer/benchmark/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"""
import time

from typing import List, Optional
from typing import List

from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult
from gpt_engineer.core.base_agent import BaseAgent
Expand All @@ -24,7 +24,6 @@
def run(
agent: BaseAgent,
benchmark: Benchmark,
task_name: Optional[str] = None,
verbose=False,
) -> List[TaskResult]:
"""
Expand All @@ -36,8 +35,6 @@ def run(
The agent to use for running the benchmark tasks.
benchmark : Benchmark
The benchmark containing the tasks to run.
task_name : Optional[str], default=None
An optional name of a specific task to run within the benchmark.
verbose : bool, default=False
A flag to indicate whether to print verbose output during the benchmark.
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,8 @@ section-order = [
combine-as-imports = true
split-on-trailing-comma = false
lines-between-types = 1

[tool.pytest.ini_options]
markers = [
"requires_key: marks tests as requiring access to a valid OPENAI_API_KEY (deselect with '-m \"not requires_key\"')",
]
Loading

0 comments on commit 7630560

Please sign in to comment.