Skip to content

Commit

Permalink
test: run evals as tests, refactor evals, added python-xdist for para…
Browse files Browse the repository at this point in the history
…llel testing
  • Loading branch information
ErikBjare committed Nov 27, 2023
1 parent 75e79bd commit 14ca2df
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 89 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ test:
@# if SLOW is not set, pass `-m "not slow"` to skip slow tests
poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \
--cov=gptme --cov-report=xml --cov-report=term-missing --cov-report=html \
-n auto \
$(if $(SLOW),, -m "not slow") \
$(if $(PROFILE), --profile-svg) \
$(if $(HAS_PLAYWRIGHT), --cov-config=scripts/.coveragerc-playwright)
Expand Down
78 changes: 78 additions & 0 deletions eval/evals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from main import ExecTest

tests: list["ExecTest"] = [
{
"name": "hello",
"files": {"hello.py": "print('Hello, world!')"},
"run": "python hello.py",
"prompt": "Change the code in hello.py to print 'Hello, human!'",
"expect": {
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
"correct file": lambda ctx: ctx.files["hello.py"].strip()
== "print('Hello, human!')",
},
},
{
"name": "hello-patch",
"files": {"hello.py": "print('Hello, world!')"},
"run": "python hello.py",
"prompt": "Patch the code in hello.py to print 'Hello, human!'",
"expect": {
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
"correct file": lambda ctx: ctx.files["hello.py"].strip()
== "print('Hello, human!')",
},
},
{
"name": "hello-ask",
"files": {"hello.py": "print('Hello, world!')"},
"run": "echo 'Erik' | python hello.py",
# TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
"prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",
"expect": {
"correct output": lambda ctx: "Hello, Erik!" in ctx.stdout,
},
},
{
"name": "prime100",
"files": {},
"run": "python prime.py",
"prompt": "write a script prime.py that computes and prints the 100th prime number",
"expect": {
"correct output": lambda ctx: "541" in ctx.stdout.split(),
},
},
{
"name": "init-git",
"files": {},
"run": "git status",
"prompt": "initialize a git repository, write a main.py file, and commit it",
"expect": {
"clean exit": lambda ctx: ctx.exit_code == 0,
"clean working tree": lambda ctx: "nothing to commit, working tree clean"
in ctx.stdout,
"main.py exists": lambda ctx: "main.py" in ctx.files,
"we have a commit": lambda ctx: "No commits yet" not in ctx.stdout,
},
},
# Fails, gets stuck on interactive stuff
# {
# "name": "init-vue-ts-tailwind",
# "files": {},
# "run": "cat package.json",
# "prompt": "initialize a vue project with typescript and tailwind, make a page that says 'Hello, world!'. don't try to execute it or do anything interactive",
# "expect": {
# "package.json exists": lambda ctx: "package.json" in ctx.files,
# "vue installed": lambda ctx: '"vue":' in ctx.files["package.json"],
# "tailwind installed": lambda ctx: '"tailwindcss":'
# in ctx.files["package.json"],
# "typescript installed": lambda ctx: '"typescript":'
# in ctx.files["package.json"],
# },
# },
]

tests_map = {test["name"]: test for test in tests}
93 changes: 5 additions & 88 deletions eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,9 @@
from gptme.message import Message
from gptme.prompts import get_prompt

logger = logging.getLogger(__name__)


def hook_chdir(name, *args):
if "chdir" in name:
logger.warning(f"chdir {args[0]}")

from evals import tests, tests_map

debug = False
if debug:
# add audit hook to see how pwd is changed
sys.addaudithook(hook_chdir)
logger = logging.getLogger(__name__)

Files = Dict[str, str | bytes]

Expand Down Expand Up @@ -79,7 +70,7 @@ class CaseResult(TypedDict):
duration: float


class TestResult(TypedDict):
class ExecResult(TypedDict):
name: str
results: list[CaseResult]
timings: dict[str, float]
Expand All @@ -93,81 +84,6 @@ class ExecTest(TypedDict):
expect: dict[str, Callable[[ResultContext], bool]]


tests: list[ExecTest] = [
{
"name": "hello",
"files": {"hello.py": "print('Hello, world!')"},
"run": "python hello.py",
"prompt": "Change the code in hello.py to print 'Hello, human!'",
"expect": {
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
"correct file": lambda ctx: ctx.files["hello.py"].strip()
== "print('Hello, human!')",
},
},
{
"name": "hello-patch",
"files": {"hello.py": "print('Hello, world!')"},
"run": "python hello.py",
"prompt": "Patch the code in hello.py to print 'Hello, human!'",
"expect": {
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
"correct file": lambda ctx: ctx.files["hello.py"].strip()
== "print('Hello, human!')",
},
},
{
"name": "hello-ask",
"files": {"hello.py": "print('Hello, world!')"},
"run": "echo 'Erik' | python hello.py",
# TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
"prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",
"expect": {
"correct output": lambda ctx: "Hello, Erik!" in ctx.stdout,
},
},
{
"name": "prime100",
"files": {},
"run": "python prime.py",
"prompt": "write a script prime.py that computes and prints the 100th prime number",
"expect": {
"correct output": lambda ctx: "541" in ctx.stdout.split(),
},
},
{
"name": "init-git",
"files": {},
"run": "git status",
"prompt": "initialize a git repository, write a main.py file, and commit it",
"expect": {
"clean exit": lambda ctx: ctx.exit_code == 0,
"clean working tree": lambda ctx: "nothing to commit, working tree clean"
in ctx.stdout,
"main.py exists": lambda ctx: "main.py" in ctx.files,
"we have a commit": lambda ctx: "No commits yet" not in ctx.stdout,
},
},
# Fails, gets stuck on interactive stuff
# {
# "name": "init-vue-ts-tailwind",
# "files": {},
# "run": "cat package.json",
# "prompt": "initialize a vue project with typescript and tailwind, make a page that says 'Hello, world!'. don't try to execute it or do anything interactive",
# "expect": {
# "package.json exists": lambda ctx: "package.json" in ctx.files,
# "vue installed": lambda ctx: '"vue":' in ctx.files["package.json"],
# "tailwind installed": lambda ctx: '"tailwindcss":'
# in ctx.files["package.json"],
# "typescript installed": lambda ctx: '"typescript":'
# in ctx.files["package.json"],
# },
# },
]

tests_map = {test["name"]: test for test in tests}


class FileStore:
def __init__(self):
self.working_dir = Path(tempfile.mkdtemp(prefix="gptme-evals-"))
Expand Down Expand Up @@ -201,6 +117,7 @@ def download(self) -> Files:


class Agent:
@abstractmethod
def act(self, files: Files | None, prompt: str) -> Files:
"""
Carries out the prompt and returns artifacts in the form of `Files`.
Expand Down Expand Up @@ -281,7 +198,7 @@ def run(self, command) -> tuple[str, str, int]:
return stdout_full, stderr_full, p.returncode


def execute(test: ExecTest) -> TestResult:
def execute(test: ExecTest) -> ExecResult:
"""
Executes the code.
"""
Expand Down
20 changes: 20 additions & 0 deletions eval/test_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pytest
from evals import tests

from main import execute


@pytest.mark.slow
def test_eval(test):
"""
This test will be run for each eval in the tests list.
See pytest_generate_tests() below.
"""
result = execute(test)
assert all(case["passed"] for case in result["results"])


# Hook to generate tests from the tests list
def pytest_generate_tests(metafunc):
if "test" in metafunc.fixturenames:
metafunc.parametrize("test", tests, ids=[test["name"] for test in tests])
36 changes: 35 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ flask = {version = "^2.3", optional=true}
[tool.poetry.group.dev.dependencies]
pytest = "^7.2"
pytest-cov = "*"
pytest-xdist = "^3.5.0"
pytest-profiling = "^1.7.0"
pytest-dotenv = "^0.5.2"
mypy = "*"
Expand Down

0 comments on commit 14ca2df

Please sign in to comment.