-
-
Notifications
You must be signed in to change notification settings - Fork 213
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
test: run evals as tests, refactor evals, added python-xdist for para…
…llel testing
- Loading branch information
Showing
6 changed files
with
140 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from typing import TYPE_CHECKING | ||
|
||
if TYPE_CHECKING: | ||
from main import ExecTest | ||
|
||
tests: list["ExecTest"] = [ | ||
{ | ||
"name": "hello", | ||
"files": {"hello.py": "print('Hello, world!')"}, | ||
"run": "python hello.py", | ||
"prompt": "Change the code in hello.py to print 'Hello, human!'", | ||
"expect": { | ||
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n", | ||
"correct file": lambda ctx: ctx.files["hello.py"].strip() | ||
== "print('Hello, human!')", | ||
}, | ||
}, | ||
{ | ||
"name": "hello-patch", | ||
"files": {"hello.py": "print('Hello, world!')"}, | ||
"run": "python hello.py", | ||
"prompt": "Patch the code in hello.py to print 'Hello, human!'", | ||
"expect": { | ||
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n", | ||
"correct file": lambda ctx: ctx.files["hello.py"].strip() | ||
== "print('Hello, human!')", | ||
}, | ||
}, | ||
{ | ||
"name": "hello-ask", | ||
"files": {"hello.py": "print('Hello, world!')"}, | ||
"run": "echo 'Erik' | python hello.py", | ||
# TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode | ||
"prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it", | ||
"expect": { | ||
"correct output": lambda ctx: "Hello, Erik!" in ctx.stdout, | ||
}, | ||
}, | ||
{ | ||
"name": "prime100", | ||
"files": {}, | ||
"run": "python prime.py", | ||
"prompt": "write a script prime.py that computes and prints the 100th prime number", | ||
"expect": { | ||
"correct output": lambda ctx: "541" in ctx.stdout.split(), | ||
}, | ||
}, | ||
{ | ||
"name": "init-git", | ||
"files": {}, | ||
"run": "git status", | ||
"prompt": "initialize a git repository, write a main.py file, and commit it", | ||
"expect": { | ||
"clean exit": lambda ctx: ctx.exit_code == 0, | ||
"clean working tree": lambda ctx: "nothing to commit, working tree clean" | ||
in ctx.stdout, | ||
"main.py exists": lambda ctx: "main.py" in ctx.files, | ||
"we have a commit": lambda ctx: "No commits yet" not in ctx.stdout, | ||
}, | ||
}, | ||
# Fails, gets stuck on interactive stuff | ||
# { | ||
# "name": "init-vue-ts-tailwind", | ||
# "files": {}, | ||
# "run": "cat package.json", | ||
# "prompt": "initialize a vue project with typescript and tailwind, make a page that says 'Hello, world!'. don't try to execute it or do anything interactive", | ||
# "expect": { | ||
# "package.json exists": lambda ctx: "package.json" in ctx.files, | ||
# "vue installed": lambda ctx: '"vue":' in ctx.files["package.json"], | ||
# "tailwind installed": lambda ctx: '"tailwindcss":' | ||
# in ctx.files["package.json"], | ||
# "typescript installed": lambda ctx: '"typescript":' | ||
# in ctx.files["package.json"], | ||
# }, | ||
# }, | ||
] | ||
|
||
tests_map = {test["name"]: test for test in tests} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import pytest | ||
from evals import tests | ||
|
||
from main import execute | ||
|
||
|
||
@pytest.mark.slow | ||
def test_eval(test): | ||
""" | ||
This test will be run for each eval in the tests list. | ||
See pytest_generate_tests() below. | ||
""" | ||
result = execute(test) | ||
assert all(case["passed"] for case in result["results"]) | ||
|
||
|
||
# Hook to generate tests from the tests list | ||
def pytest_generate_tests(metafunc): | ||
if "test" in metafunc.fixturenames: | ||
metafunc.parametrize("test", tests, ids=[test["name"] for test in tests]) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters