test: run evals as tests, refactor evals, added python-xdist for para…

…llel testing
ErikBjare · Nov 27, 2023 · 14ca2df · 14ca2df
1 parent 75e79bd
commit 14ca2df
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 89 deletions.
diff --git a/Makefile b/Makefile
@@ -21,6 +21,7 @@ test:
 	@# if SLOW is not set, pass `-m "not slow"` to skip slow tests
 	poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \
 		--cov=gptme --cov-report=xml --cov-report=term-missing --cov-report=html \
+		-n auto \
 		$(if $(SLOW),, -m "not slow") \
 		$(if $(PROFILE), --profile-svg) \
 		$(if $(HAS_PLAYWRIGHT), --cov-config=scripts/.coveragerc-playwright)

diff --git a/eval/evals.py b/eval/evals.py
@@ -0,0 +1,78 @@
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from main import ExecTest
+
+tests: list["ExecTest"] = [
+    {
+        "name": "hello",
+        "files": {"hello.py": "print('Hello, world!')"},
+        "run": "python hello.py",
+        "prompt": "Change the code in hello.py to print 'Hello, human!'",
+        "expect": {
+            "correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
+            "correct file": lambda ctx: ctx.files["hello.py"].strip()
+            == "print('Hello, human!')",
+        },
+    },
+    {
+        "name": "hello-patch",
+        "files": {"hello.py": "print('Hello, world!')"},
+        "run": "python hello.py",
+        "prompt": "Patch the code in hello.py to print 'Hello, human!'",
+        "expect": {
+            "correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
+            "correct file": lambda ctx: ctx.files["hello.py"].strip()
+            == "print('Hello, human!')",
+        },
+    },
+    {
+        "name": "hello-ask",
+        "files": {"hello.py": "print('Hello, world!')"},
+        "run": "echo 'Erik' | python hello.py",
+        # TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
+        "prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",
+        "expect": {
+            "correct output": lambda ctx: "Hello, Erik!" in ctx.stdout,
+        },
+    },
+    {
+        "name": "prime100",
+        "files": {},
+        "run": "python prime.py",
+        "prompt": "write a script prime.py that computes and prints the 100th prime number",
+        "expect": {
+            "correct output": lambda ctx: "541" in ctx.stdout.split(),
+        },
+    },
+    {
+        "name": "init-git",
+        "files": {},
+        "run": "git status",
+        "prompt": "initialize a git repository, write a main.py file, and commit it",
+        "expect": {
+            "clean exit": lambda ctx: ctx.exit_code == 0,
+            "clean working tree": lambda ctx: "nothing to commit, working tree clean"
+            in ctx.stdout,
+            "main.py exists": lambda ctx: "main.py" in ctx.files,
+            "we have a commit": lambda ctx: "No commits yet" not in ctx.stdout,
+        },
+    },
+    # Fails, gets stuck on interactive stuff
+    # {
+    #     "name": "init-vue-ts-tailwind",
+    #     "files": {},
+    #     "run": "cat package.json",
+    #     "prompt": "initialize a vue project with typescript and tailwind, make a page that says 'Hello, world!'. don't try to execute it or do anything interactive",
+    #     "expect": {
+    #         "package.json exists": lambda ctx: "package.json" in ctx.files,
+    #         "vue installed": lambda ctx: '"vue":' in ctx.files["package.json"],
+    #         "tailwind installed": lambda ctx: '"tailwindcss":'
+    #         in ctx.files["package.json"],
+    #         "typescript installed": lambda ctx: '"typescript":'
+    #         in ctx.files["package.json"],
+    #     },
+    # },
+]
+
+tests_map = {test["name"]: test for test in tests}
diff --git a/eval/main.py b/eval/main.py
@@ -21,18 +21,9 @@
 from gptme.message import Message
 from gptme.prompts import get_prompt
 
-logger = logging.getLogger(__name__)
-
-
-def hook_chdir(name, *args):
-    if "chdir" in name:
-        logger.warning(f"chdir {args[0]}")
-
+from evals import tests, tests_map
 
-debug = False
-if debug:
-    # add audit hook to see how pwd is changed
-    sys.addaudithook(hook_chdir)
+logger = logging.getLogger(__name__)
 
 Files = Dict[str, str | bytes]
 
@@ -79,7 +70,7 @@ class CaseResult(TypedDict):
     duration: float
 
 
-class TestResult(TypedDict):
+class ExecResult(TypedDict):
     name: str
     results: list[CaseResult]
     timings: dict[str, float]
@@ -93,81 +84,6 @@ class ExecTest(TypedDict):
     expect: dict[str, Callable[[ResultContext], bool]]
 
 
-tests: list[ExecTest] = [
-    {
-        "name": "hello",
-        "files": {"hello.py": "print('Hello, world!')"},
-        "run": "python hello.py",
-        "prompt": "Change the code in hello.py to print 'Hello, human!'",
-        "expect": {
-            "correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
-            "correct file": lambda ctx: ctx.files["hello.py"].strip()
-            == "print('Hello, human!')",
-        },
-    },
-    {
-        "name": "hello-patch",
-        "files": {"hello.py": "print('Hello, world!')"},
-        "run": "python hello.py",
-        "prompt": "Patch the code in hello.py to print 'Hello, human!'",
-        "expect": {
-            "correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
-            "correct file": lambda ctx: ctx.files["hello.py"].strip()
-            == "print('Hello, human!')",
-        },
-    },
-    {
-        "name": "hello-ask",
-        "files": {"hello.py": "print('Hello, world!')"},
-        "run": "echo 'Erik' | python hello.py",
-        # TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
-        "prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",
-        "expect": {
-            "correct output": lambda ctx: "Hello, Erik!" in ctx.stdout,
-        },
-    },
-    {
-        "name": "prime100",
-        "files": {},
-        "run": "python prime.py",
-        "prompt": "write a script prime.py that computes and prints the 100th prime number",
-        "expect": {
-            "correct output": lambda ctx: "541" in ctx.stdout.split(),
-        },
-    },
-    {
-        "name": "init-git",
-        "files": {},
-        "run": "git status",
-        "prompt": "initialize a git repository, write a main.py file, and commit it",
-        "expect": {
-            "clean exit": lambda ctx: ctx.exit_code == 0,
-            "clean working tree": lambda ctx: "nothing to commit, working tree clean"
-            in ctx.stdout,
-            "main.py exists": lambda ctx: "main.py" in ctx.files,
-            "we have a commit": lambda ctx: "No commits yet" not in ctx.stdout,
-        },
-    },
-    # Fails, gets stuck on interactive stuff
-    # {
-    #     "name": "init-vue-ts-tailwind",
-    #     "files": {},
-    #     "run": "cat package.json",
-    #     "prompt": "initialize a vue project with typescript and tailwind, make a page that says 'Hello, world!'. don't try to execute it or do anything interactive",
-    #     "expect": {
-    #         "package.json exists": lambda ctx: "package.json" in ctx.files,
-    #         "vue installed": lambda ctx: '"vue":' in ctx.files["package.json"],
-    #         "tailwind installed": lambda ctx: '"tailwindcss":'
-    #         in ctx.files["package.json"],
-    #         "typescript installed": lambda ctx: '"typescript":'
-    #         in ctx.files["package.json"],
-    #     },
-    # },
-]
-
-tests_map = {test["name"]: test for test in tests}
-
-
 class FileStore:
     def __init__(self):
         self.working_dir = Path(tempfile.mkdtemp(prefix="gptme-evals-"))
@@ -201,6 +117,7 @@ def download(self) -> Files:
 
 
 class Agent:
+    @abstractmethod
     def act(self, files: Files | None, prompt: str) -> Files:
         """
         Carries out the prompt and returns artifacts in the form of `Files`.
@@ -281,7 +198,7 @@ def run(self, command) -> tuple[str, str, int]:
         return stdout_full, stderr_full, p.returncode
 
 
-def execute(test: ExecTest) -> TestResult:
+def execute(test: ExecTest) -> ExecResult:
     """
     Executes the code.
     """

diff --git a/eval/test_eval.py b/eval/test_eval.py
@@ -0,0 +1,20 @@
+import pytest
+from evals import tests
+
+from main import execute
+
+
+@pytest.mark.slow
+def test_eval(test):
+    """
+    This test will be run for each eval in the tests list.
+    See pytest_generate_tests() below.
+    """
+    result = execute(test)
+    assert all(case["passed"] for case in result["results"])
+
+
+# Hook to generate tests from the tests list
+def pytest_generate_tests(metafunc):
+    if "test" in metafunc.fixturenames:
+        metafunc.parametrize("test", tests, ids=[test["name"] for test in tests])
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,7 @@ flask = {version = "^2.3", optional=true}
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.2"
 pytest-cov = "*"
+pytest-xdist = "^3.5.0"
 pytest-profiling = "^1.7.0"
 pytest-dotenv = "^0.5.2"
 mypy = "*"