Add CodeActSWEAgent to remove browsing & github + improvements on age…

…ntskills (#2105) * update swe_bench prompt; use minimal prompt for codeact; * upgrade agentskills and update testcases * update infer prompt * fix cwd * add icl for swebench * also log in_context_example to run infer * remove extra print * change prompt to abs path * update error message to include current file info * change cwd for jupyter if needed * update edit error message * update prompt * improve git get patch * update hint string * default to 50 turns * revert changes from codeact agent and create new CodeActSWEAgent * revert changes to codeact * revert instructions for run infer * revert instructions for run infer * update README * update max iter * add codeact swe agent * fix issue for CodeActSWEAgent * allow specifying max iter in cmdline script * stop printing * Update agenthub/codeact_swe_agent/README.md Co-authored-by: Yufan Song <[email protected]> * Fix prompt regression in jupyter plugin --------- Co-authored-by: Yufan Song <[email protected]> Co-authored-by: Boxuan Li <[email protected]>
All-Hands-AI · May 30, 2024 · 01ef902 · 01ef902
1 parent b1ec8e5
commit 01ef902
Show file tree

Hide file tree

Showing 14 changed files with 1,022 additions and 76 deletions.
diff --git a/agenthub/__init__.py b/agenthub/__init__.py
@@ -12,6 +12,7 @@
     SWE_agent,
     browsing_agent,
     codeact_agent,
+    codeact_swe_agent,
     delegator_agent,
     dummy_agent,
     monologue_agent,
@@ -21,6 +22,7 @@
 __all__ = [
     'monologue_agent',
     'codeact_agent',
+    'codeact_swe_agent',
     'planner_agent',
     'SWE_agent',
     'delegator_agent',

diff --git a/agenthub/codeact_agent/codeact_agent.py b/agenthub/codeact_agent/codeact_agent.py
@@ -105,6 +105,18 @@ def truncate_observation(observation: str, max_chars: int = 10_000) -> str:
     )
 
 
+# FIXME: We can tweak these two settings to create MicroAgents specialized toward different area
+def get_system_message() -> str:
+    if ENABLE_GITHUB:
+        return f'{SYSTEM_PREFIX}\n{GITHUB_MESSAGE}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
+    else:
+        return f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
+
+
+def get_in_context_example() -> str:
+    return EXAMPLES
+
+
 class CodeActAgent(Agent):
     VERSION = '1.5'
     """
@@ -152,11 +164,8 @@ class CodeActAgent(Agent):
     ]
     jupyter_kernel_init_code: str = 'from agentskills import *'
 
-    system_message: str = (
-        f'{SYSTEM_PREFIX}\n{GITHUB_MESSAGE}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
-        if ENABLE_GITHUB
-        else f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
-    )
+    system_message: str = get_system_message()
+    in_context_example: str = f"Here is an example of how you can interact with the environment for task solving:\n{get_in_context_example()}\n\nNOW, LET'S START!"
 
     def __init__(
         self,
@@ -194,10 +203,7 @@ def step(self, state: State) -> Action:
         """
         messages: list[dict[str, str]] = [
             {'role': 'system', 'content': self.system_message},
-            {
-                'role': 'user',
-                'content': f"Here is an example of how you can interact with the environment for task solving:\n{EXAMPLES}\n\nNOW, LET'S START!\n",
-            },
+            {'role': 'user', 'content': self.in_context_example},
         ]
 
         for prev_action, obs in state.history:

diff --git a/agenthub/codeact_agent/prompt.py b/agenthub/codeact_agent/prompt.py
@@ -8,17 +8,23 @@
     "Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
 )
 
-SYSTEM_PREFIX = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+# ======= SYSTEM MESSAGE =======
+MINIMAL_SYSTEM_PREFIX = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
 The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
 The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
 For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
-The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
+"""
+
+BROWSING_PREFIX = """The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
 For example, you can browse a given URL by <execute_browse> goto("<URL>") </execute_browse>.
 The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
-The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them."""
+"""
+PIP_INSTALL_PREFIX = """The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them."""
+
+SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX
 
 GITHUB_MESSAGE = """To do any activities on GitHub, the assistant should use the token in the $GITHUB_TOKEN environment variable.
 For instance, to push a local branch `my_branch` to the github repo `owner/repo`, the assistant can use the following four commands:
@@ -30,6 +36,8 @@
 IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
 """
 
+
+# ======= EXAMPLE MESSAGE =======
 EXAMPLES = """
 --- START OF EXAMPLE ---
 

diff --git a/agenthub/codeact_swe_agent/README.md b/agenthub/codeact_swe_agent/README.md
@@ -0,0 +1,7 @@
+# CodeAct (SWE Edit Specialized)
+
+This agent is an adaptation of the original [SWE Agent](https://swe-agent.com/) based on CodeAct using the `agentskills` library of OpenDevin.
+
+Its intended use is **solving Github issues**.
+
+It removes web-browsing and Github capability from the original CodeAct agent to avoid confusion to the agent.
diff --git a/agenthub/codeact_swe_agent/__init__.py b/agenthub/codeact_swe_agent/__init__.py
@@ -0,0 +1,5 @@
+from opendevin.controller.agent import Agent
+
+from .codeact_swe_agent import CodeActSWEAgent
+
+Agent.register('CodeActSWEAgent', CodeActSWEAgent)
diff --git a/agenthub/codeact_swe_agent/codeact_swe_agent.py b/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -0,0 +1,246 @@
+import re
+
+from agenthub.codeact_swe_agent.prompt import (
+    COMMAND_DOCS,
+    MINIMAL_SYSTEM_PREFIX,
+    SWE_EXAMPLE,
+    SYSTEM_SUFFIX,
+)
+from opendevin.controller.agent import Agent
+from opendevin.controller.state.state import State
+from opendevin.events.action import (
+    Action,
+    AgentFinishAction,
+    BrowseInteractiveAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from opendevin.events.observation import (
+    BrowserOutputObservation,
+    CmdOutputObservation,
+    IPythonRunCellObservation,
+)
+from opendevin.llm.llm import LLM
+from opendevin.runtime.plugins import (
+    AgentSkillsRequirement,
+    JupyterRequirement,
+    PluginRequirement,
+)
+
+
+def parse_response(response) -> str:
+    action = response.choices[0].message.content
+    for lang in ['bash', 'ipython', 'browse']:
+        if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
+            action += f'</execute_{lang}>'
+    return action
+
+
+def action_to_str(action: Action) -> str:
+    if isinstance(action, CmdRunAction):
+        return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
+    elif isinstance(action, IPythonRunCellAction):
+        return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
+    elif isinstance(action, BrowseInteractiveAction):
+        return f'{action.thought}\n<execute_browse>\n{action.browser_actions}\n</execute_browse>'
+    elif isinstance(action, MessageAction):
+        return action.content
+    return ''
+
+
+def get_action_message(action: Action) -> dict[str, str] | None:
+    if (
+        isinstance(action, BrowseInteractiveAction)
+        or isinstance(action, CmdRunAction)
+        or isinstance(action, IPythonRunCellAction)
+        or isinstance(action, MessageAction)
+    ):
+        return {
+            'role': 'user' if action.source == 'user' else 'assistant',
+            'content': action_to_str(action),
+        }
+    return None
+
+
+def get_observation_message(obs) -> dict[str, str] | None:
+    if isinstance(obs, CmdOutputObservation):
+        content = 'OBSERVATION:\n' + truncate_observation(obs.content)
+        content += (
+            f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]]'
+        )
+        return {'role': 'user', 'content': content}
+    elif isinstance(obs, IPythonRunCellObservation):
+        content = 'OBSERVATION:\n' + obs.content
+        # replace base64 images with a placeholder
+        splitted = content.split('\n')
+        for i, line in enumerate(splitted):
+            if '![image](data:image/png;base64,' in line:
+                splitted[i] = (
+                    '![image](data:image/png;base64, ...) already displayed to user'
+                )
+        content = '\n'.join(splitted)
+        content = truncate_observation(content)
+        return {'role': 'user', 'content': content}
+    elif isinstance(obs, BrowserOutputObservation):
+        content = 'OBSERVATION:\n' + truncate_observation(obs.content)
+        return {'role': 'user', 'content': content}
+    return None
+
+
+def truncate_observation(observation: str, max_chars: int = 10_000) -> str:
+    """
+    Truncate the middle of the observation if it is too long.
+    """
+    if len(observation) <= max_chars:
+        return observation
+    half = max_chars // 2
+    return (
+        observation[:half]
+        + '\n[... Observation truncated due to length ...]\n'
+        + observation[-half:]
+    )
+
+
+def get_system_message() -> str:
+    return f'{MINIMAL_SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
+
+
+def get_in_context_example() -> str:
+    return SWE_EXAMPLE
+
+
+class CodeActSWEAgent(Agent):
+    VERSION = '1.5'
+    """
+    This agent is an adaptation of the original [SWE Agent](https://swe-agent.com/) based on CodeAct 1.5 using the `agentskills` library of OpenDevin.
+
+    It is intended use is **solving Github issues**.
+
+    It removes web-browsing and Github capability from the original CodeAct agent to avoid confusion to the agent.
+    """
+
+    sandbox_plugins: list[PluginRequirement] = [
+        # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
+        # AgentSkillsRequirement provides a lot of Python functions
+        # and it need to be initialized before Jupyter for Jupyter to use those functions.
+        AgentSkillsRequirement(),
+        JupyterRequirement(),
+    ]
+    jupyter_kernel_init_code: str = 'from agentskills import *'
+
+    system_message: str = get_system_message()
+    in_context_example: str = f"Here is an example of how you can interact with the environment for task solving:\n{get_in_context_example()}\n\nNOW, LET'S START!"
+
+    def __init__(
+        self,
+        llm: LLM,
+    ) -> None:
+        """
+        Initializes a new instance of the CodeActAgent class.
+
+        Parameters:
+        - llm (LLM): The llm to be used by this agent
+        """
+        super().__init__(llm)
+        self.reset()
+
+    def reset(self) -> None:
+        """
+        Resets the CodeAct Agent.
+        """
+        super().reset()
+
+    def step(self, state: State) -> Action:
+        """
+        Performs one step using the CodeAct Agent.
+        This includes gathering info on previous steps and prompting the model to make a command to execute.
+
+        Parameters:
+        - state (State): used to get updated info and background commands
+
+        Returns:
+        - CmdRunAction(command) - bash command to run
+        - IPythonRunCellAction(code) - IPython code to run
+        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+        """
+        messages: list[dict[str, str]] = [
+            {'role': 'system', 'content': self.system_message},
+            {'role': 'user', 'content': self.in_context_example},
+        ]
+
+        for prev_action, obs in state.history:
+            action_message = get_action_message(prev_action)
+            if action_message:
+                messages.append(action_message)
+
+            obs_message = get_observation_message(obs)
+            if obs_message:
+                messages.append(obs_message)
+
+        latest_user_message = [m for m in messages if m['role'] == 'user'][-1]
+        if latest_user_message:
+            if latest_user_message['content'].strip() == '/exit':
+                return AgentFinishAction()
+            latest_user_message['content'] += (
+                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
+            )
+
+        response = self.llm.do_completion(
+            messages=messages,
+            stop=[
+                '</execute_ipython>',
+                '</execute_bash>',
+                '</execute_browse>',
+            ],
+            temperature=0.0,
+        )
+
+        action_str: str = parse_response(response)
+        state.num_of_chars += sum(
+            len(message['content']) for message in messages
+        ) + len(action_str)
+
+        if finish_command := re.search(r'<finish>.*</finish>', action_str, re.DOTALL):
+            thought = action_str.replace(finish_command.group(0), '').strip()
+            return AgentFinishAction(thought=thought)
+        if bash_command := re.search(
+            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
+        ):
+            # remove the command from the action string to get thought
+            thought = action_str.replace(bash_command.group(0), '').strip()
+            # a command was found
+            command_group = bash_command.group(1).strip()
+
+            if command_group.strip() == 'exit':
+                return AgentFinishAction()
+            return CmdRunAction(command=command_group, thought=thought)
+        elif python_code := re.search(
+            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
+        ):
+            # a code block was found
+            code_group = python_code.group(1).strip()
+            thought = action_str.replace(python_code.group(0), '').strip()
+            return IPythonRunCellAction(
+                code=code_group,
+                thought=thought,
+                kernel_init_code=self.jupyter_kernel_init_code,
+            )
+        elif browse_command := re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        ):
+            # BrowserGym actions was found
+            browse_actions = browse_command.group(1).strip()
+            thought = action_str.replace(browse_command.group(0), '').strip()
+            return BrowseInteractiveAction(
+                browser_actions=browse_actions, thought=thought
+            )
+        else:
+            # We assume the LLM is GOOD enough that when it returns pure natural language
+            # it want to talk to the user
+            return MessageAction(content=action_str, wait_for_response=True)
+
+    def search_memory(self, query: str) -> list[str]:
+        raise NotImplementedError('Implement this abstract method')