Multimodal code execution

2024-10-23 19:32:15 -07:00 · 2024-10-23 19:32:15 -07:00 · a5b6948c9c
parent 521242c36d
commit a5b6948c9c
2 changed files with 80 additions and 8 deletions
--- a/software/main.py
+++ b/software/main.py
@ -258,7 +258,7 @@ def run(

        ### START LIVEKIT WORKER
        if server == "livekit":
-            time.sleep(7)
+            time.sleep(5)
            # These are needed to communicate with the worker's entrypoint
            os.environ['INTERPRETER_SERVER_HOST'] = light_server_host
            os.environ['INTERPRETER_SERVER_PORT'] = str(light_server_port)
@ -273,7 +273,7 @@ def run(
                    room="my-room",
            )).to_jwt())

-            meet_url = f'https://meet.livekit.io/custom?liveKitUrl={url.replace("http", "ws")}&token={token}\n\n'
+            # meet_url = f'http://localhost:3000/custom?liveKitUrl={url.replace("http", "ws")}&token={token}\n\n'
            print("\n")
            print("For debugging, you can join a video call with your assistant. Click the link below, then send a chat message that says {CONTEXT_MODE_OFF}, then begin speaking:")
            print(meet_url)
--- a/software/source/server/livekit/multimodal.py
+++ b/software/source/server/livekit/multimodal.py
@ -11,38 +11,110 @@ from livekit.agents.multimodal import MultimodalAgent
 from livekit.plugins import openai
 from dotenv import load_dotenv
 import os
+import time
+from typing import Annotated
+from livekit.agents import llm
+
+# Set the environment variable
+os.environ['INTERPRETER_TERMINAL_INPUT_PATIENCE'] = '200000'
+
+instructions = """
+You are Open Interpreter, a world-class programmer that can complete any goal by executing code.
+For advanced requests, start by writing a plan.
+When you execute code, it will be executed **on the user's machine** in a stateful Jupyter notebook. The user has given you **full permission** to execute any code necessary to complete the task. Execute the code. You CAN run code on the users machine, using the tool you have access to.
+You can access the internet. Run **any code** to achieve the goal, and if at first you don't succeed, try again and again.
+You can install new packages.
+If you modify or create a file, YOU MUST THEN OPEN IT to display it to the user.
+Be concise. Do NOT send the user a markdown version of your code — just execute the code instantly. Execute the code!
+
+You are capable of **any** task.
+
+You MUST remember to pass into the execute_code function a correct JSON input like {"code": "print('hello world')"} and NOT a raw string or something else.
+"""

 load_dotenv()

 async def entrypoint(ctx: JobContext):
+    
+    from interpreter import interpreter
+
+    def execute_code(code):
+        print("--- code ---")
+        print(code)
+        print("---")
+        #time.sleep(2)
+        # Check if the code contains any file deletion commands
+        if any(keyword in code.lower() for keyword in ['os.remove', 'os.unlink', 'shutil.rmtree', 'delete file', 'rm -']):
+            print("Warning: File deletion commands detected. Execution aborted for safety.")
+            return "Execution aborted: File deletion commands are not allowed."
+        print("--- output ---")
+        output = ""
+        for chunk in interpreter.computer.run("python", code):
+            if "content" in chunk and type(chunk["content"]) == str:
+                output += "\n" + chunk["content"]
+                print(chunk["content"])
+        print("---")
+
+        output = output.strip()
+        
+        if output == "":
+            output = "No output was produced by running this code."
+        return output
+
+
+    # first define a class that inherits from llm.FunctionContext
+    class AssistantFnc(llm.FunctionContext):
+        # the llm.ai_callable decorator marks this function as a tool available to the LLM
+        # by default, it'll use the docstring as the function's description
+        @llm.ai_callable()
+        async def execute(
+            self,
+            # by using the Annotated type, arg description and type are available to the LLM
+            code: Annotated[
+                str, llm.TypeInfo(description="The Python code to execute")
+            ],
+        ):
+            """Executes Python and returns the output"""
+            return execute_code(code)
+
+    fnc_ctx = AssistantFnc()
+
    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)

    participant = await ctx.wait_for_participant()

    openai_api_key = os.getenv("OPENAI_API_KEY")
    model = openai.realtime.RealtimeModel(
-        instructions="You are a helpful assistant and you love open-source software",
+        instructions=instructions,
        voice="shimmer",
-        temperature=0.8,
+        temperature=0.6,
        modalities=["audio", "text"],
        api_key=openai_api_key,
        base_url="wss://api.openai.com/v1",
    )
-    assistant = MultimodalAgent(model=model)
+    model._fnc_ctx = fnc_ctx
+    assistant = MultimodalAgent(model=model, fnc_ctx=fnc_ctx)
+
    assistant.start(ctx.room)

-    session = model.sessions[0]
+    # Create a session with the function context
+    session = model.session(
+        chat_ctx=llm.ChatContext(),
+        fnc_ctx=fnc_ctx,
+    )
+
+    # Initial message to start the interaction
    session.conversation.item.create(
      llm.ChatMessage(
        role="user",
-        content="Please begin the interaction with the user in a manner consistent with your instructions.",
+        content="Hello!",
      )
    )
    session.response.create()

 def main(livekit_url):
    # Workers have to be run as CLIs right now.
-    # So we need to simualte running "[this file] dev"
+    # So we need to simulate running "[this file] dev"

    # Modify sys.argv to set the path to this file as the first argument
    # and 'dev' as the second argument