Multimodal code execution

2024-10-23 19:32:15 -07:00 · 2024-10-23 19:32:15 -07:00 · a5b6948c9c
parent 521242c36d
commit a5b6948c9c
2 changed files with 80 additions and 8 deletions
--- a/software/main.py
+++ b/software/main.py
@ -258,7 +258,7 @@ def run(
        ### START LIVEKIT WORKER
        if server == "livekit":
-            time.sleep(7)
+            time.sleep(5)
            # These are needed to communicate with the worker's entrypoint
            os.environ['INTERPRETER_SERVER_HOST'] = light_server_host
            os.environ['INTERPRETER_SERVER_PORT'] = str(light_server_port)
@ -273,7 +273,7 @@ def run(
                    room="my-room",
            )).to_jwt())
-            meet_url = f'https://meet.livekit.io/custom?liveKitUrl={url.replace("http", "ws")}&token={token}\n\n'
+            # meet_url = f'http://localhost:3000/custom?liveKitUrl={url.replace("http", "ws")}&token={token}\n\n'
            print("\n")
            print("For debugging, you can join a video call with your assistant. Click the link below, then send a chat message that says {CONTEXT_MODE_OFF}, then begin speaking:")
            print(meet_url)
--- a/software/source/server/livekit/multimodal.py
+++ b/software/source/server/livekit/multimodal.py
@ -11,38 +11,110 @@ from livekit.agents.multimodal import MultimodalAgent
 from livekit.plugins import openai
 from dotenv import load_dotenv
 import os
 import time
 from typing import Annotated
 from livekit.agents import llm
 # Set the environment variable
 os.environ['INTERPRETER_TERMINAL_INPUT_PATIENCE'] = '200000'
 instructions = """
 You are Open Interpreter, a world-class programmer that can complete any goal by executing code.
 For advanced requests, start by writing a plan.
 When you execute code, it will be executed **on the user's machine** in a stateful Jupyter notebook. The user has given you **full permission** to execute any code necessary to complete the task. Execute the code. You CAN run code on the users machine, using the tool you have access to.
 You can access the internet. Run **any code** to achieve the goal, and if at first you don't succeed, try again and again.
 You can install new packages.
 If you modify or create a file, YOU MUST THEN OPEN IT to display it to the user.
 Be concise. Do NOT send the user a markdown version of your code — just execute the code instantly. Execute the code!
 You are capable of **any** task.
 You MUST remember to pass into the execute_code function a correct JSON input like {"code": "print('hello world')"} and NOT a raw string or something else.
 """
 load_dotenv()
 async def entrypoint(ctx: JobContext):
    from interpreter import interpreter
    def execute_code(code):
        print("--- code ---")
        print(code)
        print("---")
        #time.sleep(2)
        # Check if the code contains any file deletion commands
        if any(keyword in code.lower() for keyword in ['os.remove', 'os.unlink', 'shutil.rmtree', 'delete file', 'rm -']):
            print("Warning: File deletion commands detected. Execution aborted for safety.")
            return "Execution aborted: File deletion commands are not allowed."
        print("--- output ---")
        output = ""
        for chunk in interpreter.computer.run("python", code):
            if "content" in chunk and type(chunk["content"]) == str:
                output += "\n" + chunk["content"]
                print(chunk["content"])
        print("---")
        output = output.strip()
        if output == "":
            output = "No output was produced by running this code."
        return output
    # first define a class that inherits from llm.FunctionContext
    class AssistantFnc(llm.FunctionContext):
        # the llm.ai_callable decorator marks this function as a tool available to the LLM
        # by default, it'll use the docstring as the function's description
        @llm.ai_callable()
        async def execute(
            self,
            # by using the Annotated type, arg description and type are available to the LLM
            code: Annotated[
                str, llm.TypeInfo(description="The Python code to execute")
            ],
        ):
            """Executes Python and returns the output"""
            return execute_code(code)
    fnc_ctx = AssistantFnc()
    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
    participant = await ctx.wait_for_participant()
    openai_api_key = os.getenv("OPENAI_API_KEY")
    model = openai.realtime.RealtimeModel(
-        instructions="You are a helpful assistant and you love open-source software",
+        instructions=instructions,
        voice="shimmer",
-        temperature=0.8,
+        temperature=0.6,
        modalities=["audio", "text"],
        api_key=openai_api_key,
        base_url="wss://api.openai.com/v1",
    )
-    assistant = MultimodalAgent(model=model)
+    model._fnc_ctx = fnc_ctx
    assistant = MultimodalAgent(model=model, fnc_ctx=fnc_ctx)
    assistant.start(ctx.room)
-    session = model.sessions[0]
+    # Create a session with the function context
    session = model.session(
        chat_ctx=llm.ChatContext(),
        fnc_ctx=fnc_ctx,
    )
    # Initial message to start the interaction
    session.conversation.item.create(
      llm.ChatMessage(
        role="user",
-        content="Please begin the interaction with the user in a manner consistent with your instructions.",
+        content="Hello!",
      )
    )
    session.response.create()
 def main(livekit_url):
    # Workers have to be run as CLIs right now.
-    # So we need to simualte running "[this file] dev"
+    # So we need to simulate running "[this file] dev"
    # Modify sys.argv to set the path to this file as the first argument
    # and 'dev' as the second argument