update docs and remove comments

2024-06-19 15:15:58 -07:00 · 2024-06-19 15:15:58 -07:00 · 564255adee
parent d162ee69a3
commit 564255adee
8 changed files with 19 additions and 64 deletions
--- a/README.md
+++ b/README.md
@ -127,7 +127,9 @@ If you want to run local speech-to-text using Whisper, you must install Rust. Fo

 ## Customizations

-To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in `i.py`. This file sets up an interpreter, and is powered by Open Interpreter.
+To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in the `profiles` directory under the `server` directory. This file sets up an interpreter, and is powered by Open Interpreter.
+
+To specify the text-to-speech service for the 01 `base_device.py`, set `interpreter.tts` to either "openai" for OpenAI, "elevenlabs" for ElevenLabs, or "coqui" for Coqui (local) in a profile.

 ## Ubuntu Dependencies

--- a/software/source/clients/base_device.py
+++ b/software/source/clients/base_device.py
@ -91,7 +91,6 @@ class Device:
        self.server_url = ""
        self.ctrl_pressed = False
        self.tts_service = ""
-        self.playback_latency = None

    def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
        """Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
@ -165,10 +164,6 @@ class Device:
        while True:
            try:
                audio = await self.audiosegments.get()
-                if self.playback_latency and isinstance(audio, bytes):
-                    elapsed_time = time.time() - self.playback_latency
-                    print(f"Time from request to playback: {elapsed_time} seconds")
-                    self.playback_latency = None

                if self.tts_service == "elevenlabs":
                    mpv_process.stdin.write(audio)  # type: ignore
@ -224,7 +219,6 @@ class Device:
        stream.stop_stream()
        stream.close()
        print("Recording stopped.")
-        self.playback_latency = time.time()

        duration = wav_file.getnframes() / RATE
        if duration < 0.3:
--- a/software/source/server/async_interpreter.py
+++ b/software/source/server/async_interpreter.py
@ -22,11 +22,6 @@ import os

 class AsyncInterpreter:
    def __init__(self, interpreter):
-        self.stt_latency = None
-        self.tts_latency = None
-        self.interpreter_latency = None
-        self.time_from_first_yield_to_first_put = None
-
        self.interpreter = interpreter

        # STT
@ -128,9 +123,7 @@ class AsyncInterpreter:

                    # Experimental: The AI voice sounds better with replacements like these, but it should happen at the TTS layer
                    # content = content.replace(". ", ". ... ").replace(", ", ", ... ").replace("!", "! ... ").replace("?", "? ... ")
-                    print("yielding ", content)
-                    if self.time_from_first_yield_to_first_put is None:
-                        self.time_from_first_yield_to_first_put = time.time()
+                    # print("yielding ", content)

                    yield content

@ -162,9 +155,6 @@ class AsyncInterpreter:
                        )

        # Send a completion signal
-        end_interpreter = time.time()
-        self.interpreter_latency = end_interpreter - start_interpreter
-        print("INTERPRETER LATENCY", self.interpreter_latency)
        # self.add_to_output_queue_sync({"role": "server","type": "completion", "content": "DONE"})

    async def run(self):
@ -179,11 +169,7 @@ class AsyncInterpreter:
        while not self._input_queue.empty():
            input_queue.append(self._input_queue.get())

-        start_stt = time.time()
        message = self.stt.text()
-        end_stt = time.time()
-        self.stt_latency = end_stt - start_stt
-        print("STT LATENCY", self.stt_latency)

        print(message)

@ -210,23 +196,11 @@ class AsyncInterpreter:
                        "end": True,
                    }
                )
-                end_tts = time.time()
-                self.tts_latency = end_tts - self.tts.stream_start_time
-                print("TTS LATENCY", self.tts_latency)
                self.tts.stop()
                break

    async def _on_tts_chunk_async(self, chunk):
-        print("adding chunk to queue")
-        if (
-            self.time_from_first_yield_to_first_put is not None
-            and self.time_from_first_yield_to_first_put != 0
-        ):
-            print(
-                "time from first yield to first put is ",
-                time.time() - self.time_from_first_yield_to_first_put,
-            )
-            self.time_from_first_yield_to_first_put = 0
+        # print("adding chunk to queue")
        await self._add_to_queue(self._output_queue, chunk)

    def on_tts_chunk(self, chunk):
@ -234,8 +208,5 @@ class AsyncInterpreter:
        asyncio.run(self._on_tts_chunk_async(chunk))

    async def output(self):
-        print("outputting chunks")
+        # print("outputting chunks")
        return await self._output_queue.get()
-
-    def shutdown(self):
-        self.stt.shutdown()
--- a/software/source/server/async_server.py
+++ b/software/source/server/async_server.py
@ -1,9 +1,13 @@
-# TODO: import from the profiles directory the interpreter that should be served!!
+# import from the profiles directory the interpreter to be served

-from .profiles.fast import interpreter as base_interpreter
+# add other profiles to the directory to define other interpreter instances and import them here
+# {.profiles.fast: optimizes for STT/TTS latency with the fastest models }
+# {.profiles.local: uses local models and local STT/TTS }
+# {.profiles.default: uses default interpreter settings with optimized TTS latency }

+# from .profiles.fast import interpreter as base_interpreter
 # from .profiles.local import interpreter as base_interpreter
-# from .profiles.default import interpreter as base_interpreter
+from .profiles.default import interpreter as base_interpreter

 import asyncio
 import traceback
--- a/software/source/server/conftest.py
+++ b/software/source/server/conftest.py
@ -1,3 +1,5 @@
+# tests currently hang after completion
+
 """
 import pytest
 import signal
--- a/software/source/server/profiles/default.py
+++ b/software/source/server/profiles/default.py
@ -3,9 +3,9 @@ from interpreter import interpreter
 # This is an Open Interpreter compatible profile.
 # Visit https://01.openinterpreter.com/profile for all options.

-# 01 suports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
+# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
 # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
-interpreter.tts = "openai"
+interpreter.tts = "elevenlabs"

 # Connect your 01 to a language model
 interpreter.llm.model = "gpt-4-turbo"
--- a/software/source/server/profiles/fast.py
+++ b/software/source/server/profiles/fast.py
@ -3,7 +3,7 @@ from interpreter import interpreter
 # This is an Open Interpreter compatible profile.
 # Visit https://01.openinterpreter.com/profile for all options.

-# 01 suports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
+# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
 # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
 interpreter.tts = "elevenlabs"

@ -16,27 +16,9 @@ interpreter.llm.context_window = 2048
 interpreter.llm.max_tokens = 4096
 interpreter.llm.temperature = 0.8

-# interpreter.llm.api_key = os.environ["GROQ_API_KEY"]
-
 interpreter.computer.import_computer_api = False

 interpreter.auto_run = True
 interpreter.system_message = (
    "You are a helpful assistant that can answer questions and help with tasks."
 )
-
-# TODO: include other options in comments in the profiles for tts
-# direct people to the profiles directory to make changes to the interpreter profile
-# this should be made explicit on the docs
-
-"""
-    llm_service: str = "litellm",
-    model: str = "gpt-4",
-    llm_supports_vision: bool = False,
-    llm_supports_functions: bool = False,
-    context_window: int = 2048,
-    max_tokens: int = 4096,
-    temperature: float = 0.8,
-    tts_service: str = "elevenlabs",
-    stt_service: str = "openai",
-"""
--- a/software/source/server/profiles/local.py
+++ b/software/source/server/profiles/local.py
@ -1,6 +1,6 @@
 from interpreter import interpreter

-# 01 suports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
+# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
 # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
 interpreter.tts = "coqui"