Merge remote-tracking branch 'upstream/main' into u/shivenmian/teach

2024-02-16 00:34:20 -08:00 · 2024-02-16 00:34:20 -08:00 · 86a399d218
parent 502d26dd63 39afff19ff
commit 86a399d218
21 changed files with 350 additions and 1481 deletions
--- a/.gitignore
+++ b/.gitignore
@ -167,4 +167,4 @@ cython_debug/

 # ignore the aifs index files
 _.aifs
-01OS/output_audio.wav
+01OS/output_audio.wav
--- a/01OS/.env.example
+++ b/01OS/.env.example
@ -36,6 +36,17 @@ STT_RUNNER=client # If server, audio will be sent over websocket.
 # Will expose the server publically and display that URL.
 SERVER_EXPOSE_PUBLICALLY=False

+# Image capture settings
+CAMERA_ENABLED=True
+
+# Camera device selection (Typically 0 for built-in, 1 for USB)
+CAMERA_DEVICE_INDEX=0
+
+# Camera warmup time
+# This is a workaround for some cameras that don't immediately
+# return a properly exposed picture when they are first turned on
+CAMERA_WARMUP_SECONDS=0.4
+
 # Debug level
 # LOG_LEVEL=DEBUG
 LOG_LEVEL="INFO"
--- a/01OS/01OS/clients/base_device.py
+++ b/01OS/01OS/clients/base_device.py
@ -1,6 +1,7 @@
 from dotenv import load_dotenv
 load_dotenv()  # take environment variables from .env.

+import os
 import asyncio
 import threading
 import os
@ -21,7 +22,10 @@ import time
 import wave
 import tempfile
 from datetime import datetime
+import cv2
+import base64
 from interpreter import interpreter # Just for code execution. Maybe we should let people do from interpreter.computer import run?
+# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
 from ..server.utils.kernel import put_kernel_messages_into_queue
 from ..server.utils.get_system_info import get_system_info
 from ..server.stt.stt import stt_wav
@ -30,6 +34,11 @@ from ..server.utils.logs import setup_logging
 from ..server.utils.logs import logger
 setup_logging()

+
+from ..utils.accumulator import Accumulator
+
+accumulator = Accumulator()
+
 # Configuration for Audio Recording
 CHUNK = 1024  # Record in chunks of 1024 samples
 FORMAT = pyaudio.paInt16  # 16 bits per sample
@ -38,25 +47,99 @@ RATE = 44100  # Sample rate
 RECORDING = False  # Flag to control recording state
 SPACEBAR_PRESSED = False  # Flag to track spacebar press state

+# Camera configuration
+CAMERA_ENABLED = bool(os.getenv('CAMERA_ENABLED', False))
+CAMERA_DEVICE_INDEX = int(os.getenv('CAMERA_DEVICE_INDEX', 0))
+CAMERA_WARMUP_SECONDS = float(os.getenv('CAMERA_WARMUP_SECONDS', 0))
+
 # Specify OS
 current_platform = get_system_info()

 # Initialize PyAudio
 p = pyaudio.PyAudio()

-import asyncio
-
 send_queue = queue.Queue()

 class Device:
    def __init__(self):
        self.pressed_keys = set()
+        self.captured_images = []
+        self.audiosegments = []
+
+    def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
+        """Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
+        image_path = None
+
+        cap = cv2.VideoCapture(camera_index)
+        ret, frame = cap.read()  # Capture a single frame to initialize the camera
+
+        if CAMERA_WARMUP_SECONDS > 0:
+            # Allow camera to warm up, then snap a picture again
+            # This is a workaround for some cameras that don't return a properly exposed
+            # picture immediately when they are first turned on
+            time.sleep(CAMERA_WARMUP_SECONDS)
+            ret, frame = cap.read()
+
+        if ret:
+            temp_dir = tempfile.gettempdir()
+            image_path = os.path.join(temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png")
+            self.captured_images.append(image_path)
+            cv2.imwrite(image_path, frame)
+            logger.info(f"Camera image captured to {image_path}")
+            logger.info(f"You now have {len(self.captured_images)} images which will be sent along with your next audio message.")
+        else:
+            logger.error(f"Error: Couldn't capture an image from camera ({camera_index})")
+
+        cap.release()
+
+        return image_path
+    
+
+    def encode_image_to_base64(self, image_path):
+        """Encodes an image file to a base64 string."""
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    def add_image_to_send_queue(self, image_path):
+        """Encodes an image and adds an LMC message to the send queue with the image data."""
+        base64_image = self.encode_image_to_base64(image_path)
+        image_message = {
+            "role": "user",
+            "type": "image",
+            "format": "base64.png",
+            "content": base64_image
+        }
+        send_queue.put(image_message)
+        # Delete the image file from the file system after sending it
+        os.remove(image_path)
+
+    def queue_all_captured_images(self):
+        """Queues all captured images to be sent."""
+        for image_path in self.captured_images:
+            self.add_image_to_send_queue(image_path)
+        self.captured_images.clear()  # Clear the list after sending
+
+        
+    async def play_audiosegments(self):
+        """Plays them sequentially."""
+        while True:
+            try:
+                for audio in self.audiosegments:
+                    play(audio)
+                    self.audiosegments.remove(audio)
+                await asyncio.sleep(0.1)
+            except asyncio.exceptions.CancelledError:
+                # This happens once at the start?
+                pass
+            except:
+                logger.info(traceback.format_exc())
+

    def record_audio(self):
        
        if os.getenv('STT_RUNNER') == "server":
            # STT will happen on the server. we're sending audio.
-            send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "start": True})
+            send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "start": True})
        elif os.getenv('STT_RUNNER') == "client":
            # STT will happen here, on the client. we're sending text.
            send_queue.put({"role": "user", "type": "message", "start": True})
@ -92,9 +175,11 @@ class Device:
                send_queue.put({"role": "user", "type": "message", "content": "stop"})
                send_queue.put({"role": "user", "type": "message", "end": True})
            else:
-                send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "content": ""})
-                send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "end": True})
+                send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "content": ""})
+                send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "end": True})
        else:
+            self.queue_all_captured_images()
+
            if os.getenv('STT_RUNNER') == "client":
                # Run stt then send text
                text = stt_wav(wav_path)
@ -105,9 +190,9 @@ class Device:
                with open(wav_path, 'rb') as audio_file:
                    byte_data = audio_file.read(CHUNK)
                    while byte_data:
-                        send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "content": str(byte_data)})
+                        send_queue.put(byte_data)
                        byte_data = audio_file.read(CHUNK)
-                send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "end": True})
+                send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "end": True})

        if os.path.exists(wav_path):
            os.remove(wav_path)
@ -125,86 +210,82 @@ class Device:
            RECORDING = False

    def on_press(self, key):
-        """Detect spacebar press, ESC key press, and Ctrl+C combination."""
+        """Detect spacebar press and Ctrl+C combination."""
        self.pressed_keys.add(key)  # Add the pressed key to the set

-        if keyboard.Key.esc in self.pressed_keys:
-            logger.info("Exiting...")
-            os._exit(0)
-        elif keyboard.Key.space in self.pressed_keys:
+        if keyboard.Key.space in self.pressed_keys:
            self.toggle_recording(True)
        elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char('c')} <= self.pressed_keys:
            logger.info("Ctrl+C pressed. Exiting...")
            os._exit(0)

    def on_release(self, key):
-        """Detect spacebar release and ESC key press."""
+        """Detect spacebar release and 'c' key press for camera, and handle key release."""
        self.pressed_keys.discard(key)  # Remove the released key from the key press tracking set
+
        if key == keyboard.Key.space:
            self.toggle_recording(False)
-        elif key == keyboard.Key.esc or (key == keyboard.Key.ctrl and keyboard.Key.c):
-            logger.info("Exiting...")
-            os._exit(0)
+        elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char('c'):
+            self.fetch_image_from_camera()

+    
    async def message_sender(self, websocket):
        while True:
            message = await asyncio.get_event_loop().run_in_executor(None, send_queue.get)
-            await websocket.send(json.dumps(message))
+            if isinstance(message, bytes):
+                await websocket.send(message)
+            else:
+                await websocket.send(json.dumps(message))
            send_queue.task_done()
+            await asyncio.sleep(0.01)

    async def websocket_communication(self, WS_URL):
        while True:
            try:
                async with websockets.connect(WS_URL) as websocket:
-                    logger.info("Press the spacebar to start/stop recording. Press ESC to exit.")
+                    if CAMERA_ENABLED:
+                        logger.info("Press the spacebar to start/stop recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit.")
+                    else:
+                        logger.info("Press the spacebar to start/stop recording. Press CTRL-C to exit.")
+                        
                    asyncio.create_task(self.message_sender(websocket))

-                    initial_message = {"role": None, "type": None, "format": None, "content": None} 
-                    message_so_far = initial_message
-
                    while True:
-                        message = await websocket.recv()
+                        await asyncio.sleep(0.01)
+                        chunk = await websocket.recv()

-                        logger.debug(f"Got this message from the server: {type(message)} {message}")
+                        logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")

-                        if type(message) == str:
-                            message = json.loads(message)
+                        if type(chunk) == str:
+                            chunk = json.loads(chunk)

-                        if message.get("end"):
-                            logger.debug(f"Complete message from the server: {message_so_far}")
-                            logger.info("\n")
-                            message_so_far = initial_message
+                        message = accumulator.accumulate(chunk)
+                        if message == None:
+                            # Will be None until we have a full message ready
+                            continue

-                        if "content" in message:
-                            print(message['content'], end="", flush=True)
-                            if any(message_so_far[key] != message[key] for key in message_so_far if key != "content"):
-                                message_so_far = message
-                            else:
-                                message_so_far["content"] += message["content"]
+                        # At this point, we have our message

-                        if message["type"] == "audio" and "content" in message:
-                            audio_bytes = bytes(ast.literal_eval(message["content"]))
+                        if message["type"] == "audio" and message["format"].startswith("bytes"):

                            # Convert bytes to audio file
-                            audio_file = io.BytesIO(audio_bytes)
-                            audio = AudioSegment.from_mp3(audio_file)
+                            # Format will be bytes.wav or bytes.opus
+                            audio_bytes = io.BytesIO(message["content"])
+                            audio = AudioSegment.from_file(audio_bytes, codec=message["format"].split(".")[1])

-                            # Play the audio
-                            play(audio)
-
-                            await asyncio.sleep(1)
+                            self.audiosegments.append(audio)

                        # Run the code if that's the client's job
                        if os.getenv('CODE_RUNNER') == "client":
                            if message["type"] == "code" and "end" in message:
-                                language = message_so_far["format"]
-                                code = message_so_far["content"]
+                                language = message["format"]
+                                code = message["content"]
                                result = interpreter.computer.run(language, code)
                                send_queue.put(result)
    

            except:
-                # traceback.print_exc()
+                logger.debug(traceback.format_exc())
                logger.info(f"Connecting to `{WS_URL}`...")
                await asyncio.sleep(2)

@ -221,6 +302,7 @@ class Device:
            if os.getenv('CODE_RUNNER') == "client":
                asyncio.create_task(put_kernel_messages_into_queue(send_queue))

+            asyncio.create_task(self.play_audiosegments())
            
            # If Raspberry Pi, add the button listener, otherwise use the spacebar
            if current_platform.startswith("raspberry-pi"):
--- a/01OS/01OS/clients/start.sh
+++ b/01OS/01OS/clients/start.sh
@ -1,8 +1,6 @@
 DEVICE=$(uname -n)
 if [[ "$DEVICE" == "rpi" ]]; then
-    cd 01OS
    python -m 01OS.clients.rpi.device
 else
-    cd 01OS
    python -m 01OS.clients.macos.device
 fi
--- a/01OS/01OS/server/server.py
+++ b/01OS/01OS/server/server.py
@ -11,7 +11,7 @@ from fastapi import FastAPI
 from fastapi.responses import PlainTextResponse
 from starlette.websockets import WebSocket, WebSocketDisconnect
 from .stt.stt import stt_bytes
-from .tts.tts import tts
+from .tts.tts import stream_tts
 from pathlib import Path
 import asyncio
 import urllib.parse
@ -19,11 +19,13 @@ from .utils.kernel import put_kernel_messages_into_queue
 from .i import configure_interpreter
 from interpreter import interpreter
 import ngrok
+from ..utils.accumulator import Accumulator

 from .utils.logs import setup_logging
 from .utils.logs import logger
 setup_logging()

+accumulator = Accumulator()

 app = FastAPI()

@ -102,59 +104,94 @@ async def websocket_endpoint(websocket: WebSocket):
    except WebSocketDisconnect:
        pass
    except Exception as e:
-        traceback.print_exc()
+        logger.debug(traceback.format_exc())
        logger.info(f"Connection lost. Error: {e}")

 async def receive_messages(websocket: WebSocket):
    while True:
-        data = await websocket.receive_json()
-        if data["role"] == "computer":
-            from_computer.put(data) # To be handled by interpreter.computer.run
-        elif data["role"] == "user":
-            await from_user.put(data)
-        else:
-            raise("Unknown role:", data)
+        try:
+            try:
+                data = await websocket.receive()
+            except Exception as e:
+                print(str(e))
+                return
+            if 'text' in data:
+                try:
+                    data = json.loads(data['text'])
+                    if data["role"] == "computer":
+                        from_computer.put(data) # To be handled by interpreter.computer.run
+                    elif data["role"] == "user":
+                        await from_user.put(data)
+                    else:
+                        raise("Unknown role:", data)
+                except json.JSONDecodeError:
+                    pass  # data is not JSON, leave it as is
+            elif 'bytes' in data:
+                data = data['bytes']  # binary data
+                await from_user.put(data)
+        except WebSocketDisconnect as e:
+            if e.code == 1000:
+                logger.info("Websocket connection closed normally.")
+                return
+            else:
+                raise
+        

 async def send_messages(websocket: WebSocket):
    while True:
        message = await to_device.get()
        logger.debug(f"Sending to the device: {type(message)} {message}")
-        await websocket.send_json(message)
+        
+        try:
+            if isinstance(message, dict):
+                await websocket.send_json(message)
+            elif isinstance(message, bytes):
+                await websocket.send_bytes(message)
+            else:
+                raise TypeError("Message must be a dict or bytes")
+        except:
+            # Make sure to put the message back in the queue if you failed to send it
+            await to_device.put(message)
+            raise

 async def listener():
-    audio_bytes = bytearray()
+
    while True:
        while True:
            if not from_user.empty():
-                message = await from_user.get()
+                chunk = await from_user.get()
                break
            elif not from_computer.empty():
-                message = from_computer.get()
+                chunk = from_computer.get()
                break
            await asyncio.sleep(1)

-        if type(message) == str:
-            message = json.loads(message)
+        

-        # Hold the audio in a buffer. If it's ready (we got end flag, stt it)
-        if message["type"] == "audio":
-            if "content" in message:
-                audio_bytes.extend(bytes(ast.literal_eval(message["content"])))
-            if "end" in message:
-                content = stt_bytes(audio_bytes, message["format"])
-                if content == None: # If it was nothing / silence
-                    continue
-                audio_bytes = bytearray()
-                message = {"role": "user", "type": "message", "content": content}
-            else:
-                continue
-
-        # Ignore flags, we only needed them for audio ^
-        if "content" not in message or message["content"] == None:
+        message = accumulator.accumulate(chunk)
+        if message == None:
+            # Will be None until we have a full message ready
            continue

+        # print(str(message)[:1000])
+
+        # At this point, we have our message
+
+        if message["type"] == "audio" and message["format"].startswith("bytes"):
+
+            if not message["content"]: # If it was nothing / silence
+                continue
+
+            # Convert bytes to audio file
+            # Format will be bytes.wav or bytes.opus
+            mime_type = "audio/" + message["format"].split(".")[1]
+            text = stt_bytes(message["content"], mime_type)
+            message = {"role": "user", "type": "message", "content": text}
+
+        # At this point, we have only text messages
+
        # Custom stop message will halt us
-        if message["content"].lower().strip(".,!") == "stop":
+        if message["content"].lower().strip(".,! ") == "stop":
            continue

        # Load, append, and save conversation history
@ -175,19 +212,31 @@ async def listener():
            # Yield to the event loop, so you actually send it out
            await asyncio.sleep(0.01)
            
-            # Speak full sentences out loud
-            if chunk["role"] == "assistant" and "content" in chunk:
-                accumulated_text += chunk["content"]
-                sentences = split_into_sentences(accumulated_text)
-                if is_full_sentence(sentences[-1]):
-                    for sentence in sentences:
-                        await stream_or_play_tts(sentence)
-                    accumulated_text = ""
-                else:
-                    for sentence in sentences[:-1]:
-                        await stream_or_play_tts(sentence)
-                    accumulated_text = sentences[-1]
-            
+            if os.getenv('TTS_RUNNER') == "server":
+                # Speak full sentences out loud
+                if chunk["role"] == "assistant" and "content" in chunk:
+                    accumulated_text += chunk["content"]
+                    sentences = split_into_sentences(accumulated_text)
+                    
+                    # If we're going to speak, say we're going to stop sending text.
+                    # This should be fixed probably, we should be able to do both in parallel, or only one.
+                    if any(is_full_sentence(sentence) for sentence in sentences):
+                        await to_device.put({"role": "assistant", "type": "message", "end": True})
+                    
+                    if is_full_sentence(sentences[-1]):
+                        for sentence in sentences:
+                            await stream_tts_to_device(sentence)
+                        accumulated_text = ""
+                    else:
+                        for sentence in sentences[:-1]:
+                            await stream_tts_to_device(sentence)
+                        accumulated_text = sentences[-1]
+
+                    # If we're going to speak, say we're going to stop sending text.
+                    # This should be fixed probably, we should be able to do both in parallel, or only one.
+                    if any(is_full_sentence(sentence) for sentence in sentences):
+                        await to_device.put({"role": "assistant", "type": "message", "start": True})
+                
            # If we have a new message, save our progress and go back to the top
            if not from_user.empty():

@ -217,19 +266,12 @@ async def listener():
                break
        else:
            with open(conversation_history_path, 'w') as file:
-                json.dump(interpreter.messages, file, indent=4)
+                json.dump(interpreter.messages, file, indent=4)  
+
+async def stream_tts_to_device(sentence):
+    for chunk in stream_tts(sentence):
+        await to_device.put(chunk)
        
-
-async def stream_or_play_tts(sentence):
-
-    if os.getenv('TTS_RUNNER') == "server":
-        tts(sentence, play_audio=True)
-    else:
-        await to_device.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "start": True})
-        audio_bytes = tts(sentence, play_audio=False)
-        await to_device.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "content": str(audio_bytes)})
-        await to_device.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "end": True})
-
 async def setup_ngrok(ngrok_auth_token, parsed_url):
    # Set up Ngrok
    logger.info("Setting up Ngrok")
--- a/01OS/01OS/server/stt/local_service/init.py
+++ b/01OS/01OS/server/stt/local_service/init.py
--- a/01OS/01OS/server/stt/local_service/whisper-rust/.gitignore
+++ b/01OS/01OS/server/stt/local_service/whisper-rust/.gitignore
@ -1,10 +0,0 @@
-# Generated by Cargo
-# will have compiled files and executables
-debug/
-target/
-
-# These are backup files generated by rustfmt
-**/*.rs.bk
-
-# MSVC Windows builds of rustc generate these, which store debugging information
-*.pdb
--- a/01OS/01OS/server/stt/local_service/whisper-rust/Cargo.lock
+++ b/01OS/01OS/server/stt/local_service/whisper-rust/Cargo.lock
--- a/01OS/01OS/server/stt/local_service/whisper-rust/Cargo.toml
+++ b/01OS/01OS/server/stt/local_service/whisper-rust/Cargo.toml
@ -1,14 +0,0 @@
-[package]
-name = "whisper-rust"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-anyhow = "1.0.79"
-clap = { version = "4.4.18", features = ["derive"] }
-cpal = "0.15.2"
-hound = "3.5.1"
-whisper-rs = "0.10.0"
-whisper-rs-sys = "0.8.0"
--- a/01OS/01OS/server/stt/local_service/whisper-rust/README.md
+++ b/01OS/01OS/server/stt/local_service/whisper-rust/README.md
@ -1,7 +0,0 @@
-# Setup
-
-To rebuild the `whisper-rust` executable, do the following:
-
-1. Install [Rust](https://www.rust-lang.org/tools/install), cmake, and Python dependencies `pip install -r requirements.txt`.
-2. Go to **core/stt** and run `cargo build --release`.
-3. Move the `whisper-rust` executable from target/release to this directory.
--- a/01OS/01OS/server/stt/local_service/whisper-rust/src/main.rs
+++ b/01OS/01OS/server/stt/local_service/whisper-rust/src/main.rs
@ -1,34 +0,0 @@
-mod transcribe;
-
-use clap::Parser;
-use std::path::PathBuf;
-use transcribe::transcribe;
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// This is the model for Whisper STT
-    #[arg(short, long, value_parser, required = true)]
-    model_path: PathBuf,
-    
-    /// This is the wav audio file that will be converted from speech to text
-    #[arg(short, long, value_parser, required = true)]
-    file_path: Option<PathBuf>,
-}
-
-fn main() {
-
-    let args = Args::parse();
-
-    let file_path = match args.file_path {
-        Some(fp) => fp,
-        None => panic!("No file path provided")
-    };
-
-    let result = transcribe(&args.model_path, &file_path);
-
-    match result {
-        Ok(transcription) => print!("{}", transcription),
-        Err(e) => panic!("Error: {}", e),
-    }
-}
--- a/01OS/01OS/server/stt/local_service/whisper-rust/src/transcribe.rs
+++ b/01OS/01OS/server/stt/local_service/whisper-rust/src/transcribe.rs
@ -1,64 +0,0 @@
-use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
-use std::path::PathBuf;
-
-
-/// Transcribes the given audio file using the whisper-rs library.
-///
-/// # Arguments
-/// * `model_path` - Path to Whisper model file
-/// * `file_path` - A string slice that holds the path to the audio file to be transcribed.
-///
-/// # Returns
-///
-/// A Result containing a String with the transcription if successful, or an error message if not.
-pub fn transcribe(model_path: &PathBuf, file_path: &PathBuf) -> Result<String, String> {
-
-    let model_path_str = model_path.to_str().expect("Not valid model path");
-    // Load a context and model
-    let ctx = WhisperContext::new_with_params(
-        model_path_str, // Replace with the actual path to the model
-        WhisperContextParameters::default(),
-    )
-    .map_err(|_| "failed to load model")?;
-
-    // Create a state
-    let mut state = ctx.create_state().map_err(|_| "failed to create state")?;
-
-    // Create a params object
-    // Note that currently the only implemented strategy is Greedy, BeamSearch is a WIP
-    let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
-
-    // Edit parameters as needed
-    params.set_n_threads(1); // Set the number of threads to use
-    params.set_translate(true); // Enable translation
-    params.set_language(Some("en")); // Set the language to translate to English
-    // Disable printing to stdout
-    params.set_print_special(false);
-    params.set_print_progress(false);
-    params.set_print_realtime(false);
-    params.set_print_timestamps(false);
-
-    // Load the audio file
-    let audio_data = std::fs::read(file_path)
-        .map_err(|e| format!("failed to read audio file: {}", e))?
-        .chunks_exact(2)
-        .map(|chunk| i16::from_ne_bytes([chunk[0], chunk[1]]))
-        .collect::<Vec<i16>>();
-
-    // Convert the audio data to the required format (16KHz mono i16 samples)
-    let audio_data = whisper_rs::convert_integer_to_float_audio(&audio_data);
-
-    // Run the model
-    state.full(params, &audio_data[..]).map_err(|_| "failed to run model")?;
-
-    // Fetch the results
-    let num_segments = state.full_n_segments().map_err(|_| "failed to get number of segments")?;
-    let mut transcription = String::new();
-    for i in 0..num_segments {
-        let segment = state.full_get_segment_text(i).map_err(|_| "failed to get segment")?;
-        transcription.push_str(&segment);
-        transcription.push('\n');
-    }
-
-    Ok(transcription)
-}
--- a/01OS/01OS/server/stt/local_service/whisper-rust/whisper-rust
+++ b/01OS/01OS/server/stt/local_service/whisper-rust/whisper-rust
--- a/01OS/01OS/server/tts/local_service/init.py
+++ b/01OS/01OS/server/tts/local_service/init.py
--- a/01OS/01OS/server/tts/tts.py
+++ b/01OS/01OS/server/tts/tts.py
@ -12,27 +12,28 @@ import os
 import subprocess
 import tempfile
 from pydub import AudioSegment
-from pydub.playback import play
-import simpleaudio as sa

 client = OpenAI()

-def tts(text, play_audio):
+chunk_size = 1024
+
+def stream_tts(text):
+    """
+    A generator that streams tts as LMC messages.
+    """
    if os.getenv('ALL_LOCAL') == 'False':
        response = client.audio.speech.create(
            model="tts-1",
            voice="alloy",
            input=text,
-            response_format="mp3"
+            response_format="opus"
        )
-        with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file:
+        with tempfile.NamedTemporaryFile(suffix=".opus") as temp_file:
            response.stream_to_file(temp_file.name)
-            
-            if play_audio:
-                audio = AudioSegment.from_mp3(temp_file.name)
-                play_audiosegment(audio)

-            return temp_file.read()
+            audio_bytes = temp_file.read()
+            file_type = "bytes.opus"
+
    else:
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            output_file = temp_file.name
@ -43,13 +44,19 @@ def tts(text, play_audio):
                '--output_file', output_file
            ], input=text, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

-            if play_audio:
-                audio = AudioSegment.from_wav(temp_file.name)
-                play_audiosegment(audio)
-            return temp_file.read()
+            audio_bytes = temp_file.read()
+            file_type = "bytes.wav"
+
+    # Stream the audio
+    yield {"role": "assistant", "type": "audio", "format": file_type, "start": True}
+    for i in range(0, len(audio_bytes), chunk_size):
+        chunk = audio_bytes[i:i+chunk_size]
+        yield chunk
+    yield {"role": "assistant", "type": "audio", "format": file_type, "end": True}

 def play_audiosegment(audio):
    """
+    UNUSED
    the default makes some pops. this fixes that
    """

@ -73,3 +80,6 @@ def play_audiosegment(audio):
    # Wait for the playback to finish
    play_obj.wait_done()

+    # Delete the wav file
+    os.remove("output_audio.wav")
+
--- a/01OS/01OS/utils/accumulator.py
+++ b/01OS/01OS/utils/accumulator.py
@ -0,0 +1,44 @@
+class Accumulator:
+    def __init__(self):
+        self.template = {"role": None, "type": None, "format": None, "content": None} 
+        self.message = self.template
+
+    def accumulate(self, chunk):
+        #print(str(chunk)[:100])
+        if type(chunk) == dict:
+
+            if "format" in chunk and chunk["format"] == "active_line":
+                # We don't do anything with these
+                return None
+
+            if "start" in chunk:
+                self.message = chunk
+                self.message.pop("start")
+                return None
+
+            if "content" in chunk:
+
+                # Display
+                print(chunk['content'], end="", flush=True)
+
+                if any(self.message[key] != chunk[key] for key in self.message if key != "content"):
+                    self.message = chunk
+                if "content" not in self.message:
+                    self.message["content"] = chunk["content"]
+                else:
+                    self.message["content"] += chunk["content"]
+                return None
+
+            if "end" in chunk:
+                # We will proceed
+                message = self.message
+                self.message = self.template
+                return message
+
+        if type(chunk) == bytes:
+            if "content" not in self.message or type(self.message["content"]) != bytes:
+                self.message["content"] = b""
+            self.message["content"] += chunk
+            return None
+
+        
--- a/01OS/_archive/device.py
+++ b/01OS/_archive/device.py
@ -122,7 +122,7 @@ def on_press(key):
        toggle_recording(True)

 def on_release(key):
-    """Detect spacebar release and ESC key press."""
+    """Detect spacebar release and CTRL-C key press."""
    if key == keyboard.Key.space:
        toggle_recording(False)
    elif key == keyboard.Key.esc:
--- a/01OS/poetry.lock
+++ b/01OS/poetry.lock
@ -1890,6 +1890,31 @@ typing-extensions = ">=4.7,<5"
 [package.extras]
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]

+[[package]]
+name = "opencv-python"
+version = "4.9.0.80"
+description = "Wrapper package for OpenCV python bindings."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "opencv-python-4.9.0.80.tar.gz", hash = "sha256:1a9f0e6267de3a1a1db0c54213d022c7c8b5b9ca4b580e80bdc58516c922c9e1"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-macosx_10_16_x86_64.whl", hash = "sha256:7e5f7aa4486651a6ebfa8ed4b594b65bd2d2f41beeb4241a3e4b1b85acbbbadb"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71dfb9555ccccdd77305fc3dcca5897fbf0cf28b297c51ee55e079c065d812a3"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b34a52e9da36dda8c151c6394aed602e4b17fa041df0b9f5b93ae10b0fcca2a"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4088cab82b66a3b37ffc452976b14a3c599269c247895ae9ceb4066d8188a57"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-win32.whl", hash = "sha256:dcf000c36dd1651118a2462257e3a9e76db789a78432e1f303c7bac54f63ef6c"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-win_amd64.whl", hash = "sha256:3f16f08e02b2a2da44259c7cc712e779eff1dd8b55fdb0323e8cab09548086c0"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\""},
+    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
+    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
+]
+
 [[package]]
 name = "packaging"
 version = "23.2"
@ -3514,4 +3539,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.12"
-content-hash = "12ccff8a2521e7eb88eee82cfd3de409fea8e1658406d6148a42f9347ca7b2a7"
+content-hash = "5c8d587b405e97c0dca454078950157106f9aea687cbecce5b7ae7effd2aeece"
--- a/01OS/pyproject.toml
+++ b/01OS/pyproject.toml
@ -4,7 +4,7 @@ packages = [
    {include = "01OS"},
 ]
 include = [".env.example", "start.py", "start.sh"]
-version = "0.0.2"
+version = "0.0.3"
 description = "The open-source language model computer"
 authors = ["Killian <killian@openinterpreter.com>"]
 license = "AGPL"
@ -25,6 +25,7 @@ pydub = "^0.25.1"
 ngrok = "^1.0.0"
 open-interpreter = "^0.2.0"
 simpleaudio = "^1.0.4"
+opencv-python = "^4.9.0.80"

 [build-system]
 requires = ["poetry-core"]
--- a/01OS/start.py
+++ b/01OS/start.py
@ -19,5 +19,8 @@ def main():
    command = [os.path.join(dir_path, 'start.sh')] + args

    # Start start.sh with the command line arguments
-    subprocess.run(command, check=True)
+    try:
+        subprocess.run(command, check=True)
+    except KeyboardInterrupt:
+        print("Exiting...")
    
--- a/01OS/start.sh
+++ b/01OS/start.sh
@ -54,6 +54,16 @@ if [[ "$@" == *"--expose"* ]]; then
    export SERVER_EXPOSE_PUBLICALLY="True"
 fi

+# Check if "--clear-local" is passed as an argument
+if [[ "$@" == *"--clear-local"* ]]; then
+    # If "--clear-local" is passed, clear the contents of the folders in script_dir/01OS/server/{tts and stt}/local_service
+    echo "Clearing local services..."
+    rm -rf "$SCRIPT_DIR/01OS/server/tts/local_service"/*
+    rm -rf "$SCRIPT_DIR/01OS/server/stt/local_service"/*
+    echo "Exiting after clearing local services..."
+    exit 0
+fi
+
 ### SETUP

 if [[ "$ALL_LOCAL" == "True" ]]; then
@ -124,7 +134,7 @@ fi

 start_client() {
    echo "Starting client..."
-    bash 01OS/clients/start.sh &
+    bash $SCRIPT_DIR/01OS/clients/start.sh &
    CLIENT_PID=$!
    echo "client started as process $CLIENT_PID"
 }