Merge remote-tracking branch 'upstream/main' into u/shivenmian/teach
This commit is contained in:
commit
86a399d218
|
@ -167,4 +167,4 @@ cython_debug/
|
|||
|
||||
# ignore the aifs index files
|
||||
_.aifs
|
||||
01OS/output_audio.wav
|
||||
01OS/output_audio.wav
|
||||
|
|
|
@ -36,6 +36,17 @@ STT_RUNNER=client # If server, audio will be sent over websocket.
|
|||
# Will expose the server publically and display that URL.
|
||||
SERVER_EXPOSE_PUBLICALLY=False
|
||||
|
||||
# Image capture settings
|
||||
CAMERA_ENABLED=True
|
||||
|
||||
# Camera device selection (Typically 0 for built-in, 1 for USB)
|
||||
CAMERA_DEVICE_INDEX=0
|
||||
|
||||
# Camera warmup time
|
||||
# This is a workaround for some cameras that don't immediately
|
||||
# return a properly exposed picture when they are first turned on
|
||||
CAMERA_WARMUP_SECONDS=0.4
|
||||
|
||||
# Debug level
|
||||
# LOG_LEVEL=DEBUG
|
||||
LOG_LEVEL="INFO"
|
|
@ -1,6 +1,7 @@
|
|||
from dotenv import load_dotenv
|
||||
load_dotenv() # take environment variables from .env.
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import threading
|
||||
import os
|
||||
|
@ -21,7 +22,10 @@ import time
|
|||
import wave
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
import cv2
|
||||
import base64
|
||||
from interpreter import interpreter # Just for code execution. Maybe we should let people do from interpreter.computer import run?
|
||||
# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
|
||||
from ..server.utils.kernel import put_kernel_messages_into_queue
|
||||
from ..server.utils.get_system_info import get_system_info
|
||||
from ..server.stt.stt import stt_wav
|
||||
|
@ -30,6 +34,11 @@ from ..server.utils.logs import setup_logging
|
|||
from ..server.utils.logs import logger
|
||||
setup_logging()
|
||||
|
||||
|
||||
from ..utils.accumulator import Accumulator
|
||||
|
||||
accumulator = Accumulator()
|
||||
|
||||
# Configuration for Audio Recording
|
||||
CHUNK = 1024 # Record in chunks of 1024 samples
|
||||
FORMAT = pyaudio.paInt16 # 16 bits per sample
|
||||
|
@ -38,25 +47,99 @@ RATE = 44100 # Sample rate
|
|||
RECORDING = False # Flag to control recording state
|
||||
SPACEBAR_PRESSED = False # Flag to track spacebar press state
|
||||
|
||||
# Camera configuration
|
||||
CAMERA_ENABLED = bool(os.getenv('CAMERA_ENABLED', False))
|
||||
CAMERA_DEVICE_INDEX = int(os.getenv('CAMERA_DEVICE_INDEX', 0))
|
||||
CAMERA_WARMUP_SECONDS = float(os.getenv('CAMERA_WARMUP_SECONDS', 0))
|
||||
|
||||
# Specify OS
|
||||
current_platform = get_system_info()
|
||||
|
||||
# Initialize PyAudio
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
import asyncio
|
||||
|
||||
send_queue = queue.Queue()
|
||||
|
||||
class Device:
|
||||
def __init__(self):
|
||||
self.pressed_keys = set()
|
||||
self.captured_images = []
|
||||
self.audiosegments = []
|
||||
|
||||
def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
|
||||
"""Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
|
||||
image_path = None
|
||||
|
||||
cap = cv2.VideoCapture(camera_index)
|
||||
ret, frame = cap.read() # Capture a single frame to initialize the camera
|
||||
|
||||
if CAMERA_WARMUP_SECONDS > 0:
|
||||
# Allow camera to warm up, then snap a picture again
|
||||
# This is a workaround for some cameras that don't return a properly exposed
|
||||
# picture immediately when they are first turned on
|
||||
time.sleep(CAMERA_WARMUP_SECONDS)
|
||||
ret, frame = cap.read()
|
||||
|
||||
if ret:
|
||||
temp_dir = tempfile.gettempdir()
|
||||
image_path = os.path.join(temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png")
|
||||
self.captured_images.append(image_path)
|
||||
cv2.imwrite(image_path, frame)
|
||||
logger.info(f"Camera image captured to {image_path}")
|
||||
logger.info(f"You now have {len(self.captured_images)} images which will be sent along with your next audio message.")
|
||||
else:
|
||||
logger.error(f"Error: Couldn't capture an image from camera ({camera_index})")
|
||||
|
||||
cap.release()
|
||||
|
||||
return image_path
|
||||
|
||||
|
||||
def encode_image_to_base64(self, image_path):
|
||||
"""Encodes an image file to a base64 string."""
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
def add_image_to_send_queue(self, image_path):
|
||||
"""Encodes an image and adds an LMC message to the send queue with the image data."""
|
||||
base64_image = self.encode_image_to_base64(image_path)
|
||||
image_message = {
|
||||
"role": "user",
|
||||
"type": "image",
|
||||
"format": "base64.png",
|
||||
"content": base64_image
|
||||
}
|
||||
send_queue.put(image_message)
|
||||
# Delete the image file from the file system after sending it
|
||||
os.remove(image_path)
|
||||
|
||||
def queue_all_captured_images(self):
|
||||
"""Queues all captured images to be sent."""
|
||||
for image_path in self.captured_images:
|
||||
self.add_image_to_send_queue(image_path)
|
||||
self.captured_images.clear() # Clear the list after sending
|
||||
|
||||
|
||||
async def play_audiosegments(self):
|
||||
"""Plays them sequentially."""
|
||||
while True:
|
||||
try:
|
||||
for audio in self.audiosegments:
|
||||
play(audio)
|
||||
self.audiosegments.remove(audio)
|
||||
await asyncio.sleep(0.1)
|
||||
except asyncio.exceptions.CancelledError:
|
||||
# This happens once at the start?
|
||||
pass
|
||||
except:
|
||||
logger.info(traceback.format_exc())
|
||||
|
||||
|
||||
def record_audio(self):
|
||||
|
||||
if os.getenv('STT_RUNNER') == "server":
|
||||
# STT will happen on the server. we're sending audio.
|
||||
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "start": True})
|
||||
send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "start": True})
|
||||
elif os.getenv('STT_RUNNER') == "client":
|
||||
# STT will happen here, on the client. we're sending text.
|
||||
send_queue.put({"role": "user", "type": "message", "start": True})
|
||||
|
@ -92,9 +175,11 @@ class Device:
|
|||
send_queue.put({"role": "user", "type": "message", "content": "stop"})
|
||||
send_queue.put({"role": "user", "type": "message", "end": True})
|
||||
else:
|
||||
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "content": ""})
|
||||
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "end": True})
|
||||
send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "content": ""})
|
||||
send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "end": True})
|
||||
else:
|
||||
self.queue_all_captured_images()
|
||||
|
||||
if os.getenv('STT_RUNNER') == "client":
|
||||
# Run stt then send text
|
||||
text = stt_wav(wav_path)
|
||||
|
@ -105,9 +190,9 @@ class Device:
|
|||
with open(wav_path, 'rb') as audio_file:
|
||||
byte_data = audio_file.read(CHUNK)
|
||||
while byte_data:
|
||||
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "content": str(byte_data)})
|
||||
send_queue.put(byte_data)
|
||||
byte_data = audio_file.read(CHUNK)
|
||||
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "end": True})
|
||||
send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "end": True})
|
||||
|
||||
if os.path.exists(wav_path):
|
||||
os.remove(wav_path)
|
||||
|
@ -125,86 +210,82 @@ class Device:
|
|||
RECORDING = False
|
||||
|
||||
def on_press(self, key):
|
||||
"""Detect spacebar press, ESC key press, and Ctrl+C combination."""
|
||||
"""Detect spacebar press and Ctrl+C combination."""
|
||||
self.pressed_keys.add(key) # Add the pressed key to the set
|
||||
|
||||
if keyboard.Key.esc in self.pressed_keys:
|
||||
logger.info("Exiting...")
|
||||
os._exit(0)
|
||||
elif keyboard.Key.space in self.pressed_keys:
|
||||
if keyboard.Key.space in self.pressed_keys:
|
||||
self.toggle_recording(True)
|
||||
elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char('c')} <= self.pressed_keys:
|
||||
logger.info("Ctrl+C pressed. Exiting...")
|
||||
os._exit(0)
|
||||
|
||||
def on_release(self, key):
|
||||
"""Detect spacebar release and ESC key press."""
|
||||
"""Detect spacebar release and 'c' key press for camera, and handle key release."""
|
||||
self.pressed_keys.discard(key) # Remove the released key from the key press tracking set
|
||||
|
||||
if key == keyboard.Key.space:
|
||||
self.toggle_recording(False)
|
||||
elif key == keyboard.Key.esc or (key == keyboard.Key.ctrl and keyboard.Key.c):
|
||||
logger.info("Exiting...")
|
||||
os._exit(0)
|
||||
elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char('c'):
|
||||
self.fetch_image_from_camera()
|
||||
|
||||
|
||||
async def message_sender(self, websocket):
|
||||
while True:
|
||||
message = await asyncio.get_event_loop().run_in_executor(None, send_queue.get)
|
||||
await websocket.send(json.dumps(message))
|
||||
if isinstance(message, bytes):
|
||||
await websocket.send(message)
|
||||
else:
|
||||
await websocket.send(json.dumps(message))
|
||||
send_queue.task_done()
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
async def websocket_communication(self, WS_URL):
|
||||
while True:
|
||||
try:
|
||||
async with websockets.connect(WS_URL) as websocket:
|
||||
logger.info("Press the spacebar to start/stop recording. Press ESC to exit.")
|
||||
if CAMERA_ENABLED:
|
||||
logger.info("Press the spacebar to start/stop recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit.")
|
||||
else:
|
||||
logger.info("Press the spacebar to start/stop recording. Press CTRL-C to exit.")
|
||||
|
||||
asyncio.create_task(self.message_sender(websocket))
|
||||
|
||||
initial_message = {"role": None, "type": None, "format": None, "content": None}
|
||||
message_so_far = initial_message
|
||||
|
||||
while True:
|
||||
message = await websocket.recv()
|
||||
await asyncio.sleep(0.01)
|
||||
chunk = await websocket.recv()
|
||||
|
||||
logger.debug(f"Got this message from the server: {type(message)} {message}")
|
||||
logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
|
||||
|
||||
if type(message) == str:
|
||||
message = json.loads(message)
|
||||
if type(chunk) == str:
|
||||
chunk = json.loads(chunk)
|
||||
|
||||
if message.get("end"):
|
||||
logger.debug(f"Complete message from the server: {message_so_far}")
|
||||
logger.info("\n")
|
||||
message_so_far = initial_message
|
||||
message = accumulator.accumulate(chunk)
|
||||
if message == None:
|
||||
# Will be None until we have a full message ready
|
||||
continue
|
||||
|
||||
if "content" in message:
|
||||
print(message['content'], end="", flush=True)
|
||||
if any(message_so_far[key] != message[key] for key in message_so_far if key != "content"):
|
||||
message_so_far = message
|
||||
else:
|
||||
message_so_far["content"] += message["content"]
|
||||
# At this point, we have our message
|
||||
|
||||
if message["type"] == "audio" and "content" in message:
|
||||
audio_bytes = bytes(ast.literal_eval(message["content"]))
|
||||
if message["type"] == "audio" and message["format"].startswith("bytes"):
|
||||
|
||||
# Convert bytes to audio file
|
||||
audio_file = io.BytesIO(audio_bytes)
|
||||
audio = AudioSegment.from_mp3(audio_file)
|
||||
# Format will be bytes.wav or bytes.opus
|
||||
audio_bytes = io.BytesIO(message["content"])
|
||||
audio = AudioSegment.from_file(audio_bytes, codec=message["format"].split(".")[1])
|
||||
|
||||
# Play the audio
|
||||
play(audio)
|
||||
|
||||
await asyncio.sleep(1)
|
||||
self.audiosegments.append(audio)
|
||||
|
||||
# Run the code if that's the client's job
|
||||
if os.getenv('CODE_RUNNER') == "client":
|
||||
if message["type"] == "code" and "end" in message:
|
||||
language = message_so_far["format"]
|
||||
code = message_so_far["content"]
|
||||
language = message["format"]
|
||||
code = message["content"]
|
||||
result = interpreter.computer.run(language, code)
|
||||
send_queue.put(result)
|
||||
|
||||
|
||||
except:
|
||||
# traceback.print_exc()
|
||||
logger.debug(traceback.format_exc())
|
||||
logger.info(f"Connecting to `{WS_URL}`...")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
|
@ -221,6 +302,7 @@ class Device:
|
|||
if os.getenv('CODE_RUNNER') == "client":
|
||||
asyncio.create_task(put_kernel_messages_into_queue(send_queue))
|
||||
|
||||
asyncio.create_task(self.play_audiosegments())
|
||||
|
||||
# If Raspberry Pi, add the button listener, otherwise use the spacebar
|
||||
if current_platform.startswith("raspberry-pi"):
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
DEVICE=$(uname -n)
|
||||
if [[ "$DEVICE" == "rpi" ]]; then
|
||||
cd 01OS
|
||||
python -m 01OS.clients.rpi.device
|
||||
else
|
||||
cd 01OS
|
||||
python -m 01OS.clients.macos.device
|
||||
fi
|
||||
|
|
|
@ -11,7 +11,7 @@ from fastapi import FastAPI
|
|||
from fastapi.responses import PlainTextResponse
|
||||
from starlette.websockets import WebSocket, WebSocketDisconnect
|
||||
from .stt.stt import stt_bytes
|
||||
from .tts.tts import tts
|
||||
from .tts.tts import stream_tts
|
||||
from pathlib import Path
|
||||
import asyncio
|
||||
import urllib.parse
|
||||
|
@ -19,11 +19,13 @@ from .utils.kernel import put_kernel_messages_into_queue
|
|||
from .i import configure_interpreter
|
||||
from interpreter import interpreter
|
||||
import ngrok
|
||||
from ..utils.accumulator import Accumulator
|
||||
|
||||
from .utils.logs import setup_logging
|
||||
from .utils.logs import logger
|
||||
setup_logging()
|
||||
|
||||
accumulator = Accumulator()
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
@ -102,59 +104,94 @@ async def websocket_endpoint(websocket: WebSocket):
|
|||
except WebSocketDisconnect:
|
||||
pass
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
logger.debug(traceback.format_exc())
|
||||
logger.info(f"Connection lost. Error: {e}")
|
||||
|
||||
async def receive_messages(websocket: WebSocket):
|
||||
while True:
|
||||
data = await websocket.receive_json()
|
||||
if data["role"] == "computer":
|
||||
from_computer.put(data) # To be handled by interpreter.computer.run
|
||||
elif data["role"] == "user":
|
||||
await from_user.put(data)
|
||||
else:
|
||||
raise("Unknown role:", data)
|
||||
try:
|
||||
try:
|
||||
data = await websocket.receive()
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return
|
||||
if 'text' in data:
|
||||
try:
|
||||
data = json.loads(data['text'])
|
||||
if data["role"] == "computer":
|
||||
from_computer.put(data) # To be handled by interpreter.computer.run
|
||||
elif data["role"] == "user":
|
||||
await from_user.put(data)
|
||||
else:
|
||||
raise("Unknown role:", data)
|
||||
except json.JSONDecodeError:
|
||||
pass # data is not JSON, leave it as is
|
||||
elif 'bytes' in data:
|
||||
data = data['bytes'] # binary data
|
||||
await from_user.put(data)
|
||||
except WebSocketDisconnect as e:
|
||||
if e.code == 1000:
|
||||
logger.info("Websocket connection closed normally.")
|
||||
return
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
async def send_messages(websocket: WebSocket):
|
||||
while True:
|
||||
message = await to_device.get()
|
||||
logger.debug(f"Sending to the device: {type(message)} {message}")
|
||||
await websocket.send_json(message)
|
||||
|
||||
try:
|
||||
if isinstance(message, dict):
|
||||
await websocket.send_json(message)
|
||||
elif isinstance(message, bytes):
|
||||
await websocket.send_bytes(message)
|
||||
else:
|
||||
raise TypeError("Message must be a dict or bytes")
|
||||
except:
|
||||
# Make sure to put the message back in the queue if you failed to send it
|
||||
await to_device.put(message)
|
||||
raise
|
||||
|
||||
async def listener():
|
||||
audio_bytes = bytearray()
|
||||
|
||||
while True:
|
||||
while True:
|
||||
if not from_user.empty():
|
||||
message = await from_user.get()
|
||||
chunk = await from_user.get()
|
||||
break
|
||||
elif not from_computer.empty():
|
||||
message = from_computer.get()
|
||||
chunk = from_computer.get()
|
||||
break
|
||||
await asyncio.sleep(1)
|
||||
|
||||
if type(message) == str:
|
||||
message = json.loads(message)
|
||||
|
||||
|
||||
# Hold the audio in a buffer. If it's ready (we got end flag, stt it)
|
||||
if message["type"] == "audio":
|
||||
if "content" in message:
|
||||
audio_bytes.extend(bytes(ast.literal_eval(message["content"])))
|
||||
if "end" in message:
|
||||
content = stt_bytes(audio_bytes, message["format"])
|
||||
if content == None: # If it was nothing / silence
|
||||
continue
|
||||
audio_bytes = bytearray()
|
||||
message = {"role": "user", "type": "message", "content": content}
|
||||
else:
|
||||
continue
|
||||
|
||||
# Ignore flags, we only needed them for audio ^
|
||||
if "content" not in message or message["content"] == None:
|
||||
message = accumulator.accumulate(chunk)
|
||||
if message == None:
|
||||
# Will be None until we have a full message ready
|
||||
continue
|
||||
|
||||
# print(str(message)[:1000])
|
||||
|
||||
# At this point, we have our message
|
||||
|
||||
if message["type"] == "audio" and message["format"].startswith("bytes"):
|
||||
|
||||
if not message["content"]: # If it was nothing / silence
|
||||
continue
|
||||
|
||||
# Convert bytes to audio file
|
||||
# Format will be bytes.wav or bytes.opus
|
||||
mime_type = "audio/" + message["format"].split(".")[1]
|
||||
text = stt_bytes(message["content"], mime_type)
|
||||
message = {"role": "user", "type": "message", "content": text}
|
||||
|
||||
# At this point, we have only text messages
|
||||
|
||||
# Custom stop message will halt us
|
||||
if message["content"].lower().strip(".,!") == "stop":
|
||||
if message["content"].lower().strip(".,! ") == "stop":
|
||||
continue
|
||||
|
||||
# Load, append, and save conversation history
|
||||
|
@ -175,19 +212,31 @@ async def listener():
|
|||
# Yield to the event loop, so you actually send it out
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
# Speak full sentences out loud
|
||||
if chunk["role"] == "assistant" and "content" in chunk:
|
||||
accumulated_text += chunk["content"]
|
||||
sentences = split_into_sentences(accumulated_text)
|
||||
if is_full_sentence(sentences[-1]):
|
||||
for sentence in sentences:
|
||||
await stream_or_play_tts(sentence)
|
||||
accumulated_text = ""
|
||||
else:
|
||||
for sentence in sentences[:-1]:
|
||||
await stream_or_play_tts(sentence)
|
||||
accumulated_text = sentences[-1]
|
||||
|
||||
if os.getenv('TTS_RUNNER') == "server":
|
||||
# Speak full sentences out loud
|
||||
if chunk["role"] == "assistant" and "content" in chunk:
|
||||
accumulated_text += chunk["content"]
|
||||
sentences = split_into_sentences(accumulated_text)
|
||||
|
||||
# If we're going to speak, say we're going to stop sending text.
|
||||
# This should be fixed probably, we should be able to do both in parallel, or only one.
|
||||
if any(is_full_sentence(sentence) for sentence in sentences):
|
||||
await to_device.put({"role": "assistant", "type": "message", "end": True})
|
||||
|
||||
if is_full_sentence(sentences[-1]):
|
||||
for sentence in sentences:
|
||||
await stream_tts_to_device(sentence)
|
||||
accumulated_text = ""
|
||||
else:
|
||||
for sentence in sentences[:-1]:
|
||||
await stream_tts_to_device(sentence)
|
||||
accumulated_text = sentences[-1]
|
||||
|
||||
# If we're going to speak, say we're going to stop sending text.
|
||||
# This should be fixed probably, we should be able to do both in parallel, or only one.
|
||||
if any(is_full_sentence(sentence) for sentence in sentences):
|
||||
await to_device.put({"role": "assistant", "type": "message", "start": True})
|
||||
|
||||
# If we have a new message, save our progress and go back to the top
|
||||
if not from_user.empty():
|
||||
|
||||
|
@ -217,19 +266,12 @@ async def listener():
|
|||
break
|
||||
else:
|
||||
with open(conversation_history_path, 'w') as file:
|
||||
json.dump(interpreter.messages, file, indent=4)
|
||||
json.dump(interpreter.messages, file, indent=4)
|
||||
|
||||
async def stream_tts_to_device(sentence):
|
||||
for chunk in stream_tts(sentence):
|
||||
await to_device.put(chunk)
|
||||
|
||||
|
||||
async def stream_or_play_tts(sentence):
|
||||
|
||||
if os.getenv('TTS_RUNNER') == "server":
|
||||
tts(sentence, play_audio=True)
|
||||
else:
|
||||
await to_device.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "start": True})
|
||||
audio_bytes = tts(sentence, play_audio=False)
|
||||
await to_device.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "content": str(audio_bytes)})
|
||||
await to_device.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "end": True})
|
||||
|
||||
async def setup_ngrok(ngrok_auth_token, parsed_url):
|
||||
# Set up Ngrok
|
||||
logger.info("Setting up Ngrok")
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
# Generated by Cargo
|
||||
# will have compiled files and executables
|
||||
debug/
|
||||
target/
|
||||
|
||||
# These are backup files generated by rustfmt
|
||||
**/*.rs.bk
|
||||
|
||||
# MSVC Windows builds of rustc generate these, which store debugging information
|
||||
*.pdb
|
File diff suppressed because it is too large
Load Diff
|
@ -1,14 +0,0 @@
|
|||
[package]
|
||||
name = "whisper-rust"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.79"
|
||||
clap = { version = "4.4.18", features = ["derive"] }
|
||||
cpal = "0.15.2"
|
||||
hound = "3.5.1"
|
||||
whisper-rs = "0.10.0"
|
||||
whisper-rs-sys = "0.8.0"
|
|
@ -1,7 +0,0 @@
|
|||
# Setup
|
||||
|
||||
To rebuild the `whisper-rust` executable, do the following:
|
||||
|
||||
1. Install [Rust](https://www.rust-lang.org/tools/install), cmake, and Python dependencies `pip install -r requirements.txt`.
|
||||
2. Go to **core/stt** and run `cargo build --release`.
|
||||
3. Move the `whisper-rust` executable from target/release to this directory.
|
|
@ -1,34 +0,0 @@
|
|||
mod transcribe;
|
||||
|
||||
use clap::Parser;
|
||||
use std::path::PathBuf;
|
||||
use transcribe::transcribe;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
/// This is the model for Whisper STT
|
||||
#[arg(short, long, value_parser, required = true)]
|
||||
model_path: PathBuf,
|
||||
|
||||
/// This is the wav audio file that will be converted from speech to text
|
||||
#[arg(short, long, value_parser, required = true)]
|
||||
file_path: Option<PathBuf>,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
|
||||
let args = Args::parse();
|
||||
|
||||
let file_path = match args.file_path {
|
||||
Some(fp) => fp,
|
||||
None => panic!("No file path provided")
|
||||
};
|
||||
|
||||
let result = transcribe(&args.model_path, &file_path);
|
||||
|
||||
match result {
|
||||
Ok(transcription) => print!("{}", transcription),
|
||||
Err(e) => panic!("Error: {}", e),
|
||||
}
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
|
||||
use std::path::PathBuf;
|
||||
|
||||
|
||||
/// Transcribes the given audio file using the whisper-rs library.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `model_path` - Path to Whisper model file
|
||||
/// * `file_path` - A string slice that holds the path to the audio file to be transcribed.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A Result containing a String with the transcription if successful, or an error message if not.
|
||||
pub fn transcribe(model_path: &PathBuf, file_path: &PathBuf) -> Result<String, String> {
|
||||
|
||||
let model_path_str = model_path.to_str().expect("Not valid model path");
|
||||
// Load a context and model
|
||||
let ctx = WhisperContext::new_with_params(
|
||||
model_path_str, // Replace with the actual path to the model
|
||||
WhisperContextParameters::default(),
|
||||
)
|
||||
.map_err(|_| "failed to load model")?;
|
||||
|
||||
// Create a state
|
||||
let mut state = ctx.create_state().map_err(|_| "failed to create state")?;
|
||||
|
||||
// Create a params object
|
||||
// Note that currently the only implemented strategy is Greedy, BeamSearch is a WIP
|
||||
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
|
||||
|
||||
// Edit parameters as needed
|
||||
params.set_n_threads(1); // Set the number of threads to use
|
||||
params.set_translate(true); // Enable translation
|
||||
params.set_language(Some("en")); // Set the language to translate to English
|
||||
// Disable printing to stdout
|
||||
params.set_print_special(false);
|
||||
params.set_print_progress(false);
|
||||
params.set_print_realtime(false);
|
||||
params.set_print_timestamps(false);
|
||||
|
||||
// Load the audio file
|
||||
let audio_data = std::fs::read(file_path)
|
||||
.map_err(|e| format!("failed to read audio file: {}", e))?
|
||||
.chunks_exact(2)
|
||||
.map(|chunk| i16::from_ne_bytes([chunk[0], chunk[1]]))
|
||||
.collect::<Vec<i16>>();
|
||||
|
||||
// Convert the audio data to the required format (16KHz mono i16 samples)
|
||||
let audio_data = whisper_rs::convert_integer_to_float_audio(&audio_data);
|
||||
|
||||
// Run the model
|
||||
state.full(params, &audio_data[..]).map_err(|_| "failed to run model")?;
|
||||
|
||||
// Fetch the results
|
||||
let num_segments = state.full_n_segments().map_err(|_| "failed to get number of segments")?;
|
||||
let mut transcription = String::new();
|
||||
for i in 0..num_segments {
|
||||
let segment = state.full_get_segment_text(i).map_err(|_| "failed to get segment")?;
|
||||
transcription.push_str(&segment);
|
||||
transcription.push('\n');
|
||||
}
|
||||
|
||||
Ok(transcription)
|
||||
}
|
Binary file not shown.
|
@ -12,27 +12,28 @@ import os
|
|||
import subprocess
|
||||
import tempfile
|
||||
from pydub import AudioSegment
|
||||
from pydub.playback import play
|
||||
import simpleaudio as sa
|
||||
|
||||
client = OpenAI()
|
||||
|
||||
def tts(text, play_audio):
|
||||
chunk_size = 1024
|
||||
|
||||
def stream_tts(text):
|
||||
"""
|
||||
A generator that streams tts as LMC messages.
|
||||
"""
|
||||
if os.getenv('ALL_LOCAL') == 'False':
|
||||
response = client.audio.speech.create(
|
||||
model="tts-1",
|
||||
voice="alloy",
|
||||
input=text,
|
||||
response_format="mp3"
|
||||
response_format="opus"
|
||||
)
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file:
|
||||
with tempfile.NamedTemporaryFile(suffix=".opus") as temp_file:
|
||||
response.stream_to_file(temp_file.name)
|
||||
|
||||
if play_audio:
|
||||
audio = AudioSegment.from_mp3(temp_file.name)
|
||||
play_audiosegment(audio)
|
||||
|
||||
return temp_file.read()
|
||||
audio_bytes = temp_file.read()
|
||||
file_type = "bytes.opus"
|
||||
|
||||
else:
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
||||
output_file = temp_file.name
|
||||
|
@ -43,13 +44,19 @@ def tts(text, play_audio):
|
|||
'--output_file', output_file
|
||||
], input=text, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
|
||||
if play_audio:
|
||||
audio = AudioSegment.from_wav(temp_file.name)
|
||||
play_audiosegment(audio)
|
||||
return temp_file.read()
|
||||
audio_bytes = temp_file.read()
|
||||
file_type = "bytes.wav"
|
||||
|
||||
# Stream the audio
|
||||
yield {"role": "assistant", "type": "audio", "format": file_type, "start": True}
|
||||
for i in range(0, len(audio_bytes), chunk_size):
|
||||
chunk = audio_bytes[i:i+chunk_size]
|
||||
yield chunk
|
||||
yield {"role": "assistant", "type": "audio", "format": file_type, "end": True}
|
||||
|
||||
def play_audiosegment(audio):
|
||||
"""
|
||||
UNUSED
|
||||
the default makes some pops. this fixes that
|
||||
"""
|
||||
|
||||
|
@ -73,3 +80,6 @@ def play_audiosegment(audio):
|
|||
# Wait for the playback to finish
|
||||
play_obj.wait_done()
|
||||
|
||||
# Delete the wav file
|
||||
os.remove("output_audio.wav")
|
||||
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
class Accumulator:
|
||||
def __init__(self):
|
||||
self.template = {"role": None, "type": None, "format": None, "content": None}
|
||||
self.message = self.template
|
||||
|
||||
def accumulate(self, chunk):
|
||||
#print(str(chunk)[:100])
|
||||
if type(chunk) == dict:
|
||||
|
||||
if "format" in chunk and chunk["format"] == "active_line":
|
||||
# We don't do anything with these
|
||||
return None
|
||||
|
||||
if "start" in chunk:
|
||||
self.message = chunk
|
||||
self.message.pop("start")
|
||||
return None
|
||||
|
||||
if "content" in chunk:
|
||||
|
||||
# Display
|
||||
print(chunk['content'], end="", flush=True)
|
||||
|
||||
if any(self.message[key] != chunk[key] for key in self.message if key != "content"):
|
||||
self.message = chunk
|
||||
if "content" not in self.message:
|
||||
self.message["content"] = chunk["content"]
|
||||
else:
|
||||
self.message["content"] += chunk["content"]
|
||||
return None
|
||||
|
||||
if "end" in chunk:
|
||||
# We will proceed
|
||||
message = self.message
|
||||
self.message = self.template
|
||||
return message
|
||||
|
||||
if type(chunk) == bytes:
|
||||
if "content" not in self.message or type(self.message["content"]) != bytes:
|
||||
self.message["content"] = b""
|
||||
self.message["content"] += chunk
|
||||
return None
|
||||
|
||||
|
|
@ -122,7 +122,7 @@ def on_press(key):
|
|||
toggle_recording(True)
|
||||
|
||||
def on_release(key):
|
||||
"""Detect spacebar release and ESC key press."""
|
||||
"""Detect spacebar release and CTRL-C key press."""
|
||||
if key == keyboard.Key.space:
|
||||
toggle_recording(False)
|
||||
elif key == keyboard.Key.esc:
|
||||
|
|
|
@ -1890,6 +1890,31 @@ typing-extensions = ">=4.7,<5"
|
|||
[package.extras]
|
||||
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
|
||||
|
||||
[[package]]
|
||||
name = "opencv-python"
|
||||
version = "4.9.0.80"
|
||||
description = "Wrapper package for OpenCV python bindings."
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "opencv-python-4.9.0.80.tar.gz", hash = "sha256:1a9f0e6267de3a1a1db0c54213d022c7c8b5b9ca4b580e80bdc58516c922c9e1"},
|
||||
{file = "opencv_python-4.9.0.80-cp37-abi3-macosx_10_16_x86_64.whl", hash = "sha256:7e5f7aa4486651a6ebfa8ed4b594b65bd2d2f41beeb4241a3e4b1b85acbbbadb"},
|
||||
{file = "opencv_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71dfb9555ccccdd77305fc3dcca5897fbf0cf28b297c51ee55e079c065d812a3"},
|
||||
{file = "opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b34a52e9da36dda8c151c6394aed602e4b17fa041df0b9f5b93ae10b0fcca2a"},
|
||||
{file = "opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4088cab82b66a3b37ffc452976b14a3c599269c247895ae9ceb4066d8188a57"},
|
||||
{file = "opencv_python-4.9.0.80-cp37-abi3-win32.whl", hash = "sha256:dcf000c36dd1651118a2462257e3a9e76db789a78432e1f303c7bac54f63ef6c"},
|
||||
{file = "opencv_python-4.9.0.80-cp37-abi3-win_amd64.whl", hash = "sha256:3f16f08e02b2a2da44259c7cc712e779eff1dd8b55fdb0323e8cab09548086c0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\""},
|
||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "23.2"
|
||||
|
@ -3514,4 +3539,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9,<3.12"
|
||||
content-hash = "12ccff8a2521e7eb88eee82cfd3de409fea8e1658406d6148a42f9347ca7b2a7"
|
||||
content-hash = "5c8d587b405e97c0dca454078950157106f9aea687cbecce5b7ae7effd2aeece"
|
||||
|
|
|
@ -4,7 +4,7 @@ packages = [
|
|||
{include = "01OS"},
|
||||
]
|
||||
include = [".env.example", "start.py", "start.sh"]
|
||||
version = "0.0.2"
|
||||
version = "0.0.3"
|
||||
description = "The open-source language model computer"
|
||||
authors = ["Killian <killian@openinterpreter.com>"]
|
||||
license = "AGPL"
|
||||
|
@ -25,6 +25,7 @@ pydub = "^0.25.1"
|
|||
ngrok = "^1.0.0"
|
||||
open-interpreter = "^0.2.0"
|
||||
simpleaudio = "^1.0.4"
|
||||
opencv-python = "^4.9.0.80"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
|
|
|
@ -19,5 +19,8 @@ def main():
|
|||
command = [os.path.join(dir_path, 'start.sh')] + args
|
||||
|
||||
# Start start.sh with the command line arguments
|
||||
subprocess.run(command, check=True)
|
||||
try:
|
||||
subprocess.run(command, check=True)
|
||||
except KeyboardInterrupt:
|
||||
print("Exiting...")
|
||||
|
|
@ -54,6 +54,16 @@ if [[ "$@" == *"--expose"* ]]; then
|
|||
export SERVER_EXPOSE_PUBLICALLY="True"
|
||||
fi
|
||||
|
||||
# Check if "--clear-local" is passed as an argument
|
||||
if [[ "$@" == *"--clear-local"* ]]; then
|
||||
# If "--clear-local" is passed, clear the contents of the folders in script_dir/01OS/server/{tts and stt}/local_service
|
||||
echo "Clearing local services..."
|
||||
rm -rf "$SCRIPT_DIR/01OS/server/tts/local_service"/*
|
||||
rm -rf "$SCRIPT_DIR/01OS/server/stt/local_service"/*
|
||||
echo "Exiting after clearing local services..."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
### SETUP
|
||||
|
||||
if [[ "$ALL_LOCAL" == "True" ]]; then
|
||||
|
@ -124,7 +134,7 @@ fi
|
|||
|
||||
start_client() {
|
||||
echo "Starting client..."
|
||||
bash 01OS/clients/start.sh &
|
||||
bash $SCRIPT_DIR/01OS/clients/start.sh &
|
||||
CLIENT_PID=$!
|
||||
echo "client started as process $CLIENT_PID"
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue