add mobile flag

2024-05-02 12:48:32 -04:00 · 2024-05-02 12:48:32 -04:00 · 3dea99470a
parent 2e0ab15e5b
commit 3dea99470a
5 changed files with 79 additions and 73 deletions
--- a/software/source/clients/ios/react-native/src/screens/Main.tsx
+++ b/software/source/clients/ios/react-native/src/screens/Main.tsx
@ -182,7 +182,7 @@ const Main: React.FC<MainProps> = ({ route }) => {
        try {
          const message = JSON.parse(e.data);

-          if (message.content && typeof message.content === "string") {
+          if (message.content && message.type === "audio") {
            console.log("✅✅✅✅✅✅✅✅✅✅✅✅✅✅✅✅✅✅✅ Audio message");

            const buffer = message.content;
--- a/software/source/server/server.py
+++ b/software/source/server/server.py
@ -39,6 +39,8 @@ print("")

 setup_logging()

+accumulator_global = Accumulator()
+
 app = FastAPI()

 app_dir = user_data_dir("01")
@ -196,26 +198,11 @@ async def send_messages(websocket: WebSocket):

        try:
            if isinstance(message, dict):
-                # print(f"Sending to the device: {type(message)} {str(message)[:100]}")
+                print(f"Sending to the device: {type(message)} {str(message)[:100]}")
                await websocket.send_json(message)
            elif isinstance(message, bytes):
-                message = base64.b64encode(message)
-                # print(f"Sending to the device: {type(message)} {str(message)[:100]}")
+                print(f"Sending to the device: {type(message)} {str(message)[:100]}")
                await websocket.send_bytes(message)
-
-                """
-                str_bytes = str(message)
-                json_bytes = {
-                    "role": "assistant",
-                    "type": "audio",
-                    "format": "message",
-                    "content": str_bytes,
-                }
-                print(
-                    f"Sending to the device: {type(json_bytes)} {str(json_bytes)[:100]}"
-                )
-                await websocket.send_json(json_bytes)
-                """
            else:
                raise TypeError("Message must be a dict or bytes")
        except:
@ -224,10 +211,11 @@ async def send_messages(websocket: WebSocket):
            raise


-async def listener():
+async def listener(mobile: bool):
    while True:
        try:
-            accumulator = Accumulator()
+            if mobile:
+                accumulator_mobile = Accumulator()

            while True:
                if not from_user.empty():
@ -238,7 +226,11 @@ async def listener():
                    break
                await asyncio.sleep(1)

-            message = accumulator.accumulate(chunk)
+            if mobile:
+                message = accumulator_mobile.accumulate(chunk, mobile)
+            else:
+                message = accumulator_global.accumulate(chunk, mobile)
+
            if message == None:
                # Will be None until we have a full message ready
                continue
@ -305,8 +297,9 @@ async def listener():
                logger.debug("Got chunk:", chunk)

                # Send it to the user
-                # await to_device.put(chunk)
-                # Yield to the event loop, so you actually send it out
+                await to_device.put(chunk)
+
+                # Yield to the event loop, so you actxually send it out
                await asyncio.sleep(0.01)

                if os.getenv("TTS_RUNNER") == "server":
@ -328,11 +321,11 @@ async def listener():

                        if is_full_sentence(sentences[-1]):
                            for sentence in sentences:
-                                await stream_tts_to_device(sentence)
+                                await stream_tts_to_device(sentence, mobile)
                            accumulated_text = ""
                        else:
                            for sentence in sentences[:-1]:
-                                await stream_tts_to_device(sentence)
+                                await stream_tts_to_device(sentence, mobile)
                            accumulated_text = sentences[-1]

                        # If we're going to speak, say we're going to stop sending text.
@ -376,7 +369,7 @@ async def listener():
            traceback.print_exc()


-async def stream_tts_to_device(sentence):
+async def stream_tts_to_device(sentence, mobile: bool):
    force_task_completion_responses = [
        "the task is done",
        "the task is impossible",
@ -385,49 +378,44 @@ async def stream_tts_to_device(sentence):
    if sentence.lower().strip().strip(".!?").strip() in force_task_completion_responses:
        return

-    for chunk in stream_tts(sentence):
+    for chunk in stream_tts(sentence, mobile):
        await to_device.put(chunk)


-def stream_tts(sentence):
-    audio_file = tts(sentence)
+def stream_tts(sentence, mobile: bool):
+    audio_file = tts(sentence, mobile)

-    with open(audio_file, "rb") as f:
-        audio_bytes = f.read()
-    desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
-    desktop_audio_file = os.path.join(
-        desktop_path, f"{datetime.datetime.now()}" + os.path.basename(audio_file)
-    )
-    shutil.copy(audio_file, desktop_audio_file)
-    print(f"Audio file saved to Desktop: {desktop_audio_file}")
-    # storage_client = storage.Client(project="react-native-421323")
-    # bucket = storage_client.bucket("01-audio")
-    # blob = bucket.blob(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
-    # generation_match_precondition = 0
-
-    # blob.upload_from_filename(
-    #     audio_file, if_generation_match=generation_match_precondition
-    # )
-    # print(
-    #     f"Audio file {audio_file} uploaded to {datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav"
-    # )
-
-    file_type = "audio/wav"
    # Read the entire WAV file
    with open(audio_file, "rb") as f:
        audio_bytes = f.read()

-    os.remove(audio_file)
+    if mobile:
+        file_type = "audio/wav"

-    # Stream the audio as a single message
-    yield {
-        "role": "assistant",
-        "type": "audio",
-        "format": file_type,
-        "content": base64.b64encode(audio_bytes).decode("utf-8"),
-        "start": True,
-        "end": True,
-    }
+        os.remove(audio_file)
+
+        # stream the audio as a single sentence
+        yield {
+            "role": "assistant",
+            "type": "audio",
+            "format": file_type,
+            "content": base64.b64encode(audio_bytes).decode("utf-8"),
+            "start": True,
+            "end": True,
+        }
+
+    else:
+        # stream the audio in chunk sizes
+        os.remove(audio_file)
+
+        file_type = "bytes.raw"
+        chunk_size = 1024
+
+        yield {"role": "assistant", "type": "audio", "format": file_type, "start": True}
+        for i in range(0, len(audio_bytes), chunk_size):
+            chunk = audio_bytes[i : i + chunk_size]
+            yield chunk
+        yield {"role": "assistant", "type": "audio", "format": file_type, "end": True}


 from uvicorn import Config, Server
@ -464,6 +452,7 @@ async def main(
    temperature,
    tts_service,
    stt_service,
+    mobile,
 ):
    global HOST
    global PORT
@ -515,7 +504,7 @@ async def main(
    interpreter.llm.completions = llm

    # Start listening
-    asyncio.create_task(listener())
+    asyncio.create_task(listener(mobile))

    # Start watching the kernel if it's your job to do that
    if True:  # in the future, code can run on device. for now, just server.
--- a/software/source/server/services/tts/openai/tts.py
+++ b/software/source/server/services/tts/openai/tts.py
@ -25,7 +25,7 @@ class Tts:
    def __init__(self, config):
        pass

-    def tts(self, text):
+    def tts(self, text, mobile):
        response = client.audio.speech.create(
            model="tts-1",
            voice=os.getenv("OPENAI_VOICE_NAME", "alloy"),
@ -36,9 +36,15 @@ class Tts:
            response.stream_to_file(temp_file.name)

            # TODO: hack to format audio correctly for device
-            outfile = tempfile.gettempdir() + "/" + "output.wav"
-            ffmpeg.input(temp_file.name).output(
-                outfile, f="wav", ar="16000", ac="1", loglevel="panic"
-            ).run()
+            if mobile:
+                outfile = tempfile.gettempdir() + "/" + "output.wav"
+                ffmpeg.input(temp_file.name).output(
+                    outfile, f="wav", ar="16000", ac="1", loglevel="panic"
+                ).run()
+            else:
+                outfile = tempfile.gettempdir() + "/" + "raw.dat"
+                ffmpeg.input(temp_file.name).output(
+                    outfile, f="s16le", ar="16000", ac="1", loglevel="panic"
+                ).run()

            return outfile
--- a/software/source/utils/accumulator.py
+++ b/software/source/utils/accumulator.py
@ -3,7 +3,7 @@ class Accumulator:
        self.template = {"role": None, "type": None, "format": None, "content": None}
        self.message = self.template

-    def accumulate(self, chunk):
+    def accumulate(self, chunk, mobile):
        # print(str(chunk)[:100])
        if type(chunk) == dict:
            if "format" in chunk and chunk["format"] == "active_line":
@ -44,6 +44,10 @@ class Accumulator:
            if "content" not in self.message or type(self.message["content"]) != bytes:
                self.message["content"] = b""
            self.message["content"] += chunk
-            self.message["type"] = "audio"
-            self.message["format"] = "bytes.wav"
-            return self.message
+
+            if mobile:
+                self.message["type"] = "audio"
+                self.message["format"] = "bytes.wav"
+                return self.message
+            else:
+                return None
--- a/software/start.py
+++ b/software/start.py
@ -72,13 +72,16 @@ def run(
        False, "--local", help="Use recommended local services for LLM, STT, and TTS"
    ),
    qr: bool = typer.Option(False, "--qr", help="Print the QR code for the server URL"),
+    mobile: bool = typer.Option(
+        False, "--mobile", help="Toggle server to support mobile app"
+    ),
 ):
    _run(
-        server=server,
+        server=server or mobile,
        server_host=server_host,
        server_port=server_port,
        tunnel_service=tunnel_service,
-        expose=expose,
+        expose=expose or mobile,
        client=client,
        server_url=server_url,
        client_type=client_type,
@ -92,7 +95,8 @@ def run(
        tts_service=tts_service,
        stt_service=stt_service,
        local=local,
-        qr=qr,
+        qr=qr or mobile,
+        mobile=mobile,
    )


@ -116,6 +120,7 @@ def _run(
    stt_service: str = "openai",
    local: bool = False,
    qr: bool = False,
+    mobile: bool = False,
 ):
    if local:
        tts_service = "piper"
@ -136,6 +141,7 @@ def _run(
    signal.signal(signal.SIGINT, handle_exit)

    if server:
+        print(f"Starting server with mobile = {mobile}")
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        server_thread = threading.Thread(
@ -153,6 +159,7 @@ def _run(
                    temperature,
                    tts_service,
                    stt_service,
+                    mobile,
                ),
            ),
        )