commit
387bc00a20
|
@ -70,6 +70,7 @@ void hexdump(const void *mem, uint32_t len, uint8_t cols = 16) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void InitI2SSpeakerOrMic(int mode) {
|
void InitI2SSpeakerOrMic(int mode) {
|
||||||
|
Serial.printf("InitI2sSpeakerOrMic %d\n", mode);
|
||||||
esp_err_t err = ESP_OK;
|
esp_err_t err = ESP_OK;
|
||||||
|
|
||||||
i2s_driver_uninstall(SPEAKER_I2S_NUMBER);
|
i2s_driver_uninstall(SPEAKER_I2S_NUMBER);
|
||||||
|
@ -136,17 +137,19 @@ void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) {
|
||||||
break;
|
break;
|
||||||
case WStype_TEXT:
|
case WStype_TEXT:
|
||||||
Serial.printf("[WSc] get text: %s\n", payload);
|
Serial.printf("[WSc] get text: %s\n", payload);
|
||||||
if ((char)payload[0] == 's'){
|
{
|
||||||
Serial.println("start");
|
std::string str(payload, payload + length);
|
||||||
speaker_offset = 0;
|
bool isAudio = str.find("\"audio\"") != std::string::npos;
|
||||||
InitI2SSpeakerOrMic(MODE_SPK);
|
if (isAudio && str.find("\"start\"") != std::string::npos) {
|
||||||
|
Serial.println("start playback");
|
||||||
|
speaker_offset = 0;
|
||||||
|
InitI2SSpeakerOrMic(MODE_SPK);
|
||||||
|
} else if (isAudio && str.find("\"end\"") != std::string::npos) {
|
||||||
|
Serial.println("end playback");
|
||||||
|
// speaker_play(speakerdata0, speaker_offset);
|
||||||
|
// speaker_offset = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if ((char)payload[0] == 'e'){
|
|
||||||
Serial.println("end");
|
|
||||||
// speaker_play(speakerdata0, speaker_offset);
|
|
||||||
// speaker_offset = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// send message to server
|
// send message to server
|
||||||
// webSocket.sendTXT("message here");
|
// webSocket.sendTXT("message here");
|
||||||
break;
|
break;
|
||||||
|
@ -180,12 +183,12 @@ void websocket_setup() {
|
||||||
Serial.println("connecting to WiFi");
|
Serial.println("connecting to WiFi");
|
||||||
}
|
}
|
||||||
Serial.println("connected to WiFi");
|
Serial.println("connected to WiFi");
|
||||||
webSocket.begin(COMPUTER_IP, 9001, "/");
|
webSocket.begin(COMPUTER_IP, 8000, "/");
|
||||||
webSocket.onEvent(webSocketEvent);
|
webSocket.onEvent(webSocketEvent);
|
||||||
// webSocket.setAuthorization("user", "Password");
|
// webSocket.setAuthorization("user", "Password");
|
||||||
webSocket.setReconnectInterval(5000);
|
webSocket.setReconnectInterval(5000);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void setup() {
|
void setup() {
|
||||||
M5.begin(true, false, true);
|
M5.begin(true, false, true);
|
||||||
M5.dis.drawpix(0, CRGB(128, 128, 0));
|
M5.dis.drawpix(0, CRGB(128, 128, 0));
|
||||||
|
@ -208,17 +211,19 @@ void loop() {
|
||||||
button.loop();
|
button.loop();
|
||||||
if (button.justPressed()) {
|
if (button.justPressed()) {
|
||||||
Serial.println("Recording...");
|
Serial.println("Recording...");
|
||||||
webSocket.sendTXT("s");
|
webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"start\": true}");
|
||||||
InitI2SSpeakerOrMic(MODE_MIC);
|
InitI2SSpeakerOrMic(MODE_MIC);
|
||||||
recording = true;
|
recording = true;
|
||||||
|
data_offset = 0;
|
||||||
|
Serial.println("Recording ready.");
|
||||||
} else if (button.justReleased()) {
|
} else if (button.justReleased()) {
|
||||||
Serial.println("Stopped recording.");
|
Serial.println("Stopped recording.");
|
||||||
webSocket.sendTXT("e");
|
webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"end\": true}");
|
||||||
flush_microphone();
|
flush_microphone();
|
||||||
recording = false;
|
recording = false;
|
||||||
}
|
data_offset = 0;
|
||||||
|
} else if (recording) {
|
||||||
if (recording) {
|
Serial.printf("Reading chunk at %d...\n", data_offset);
|
||||||
size_t bytes_read;
|
size_t bytes_read;
|
||||||
i2s_read(
|
i2s_read(
|
||||||
SPEAKER_I2S_NUMBER,
|
SPEAKER_I2S_NUMBER,
|
||||||
|
@ -226,13 +231,13 @@ void loop() {
|
||||||
DATA_SIZE, &bytes_read, (100 / portTICK_RATE_MS)
|
DATA_SIZE, &bytes_read, (100 / portTICK_RATE_MS)
|
||||||
);
|
);
|
||||||
data_offset += bytes_read;
|
data_offset += bytes_read;
|
||||||
|
Serial.printf("Read %d bytes in chunk.\n", bytes_read);
|
||||||
|
|
||||||
if (data_offset > 1024*10) {
|
if (data_offset > 1024*9) {
|
||||||
flush_microphone();
|
flush_microphone();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
M5.update();
|
M5.update();
|
||||||
webSocket.loop();
|
webSocket.loop();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,6 +43,18 @@ def configure_interpreter(interpreter: OpenInterpreter):
|
||||||
|
|
||||||
Remember: You can run Python code. Be very concise. Ensure that you actually run code every time! THIS IS IMPORTANT. You NEED to write code. **Help the user by being very concise in your answers.** Do not break down tasks excessively, just into simple, few minute steps. Don't assume the user lives their life in a certain way— pick very general tasks if you're breaking a task down.
|
Remember: You can run Python code. Be very concise. Ensure that you actually run code every time! THIS IS IMPORTANT. You NEED to write code. **Help the user by being very concise in your answers.** Do not break down tasks excessively, just into simple, few minute steps. Don't assume the user lives their life in a certain way— pick very general tasks if you're breaking a task down.
|
||||||
|
|
||||||
|
ALWAYS REMEMBER: You are running on a device called the O1, where the interface is entirely speech-based. Keep your responses succint in light of this!
|
||||||
|
IF YOU NEED TO THINK ABOUT A PROBLEM: (such as "Here's the plan:"), WRITE IT IN THE COMMENTS of the code block!
|
||||||
|
For example:
|
||||||
|
> User: What is 432/7?
|
||||||
|
> Assistant: Let me use Python to calculate that.
|
||||||
|
> Assistant Python function call:
|
||||||
|
> # Here's the plan:
|
||||||
|
> # 1. Divide the numbers
|
||||||
|
> # 2. Round it to 3 digits.
|
||||||
|
> print(round(432/7, 3))
|
||||||
|
> Assistant: 432 / 7 is 61.714.
|
||||||
|
|
||||||
Use the following functions (assume they're imported) to complete your goals whenever possible:
|
Use the following functions (assume they're imported) to complete your goals whenever possible:
|
||||||
{{
|
{{
|
||||||
import sys
|
import sys
|
||||||
|
@ -70,7 +82,9 @@ print(output)
|
||||||
|
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
interpreter.custom_instructions = system_message
|
# interpreter.custom_instructions = system_message
|
||||||
|
interpreter.system_message = system_message
|
||||||
|
interpreter.llm.supports_functions = True
|
||||||
|
|
||||||
### LLM SETTINGS
|
### LLM SETTINGS
|
||||||
|
|
||||||
|
|
|
@ -201,7 +201,7 @@ async def listener():
|
||||||
|
|
||||||
accumulated_text = ""
|
accumulated_text = ""
|
||||||
|
|
||||||
for chunk in interpreter.chat(messages, stream=True, display=False):
|
for chunk in interpreter.chat(messages, stream=True, display=True):
|
||||||
|
|
||||||
logger.debug("Got chunk:", chunk)
|
logger.debug("Got chunk:", chunk)
|
||||||
|
|
||||||
|
@ -212,7 +212,7 @@ async def listener():
|
||||||
|
|
||||||
if os.getenv('TTS_RUNNER') == "server":
|
if os.getenv('TTS_RUNNER') == "server":
|
||||||
# Speak full sentences out loud
|
# Speak full sentences out loud
|
||||||
if chunk["role"] == "assistant" and "content" in chunk:
|
if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
|
||||||
accumulated_text += chunk["content"]
|
accumulated_text += chunk["content"]
|
||||||
sentences = split_into_sentences(accumulated_text)
|
sentences = split_into_sentences(accumulated_text)
|
||||||
|
|
||||||
|
@ -241,7 +241,7 @@ async def listener():
|
||||||
# Check if it's just an end flag. We ignore those.
|
# Check if it's just an end flag. We ignore those.
|
||||||
temp_message = await from_user.get()
|
temp_message = await from_user.get()
|
||||||
|
|
||||||
if temp_message == {'role': 'user', 'type': 'message', 'end': True}:
|
if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
|
||||||
# Yup. False alarm.
|
# Yup. False alarm.
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
|
@ -251,8 +251,9 @@ async def listener():
|
||||||
with open(conversation_history_path, 'w') as file:
|
with open(conversation_history_path, 'w') as file:
|
||||||
json.dump(interpreter.messages, file, indent=4)
|
json.dump(interpreter.messages, file, indent=4)
|
||||||
|
|
||||||
logger.info("New user message recieved. Breaking.")
|
# TODO: is triggering seemingly randomly
|
||||||
break
|
#logger.info("New user message recieved. Breaking.")
|
||||||
|
#break
|
||||||
|
|
||||||
# Also check if there's any new computer messages
|
# Also check if there's any new computer messages
|
||||||
if not from_computer.empty():
|
if not from_computer.empty():
|
||||||
|
|
|
@ -25,6 +25,8 @@ def convert_mime_type_to_format(mime_type: str) -> str:
|
||||||
return "wav"
|
return "wav"
|
||||||
if mime_type == "audio/webm":
|
if mime_type == "audio/webm":
|
||||||
return "webm"
|
return "webm"
|
||||||
|
if mime_type == "audio/raw":
|
||||||
|
return "dat"
|
||||||
|
|
||||||
return mime_type
|
return mime_type
|
||||||
|
|
||||||
|
@ -43,7 +45,16 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
|
||||||
|
|
||||||
# Export to wav
|
# Export to wav
|
||||||
output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
|
output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
|
||||||
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
|
print(mime_type, input_path, output_path)
|
||||||
|
if mime_type == "audio/raw":
|
||||||
|
ffmpeg.input(
|
||||||
|
input_path,
|
||||||
|
f='s16le',
|
||||||
|
ar='16000',
|
||||||
|
ac=1,
|
||||||
|
).output(output_path).run()
|
||||||
|
else:
|
||||||
|
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
yield output_path
|
yield output_path
|
||||||
|
|
|
@ -6,6 +6,7 @@ from pydub import AudioSegment
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
load_dotenv() # take environment variables from .env.
|
load_dotenv() # take environment variables from .env.
|
||||||
|
|
||||||
|
import ffmpeg
|
||||||
import tempfile
|
import tempfile
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
import os
|
import os
|
||||||
|
@ -28,11 +29,17 @@ def stream_tts(text):
|
||||||
input=text,
|
input=text,
|
||||||
response_format="opus"
|
response_format="opus"
|
||||||
)
|
)
|
||||||
with tempfile.NamedTemporaryFile(suffix=".opus") as temp_file:
|
with tempfile.NamedTemporaryFile(suffix=".opus", delete=False) as temp_file:
|
||||||
response.stream_to_file(temp_file.name)
|
response.stream_to_file(temp_file.name)
|
||||||
|
|
||||||
audio_bytes = temp_file.read()
|
# TODO: hack to format audio correctly for device
|
||||||
file_type = "bytes.opus"
|
outfile = tempfile.gettempdir() + "/" + "raw.dat"
|
||||||
|
ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run()
|
||||||
|
with open(outfile, "rb") as f:
|
||||||
|
audio_bytes = f.read()
|
||||||
|
file_type = "bytes.raw"
|
||||||
|
print(outfile, len(audio_bytes))
|
||||||
|
os.remove(outfile)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
||||||
|
|
Loading…
Reference in New Issue