Merge pull request #19 from shivenmian/u/shivenmian/local
feat: added local TTS using Piper
This commit is contained in:
commit
9384f68c66
|
@ -1,5 +1,5 @@
|
||||||
ggml-*.bin
|
ggml-*.bin
|
||||||
|
OS/01/local_tts/*
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
|
|
@ -6,6 +6,11 @@ export ALL_LOCAL=False
|
||||||
# export WHISPER_MODEL_PATH=...
|
# export WHISPER_MODEL_PATH=...
|
||||||
# export OPENAI_API_KEY=sk-...
|
# export OPENAI_API_KEY=sk-...
|
||||||
|
|
||||||
|
# For TTS, we use the en_US-lessac-medium voice model by default
|
||||||
|
# Please change the voice URL and voice name if you wish to use another voice
|
||||||
|
export PIPER_VOICE_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/"
|
||||||
|
export PIPER_VOICE_NAME="en_US-lessac-medium.onnx"
|
||||||
|
|
||||||
# If SERVER_START, this is where we'll serve the server.
|
# If SERVER_START, this is where we'll serve the server.
|
||||||
# If DEVICE_START, this is where the device expects the server to be.
|
# If DEVICE_START, this is where the device expects the server to be.
|
||||||
export SERVER_URL=ws://localhost:8000/
|
export SERVER_URL=ws://localhost:8000/
|
||||||
|
@ -26,6 +31,46 @@ export LOG_LEVEL="INFO"
|
||||||
|
|
||||||
### SETUP
|
### SETUP
|
||||||
|
|
||||||
|
# if using local models, install the models / executables
|
||||||
|
if [[ "$ALL_LOCAL" == "True" ]]; then
|
||||||
|
OS=$(uname -s)
|
||||||
|
ARCH=$(uname -m)
|
||||||
|
if [ "$OS" = "Darwin" ]; then
|
||||||
|
OS="macos"
|
||||||
|
if [ "$ARCH" = "arm64" ]; then
|
||||||
|
ARCH="aarch64"
|
||||||
|
elif [ "$ARCH" = "x86_64" ]; then
|
||||||
|
ARCH="x64"
|
||||||
|
else
|
||||||
|
echo "Piper: unsupported architecture"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
PIPER_ASSETNAME="piper_${OS}_${ARCH}.tar.gz"
|
||||||
|
PIPER_URL="https://github.com/rhasspy/piper/releases/latest/download/"
|
||||||
|
mkdir local_tts
|
||||||
|
cd local_tts
|
||||||
|
curl -OL "${PIPER_URL}${PIPER_ASSETNAME}"
|
||||||
|
tar -xvzf $PIPER_ASSETNAME
|
||||||
|
cd piper
|
||||||
|
if [ "$OS" = "macos" ]; then
|
||||||
|
if [ "$ARCH" = "x64" ]; then
|
||||||
|
softwareupdate --install-rosetta --agree-to-license
|
||||||
|
fi
|
||||||
|
PIPER_PHONEMIZE_ASSETNAME="piper-phonemize_${OS}_${ARCH}.tar.gz"
|
||||||
|
PIPER_PHONEMIZE_URL="https://github.com/rhasspy/piper-phonemize/releases/latest/download/"
|
||||||
|
|
||||||
|
curl -OL "${PIPER_PHONEMIZE_URL}${PIPER_PHONEMIZE_ASSETNAME}"
|
||||||
|
tar -xvzf $PIPER_PHONEMIZE_ASSETNAME
|
||||||
|
curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}"
|
||||||
|
curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}.json"
|
||||||
|
PIPER_DIR=`pwd`
|
||||||
|
install_name_tool -change @rpath/libespeak-ng.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libespeak-ng.1.dylib" "${PIPER_DIR}/piper"
|
||||||
|
install_name_tool -change @rpath/libonnxruntime.1.14.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libonnxruntime.1.14.1.dylib" "${PIPER_DIR}/piper"
|
||||||
|
install_name_tool -change @rpath/libpiper_phonemize.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libpiper_phonemize.1.dylib" "${PIPER_DIR}/piper"
|
||||||
|
fi
|
||||||
|
cd ../..
|
||||||
|
fi
|
||||||
|
|
||||||
# (for dev, reset the ports we were using)
|
# (for dev, reset the ports we were using)
|
||||||
|
|
||||||
SERVER_PORT=$(echo $SERVER_URL | grep -oE "[0-9]+")
|
SERVER_PORT=$(echo $SERVER_URL | grep -oE "[0-9]+")
|
||||||
|
|
43
OS/01/tts.py
43
OS/01/tts.py
|
@ -7,20 +7,37 @@ from openai import OpenAI
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
from pydub.playback import play
|
from pydub.playback import play
|
||||||
from playsound import playsound
|
from playsound import playsound
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
|
||||||
client = OpenAI()
|
client = OpenAI()
|
||||||
|
|
||||||
def tts(text, play_audio):
|
def tts(text, play_audio):
|
||||||
response = client.audio.speech.create(
|
if os.getenv('ALL_LOCAL') == 'False':
|
||||||
model="tts-1",
|
response = client.audio.speech.create(
|
||||||
voice="alloy",
|
model="tts-1",
|
||||||
input=text,
|
voice="alloy",
|
||||||
response_format="mp3"
|
input=text,
|
||||||
)
|
response_format="mp3"
|
||||||
with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file:
|
)
|
||||||
response.stream_to_file(temp_file.name)
|
with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file:
|
||||||
|
response.stream_to_file(temp_file.name)
|
||||||
if play_audio:
|
|
||||||
playsound(temp_file.name)
|
if play_audio:
|
||||||
|
playsound(temp_file.name)
|
||||||
return temp_file.read()
|
|
||||||
|
return temp_file.read()
|
||||||
|
else:
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
||||||
|
output_file = temp_file.name
|
||||||
|
piper_dir = os.path.join(os.path.dirname(__file__), 'local_tts', 'piper')
|
||||||
|
subprocess.run([
|
||||||
|
os.path.join(piper_dir, 'piper'),
|
||||||
|
'--model', os.path.join(piper_dir, os.getenv('PIPER_VOICE_NAME')),
|
||||||
|
'--output_file', output_file
|
||||||
|
], input=text, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||||
|
|
||||||
|
if play_audio:
|
||||||
|
playsound(temp_file.name)
|
||||||
|
return temp_file.read()
|
||||||
|
|
Loading…
Reference in New Issue