109 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			109 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
| class Stt:
 | |
|     def __init__(self, config):
 | |
|         pass
 | |
| 
 | |
|     def stt(self, audio_file_path):
 | |
|         return stt(audio_file_path)
 | |
| 
 | |
| 
 | |
| 
 | |
| from datetime import datetime
 | |
| import os
 | |
| import contextlib
 | |
| import tempfile
 | |
| import ffmpeg
 | |
| import subprocess
 | |
| import openai
 | |
| from openai import OpenAI
 | |
| 
 | |
| 
 | |
| client = OpenAI()
 | |
| 
 | |
| def convert_mime_type_to_format(mime_type: str) -> str:
 | |
|     if mime_type == "audio/x-wav" or mime_type == "audio/wav":
 | |
|         return "wav"
 | |
|     if mime_type == "audio/webm":
 | |
|         return "webm"
 | |
|     if mime_type == "audio/raw":
 | |
|         return "dat"
 | |
| 
 | |
|     return mime_type
 | |
| 
 | |
| @contextlib.contextmanager
 | |
| def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
 | |
|     temp_dir = tempfile.gettempdir()
 | |
| 
 | |
|     # Create a temporary file with the appropriate extension
 | |
|     input_ext = convert_mime_type_to_format(mime_type)
 | |
|     input_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.{input_ext}")
 | |
|     with open(input_path, 'wb') as f:
 | |
|         f.write(audio)
 | |
| 
 | |
|     # Check if the input file exists
 | |
|     assert os.path.exists(input_path), f"Input file does not exist: {input_path}"
 | |
| 
 | |
|     # Export to wav
 | |
|     output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
 | |
|     if mime_type == "audio/raw":
 | |
|         ffmpeg.input(
 | |
|             input_path,
 | |
|             f='s16le',
 | |
|             ar='16000',
 | |
|             ac=1,
 | |
|         ).output(output_path, loglevel='panic').run()
 | |
|     else:
 | |
|         ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k', loglevel='panic').run()
 | |
| 
 | |
|     try:
 | |
|         yield output_path
 | |
|     finally:
 | |
|         os.remove(input_path)
 | |
|         os.remove(output_path)
 | |
| 
 | |
| def run_command(command):
 | |
|     result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
 | |
|     return result.stdout, result.stderr
 | |
| 
 | |
| def get_transcription_file(wav_file_path: str):
 | |
|     local_path = os.path.join(os.path.dirname(__file__), 'local_service')
 | |
|     whisper_rust_path = os.path.join(os.path.dirname(__file__), 'whisper-rust', 'target', 'release')
 | |
|     model_name = os.getenv('WHISPER_MODEL_NAME', 'ggml-tiny.en.bin')
 | |
| 
 | |
|     output, error = run_command([
 | |
|         os.path.join(whisper_rust_path, 'whisper-rust'),
 | |
|         '--model-path', os.path.join(local_path, model_name),
 | |
|         '--file-path', wav_file_path
 | |
|     ])
 | |
| 
 | |
|     return output
 | |
| 
 | |
| def get_transcription_bytes(audio_bytes: bytearray, mime_type):
 | |
|     with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path:
 | |
|         return get_transcription_file(wav_file_path)
 | |
| 
 | |
| def stt_bytes(audio_bytes: bytearray, mime_type="audio/wav"):
 | |
|     with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path:
 | |
|         return stt_wav(wav_file_path)
 | |
| 
 | |
| def stt_wav(wav_file_path: str):
 | |
| 
 | |
|     audio_file = open(wav_file_path, "rb")
 | |
|     try:
 | |
|         transcript = client.audio.transcriptions.create(
 | |
|             model="whisper-1", 
 | |
|             file=audio_file,
 | |
|             response_format="text"
 | |
|         )
 | |
|     except openai.BadRequestError as e:
 | |
|         print(f"openai.BadRequestError: {e}")
 | |
|         return None
 | |
| 
 | |
|     return transcript
 | |
| 
 | |
| def stt(input_data, mime_type="audio/wav"):
 | |
|     if isinstance(input_data, str):
 | |
|         return stt_wav(input_data)
 | |
|     elif isinstance(input_data, bytearray):
 | |
|         return stt_bytes(input_data, mime_type)
 | |
|     else:
 | |
|         raise ValueError("Input data should be either a path to a wav file (str) or audio bytes (bytearray)") |