malek-messaoudii
Refactor audio processing to utilize free models and enhance logging; update TTS and STT services for improved functionality
95cb26e
raw
history blame
2.27 kB
import torch
import torchaudio
from transformers import pipeline
import logging
import tempfile
import os
logger = logging.getLogger(__name__)
# Global STT pipeline
stt_pipeline = None
def load_stt_model():
"""Load the free Whisper model for speech-to-text"""
global stt_pipeline
try:
logger.info("Loading Whisper STT model...")
stt_pipeline = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small", # Free model
device="cpu" # Use CPU to avoid GPU requirements
)
logger.info("βœ“ Whisper STT model loaded successfully")
except Exception as e:
logger.error(f"βœ— Failed to load Whisper model: {str(e)}")
stt_pipeline = None
async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
"""
Convert audio bytes to text using free Whisper model.
Args:
audio_bytes: Raw audio file bytes
filename: Name of the audio file
Returns:
Transcribed text
"""
global stt_pipeline
try:
if stt_pipeline is None:
load_stt_model()
if stt_pipeline is None:
raise Exception("STT model failed to load")
logger.info(f"Converting audio to text using Whisper")
# Save audio bytes to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
temp_audio.write(audio_bytes)
temp_audio_path = temp_audio.name
try:
# Transcribe using Whisper
result = stt_pipeline(temp_audio_path)
transcribed_text = result.get("text", "").strip()
if not transcribed_text:
transcribed_text = "Sorry, I couldn't understand the audio."
logger.info(f"βœ“ STT successful: '{transcribed_text}'")
return transcribed_text
finally:
# Clean up temporary file
if os.path.exists(temp_audio_path):
os.unlink(temp_audio_path)
except Exception as e:
logger.error(f"βœ— STT failed: {str(e)}")
raise Exception(f"Speech-to-text conversion failed: {str(e)}")