malek-messaoudii
Refactor audio processing to utilize free models and enhance logging; update TTS and STT services for improved functionality
95cb26e
| import torch | |
| import torchaudio | |
| from transformers import pipeline | |
| import logging | |
| import tempfile | |
| import os | |
| logger = logging.getLogger(__name__) | |
| # Global STT pipeline | |
| stt_pipeline = None | |
| def load_stt_model(): | |
| """Load the free Whisper model for speech-to-text""" | |
| global stt_pipeline | |
| try: | |
| logger.info("Loading Whisper STT model...") | |
| stt_pipeline = pipeline( | |
| "automatic-speech-recognition", | |
| model="openai/whisper-small", # Free model | |
| device="cpu" # Use CPU to avoid GPU requirements | |
| ) | |
| logger.info("β Whisper STT model loaded successfully") | |
| except Exception as e: | |
| logger.error(f"β Failed to load Whisper model: {str(e)}") | |
| stt_pipeline = None | |
| async def speech_to_text(audio_bytes: bytes, filename: str) -> str: | |
| """ | |
| Convert audio bytes to text using free Whisper model. | |
| Args: | |
| audio_bytes: Raw audio file bytes | |
| filename: Name of the audio file | |
| Returns: | |
| Transcribed text | |
| """ | |
| global stt_pipeline | |
| try: | |
| if stt_pipeline is None: | |
| load_stt_model() | |
| if stt_pipeline is None: | |
| raise Exception("STT model failed to load") | |
| logger.info(f"Converting audio to text using Whisper") | |
| # Save audio bytes to temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio: | |
| temp_audio.write(audio_bytes) | |
| temp_audio_path = temp_audio.name | |
| try: | |
| # Transcribe using Whisper | |
| result = stt_pipeline(temp_audio_path) | |
| transcribed_text = result.get("text", "").strip() | |
| if not transcribed_text: | |
| transcribed_text = "Sorry, I couldn't understand the audio." | |
| logger.info(f"β STT successful: '{transcribed_text}'") | |
| return transcribed_text | |
| finally: | |
| # Clean up temporary file | |
| if os.path.exists(temp_audio_path): | |
| os.unlink(temp_audio_path) | |
| except Exception as e: | |
| logger.error(f"β STT failed: {str(e)}") | |
| raise Exception(f"Speech-to-text conversion failed: {str(e)}") |