Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Running

FastAPI-Backend-Models / services /stt_service.py

malek-messaoudii

Refactor audio processing to utilize free models and enhance logging; update TTS and STT services for improved functionality

95cb26e about 1 month ago

raw

history blame

2.27 kB

	import torch
	import torchaudio
	from transformers import pipeline
	import logging
	import tempfile
	import os

	logger = logging.getLogger(__name__)

	# Global STT pipeline
	stt_pipeline = None

	def load_stt_model():
	"""Load the free Whisper model for speech-to-text"""
	global stt_pipeline
	try:
	logger.info("Loading Whisper STT model...")
	stt_pipeline = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-small", # Free model
	device="cpu" # Use CPU to avoid GPU requirements
	)
	logger.info("✓ Whisper STT model loaded successfully")
	except Exception as e:
	logger.error(f"✗ Failed to load Whisper model: {str(e)}")
	stt_pipeline = None

	async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
	"""
	Convert audio bytes to text using free Whisper model.

	Args:
	audio_bytes: Raw audio file bytes
	filename: Name of the audio file

	Returns:
	Transcribed text
	"""
	global stt_pipeline

	try:
	if stt_pipeline is None:
	load_stt_model()
	if stt_pipeline is None:
	raise Exception("STT model failed to load")

	logger.info(f"Converting audio to text using Whisper")

	# Save audio bytes to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
	temp_audio.write(audio_bytes)
	temp_audio_path = temp_audio.name

	try:
	# Transcribe using Whisper
	result = stt_pipeline(temp_audio_path)
	transcribed_text = result.get("text", "").strip()

	if not transcribed_text:
	transcribed_text = "Sorry, I couldn't understand the audio."

	logger.info(f"✓ STT successful: '{transcribed_text}'")
	return transcribed_text

	finally:
	# Clean up temporary file
	if os.path.exists(temp_audio_path):
	os.unlink(temp_audio_path)

	except Exception as e:
	logger.error(f"✗ STT failed: {str(e)}")
	raise Exception(f"Speech-to-text conversion failed: {str(e)}")