Spaces:

AitBAD
/

kab-tts-tanti

Sleeping

App Files Files Community

kab-tts-tanti / app.py

AitBAD

Update app.py

2d2220b verified 3 months ago

raw

history blame contribute delete

5.9 kB

	# app.py - Fully standalone Kabyle TTS with working downloads

	import gradio as gr
	from transformers import VitsModel, AutoTokenizer
	import torch
	import scipy.io.wavfile as wavfile
	import numpy as np
	import os
	from datetime import datetime
	from pydub import AudioSegment
	import pdfplumber

	# Load model once at startup
	print("Loading facebook/mms-tts-kab...")
	model = VitsModel.from_pretrained("facebook/mms-tts-kab")
	tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kab")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = model.to(device)
	sampling_rate = model.config.sampling_rate
	print(f"Running on device: {device}")

	# Create outputs folder
	os.makedirs("outputs", exist_ok=True)

	def convert_wav_to_mp3(wav_path, mp3_path):
	"""Converts a WAV file to MP3."""
	try:
	audio = AudioSegment.from_wav(wav_path)
	audio.export(mp3_path, format="mp3")
	return mp3_path
	except Exception as e:
	print(f"Failed to convert WAV to MP3: {e}")
	return None

	def synthesize(text, speed):
	if not text.strip():
	raise ValueError("Text is empty!")

	# Tokenize
	inputs = tokenizer(text.strip(), return_tensors="pt").to(device)

	# Generate waveform
	with torch.no_grad():
	waveform = model(**inputs).waveform.cpu().numpy().squeeze()

	# Adjust rate for speed
	adjusted_rate = int(sampling_rate * speed)

	# Save WAV
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	wav_filepath = os.path.join("outputs", f"kabyle_{timestamp}.wav")
	wavfile.write(wav_filepath, adjusted_rate, np.float32(waveform))

	return wav_filepath

	def process_input(input_type, typed_text, uploaded_file, speed):
	final_text = ""
	# Choose source
	if input_type == "📝 Type Text":
	final_text = typed_text
	else:
	if uploaded_file is None:
	raise ValueError("Please upload a file.")

	file_extension = os.path.splitext(uploaded_file.name)[1].lower()

	try:
	if file_extension == ".txt":
	with open(uploaded_file.name, 'r', encoding='utf-8') as f:
	final_text = f.read()
	elif file_extension == ".pdf":
	with pdfplumber.open(uploaded_file.name) as pdf:
	text_parts = []
	for page in pdf.pages:
	text_parts.append(page.extract_text())
	final_text = " ".join(text_parts)
	else:
	raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")

	except Exception as e:
	raise ValueError(f"Could not read file: {e}")

	if not final_text.strip():
	raise ValueError("Input text is empty.")

	# Truncate long texts
	if len(final_text) > 1000:
	final_text = final_text[:1000] + " [truncated]"

	# Generate audio
	try:
	wav_path = synthesize(final_text, speed)

	# Convert WAV to MP3
	mp3_path = wav_path.replace('.wav', '.mp3')
	convert_wav_to_mp3(wav_path, mp3_path)

	# Return gr.update objects for a dynamic UI
	return (
	gr.Audio(value=wav_path, label="Generated Speech", autoplay=False),
	gr.File(value=wav_path, visible=True),
	gr.File(value=mp3_path, visible=True)
	)
	except Exception as e:
	raise RuntimeError(f"Synthesis failed: {str(e)}")


	with gr.Blocks(title="🗣️ Kabyle TTS") as demo:
	gr.Markdown("# 🎵 Kabyle Text-to-Speech")
	gr.Markdown("Convert text to speech using Meta's MMS-TTS model for Kabyle.")

	with gr.Row():
	with gr.Column():
	input_type = gr.Radio(
	["📝 Type Text", "📎 Upload File"],
	value="📝 Type Text",
	label="Input Method"
	)

	typed_text = gr.Textbox(
	label="Enter Text",
	placeholder="Example: Azul fell-ay! Kaci tazmamt.",
	lines=6,
	visible=True
	)
	uploaded_file = gr.File(label="Upload .txt or .pdf", file_types=['.txt', '.pdf'], visible=False)

	speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed")

	btn = gr.Button("🔊 Generate Speech", variant="primary")

	with gr.Column():
	output_audio = gr.Audio(label="Generated Speech", autoplay=False)

	gr.Markdown("### 💾 Download Audio")

	# These are the download components, initially hidden
	file_wav = gr.File(label="WAV Download", file_types=['.wav'], visible=False)
	file_mp3 = gr.File(label="MP3 Download", file_types=['.mp3'], visible=False)

	# Toggle visibility of text input vs file input
	def toggle_inputs(choice):
	return (
	gr.update(visible=choice == "📝 Type Text"),
	gr.update(visible=choice == "📎 Upload File")
	)

	input_type.change(toggle_inputs, input_type, [typed_text, uploaded_file])

	# Main event
	btn.click(
	fn=process_input,
	inputs=[input_type, typed_text, uploaded_file, speed],
	outputs=[output_audio, file_wav, file_mp3],
	queue=True,
	)

	# Hide download components when input changes
	typed_text.change(lambda: (gr.update(visible=False), gr.update(visible=False)), [], [file_wav, file_mp3])
	uploaded_file.change(lambda: (gr.update(visible=False), gr.update(visible=False)), [], [file_wav, file_mp3])


	gr.HTML("""
	<hr style="margin:20px 0; border-top:1px solid #ddd;">
	<p style="text-align:center; color:#666;">
	Powered by
	<a href="https://huggingface.co/facebook/mms-tts-kab" target="_blank">facebook/mms-tts-kab</a>
	</p>
	""")


	# Launch app
	if __name__ == "__main__":
	demo.launch(server_port=7860, debug=True)