kab-tts-tanti / app.py
AitBAD's picture
Update app.py
2d2220b verified
# app.py - Fully standalone Kabyle TTS with working downloads
import gradio as gr
from transformers import VitsModel, AutoTokenizer
import torch
import scipy.io.wavfile as wavfile
import numpy as np
import os
from datetime import datetime
from pydub import AudioSegment
import pdfplumber
# Load model once at startup
print("Loading facebook/mms-tts-kab...")
model = VitsModel.from_pretrained("facebook/mms-tts-kab")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kab")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
sampling_rate = model.config.sampling_rate
print(f"Running on device: {device}")
# Create outputs folder
os.makedirs("outputs", exist_ok=True)
def convert_wav_to_mp3(wav_path, mp3_path):
"""Converts a WAV file to MP3."""
try:
audio = AudioSegment.from_wav(wav_path)
audio.export(mp3_path, format="mp3")
return mp3_path
except Exception as e:
print(f"Failed to convert WAV to MP3: {e}")
return None
def synthesize(text, speed):
if not text.strip():
raise ValueError("Text is empty!")
# Tokenize
inputs = tokenizer(text.strip(), return_tensors="pt").to(device)
# Generate waveform
with torch.no_grad():
waveform = model(**inputs).waveform.cpu().numpy().squeeze()
# Adjust rate for speed
adjusted_rate = int(sampling_rate * speed)
# Save WAV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
wav_filepath = os.path.join("outputs", f"kabyle_{timestamp}.wav")
wavfile.write(wav_filepath, adjusted_rate, np.float32(waveform))
return wav_filepath
def process_input(input_type, typed_text, uploaded_file, speed):
final_text = ""
# Choose source
if input_type == "πŸ“ Type Text":
final_text = typed_text
else:
if uploaded_file is None:
raise ValueError("Please upload a file.")
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
try:
if file_extension == ".txt":
with open(uploaded_file.name, 'r', encoding='utf-8') as f:
final_text = f.read()
elif file_extension == ".pdf":
with pdfplumber.open(uploaded_file.name) as pdf:
text_parts = []
for page in pdf.pages:
text_parts.append(page.extract_text())
final_text = " ".join(text_parts)
else:
raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
except Exception as e:
raise ValueError(f"Could not read file: {e}")
if not final_text.strip():
raise ValueError("Input text is empty.")
# Truncate long texts
if len(final_text) > 1000:
final_text = final_text[:1000] + " [truncated]"
# Generate audio
try:
wav_path = synthesize(final_text, speed)
# Convert WAV to MP3
mp3_path = wav_path.replace('.wav', '.mp3')
convert_wav_to_mp3(wav_path, mp3_path)
# Return gr.update objects for a dynamic UI
return (
gr.Audio(value=wav_path, label="Generated Speech", autoplay=False),
gr.File(value=wav_path, visible=True),
gr.File(value=mp3_path, visible=True)
)
except Exception as e:
raise RuntimeError(f"Synthesis failed: {str(e)}")
with gr.Blocks(title="πŸ—£οΈ Kabyle TTS") as demo:
gr.Markdown("# 🎡 Kabyle Text-to-Speech")
gr.Markdown("Convert text to speech using Meta's MMS-TTS model for Kabyle.")
with gr.Row():
with gr.Column():
input_type = gr.Radio(
["πŸ“ Type Text", "πŸ“Ž Upload File"],
value="πŸ“ Type Text",
label="Input Method"
)
typed_text = gr.Textbox(
label="Enter Text",
placeholder="Example: Azul fell-ay! Kaci tazmamt.",
lines=6,
visible=True
)
uploaded_file = gr.File(label="Upload .txt or .pdf", file_types=['.txt', '.pdf'], visible=False)
speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed")
btn = gr.Button("πŸ”Š Generate Speech", variant="primary")
with gr.Column():
output_audio = gr.Audio(label="Generated Speech", autoplay=False)
gr.Markdown("### πŸ’Ύ Download Audio")
# These are the download components, initially hidden
file_wav = gr.File(label="WAV Download", file_types=['.wav'], visible=False)
file_mp3 = gr.File(label="MP3 Download", file_types=['.mp3'], visible=False)
# Toggle visibility of text input vs file input
def toggle_inputs(choice):
return (
gr.update(visible=choice == "πŸ“ Type Text"),
gr.update(visible=choice == "πŸ“Ž Upload File")
)
input_type.change(toggle_inputs, input_type, [typed_text, uploaded_file])
# Main event
btn.click(
fn=process_input,
inputs=[input_type, typed_text, uploaded_file, speed],
outputs=[output_audio, file_wav, file_mp3],
queue=True,
)
# Hide download components when input changes
typed_text.change(lambda: (gr.update(visible=False), gr.update(visible=False)), [], [file_wav, file_mp3])
uploaded_file.change(lambda: (gr.update(visible=False), gr.update(visible=False)), [], [file_wav, file_mp3])
gr.HTML("""
<hr style="margin:20px 0; border-top:1px solid #ddd;">
<p style="text-align:center; color:#666;">
Powered by
<a href="https://huggingface.co/facebook/mms-tts-kab" target="_blank">facebook/mms-tts-kab</a>
</p>
""")
# Launch app
if __name__ == "__main__":
demo.launch(server_port=7860, debug=True)