# app.py - Fully standalone Kabyle TTS with working downloads import gradio as gr from transformers import VitsModel, AutoTokenizer import torch import scipy.io.wavfile as wavfile import numpy as np import os from datetime import datetime from pydub import AudioSegment import pdfplumber # Load model once at startup print("Loading facebook/mms-tts-kab...") model = VitsModel.from_pretrained("facebook/mms-tts-kab") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kab") device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) sampling_rate = model.config.sampling_rate print(f"Running on device: {device}") # Create outputs folder os.makedirs("outputs", exist_ok=True) def convert_wav_to_mp3(wav_path, mp3_path): """Converts a WAV file to MP3.""" try: audio = AudioSegment.from_wav(wav_path) audio.export(mp3_path, format="mp3") return mp3_path except Exception as e: print(f"Failed to convert WAV to MP3: {e}") return None def synthesize(text, speed): if not text.strip(): raise ValueError("Text is empty!") # Tokenize inputs = tokenizer(text.strip(), return_tensors="pt").to(device) # Generate waveform with torch.no_grad(): waveform = model(**inputs).waveform.cpu().numpy().squeeze() # Adjust rate for speed adjusted_rate = int(sampling_rate * speed) # Save WAV timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") wav_filepath = os.path.join("outputs", f"kabyle_{timestamp}.wav") wavfile.write(wav_filepath, adjusted_rate, np.float32(waveform)) return wav_filepath def process_input(input_type, typed_text, uploaded_file, speed): final_text = "" # Choose source if input_type == "📝 Type Text": final_text = typed_text else: if uploaded_file is None: raise ValueError("Please upload a file.") file_extension = os.path.splitext(uploaded_file.name)[1].lower() try: if file_extension == ".txt": with open(uploaded_file.name, 'r', encoding='utf-8') as f: final_text = f.read() elif file_extension == ".pdf": with pdfplumber.open(uploaded_file.name) as pdf: text_parts = [] for page in pdf.pages: text_parts.append(page.extract_text()) final_text = " ".join(text_parts) else: raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.") except Exception as e: raise ValueError(f"Could not read file: {e}") if not final_text.strip(): raise ValueError("Input text is empty.") # Truncate long texts if len(final_text) > 1000: final_text = final_text[:1000] + " [truncated]" # Generate audio try: wav_path = synthesize(final_text, speed) # Convert WAV to MP3 mp3_path = wav_path.replace('.wav', '.mp3') convert_wav_to_mp3(wav_path, mp3_path) # Return gr.update objects for a dynamic UI return ( gr.Audio(value=wav_path, label="Generated Speech", autoplay=False), gr.File(value=wav_path, visible=True), gr.File(value=mp3_path, visible=True) ) except Exception as e: raise RuntimeError(f"Synthesis failed: {str(e)}") with gr.Blocks(title="🗣️ Kabyle TTS") as demo: gr.Markdown("# 🎵 Kabyle Text-to-Speech") gr.Markdown("Convert text to speech using Meta's MMS-TTS model for Kabyle.") with gr.Row(): with gr.Column(): input_type = gr.Radio( ["📝 Type Text", "📎 Upload File"], value="📝 Type Text", label="Input Method" ) typed_text = gr.Textbox( label="Enter Text", placeholder="Example: Azul fell-ay! Kaci tazmamt.", lines=6, visible=True ) uploaded_file = gr.File(label="Upload .txt or .pdf", file_types=['.txt', '.pdf'], visible=False) speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed") btn = gr.Button("🔊 Generate Speech", variant="primary") with gr.Column(): output_audio = gr.Audio(label="Generated Speech", autoplay=False) gr.Markdown("### 💾 Download Audio") # These are the download components, initially hidden file_wav = gr.File(label="WAV Download", file_types=['.wav'], visible=False) file_mp3 = gr.File(label="MP3 Download", file_types=['.mp3'], visible=False) # Toggle visibility of text input vs file input def toggle_inputs(choice): return ( gr.update(visible=choice == "📝 Type Text"), gr.update(visible=choice == "📎 Upload File") ) input_type.change(toggle_inputs, input_type, [typed_text, uploaded_file]) # Main event btn.click( fn=process_input, inputs=[input_type, typed_text, uploaded_file, speed], outputs=[output_audio, file_wav, file_mp3], queue=True, ) # Hide download components when input changes typed_text.change(lambda: (gr.update(visible=False), gr.update(visible=False)), [], [file_wav, file_mp3]) uploaded_file.change(lambda: (gr.update(visible=False), gr.update(visible=False)), [], [file_wav, file_mp3]) gr.HTML("""
Powered by facebook/mms-tts-kab
""") # Launch app if __name__ == "__main__": demo.launch(server_port=7860, debug=True)