Spaces:
Sleeping
Sleeping
| # app.py - Fully standalone Kabyle TTS with working downloads | |
| import gradio as gr | |
| from transformers import VitsModel, AutoTokenizer | |
| import torch | |
| import scipy.io.wavfile as wavfile | |
| import numpy as np | |
| import os | |
| from datetime import datetime | |
| from pydub import AudioSegment | |
| import pdfplumber | |
| # Load model once at startup | |
| print("Loading facebook/mms-tts-kab...") | |
| model = VitsModel.from_pretrained("facebook/mms-tts-kab") | |
| tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kab") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = model.to(device) | |
| sampling_rate = model.config.sampling_rate | |
| print(f"Running on device: {device}") | |
| # Create outputs folder | |
| os.makedirs("outputs", exist_ok=True) | |
| def convert_wav_to_mp3(wav_path, mp3_path): | |
| """Converts a WAV file to MP3.""" | |
| try: | |
| audio = AudioSegment.from_wav(wav_path) | |
| audio.export(mp3_path, format="mp3") | |
| return mp3_path | |
| except Exception as e: | |
| print(f"Failed to convert WAV to MP3: {e}") | |
| return None | |
| def synthesize(text, speed): | |
| if not text.strip(): | |
| raise ValueError("Text is empty!") | |
| # Tokenize | |
| inputs = tokenizer(text.strip(), return_tensors="pt").to(device) | |
| # Generate waveform | |
| with torch.no_grad(): | |
| waveform = model(**inputs).waveform.cpu().numpy().squeeze() | |
| # Adjust rate for speed | |
| adjusted_rate = int(sampling_rate * speed) | |
| # Save WAV | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| wav_filepath = os.path.join("outputs", f"kabyle_{timestamp}.wav") | |
| wavfile.write(wav_filepath, adjusted_rate, np.float32(waveform)) | |
| return wav_filepath | |
| def process_input(input_type, typed_text, uploaded_file, speed): | |
| final_text = "" | |
| # Choose source | |
| if input_type == "π Type Text": | |
| final_text = typed_text | |
| else: | |
| if uploaded_file is None: | |
| raise ValueError("Please upload a file.") | |
| file_extension = os.path.splitext(uploaded_file.name)[1].lower() | |
| try: | |
| if file_extension == ".txt": | |
| with open(uploaded_file.name, 'r', encoding='utf-8') as f: | |
| final_text = f.read() | |
| elif file_extension == ".pdf": | |
| with pdfplumber.open(uploaded_file.name) as pdf: | |
| text_parts = [] | |
| for page in pdf.pages: | |
| text_parts.append(page.extract_text()) | |
| final_text = " ".join(text_parts) | |
| else: | |
| raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.") | |
| except Exception as e: | |
| raise ValueError(f"Could not read file: {e}") | |
| if not final_text.strip(): | |
| raise ValueError("Input text is empty.") | |
| # Truncate long texts | |
| if len(final_text) > 1000: | |
| final_text = final_text[:1000] + " [truncated]" | |
| # Generate audio | |
| try: | |
| wav_path = synthesize(final_text, speed) | |
| # Convert WAV to MP3 | |
| mp3_path = wav_path.replace('.wav', '.mp3') | |
| convert_wav_to_mp3(wav_path, mp3_path) | |
| # Return gr.update objects for a dynamic UI | |
| return ( | |
| gr.Audio(value=wav_path, label="Generated Speech", autoplay=False), | |
| gr.File(value=wav_path, visible=True), | |
| gr.File(value=mp3_path, visible=True) | |
| ) | |
| except Exception as e: | |
| raise RuntimeError(f"Synthesis failed: {str(e)}") | |
| with gr.Blocks(title="π£οΈ Kabyle TTS") as demo: | |
| gr.Markdown("# π΅ Kabyle Text-to-Speech") | |
| gr.Markdown("Convert text to speech using Meta's MMS-TTS model for Kabyle.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_type = gr.Radio( | |
| ["π Type Text", "π Upload File"], | |
| value="π Type Text", | |
| label="Input Method" | |
| ) | |
| typed_text = gr.Textbox( | |
| label="Enter Text", | |
| placeholder="Example: Azul fell-ay! Kaci tazmamt.", | |
| lines=6, | |
| visible=True | |
| ) | |
| uploaded_file = gr.File(label="Upload .txt or .pdf", file_types=['.txt', '.pdf'], visible=False) | |
| speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed") | |
| btn = gr.Button("π Generate Speech", variant="primary") | |
| with gr.Column(): | |
| output_audio = gr.Audio(label="Generated Speech", autoplay=False) | |
| gr.Markdown("### πΎ Download Audio") | |
| # These are the download components, initially hidden | |
| file_wav = gr.File(label="WAV Download", file_types=['.wav'], visible=False) | |
| file_mp3 = gr.File(label="MP3 Download", file_types=['.mp3'], visible=False) | |
| # Toggle visibility of text input vs file input | |
| def toggle_inputs(choice): | |
| return ( | |
| gr.update(visible=choice == "π Type Text"), | |
| gr.update(visible=choice == "π Upload File") | |
| ) | |
| input_type.change(toggle_inputs, input_type, [typed_text, uploaded_file]) | |
| # Main event | |
| btn.click( | |
| fn=process_input, | |
| inputs=[input_type, typed_text, uploaded_file, speed], | |
| outputs=[output_audio, file_wav, file_mp3], | |
| queue=True, | |
| ) | |
| # Hide download components when input changes | |
| typed_text.change(lambda: (gr.update(visible=False), gr.update(visible=False)), [], [file_wav, file_mp3]) | |
| uploaded_file.change(lambda: (gr.update(visible=False), gr.update(visible=False)), [], [file_wav, file_mp3]) | |
| gr.HTML(""" | |
| <hr style="margin:20px 0; border-top:1px solid #ddd;"> | |
| <p style="text-align:center; color:#666;"> | |
| Powered by | |
| <a href="https://huggingface.co/facebook/mms-tts-kab" target="_blank">facebook/mms-tts-kab</a> | |
| </p> | |
| """) | |
| # Launch app | |
| if __name__ == "__main__": | |
| demo.launch(server_port=7860, debug=True) |