import gradio as gr import os import sys import subprocess import json import tempfile from pathlib import Path from huggingface_hub import snapshot_download import warnings warnings.filterwarnings('ignore') # Setup GLM-TTS environment def setup_glm_tts(): """Download and setup GLM-TTS repository""" glm_tts_dir = Path("./GLM-TTS") if not glm_tts_dir.exists(): print("πŸ“₯ Cloning GLM-TTS repository...") subprocess.run( ["git", "clone", "https://github.com/zai-org/GLM-TTS.git"], check=True ) print("βœ… GLM-TTS repository cloned") # Install additional requirements glm_requirements = glm_tts_dir / "requirements.txt" if glm_requirements.exists(): print("πŸ“¦ Installing GLM-TTS requirements...") subprocess.run( [sys.executable, "-m", "pip", "install", "-q", "-r", str(glm_requirements)], check=False ) print("βœ… GLM-TTS requirements installed") # Add to Python path if str(glm_tts_dir) not in sys.path: sys.path.insert(0, str(glm_tts_dir)) return glm_tts_dir print("πŸ”§ Setting up GLM-TTS environment...") GLM_TTS_DIR = setup_glm_tts() # Download models def download_models(): """Download GLM-TTS models from HuggingFace""" model_dir = Path("./ckpt") if not model_dir.exists(): print("πŸ“₯ Downloading GLM-TTS models from HuggingFace (~8.9GB)...") print("⏳ This will take several minutes on first run...") snapshot_download( repo_id="zai-org/GLM-TTS", local_dir=str(model_dir), local_dir_use_symlinks=False, resume_download=True ) print("βœ… Models downloaded successfully!") else: print("βœ… Models already downloaded") return model_dir MODEL_DIR = download_models() # Create example directory structure def setup_examples(): """Setup example directories""" example_dir = GLM_TTS_DIR / "examples" example_dir.mkdir(exist_ok=True) prompt_dir = example_dir / "prompt" prompt_dir.mkdir(exist_ok=True) return example_dir, prompt_dir EXAMPLE_DIR, PROMPT_DIR = setup_examples() def generate_speech_glmtts(text, ref_audio_path=None, speed=1.0): """ Generate speech using GLM-TTS inference script Args: text: Text to synthesize ref_audio_path: Optional reference audio for voice cloning speed: Speech speed (not directly supported, for future use) Returns: tuple: (audio_path, status_message) """ try: # Create a temporary JSONL file with the input temp_jsonl = tempfile.NamedTemporaryFile( mode='w', suffix='.jsonl', dir=EXAMPLE_DIR, delete=False ) # Prepare data entry data_entry = { "text": text, "prompt_text": "", "prompt_audio": "" } # Handle reference audio if provided if ref_audio_path and os.path.exists(ref_audio_path): # Copy reference audio to prompt directory import shutil ref_filename = f"ref_{os.path.basename(ref_audio_path)}" ref_dest = PROMPT_DIR / ref_filename shutil.copy2(ref_audio_path, ref_dest) data_entry["prompt_audio"] = str(ref_dest) print(f"βœ“ Using reference audio: {ref_dest}") # Write to JSONL json.dump(data_entry, temp_jsonl) temp_jsonl.write('\n') temp_jsonl.close() jsonl_path = Path(temp_jsonl.name) data_name = jsonl_path.stem print(f"πŸŽ™οΈ Synthesizing: '{text[:100]}...'") print(f"πŸ“ Using data file: {data_name}") # Run GLM-TTS inference inference_script = "glmtts_inference.py" cmd = [ sys.executable, str(inference_script), f"--data={data_name}", "--exp_name=_gradio_demo", "--use_cache" ] print(f"πŸ”„ Running inference...") print(f"Command: {' '.join(cmd)}") # Change to GLM-TTS directory for execution result = subprocess.run( cmd, cwd=str(GLM_TTS_DIR), capture_output=True, text=True, timeout=300 # 5 minute timeout ) if result.returncode != 0: error_msg = f"Inference failed:\n{result.stderr}\n{result.stdout}" print(f"❌ {error_msg}") return None, f"❌ Error: {error_msg}" print("βœ… Inference completed") print(f"Output:\n{result.stdout}") # Find the generated audio file output_dir = GLM_TTS_DIR / "outputs" / f"{data_name}_gradio_demo" if not output_dir.exists(): return None, f"❌ Output directory not found: {output_dir}" # Look for .wav files in output directory wav_files = list(output_dir.glob("*.wav")) if not wav_files: return None, f"❌ No audio files generated in {output_dir}" # Return the first wav file found output_audio = str(wav_files[0]) print(f"βœ… Audio generated: {output_audio}") return output_audio, "βœ… Success! Audio generated successfully." except subprocess.TimeoutExpired: return None, "❌ Error: Inference timeout (>5 minutes). CPU inference is very slow." except Exception as e: import traceback error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}" print(error_msg) return None, error_msg finally: # Cleanup temp file try: if os.path.exists(temp_jsonl.name): os.unlink(temp_jsonl.name) except: pass # Gradio interface def generate_speech(text, ref_audio, speed): """Gradio interface function""" if not text or len(text.strip()) == 0: return None, "⚠️ Please enter text to synthesize" # Call GLM-TTS audio_path, message = generate_speech_glmtts( text=text, ref_audio_path=ref_audio, speed=speed ) return audio_path, message # Create Gradio Interface with gr.Blocks( title="GLM-TTS Voice Cloning", theme=gr.themes.Soft(), css=""" .gradio-container {max-width: 1200px !important} .status-box {font-family: monospace; font-size: 11px; max-height: 400px; overflow-y: auto;} """ ) as demo: gr.Markdown(""" # πŸŽ™οΈ GLM-TTS: Zero-Shot Voice Cloning & Text-to-Speech **State-of-the-art voice cloning** with just 3-10 seconds of audio! ### ⚑ Features: - 🎯 **Zero-shot cloning** - Clone any voice without training - 🌏 **Bilingual** - Excellent Chinese & good English support - 🎭 **Emotion control** - Natural & expressive speech - ⚑ **High quality** - Best-in-class among open-source (CER: 0.89) ### ⚠️ Important Notes: - **CPU inference is VERY slow** (5-15 minutes per generation) - **First generation takes longer** as models initialize - For production use, **GPU is strongly recommended** """) with gr.Row(): with gr.Column(scale=1): text_input = gr.Textbox( label="πŸ“ Text to Synthesize", placeholder="Enter text here (Chinese or English)...\n\nExample: Hello! This is GLM-TTS voice cloning.", lines=6, value="Hello! This is GLM-TTS, a powerful text-to-speech system with zero-shot voice cloning." ) with gr.Accordion("🎡 Voice Cloning (Optional)", open=True): ref_audio_input = gr.Audio( label="Reference Audio (3-10 seconds)", type="filepath", sources=["upload", "microphone"] ) gr.Markdown(""" *Upload audio of the voice you want to clone.* - **Leave empty for default voice** - **3-10 seconds recommended** - **Clear audio with minimal noise** """) with gr.Accordion("βš™οΈ Settings", open=False): speed_slider = gr.Slider( label="Speech Speed (not yet implemented)", minimum=0.5, maximum=2.0, value=1.0, step=0.1, interactive=False, info="Speed control coming soon" ) generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg") gr.Markdown(""" ### πŸ’‘ Tips: - First generation will take **10-15 minutes** on CPU - Subsequent generations: **5-10 minutes** - Be patient - the quality is worth the wait! """) with gr.Column(scale=1): audio_output = gr.Audio( label="πŸ”Š Generated Speech", type="filepath" ) status_output = gr.Textbox( label="πŸ“Š Status / Logs", lines=12, interactive=False, elem_classes=["status-box"] ) # Examples gr.Markdown("### πŸ“š Try These Examples") gr.Examples( examples=[ ["Hello! Welcome to GLM-TTS.", None, 1.0], ["ζ¬’θΏŽδ½Ώη”¨GLM-TTS语音合成系统!", None, 1.0], ["Artificial intelligence is transforming our world.", None, 1.0], ["δΊΊε·₯ζ™Ίθƒ½ζ­£εœ¨ζ”Ήε˜δΈ–η•Œγ€‚", None, 1.0], ["This is a test of zero-shot voice cloning technology.", None, 1.0], ["There's a new tool that allows you to clone his voice with just about 10 or 12, maybe 13 or 15 seconds of audio! And it's actually a really good clone! Check this out!", None, 1.0], ], inputs=[text_input, ref_audio_input, speed_slider], outputs=[audio_output, status_output], fn=generate_speech, cache_examples=False, ) gr.Markdown(""" --- ### 🎯 How It Works: 1. **Enter text** in Chinese or English (or mixed) 2. **Optional**: Upload 3-10s of reference audio to clone that voice 3. **Click Generate** and wait (be patient on CPU!) 4. **Download** your generated audio ### πŸ’‘ Best Practices: - **Reference audio**: Clear, minimal background noise - **Length**: 3-10 seconds is optimal for cloning - **Languages**: Chinese is strongest, English is well-supported - **Mixed text**: Can handle Chinese-English in same sentence ### πŸ”— Resources: - [GitHub Repository](https://github.com/zai-org/GLM-TTS) - [Model on HuggingFace](https://huggingface.co/zai-org/GLM-TTS) - [Official Demo](https://audio.z.ai) ### πŸ“Š Performance Benchmarks: | Metric | GLM-TTS | GLM-TTS-RL | |--------|---------|------------| | CER ↓ | 1.03 | **0.89** (best open-source) | | SIM ↑ | 76.1 | 76.4 | ### πŸ“„ Citation: ```bibtex @misc{glmtts2025, title={GLM-TTS: Controllable & Emotion-Expressive Zero-shot TTS}, author={CogAudio Group, Zhipu AI}, year={2025} } ``` """) # Connect button generate_btn.click( fn=generate_speech, inputs=[text_input, ref_audio_input, speed_slider], outputs=[audio_output, status_output], api_name="generate" ) # Launch if __name__ == "__main__": print("πŸš€ Starting Gradio interface...") print(f"πŸ“ GLM-TTS directory: {GLM_TTS_DIR}") print(f"πŸ“ Model directory: {MODEL_DIR}") print(f"πŸ“ Example directory: {EXAMPLE_DIR}") demo.queue(max_size=5) # Limit queue size for CPU demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True, ssr_mode=False # Disable SSR to prevent asyncio event loop cleanup errors )