Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import sys | |
| import subprocess | |
| import json | |
| import tempfile | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Setup GLM-TTS environment | |
| def setup_glm_tts(): | |
| """Download and setup GLM-TTS repository""" | |
| glm_tts_dir = Path("./GLM-TTS") | |
| if not glm_tts_dir.exists(): | |
| print("π₯ Cloning GLM-TTS repository...") | |
| subprocess.run( | |
| ["git", "clone", "https://github.com/zai-org/GLM-TTS.git"], | |
| check=True | |
| ) | |
| print("β GLM-TTS repository cloned") | |
| # Install additional requirements | |
| glm_requirements = glm_tts_dir / "requirements.txt" | |
| if glm_requirements.exists(): | |
| print("π¦ Installing GLM-TTS requirements...") | |
| subprocess.run( | |
| [sys.executable, "-m", "pip", "install", "-q", "-r", str(glm_requirements)], | |
| check=False | |
| ) | |
| print("β GLM-TTS requirements installed") | |
| # Add to Python path | |
| if str(glm_tts_dir) not in sys.path: | |
| sys.path.insert(0, str(glm_tts_dir)) | |
| return glm_tts_dir | |
| print("π§ Setting up GLM-TTS environment...") | |
| GLM_TTS_DIR = setup_glm_tts() | |
| # Download models | |
| def download_models(): | |
| """Download GLM-TTS models from HuggingFace""" | |
| model_dir = Path("./ckpt") | |
| if not model_dir.exists(): | |
| print("π₯ Downloading GLM-TTS models from HuggingFace (~8.9GB)...") | |
| print("β³ This will take several minutes on first run...") | |
| snapshot_download( | |
| repo_id="zai-org/GLM-TTS", | |
| local_dir=str(model_dir), | |
| local_dir_use_symlinks=False, | |
| resume_download=True | |
| ) | |
| print("β Models downloaded successfully!") | |
| else: | |
| print("β Models already downloaded") | |
| return model_dir | |
| MODEL_DIR = download_models() | |
| # Create example directory structure | |
| def setup_examples(): | |
| """Setup example directories""" | |
| example_dir = GLM_TTS_DIR / "examples" | |
| example_dir.mkdir(exist_ok=True) | |
| prompt_dir = example_dir / "prompt" | |
| prompt_dir.mkdir(exist_ok=True) | |
| return example_dir, prompt_dir | |
| EXAMPLE_DIR, PROMPT_DIR = setup_examples() | |
| def generate_speech_glmtts(text, ref_audio_path=None, speed=1.0): | |
| """ | |
| Generate speech using GLM-TTS inference script | |
| Args: | |
| text: Text to synthesize | |
| ref_audio_path: Optional reference audio for voice cloning | |
| speed: Speech speed (not directly supported, for future use) | |
| Returns: | |
| tuple: (audio_path, status_message) | |
| """ | |
| try: | |
| # Create a temporary JSONL file with the input | |
| temp_jsonl = tempfile.NamedTemporaryFile( | |
| mode='w', | |
| suffix='.jsonl', | |
| dir=EXAMPLE_DIR, | |
| delete=False | |
| ) | |
| # Prepare data entry | |
| data_entry = { | |
| "text": text, | |
| "prompt_text": "", | |
| "prompt_audio": "" | |
| } | |
| # Handle reference audio if provided | |
| if ref_audio_path and os.path.exists(ref_audio_path): | |
| # Copy reference audio to prompt directory | |
| import shutil | |
| ref_filename = f"ref_{os.path.basename(ref_audio_path)}" | |
| ref_dest = PROMPT_DIR / ref_filename | |
| shutil.copy2(ref_audio_path, ref_dest) | |
| data_entry["prompt_audio"] = str(ref_dest) | |
| print(f"β Using reference audio: {ref_dest}") | |
| # Write to JSONL | |
| json.dump(data_entry, temp_jsonl) | |
| temp_jsonl.write('\n') | |
| temp_jsonl.close() | |
| jsonl_path = Path(temp_jsonl.name) | |
| data_name = jsonl_path.stem | |
| print(f"ποΈ Synthesizing: '{text[:100]}...'") | |
| print(f"π Using data file: {data_name}") | |
| # Run GLM-TTS inference | |
| inference_script = "glmtts_inference.py" | |
| cmd = [ | |
| sys.executable, | |
| str(inference_script), | |
| f"--data={data_name}", | |
| "--exp_name=_gradio_demo", | |
| "--use_cache" | |
| ] | |
| print(f"π Running inference...") | |
| print(f"Command: {' '.join(cmd)}") | |
| # Change to GLM-TTS directory for execution | |
| result = subprocess.run( | |
| cmd, | |
| cwd=str(GLM_TTS_DIR), | |
| capture_output=True, | |
| text=True, | |
| timeout=300 # 5 minute timeout | |
| ) | |
| if result.returncode != 0: | |
| error_msg = f"Inference failed:\n{result.stderr}\n{result.stdout}" | |
| print(f"β {error_msg}") | |
| return None, f"β Error: {error_msg}" | |
| print("β Inference completed") | |
| print(f"Output:\n{result.stdout}") | |
| # Find the generated audio file | |
| output_dir = GLM_TTS_DIR / "outputs" / f"{data_name}_gradio_demo" | |
| if not output_dir.exists(): | |
| return None, f"β Output directory not found: {output_dir}" | |
| # Look for .wav files in output directory | |
| wav_files = list(output_dir.glob("*.wav")) | |
| if not wav_files: | |
| return None, f"β No audio files generated in {output_dir}" | |
| # Return the first wav file found | |
| output_audio = str(wav_files[0]) | |
| print(f"β Audio generated: {output_audio}") | |
| return output_audio, "β Success! Audio generated successfully." | |
| except subprocess.TimeoutExpired: | |
| return None, "β Error: Inference timeout (>5 minutes). CPU inference is very slow." | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"β Error: {str(e)}\n{traceback.format_exc()}" | |
| print(error_msg) | |
| return None, error_msg | |
| finally: | |
| # Cleanup temp file | |
| try: | |
| if os.path.exists(temp_jsonl.name): | |
| os.unlink(temp_jsonl.name) | |
| except: | |
| pass | |
| # Gradio interface | |
| def generate_speech(text, ref_audio, speed): | |
| """Gradio interface function""" | |
| if not text or len(text.strip()) == 0: | |
| return None, "β οΈ Please enter text to synthesize" | |
| # Call GLM-TTS | |
| audio_path, message = generate_speech_glmtts( | |
| text=text, | |
| ref_audio_path=ref_audio, | |
| speed=speed | |
| ) | |
| return audio_path, message | |
| # Create Gradio Interface | |
| with gr.Blocks( | |
| title="GLM-TTS Voice Cloning", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container {max-width: 1200px !important} | |
| .status-box {font-family: monospace; font-size: 11px; max-height: 400px; overflow-y: auto;} | |
| """ | |
| ) as demo: | |
| gr.Markdown(""" | |
| # ποΈ GLM-TTS: Zero-Shot Voice Cloning & Text-to-Speech | |
| **State-of-the-art voice cloning** with just 3-10 seconds of audio! | |
| ### β‘ Features: | |
| - π― **Zero-shot cloning** - Clone any voice without training | |
| - π **Bilingual** - Excellent Chinese & good English support | |
| - π **Emotion control** - Natural & expressive speech | |
| - β‘ **High quality** - Best-in-class among open-source (CER: 0.89) | |
| ### β οΈ Important Notes: | |
| - **CPU inference is VERY slow** (5-15 minutes per generation) | |
| - **First generation takes longer** as models initialize | |
| - For production use, **GPU is strongly recommended** | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| text_input = gr.Textbox( | |
| label="π Text to Synthesize", | |
| placeholder="Enter text here (Chinese or English)...\n\nExample: Hello! This is GLM-TTS voice cloning.", | |
| lines=6, | |
| value="Hello! This is GLM-TTS, a powerful text-to-speech system with zero-shot voice cloning." | |
| ) | |
| with gr.Accordion("π΅ Voice Cloning (Optional)", open=True): | |
| ref_audio_input = gr.Audio( | |
| label="Reference Audio (3-10 seconds)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| gr.Markdown(""" | |
| *Upload audio of the voice you want to clone.* | |
| - **Leave empty for default voice** | |
| - **3-10 seconds recommended** | |
| - **Clear audio with minimal noise** | |
| """) | |
| with gr.Accordion("βοΈ Settings", open=False): | |
| speed_slider = gr.Slider( | |
| label="Speech Speed (not yet implemented)", | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| interactive=False, | |
| info="Speed control coming soon" | |
| ) | |
| generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg") | |
| gr.Markdown(""" | |
| ### π‘ Tips: | |
| - First generation will take **10-15 minutes** on CPU | |
| - Subsequent generations: **5-10 minutes** | |
| - Be patient - the quality is worth the wait! | |
| """) | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio( | |
| label="π Generated Speech", | |
| type="filepath" | |
| ) | |
| status_output = gr.Textbox( | |
| label="π Status / Logs", | |
| lines=12, | |
| interactive=False, | |
| elem_classes=["status-box"] | |
| ) | |
| # Examples | |
| gr.Markdown("### π Try These Examples") | |
| gr.Examples( | |
| examples=[ | |
| ["Hello! Welcome to GLM-TTS.", None, 1.0], | |
| ["ζ¬’θΏδ½Ώη¨GLM-TTSθ―ι³εζη³»η»οΌ", None, 1.0], | |
| ["Artificial intelligence is transforming our world.", None, 1.0], | |
| ["δΊΊε·₯ζΊθ½ζ£ε¨ζΉεδΈηγ", None, 1.0], | |
| ["This is a test of zero-shot voice cloning technology.", None, 1.0], | |
| ["There's a new tool that allows you to clone his voice with just about 10 or 12, maybe 13 or 15 seconds of audio! And it's actually a really good clone! Check this out!", None, 1.0], | |
| ], | |
| inputs=[text_input, ref_audio_input, speed_slider], | |
| outputs=[audio_output, status_output], | |
| fn=generate_speech, | |
| cache_examples=False, | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### π― How It Works: | |
| 1. **Enter text** in Chinese or English (or mixed) | |
| 2. **Optional**: Upload 3-10s of reference audio to clone that voice | |
| 3. **Click Generate** and wait (be patient on CPU!) | |
| 4. **Download** your generated audio | |
| ### π‘ Best Practices: | |
| - **Reference audio**: Clear, minimal background noise | |
| - **Length**: 3-10 seconds is optimal for cloning | |
| - **Languages**: Chinese is strongest, English is well-supported | |
| - **Mixed text**: Can handle Chinese-English in same sentence | |
| ### π Resources: | |
| - [GitHub Repository](https://github.com/zai-org/GLM-TTS) | |
| - [Model on HuggingFace](https://huggingface.co/zai-org/GLM-TTS) | |
| - [Official Demo](https://audio.z.ai) | |
| ### π Performance Benchmarks: | |
| | Metric | GLM-TTS | GLM-TTS-RL | | |
| |--------|---------|------------| | |
| | CER β | 1.03 | **0.89** (best open-source) | | |
| | SIM β | 76.1 | 76.4 | | |
| ### π Citation: | |
| ```bibtex | |
| @misc{glmtts2025, | |
| title={GLM-TTS: Controllable & Emotion-Expressive Zero-shot TTS}, | |
| author={CogAudio Group, Zhipu AI}, | |
| year={2025} | |
| } | |
| ``` | |
| """) | |
| # Connect button | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, ref_audio_input, speed_slider], | |
| outputs=[audio_output, status_output], | |
| api_name="generate" | |
| ) | |
| # Launch | |
| if __name__ == "__main__": | |
| print("π Starting Gradio interface...") | |
| print(f"π GLM-TTS directory: {GLM_TTS_DIR}") | |
| print(f"π Model directory: {MODEL_DIR}") | |
| print(f"π Example directory: {EXAMPLE_DIR}") | |
| demo.queue(max_size=5) # Limit queue size for CPU | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True, | |
| ssr_mode=False # Disable SSR to prevent asyncio event loop cleanup errors | |
| ) |