glm-tts / app.py
MihaiPopa-1's picture
Update app.py
cbc942b verified
import gradio as gr
import os
import sys
import subprocess
import json
import tempfile
from pathlib import Path
from huggingface_hub import snapshot_download
import warnings
warnings.filterwarnings('ignore')
# Setup GLM-TTS environment
def setup_glm_tts():
"""Download and setup GLM-TTS repository"""
glm_tts_dir = Path("./GLM-TTS")
if not glm_tts_dir.exists():
print("πŸ“₯ Cloning GLM-TTS repository...")
subprocess.run(
["git", "clone", "https://github.com/zai-org/GLM-TTS.git"],
check=True
)
print("βœ… GLM-TTS repository cloned")
# Install additional requirements
glm_requirements = glm_tts_dir / "requirements.txt"
if glm_requirements.exists():
print("πŸ“¦ Installing GLM-TTS requirements...")
subprocess.run(
[sys.executable, "-m", "pip", "install", "-q", "-r", str(glm_requirements)],
check=False
)
print("βœ… GLM-TTS requirements installed")
# Add to Python path
if str(glm_tts_dir) not in sys.path:
sys.path.insert(0, str(glm_tts_dir))
return glm_tts_dir
print("πŸ”§ Setting up GLM-TTS environment...")
GLM_TTS_DIR = setup_glm_tts()
# Download models
def download_models():
"""Download GLM-TTS models from HuggingFace"""
model_dir = Path("./ckpt")
if not model_dir.exists():
print("πŸ“₯ Downloading GLM-TTS models from HuggingFace (~8.9GB)...")
print("⏳ This will take several minutes on first run...")
snapshot_download(
repo_id="zai-org/GLM-TTS",
local_dir=str(model_dir),
local_dir_use_symlinks=False,
resume_download=True
)
print("βœ… Models downloaded successfully!")
else:
print("βœ… Models already downloaded")
return model_dir
MODEL_DIR = download_models()
# Create example directory structure
def setup_examples():
"""Setup example directories"""
example_dir = GLM_TTS_DIR / "examples"
example_dir.mkdir(exist_ok=True)
prompt_dir = example_dir / "prompt"
prompt_dir.mkdir(exist_ok=True)
return example_dir, prompt_dir
EXAMPLE_DIR, PROMPT_DIR = setup_examples()
def generate_speech_glmtts(text, ref_audio_path=None, speed=1.0):
"""
Generate speech using GLM-TTS inference script
Args:
text: Text to synthesize
ref_audio_path: Optional reference audio for voice cloning
speed: Speech speed (not directly supported, for future use)
Returns:
tuple: (audio_path, status_message)
"""
try:
# Create a temporary JSONL file with the input
temp_jsonl = tempfile.NamedTemporaryFile(
mode='w',
suffix='.jsonl',
dir=EXAMPLE_DIR,
delete=False
)
# Prepare data entry
data_entry = {
"text": text,
"prompt_text": "",
"prompt_audio": ""
}
# Handle reference audio if provided
if ref_audio_path and os.path.exists(ref_audio_path):
# Copy reference audio to prompt directory
import shutil
ref_filename = f"ref_{os.path.basename(ref_audio_path)}"
ref_dest = PROMPT_DIR / ref_filename
shutil.copy2(ref_audio_path, ref_dest)
data_entry["prompt_audio"] = str(ref_dest)
print(f"βœ“ Using reference audio: {ref_dest}")
# Write to JSONL
json.dump(data_entry, temp_jsonl)
temp_jsonl.write('\n')
temp_jsonl.close()
jsonl_path = Path(temp_jsonl.name)
data_name = jsonl_path.stem
print(f"πŸŽ™οΈ Synthesizing: '{text[:100]}...'")
print(f"πŸ“ Using data file: {data_name}")
# Run GLM-TTS inference
inference_script = "glmtts_inference.py"
cmd = [
sys.executable,
str(inference_script),
f"--data={data_name}",
"--exp_name=_gradio_demo",
"--use_cache"
]
print(f"πŸ”„ Running inference...")
print(f"Command: {' '.join(cmd)}")
# Change to GLM-TTS directory for execution
result = subprocess.run(
cmd,
cwd=str(GLM_TTS_DIR),
capture_output=True,
text=True,
timeout=300 # 5 minute timeout
)
if result.returncode != 0:
error_msg = f"Inference failed:\n{result.stderr}\n{result.stdout}"
print(f"❌ {error_msg}")
return None, f"❌ Error: {error_msg}"
print("βœ… Inference completed")
print(f"Output:\n{result.stdout}")
# Find the generated audio file
output_dir = GLM_TTS_DIR / "outputs" / f"{data_name}_gradio_demo"
if not output_dir.exists():
return None, f"❌ Output directory not found: {output_dir}"
# Look for .wav files in output directory
wav_files = list(output_dir.glob("*.wav"))
if not wav_files:
return None, f"❌ No audio files generated in {output_dir}"
# Return the first wav file found
output_audio = str(wav_files[0])
print(f"βœ… Audio generated: {output_audio}")
return output_audio, "βœ… Success! Audio generated successfully."
except subprocess.TimeoutExpired:
return None, "❌ Error: Inference timeout (>5 minutes). CPU inference is very slow."
except Exception as e:
import traceback
error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
return None, error_msg
finally:
# Cleanup temp file
try:
if os.path.exists(temp_jsonl.name):
os.unlink(temp_jsonl.name)
except:
pass
# Gradio interface
def generate_speech(text, ref_audio, speed):
"""Gradio interface function"""
if not text or len(text.strip()) == 0:
return None, "⚠️ Please enter text to synthesize"
# Call GLM-TTS
audio_path, message = generate_speech_glmtts(
text=text,
ref_audio_path=ref_audio,
speed=speed
)
return audio_path, message
# Create Gradio Interface
with gr.Blocks(
title="GLM-TTS Voice Cloning",
theme=gr.themes.Soft(),
css="""
.gradio-container {max-width: 1200px !important}
.status-box {font-family: monospace; font-size: 11px; max-height: 400px; overflow-y: auto;}
"""
) as demo:
gr.Markdown("""
# πŸŽ™οΈ GLM-TTS: Zero-Shot Voice Cloning & Text-to-Speech
**State-of-the-art voice cloning** with just 3-10 seconds of audio!
### ⚑ Features:
- 🎯 **Zero-shot cloning** - Clone any voice without training
- 🌏 **Bilingual** - Excellent Chinese & good English support
- 🎭 **Emotion control** - Natural & expressive speech
- ⚑ **High quality** - Best-in-class among open-source (CER: 0.89)
### ⚠️ Important Notes:
- **CPU inference is VERY slow** (5-15 minutes per generation)
- **First generation takes longer** as models initialize
- For production use, **GPU is strongly recommended**
""")
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="πŸ“ Text to Synthesize",
placeholder="Enter text here (Chinese or English)...\n\nExample: Hello! This is GLM-TTS voice cloning.",
lines=6,
value="Hello! This is GLM-TTS, a powerful text-to-speech system with zero-shot voice cloning."
)
with gr.Accordion("🎡 Voice Cloning (Optional)", open=True):
ref_audio_input = gr.Audio(
label="Reference Audio (3-10 seconds)",
type="filepath",
sources=["upload", "microphone"]
)
gr.Markdown("""
*Upload audio of the voice you want to clone.*
- **Leave empty for default voice**
- **3-10 seconds recommended**
- **Clear audio with minimal noise**
""")
with gr.Accordion("βš™οΈ Settings", open=False):
speed_slider = gr.Slider(
label="Speech Speed (not yet implemented)",
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
interactive=False,
info="Speed control coming soon"
)
generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
gr.Markdown("""
### πŸ’‘ Tips:
- First generation will take **10-15 minutes** on CPU
- Subsequent generations: **5-10 minutes**
- Be patient - the quality is worth the wait!
""")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="πŸ”Š Generated Speech",
type="filepath"
)
status_output = gr.Textbox(
label="πŸ“Š Status / Logs",
lines=12,
interactive=False,
elem_classes=["status-box"]
)
# Examples
gr.Markdown("### πŸ“š Try These Examples")
gr.Examples(
examples=[
["Hello! Welcome to GLM-TTS.", None, 1.0],
["ζ¬’θΏŽδ½Ώη”¨GLM-TTS语音合成系统!", None, 1.0],
["Artificial intelligence is transforming our world.", None, 1.0],
["δΊΊε·₯ζ™Ίθƒ½ζ­£εœ¨ζ”Ήε˜δΈ–η•Œγ€‚", None, 1.0],
["This is a test of zero-shot voice cloning technology.", None, 1.0],
["There's a new tool that allows you to clone his voice with just about 10 or 12, maybe 13 or 15 seconds of audio! And it's actually a really good clone! Check this out!", None, 1.0],
],
inputs=[text_input, ref_audio_input, speed_slider],
outputs=[audio_output, status_output],
fn=generate_speech,
cache_examples=False,
)
gr.Markdown("""
---
### 🎯 How It Works:
1. **Enter text** in Chinese or English (or mixed)
2. **Optional**: Upload 3-10s of reference audio to clone that voice
3. **Click Generate** and wait (be patient on CPU!)
4. **Download** your generated audio
### πŸ’‘ Best Practices:
- **Reference audio**: Clear, minimal background noise
- **Length**: 3-10 seconds is optimal for cloning
- **Languages**: Chinese is strongest, English is well-supported
- **Mixed text**: Can handle Chinese-English in same sentence
### πŸ”— Resources:
- [GitHub Repository](https://github.com/zai-org/GLM-TTS)
- [Model on HuggingFace](https://huggingface.co/zai-org/GLM-TTS)
- [Official Demo](https://audio.z.ai)
### πŸ“Š Performance Benchmarks:
| Metric | GLM-TTS | GLM-TTS-RL |
|--------|---------|------------|
| CER ↓ | 1.03 | **0.89** (best open-source) |
| SIM ↑ | 76.1 | 76.4 |
### πŸ“„ Citation:
```bibtex
@misc{glmtts2025,
title={GLM-TTS: Controllable & Emotion-Expressive Zero-shot TTS},
author={CogAudio Group, Zhipu AI},
year={2025}
}
```
""")
# Connect button
generate_btn.click(
fn=generate_speech,
inputs=[text_input, ref_audio_input, speed_slider],
outputs=[audio_output, status_output],
api_name="generate"
)
# Launch
if __name__ == "__main__":
print("πŸš€ Starting Gradio interface...")
print(f"πŸ“ GLM-TTS directory: {GLM_TTS_DIR}")
print(f"πŸ“ Model directory: {MODEL_DIR}")
print(f"πŸ“ Example directory: {EXAMPLE_DIR}")
demo.queue(max_size=5) # Limit queue size for CPU
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
ssr_mode=False # Disable SSR to prevent asyncio event loop cleanup errors
)