glm-tts

Running

App Files Files Community

glm-tts / app.py

MihaiPopa-1

Update app.py

cbc942b verified 4 days ago

raw

history blame contribute delete

12.3 kB

	import gradio as gr
	import os
	import sys
	import subprocess
	import json
	import tempfile
	from pathlib import Path
	from huggingface_hub import snapshot_download
	import warnings
	warnings.filterwarnings('ignore')

	# Setup GLM-TTS environment
	def setup_glm_tts():
	"""Download and setup GLM-TTS repository"""
	glm_tts_dir = Path("./GLM-TTS")

	if not glm_tts_dir.exists():
	print("📥 Cloning GLM-TTS repository...")
	subprocess.run(
	["git", "clone", "https://github.com/zai-org/GLM-TTS.git"],
	check=True
	)
	print("✅ GLM-TTS repository cloned")

	# Install additional requirements
	glm_requirements = glm_tts_dir / "requirements.txt"
	if glm_requirements.exists():
	print("📦 Installing GLM-TTS requirements...")
	subprocess.run(
	[sys.executable, "-m", "pip", "install", "-q", "-r", str(glm_requirements)],
	check=False
	)
	print("✅ GLM-TTS requirements installed")

	# Add to Python path
	if str(glm_tts_dir) not in sys.path:
	sys.path.insert(0, str(glm_tts_dir))

	return glm_tts_dir

	print("🔧 Setting up GLM-TTS environment...")
	GLM_TTS_DIR = setup_glm_tts()

	# Download models
	def download_models():
	"""Download GLM-TTS models from HuggingFace"""
	model_dir = Path("./ckpt")

	if not model_dir.exists():
	print("📥 Downloading GLM-TTS models from HuggingFace (~8.9GB)...")
	print("⏳ This will take several minutes on first run...")
	snapshot_download(
	repo_id="zai-org/GLM-TTS",
	local_dir=str(model_dir),
	local_dir_use_symlinks=False,
	resume_download=True
	)
	print("✅ Models downloaded successfully!")
	else:
	print("✅ Models already downloaded")

	return model_dir

	MODEL_DIR = download_models()

	# Create example directory structure
	def setup_examples():
	"""Setup example directories"""
	example_dir = GLM_TTS_DIR / "examples"
	example_dir.mkdir(exist_ok=True)

	prompt_dir = example_dir / "prompt"
	prompt_dir.mkdir(exist_ok=True)

	return example_dir, prompt_dir

	EXAMPLE_DIR, PROMPT_DIR = setup_examples()

	def generate_speech_glmtts(text, ref_audio_path=None, speed=1.0):
	"""
	Generate speech using GLM-TTS inference script

	Args:
	text: Text to synthesize
	ref_audio_path: Optional reference audio for voice cloning
	speed: Speech speed (not directly supported, for future use)

	Returns:
	tuple: (audio_path, status_message)
	"""
	try:
	# Create a temporary JSONL file with the input
	temp_jsonl = tempfile.NamedTemporaryFile(
	mode='w',
	suffix='.jsonl',
	dir=EXAMPLE_DIR,
	delete=False
	)

	# Prepare data entry
	data_entry = {
	"text": text,
	"prompt_text": "",
	"prompt_audio": ""
	}

	# Handle reference audio if provided
	if ref_audio_path and os.path.exists(ref_audio_path):
	# Copy reference audio to prompt directory
	import shutil
	ref_filename = f"ref_{os.path.basename(ref_audio_path)}"
	ref_dest = PROMPT_DIR / ref_filename
	shutil.copy2(ref_audio_path, ref_dest)

	data_entry["prompt_audio"] = str(ref_dest)
	print(f"✓ Using reference audio: {ref_dest}")

	# Write to JSONL
	json.dump(data_entry, temp_jsonl)
	temp_jsonl.write('\n')
	temp_jsonl.close()

	jsonl_path = Path(temp_jsonl.name)
	data_name = jsonl_path.stem

	print(f"🎙️ Synthesizing: '{text[:100]}...'")
	print(f"📝 Using data file: {data_name}")

	# Run GLM-TTS inference
	inference_script = "glmtts_inference.py"

	cmd = [
	sys.executable,
	str(inference_script),
	f"--data={data_name}",
	"--exp_name=_gradio_demo",
	"--use_cache"
	]

	print(f"🔄 Running inference...")
	print(f"Command: {' '.join(cmd)}")

	# Change to GLM-TTS directory for execution
	result = subprocess.run(
	cmd,
	cwd=str(GLM_TTS_DIR),
	capture_output=True,
	text=True,
	timeout=300 # 5 minute timeout
	)

	if result.returncode != 0:
	error_msg = f"Inference failed:\n{result.stderr}\n{result.stdout}"
	print(f"❌ {error_msg}")
	return None, f"❌ Error: {error_msg}"

	print("✅ Inference completed")
	print(f"Output:\n{result.stdout}")

	# Find the generated audio file
	output_dir = GLM_TTS_DIR / "outputs" / f"{data_name}_gradio_demo"

	if not output_dir.exists():
	return None, f"❌ Output directory not found: {output_dir}"

	# Look for .wav files in output directory
	wav_files = list(output_dir.glob("*.wav"))

	if not wav_files:
	return None, f"❌ No audio files generated in {output_dir}"

	# Return the first wav file found
	output_audio = str(wav_files[0])
	print(f"✅ Audio generated: {output_audio}")

	return output_audio, "✅ Success! Audio generated successfully."

	except subprocess.TimeoutExpired:
	return None, "❌ Error: Inference timeout (>5 minutes). CPU inference is very slow."
	except Exception as e:
	import traceback
	error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
	print(error_msg)
	return None, error_msg
	finally:
	# Cleanup temp file
	try:
	if os.path.exists(temp_jsonl.name):
	os.unlink(temp_jsonl.name)
	except:
	pass

	# Gradio interface
	def generate_speech(text, ref_audio, speed):
	"""Gradio interface function"""

	if not text or len(text.strip()) == 0:
	return None, "⚠️ Please enter text to synthesize"

	# Call GLM-TTS
	audio_path, message = generate_speech_glmtts(
	text=text,
	ref_audio_path=ref_audio,
	speed=speed
	)

	return audio_path, message

	# Create Gradio Interface
	with gr.Blocks(
	title="GLM-TTS Voice Cloning",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {max-width: 1200px !important}
	.status-box {font-family: monospace; font-size: 11px; max-height: 400px; overflow-y: auto;}
	"""
	) as demo:

	gr.Markdown("""
	# 🎙️ GLM-TTS: Zero-Shot Voice Cloning & Text-to-Speech

	State-of-the-art voice cloning with just 3-10 seconds of audio!

	### ⚡ Features:
	- 🎯 Zero-shot cloning - Clone any voice without training
	- 🌏 Bilingual - Excellent Chinese & good English support
	- 🎭 Emotion control - Natural & expressive speech
	- ⚡ High quality - Best-in-class among open-source (CER: 0.89)

	### ⚠️ Important Notes:
	- CPU inference is VERY slow (5-15 minutes per generation)
	- First generation takes longer as models initialize
	- For production use, GPU is strongly recommended
	""")

	with gr.Row():
	with gr.Column(scale=1):
	text_input = gr.Textbox(
	label="📝 Text to Synthesize",
	placeholder="Enter text here (Chinese or English)...\n\nExample: Hello! This is GLM-TTS voice cloning.",
	lines=6,
	value="Hello! This is GLM-TTS, a powerful text-to-speech system with zero-shot voice cloning."
	)

	with gr.Accordion("🎵 Voice Cloning (Optional)", open=True):
	ref_audio_input = gr.Audio(
	label="Reference Audio (3-10 seconds)",
	type="filepath",
	sources=["upload", "microphone"]
	)
	gr.Markdown("""
	Upload audio of the voice you want to clone.
	- Leave empty for default voice
	- 3-10 seconds recommended
	- Clear audio with minimal noise
	""")

	with gr.Accordion("⚙️ Settings", open=False):
	speed_slider = gr.Slider(
	label="Speech Speed (not yet implemented)",
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	interactive=False,
	info="Speed control coming soon"
	)

	generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")

	gr.Markdown("""
	### 💡 Tips:
	- First generation will take 10-15 minutes on CPU
	- Subsequent generations: 5-10 minutes
	- Be patient - the quality is worth the wait!
	""")

	with gr.Column(scale=1):
	audio_output = gr.Audio(
	label="🔊 Generated Speech",
	type="filepath"
	)
	status_output = gr.Textbox(
	label="📊 Status / Logs",
	lines=12,
	interactive=False,
	elem_classes=["status-box"]
	)

	# Examples
	gr.Markdown("### 📚 Try These Examples")
	gr.Examples(
	examples=[
	["Hello! Welcome to GLM-TTS.", None, 1.0],
	["欢迎使用GLM-TTS语音合成系统！", None, 1.0],
	["Artificial intelligence is transforming our world.", None, 1.0],
	["人工智能正在改变世界。", None, 1.0],
	["This is a test of zero-shot voice cloning technology.", None, 1.0],
	["There's a new tool that allows you to clone his voice with just about 10 or 12, maybe 13 or 15 seconds of audio! And it's actually a really good clone! Check this out!", None, 1.0],
	],
	inputs=[text_input, ref_audio_input, speed_slider],
	outputs=[audio_output, status_output],
	fn=generate_speech,
	cache_examples=False,
	)

	gr.Markdown("""
	---
	### 🎯 How It Works:
	1. Enter text in Chinese or English (or mixed)
	2. Optional: Upload 3-10s of reference audio to clone that voice
	3. Click Generate and wait (be patient on CPU!)
	4. Download your generated audio

	### 💡 Best Practices:
	- Reference audio: Clear, minimal background noise
	- Length: 3-10 seconds is optimal for cloning
	- Languages: Chinese is strongest, English is well-supported
	- Mixed text: Can handle Chinese-English in same sentence

	### 🔗 Resources:
	- [GitHub Repository](https://github.com/zai-org/GLM-TTS)
	- [Model on HuggingFace](https://huggingface.co/zai-org/GLM-TTS)
	- [Official Demo](https://audio.z.ai)

	### 📊 Performance Benchmarks:
	\| Metric \| GLM-TTS \| GLM-TTS-RL \|
	\|--------\|---------\|------------\|
	\| CER ↓ \| 1.03 \| 0.89 (best open-source) \|
	\| SIM ↑ \| 76.1 \| 76.4 \|

	### 📄 Citation:
	```bibtex
	@misc{glmtts2025,
	title={GLM-TTS: Controllable & Emotion-Expressive Zero-shot TTS},
	author={CogAudio Group, Zhipu AI},
	year={2025}
	}
	```
	""")

	# Connect button
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, ref_audio_input, speed_slider],
	outputs=[audio_output, status_output],
	api_name="generate"
	)

	# Launch
	if __name__ == "__main__":
	print("🚀 Starting Gradio interface...")
	print(f"📁 GLM-TTS directory: {GLM_TTS_DIR}")
	print(f"📁 Model directory: {MODEL_DIR}")
	print(f"📁 Example directory: {EXAMPLE_DIR}")

	demo.queue(max_size=5) # Limit queue size for CPU
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True,
	ssr_mode=False # Disable SSR to prevent asyncio event loop cleanup errors
	)