import gradio as gr
import os
import sys
import subprocess
import json
import tempfile
from pathlib import Path
from huggingface_hub import snapshot_download
import warnings
warnings.filterwarnings('ignore')

# Setup GLM-TTS environment
def setup_glm_tts():
    """Download and setup GLM-TTS repository"""
    glm_tts_dir = Path("./GLM-TTS")
    
    if not glm_tts_dir.exists():
        print("📥 Cloning GLM-TTS repository...")
        subprocess.run(
            ["git", "clone", "https://github.com/zai-org/GLM-TTS.git"],
            check=True
        )
        print("✅ GLM-TTS repository cloned")
        
        # Install additional requirements
        glm_requirements = glm_tts_dir / "requirements.txt"
        if glm_requirements.exists():
            print("📦 Installing GLM-TTS requirements...")
            subprocess.run(
                [sys.executable, "-m", "pip", "install", "-q", "-r", str(glm_requirements)],
                check=False
            )
            print("✅ GLM-TTS requirements installed")
    
    # Add to Python path
    if str(glm_tts_dir) not in sys.path:
        sys.path.insert(0, str(glm_tts_dir))
    
    return glm_tts_dir

print("🔧 Setting up GLM-TTS environment...")
GLM_TTS_DIR = setup_glm_tts()

# Download models
def download_models():
    """Download GLM-TTS models from HuggingFace"""
    model_dir = Path("./ckpt")
    
    if not model_dir.exists():
        print("📥 Downloading GLM-TTS models from HuggingFace (~8.9GB)...")
        print("⏳ This will take several minutes on first run...")
        snapshot_download(
            repo_id="zai-org/GLM-TTS",
            local_dir=str(model_dir),
            local_dir_use_symlinks=False,
            resume_download=True
        )
        print("✅ Models downloaded successfully!")
    else:
        print("✅ Models already downloaded")
    
    return model_dir

MODEL_DIR = download_models()

# Create example directory structure
def setup_examples():
    """Setup example directories"""
    example_dir = GLM_TTS_DIR / "examples"
    example_dir.mkdir(exist_ok=True)
    
    prompt_dir = example_dir / "prompt"
    prompt_dir.mkdir(exist_ok=True)
    
    return example_dir, prompt_dir

EXAMPLE_DIR, PROMPT_DIR = setup_examples()

def generate_speech_glmtts(text, ref_audio_path=None, speed=1.0):
    """
    Generate speech using GLM-TTS inference script
    
    Args:
        text: Text to synthesize
        ref_audio_path: Optional reference audio for voice cloning
        speed: Speech speed (not directly supported, for future use)
    
    Returns:
        tuple: (audio_path, status_message)
    """
    try:
        # Create a temporary JSONL file with the input
        temp_jsonl = tempfile.NamedTemporaryFile(
            mode='w',
            suffix='.jsonl',
            dir=EXAMPLE_DIR,
            delete=False
        )
        
        # Prepare data entry
        data_entry = {
            "text": text,
            "prompt_text": "",
            "prompt_audio": ""
        }
        
        # Handle reference audio if provided
        if ref_audio_path and os.path.exists(ref_audio_path):
            # Copy reference audio to prompt directory
            import shutil
            ref_filename = f"ref_{os.path.basename(ref_audio_path)}"
            ref_dest = PROMPT_DIR / ref_filename
            shutil.copy2(ref_audio_path, ref_dest)
            
            data_entry["prompt_audio"] = str(ref_dest)
            print(f"✓ Using reference audio: {ref_dest}")
        
        # Write to JSONL
        json.dump(data_entry, temp_jsonl)
        temp_jsonl.write('\n')
        temp_jsonl.close()
        
        jsonl_path = Path(temp_jsonl.name)
        data_name = jsonl_path.stem
        
        print(f"🎙️ Synthesizing: '{text[:100]}...'")
        print(f"📝 Using data file: {data_name}")
        
        # Run GLM-TTS inference
        inference_script = "glmtts_inference.py"
        
        cmd = [
            sys.executable,
            str(inference_script),
            f"--data={data_name}",
            "--exp_name=_gradio_demo",
            "--use_cache"
        ]
        
        print(f"🔄 Running inference...")
        print(f"Command: {' '.join(cmd)}")
        
        # Change to GLM-TTS directory for execution
        result = subprocess.run(
            cmd,
            cwd=str(GLM_TTS_DIR),
            capture_output=True,
            text=True,
            timeout=300  # 5 minute timeout
        )
        
        if result.returncode != 0:
            error_msg = f"Inference failed:\n{result.stderr}\n{result.stdout}"
            print(f"❌ {error_msg}")
            return None, f"❌ Error: {error_msg}"
        
        print("✅ Inference completed")
        print(f"Output:\n{result.stdout}")
        
        # Find the generated audio file
        output_dir = GLM_TTS_DIR / "outputs" / f"{data_name}_gradio_demo"
        
        if not output_dir.exists():
            return None, f"❌ Output directory not found: {output_dir}"
        
        # Look for .wav files in output directory
        wav_files = list(output_dir.glob("*.wav"))
        
        if not wav_files:
            return None, f"❌ No audio files generated in {output_dir}"
        
        # Return the first wav file found
        output_audio = str(wav_files[0])
        print(f"✅ Audio generated: {output_audio}")
        
        return output_audio, "✅ Success! Audio generated successfully."
        
    except subprocess.TimeoutExpired:
        return None, "❌ Error: Inference timeout (>5 minutes). CPU inference is very slow."
    except Exception as e:
        import traceback
        error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
        print(error_msg)
        return None, error_msg
    finally:
        # Cleanup temp file
        try:
            if os.path.exists(temp_jsonl.name):
                os.unlink(temp_jsonl.name)
        except:
            pass

# Gradio interface
def generate_speech(text, ref_audio, speed):
    """Gradio interface function"""
    
    if not text or len(text.strip()) == 0:
        return None, "⚠️ Please enter text to synthesize"
    
    # Call GLM-TTS
    audio_path, message = generate_speech_glmtts(
        text=text,
        ref_audio_path=ref_audio,
        speed=speed
    )
    
    return audio_path, message

# Create Gradio Interface
with gr.Blocks(
    title="GLM-TTS Voice Cloning",
    theme=gr.themes.Soft(),
    css="""
    .gradio-container {max-width: 1200px !important}
    .status-box {font-family: monospace; font-size: 11px; max-height: 400px; overflow-y: auto;}
    """
) as demo:
    
    gr.Markdown("""
    # 🎙️ GLM-TTS: Zero-Shot Voice Cloning & Text-to-Speech
    
    **State-of-the-art voice cloning** with just 3-10 seconds of audio!
    
    ### ⚡ Features:
    - 🎯 **Zero-shot cloning** - Clone any voice without training
    - 🌏 **Bilingual** - Excellent Chinese & good English support  
    - 🎭 **Emotion control** - Natural & expressive speech
    - ⚡ **High quality** - Best-in-class among open-source (CER: 0.89)
    
    ### ⚠️ Important Notes:
    - **CPU inference is VERY slow** (5-15 minutes per generation)
    - **First generation takes longer** as models initialize
    - For production use, **GPU is strongly recommended**
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            text_input = gr.Textbox(
                label="📝 Text to Synthesize",
                placeholder="Enter text here (Chinese or English)...\n\nExample: Hello! This is GLM-TTS voice cloning.",
                lines=6,
                value="Hello! This is GLM-TTS, a powerful text-to-speech system with zero-shot voice cloning."
            )
            
            with gr.Accordion("🎵 Voice Cloning (Optional)", open=True):
                ref_audio_input = gr.Audio(
                    label="Reference Audio (3-10 seconds)",
                    type="filepath",
                    sources=["upload", "microphone"]
                )
                gr.Markdown("""
                *Upload audio of the voice you want to clone.*
                - **Leave empty for default voice**
                - **3-10 seconds recommended**
                - **Clear audio with minimal noise**
                """)
            
            with gr.Accordion("⚙️ Settings", open=False):
                speed_slider = gr.Slider(
                    label="Speech Speed (not yet implemented)",
                    minimum=0.5,
                    maximum=2.0,
                    value=1.0,
                    step=0.1,
                    interactive=False,
                    info="Speed control coming soon"
                )
            
            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
            
            gr.Markdown("""
            ### 💡 Tips:
            - First generation will take **10-15 minutes** on CPU
            - Subsequent generations: **5-10 minutes**
            - Be patient - the quality is worth the wait!
            """)
        
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="🔊 Generated Speech",
                type="filepath"
            )
            status_output = gr.Textbox(
                label="📊 Status / Logs",
                lines=12,
                interactive=False,
                elem_classes=["status-box"]
            )
    
    # Examples
    gr.Markdown("### 📚 Try These Examples")
    gr.Examples(
        examples=[
            ["Hello! Welcome to GLM-TTS.", None, 1.0],
            ["欢迎使用GLM-TTS语音合成系统！", None, 1.0],
            ["Artificial intelligence is transforming our world.", None, 1.0],
            ["人工智能正在改变世界。", None, 1.0],
            ["This is a test of zero-shot voice cloning technology.", None, 1.0],
            ["There's a new tool that allows you to clone his voice with just about 10 or 12, maybe 13 or 15 seconds of audio! And it's actually a really good clone! Check this out!", None, 1.0],
        ],
        inputs=[text_input, ref_audio_input, speed_slider],
        outputs=[audio_output, status_output],
        fn=generate_speech,
        cache_examples=False,
    )
    
    gr.Markdown("""
    ---
    ### 🎯 How It Works:
    1. **Enter text** in Chinese or English (or mixed)
    2. **Optional**: Upload 3-10s of reference audio to clone that voice
    3. **Click Generate** and wait (be patient on CPU!)
    4. **Download** your generated audio
    
    ### 💡 Best Practices:
    - **Reference audio**: Clear, minimal background noise
    - **Length**: 3-10 seconds is optimal for cloning
    - **Languages**: Chinese is strongest, English is well-supported
    - **Mixed text**: Can handle Chinese-English in same sentence
    
    ### 🔗 Resources:
    - [GitHub Repository](https://github.com/zai-org/GLM-TTS)
    - [Model on HuggingFace](https://huggingface.co/zai-org/GLM-TTS)
    - [Official Demo](https://audio.z.ai)
    
    ### 📊 Performance Benchmarks:
    | Metric | GLM-TTS | GLM-TTS-RL |
    |--------|---------|------------|
    | CER ↓  | 1.03    | **0.89** (best open-source) |
    | SIM ↑  | 76.1    | 76.4 |
    
    ### 📄 Citation:
    ```bibtex
    @misc{glmtts2025,
      title={GLM-TTS: Controllable & Emotion-Expressive Zero-shot TTS},
      author={CogAudio Group, Zhipu AI},
      year={2025}
    }
    ```
    """)
    
    # Connect button
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, ref_audio_input, speed_slider],
        outputs=[audio_output, status_output],
        api_name="generate"
    )

# Launch
if __name__ == "__main__":
    print("🚀 Starting Gradio interface...")
    print(f"📁 GLM-TTS directory: {GLM_TTS_DIR}")
    print(f"📁 Model directory: {MODEL_DIR}")
    print(f"📁 Example directory: {EXAMPLE_DIR}")
    
    demo.queue(max_size=5)  # Limit queue size for CPU
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True,
        ssr_mode=False  # Disable SSR to prevent asyncio event loop cleanup errors
    )