video-ffmpeg

Sleeping

File size: 10,360 Bytes

e5b621e
085d5e6
1b78077
e5b621e
1b78077
fef9da1
c5cfcb5
e5b621e
377308b
 
8320d85
 
500f777
e5b621e
500f777
 
c5cfcb5
 
 
 
 
 
 
 
 
 
 
 
1bd9ab8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7644c1e
 
1bd9ab8
 
 
 
 
 
c5cfcb5
1ce3011
377308b
77f3dab
377308b
8320d85
77f3dab
8320d85
 
377308b
1ce3011
 
377308b
 
 
 
 
7644c1e
377308b
 
 
 
 
 
 
8320d85
1ce3011
8320d85
1ce3011
377308b
1ce3011
 
 
8320d85
c5cfcb5
7644c1e
ea1c088
 
c9a86ba
2a9840f
7644c1e
ad4cab5
c5cfcb5
7644c1e
 
1ce3011
7644c1e
 
 
1ce3011
7644c1e
 
 
1ce3011
 
 
fd9d93c
8320d85
1bd9ab8
7644c1e
d24cfba
8320d85
7644c1e
 
1ce3011
7644c1e
77f3dab
7644c1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ce3011
 
7644c1e
1ce3011
 
7644c1e
 
 
 
 
 
 
 
 
 
 
 
 
312fd62
c5cfcb5
7644c1e
1ce3011
 
312fd62
c5cfcb5
312fd62
 
1ce3011
312fd62
7644c1e
 
 
 
312fd62
7644c1e
c5cfcb5
 
312fd62
 
 
c5cfcb5
 
 
 
 
 
 
312fd62
c5cfcb5
1ce3011
7644c1e
1ce3011
7644c1e
 
1ce3011
bbb5565
7644c1e
 
ad4cab5
c5cfcb5
ad4cab5
c5cfcb5
 
ad4cab5
 
1ce3011
7644c1e
1ce3011
7644c1e
 
1ce3011
7644c1e
 
 
 
c5cfcb5
ad4cab5
7644c1e
 
36bfe7e
 
e5b621e
c5cfcb5
36bfe7e
c5cfcb5
 
7644c1e
 
b6a8e09
c5cfcb5
7644c1e
1ce3011
7644c1e
c5cfcb5
1ce3011
c5cfcb5
 
 
 
 
 
0b567d9
c5cfcb5
 
0b567d9
 
c5cfcb5
 
 
8320d85
c5cfcb5
1ce3011
 
 
c5cfcb5
 
0b567d9
 
 
c5cfcb5

import gradio as gr
import tempfile
from pathlib import Path
import uuid
import subprocess
import shutil
import os

# NEU: Dauer des Fade-In/Out für jedes einzelne Wort (z.B. 0.2 Sekunden)
WORD_FADE_DURATION = 0.2
FFMPEG_ESCAPE_CHAR = "\\"

# Erlaubte Dateiformate
allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
allowed_audios = [".mp3", ".wav", ".m4a", ".ogg"]

def get_font_path():
    """Versucht, eine Standard-Schriftart im Linux-System zu finden."""
    possible_fonts = [
        "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
        "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
        "/usr/share/fonts/truetype/freefont/FreeSansBold.ttf"
    ]
    for font in possible_fonts:
        if os.path.exists(font):
            return font
    return None # Fallback: FFmpeg soll selbst suchen (klappt manchmal nicht)

def save_temp_audio(audio_file_path):
    """
    Speichert die hochgeladene Audio-Datei in einem temporären Verzeichnis.
    Erwartet einen Dateipfad-String von Gradio.
    """
    if not audio_file_path:
        return None, None

    # Gradio liefert einen String-Pfad zum temporären Speicherort
    input_path = Path(audio_file_path)
    
    # Bestimme die Erweiterung
    ext = input_path.suffix
    if ext.lower() not in allowed_audios:
        ext = ".mp3"
        
    # Erstelle das Zielverzeichnis und den Zielpfad
    temp_audio_dir = Path(tempfile.mkdtemp())
    temp_audio = temp_audio_dir / f"input{ext}"
    
    # Kopiere die Datei vom Gradio-Temp-Pfad in unseren eigenen Temp-Pfad
    try:
        shutil.copyfile(input_path, temp_audio)
        # Rückgabe des Verzeichnisses, das später gelöscht werden kann, und des Dateipfads
        return temp_audio_dir, temp_audio
    except Exception as e:
        print(f"Fehler beim Kopieren der Audiodatei: {e}")
        if temp_audio_dir.exists():
             shutil.rmtree(temp_audio_dir)
        return None, None


def create_timed_drawtext(word, start_time, duration, font_option, font_size, y_pos):
    """Erstellt einen FFmpeg drawtext Filter, der ein Wort mit weichen Übergängen (Alpha-Kanal) einblendet."""
    global FFMPEG_ESCAPE_CHAR 
    global WORD_FADE_DURATION 

    # 1. Escaping: Ersetze alle ":" durch "\:" für FFmpeg
    escaped_word = word.replace(':', f"{FFMPEG_ESCAPE_CHAR}:")
    
    # Definiere die Start- und Endzeit des WORTES
    end_time = start_time + duration
    
    # Zeitpunkte für den Fade
    fade_in_end = start_time + WORD_FADE_DURATION
    fade_out_start = end_time - WORD_FADE_DURATION
    
    # Alpha-Ausdruck für smooth Fade-In und Fade-Out
    # Steuert die Deckkraft basierend auf der Zeit t (relativ zum Clip-Start)
    alpha_expression = (
        f"if(lt(t,{start_time}), 0, "
        f"if(lt(t,{fade_in_end}), (t-{start_time})/{WORD_FADE_DURATION}, "
        f"if(lt(t,{fade_out_start}), 1, "
        f"if(lt(t,{end_time}), ({end_time}-t)/{WORD_FADE_DURATION}, 0))))"
    )

    # Erstelle den Filterstring
    drawtext_filter = (
        f"drawtext=text='{escaped_word}'{font_option}:fontcolor=white:fontsize={font_size}:borderw=2:bordercolor=black:"
        f"x=(w-text_w)/2:y=(h-text_h)*{y_pos}:"
        f"alpha='{alpha_expression}'" # Steuert die Deckkraft (Smoothness)
    )
    return drawtext_filter


def generate_slideshow_with_audio(images, input_text, duration_per_word, duration_per_image, fade_duration, font_size, y_pos, audio_file):
    
    if not images:
        return None, "❌ Keine Bilder ausgewählt"
    
    temp_dir = tempfile.mkdtemp()
    
    # Text in Wörter aufteilen
    words = input_text.split() if input_text else []
    total_words = len(words)
    num_images = len(images)
    
    # Berechnung der gleichmäßigen Verteilung der Wörter auf die Bilder
    base_words_per_clip = total_words // num_images
    remainder = total_words % num_images
    
    current_word_index = 0
    clips_with_text = [] # Paths der generierten MP4-Clips

    # Schriftart finden
    font_path = get_font_path()
    font_option = f":fontfile='{font_path}'" if font_path else ""

    # Audio verarbeiten
    # audio_file ist der Pfad-String von Gradio
    audio_temp_dir, temp_audio_file = save_temp_audio(audio_file) if audio_file else (None, None)

    
    # --- 1. SCHLEIFE: Erstelle jeden Clip mit seinem Textsegment ---
    for i in range(num_images):
        img_path = Path(images[i].name)
        clip_path = Path(temp_dir) / f"clip_with_text_{i}.mp4"

        # 1. Bestimme das Wortsegment für diesen Clip
        words_on_this_clip = base_words_per_clip + (1 if i < remainder else 0)
        
        # Extrahieren des Segments aus der Gesamtliste der Wörter
        word_segment = words[current_word_index : current_word_index + words_on_this_clip]
        current_word_index += len(word_segment)
        
        # 2. Berechne die Clip-Dauer
        text_duration = len(word_segment) * duration_per_word
        # Die Dauer ist das Maximum aus der gewünschten Bilddauer und der benötigten Textdauer
        duration_clip = max(duration_per_image, text_duration)

        # 3. Generiere Drawtext Filter (Startzeit ist relativ zum Clip-Start, also 0)
        drawtext_filters = []
        word_start_time = 0.0
        for word in word_segment:
            filter_str = create_timed_drawtext(word, word_start_time, duration_per_word, font_option, font_size, y_pos)
            drawtext_filters.append(filter_str)
            word_start_time += duration_per_word

        # 4. Basis- und Fade-Filter
        base_filters = (
            "scale=w=1280:h=720:force_original_aspect_ratio=decrease,"
            "pad=1280:720:(ow-iw)/2:(oh-ih)/2:color=black,"
            "fps=25,format=yuv420p"
        )
        
        fade_out_start = duration_clip - fade_duration
        if fade_out_start < 0: fade_out_start = 0
        fade_img_filter = f"fade=t=in:st=0:d={fade_duration},fade=t=out:st={fade_out_start}:d={fade_duration}"
        
        # 5. Kombiniere alle Filter
        if drawtext_filters:
            all_drawtext_filters = ",".join(drawtext_filters)
            vf_filters_clip = f"{base_filters},{all_drawtext_filters},{fade_img_filter}"
        else:
            # Kein Text mehr: Nur Bild mit Fade
            vf_filters_clip = f"{base_filters},{fade_img_filter}"

        # 6. FFmpeg Command zum Erstellen des Clips
        cmd = [
            "ffmpeg", "-y", "-loop", "1", "-i", str(img_path),
            "-t", str(duration_clip),
            "-vf", vf_filters_clip,
            str(clip_path)
        ]
        
        try:
            subprocess.run(cmd, check=True, capture_output=True, text=True)
            clips_with_text.append(clip_path)
        except subprocess.CalledProcessError as e:
            # Bereinigung bei Fehler
            shutil.rmtree(temp_dir)
            if audio_temp_dir: shutil.rmtree(audio_temp_dir)
            return None, f"❌ FFmpeg Fehler bei Bild {i+1}:\n{e.stderr}"

    # --- 2. ZUSAMMENFÜGEN ---
    filelist_path = Path(temp_dir) / "filelist.txt"
    with open(filelist_path, "w") as f:
        for clip in clips_with_text:
            f.write(f"file '{clip}'\n")

    output_video = Path(temp_dir) / f"slideshow_{uuid.uuid4().hex}.mp4"
    
    cmd_concat = [
        "ffmpeg", "-y", "-f", "concat", "-safe", "0",
        "-i", str(filelist_path),
        "-c:v", "libx264", "-pix_fmt", "yuv420p",
        str(output_video)
    ]
    
    try:
        subprocess.run(cmd_concat, check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as e:
        shutil.rmtree(temp_dir)
        if audio_temp_dir: shutil.rmtree(audio_temp_dir)
        return None, f"❌ FFmpeg Fehler beim Zusammenfügen:\n{e.stderr}"

    # --- 3. AUDIO HINZUFÜGEN (falls vorhanden) ---
    final_output = output_video
    if temp_audio_file:
        final_output = Path(temp_dir) / f"final_{uuid.uuid4().hex}.mp4"
        cmd_audio = [
            "ffmpeg", "-y", "-i", str(output_video), "-i", str(temp_audio_file),
            "-c:v", "copy", "-c:a", "aac", "-shortest",
            str(final_output)
        ]
        try:
            subprocess.run(cmd_audio, check=True, capture_output=True, text=True)
        except subprocess.CalledProcessError as e:
            shutil.rmtree(temp_dir)
            if audio_temp_dir: shutil.rmtree(audio_temp_dir)
            return None, f"❌ FFmpeg Fehler beim Hinzufügen von Audio:\n{e.stderr}"
        
        # Bereinige das separate Audio-Temp-Verzeichnis
        if audio_temp_dir: shutil.rmtree(audio_temp_dir)
        
        return str(final_output), "✅ Video mit Audio erstellt!"

    # Nur Video-Pfad zurückgeben
    return str(final_output), "✅ Video erstellt (ohne Audio)"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Slideshow Generator")

    with gr.Row():
        img_input = gr.Files(label="Bilder", file_types=allowed_medias)
        # TEXT WURDE GEÄNDERT: Neue Beschreibung für Textverteilung
        text_input = gr.Textbox(label="Text (Wörter werden gleichmäßig auf alle Bilder verteilt)", lines=5, placeholder="Jedes Wort wird für 'Dauer pro Wort' angezeigt.")
    
    with gr.Row():
        duration_image_input = gr.Number(value=3, label="Mindest-Dauer pro BILD (s)")
        duration_word_input = gr.Number(value=1.0, label="Dauer pro WORT (s) [bestimmt Geschwindigkeit der Text-Anzeige]")
        fade_input = gr.Number(value=0.5, label="Bild-Fade Dauer (s)")
        font_size_input = gr.Number(value=80, label="Schriftgröße (px)")
        ypos_input = gr.Slider(0.0, 1.0, value=0.9, label="Y-Position (0=Oben, 1=Unten)")
    
    audio_input = gr.File(label="Audio (optional)", file_types=allowed_audios)
    btn = gr.Button("Erstellen", variant="primary")
    
    out_video = gr.Video(label="Ergebnis")
    status = gr.Textbox(label="Status")

    # KORREKTE REIHENFOLGE DER INPUTS:
    # (images, input_text, duration_per_word, duration_per_image, fade_duration, font_size, y_pos, audio_file)
    btn.click(
        fn=generate_slideshow_with_audio,
        inputs=[
            img_input, 
            text_input, 
            duration_word_input, 
            duration_image_input, 
            fade_input, 
            font_size_input, 
            ypos_input, 
            audio_input
        ],
        outputs=[out_video, status]
    )

demo.launch()