video-ffmpeg

Running

App Files Files Community

Tim13ekd commited on 6 days ago

Commit

fef9da1

verified ·

1 Parent(s): 500f777

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -8

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import subprocess
 import requests
 import base64
 import math
 # Erlaubte Dateiformate
 allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
@@ -13,14 +14,28 @@ allowed_audios = [".mp3", ".wav", ".m4a", ".ogg"]
 API_URL = "https://text.pollinations.ai/openai"
 def convert_to_wav(audio_path):
     wav_path = Path(audio_path).with_suffix(".wav")
     cmd = ["ffmpeg", "-y", "-i", str(audio_path), "-ar", "16000", "-ac", "1", str(wav_path)]
-    subprocess.run(cmd, check=True)
     return wav_path
-def transcribe_audio(audio_path):
-    wav_file = convert_to_wav(audio_path)
     with open(wav_file, "rb") as f:
         audio_data = base64.b64encode(f.read()).decode()
     payload = {
@@ -49,7 +64,7 @@ def generate_slideshow_with_audio(images, audio_file, duration_per_image=3, y_po
     # Transkription, falls Audio vorhanden
     if audio_file:
-        transcript = transcribe_audio(audio_file.name)
         words = transcript.split()
         total_words = len(words)
         segments_per_image = math.ceil(total_words / len(images))
@@ -58,9 +73,12 @@ def generate_slideshow_with_audio(images, audio_file, duration_per_image=3, y_po
             start = i * segments_per_image
             end = min((i + 1) * segments_per_image, total_words)
             texts.append(" ".join(words[start:end]))
     else:
         texts = [""] * len(images)
     for i, img_path in enumerate(images):
         clip_path = Path(temp_dir) / f"clip_{i}.mp4"
         text = texts[i] if i < len(texts) else ""
@@ -72,7 +90,7 @@ def generate_slideshow_with_audio(images, audio_file, duration_per_image=3, y_po
         )
         if text:
-            # Escape problematische Zeichen
             safe_text = text.replace(":", "\\:").replace("'", "\\'").replace(",", "\\,")
             drawtext_filter = (
                 f",drawtext=text='{safe_text}':fontcolor=white:fontsize={font_size}:borderw=2:"
@@ -120,13 +138,13 @@ def generate_slideshow_with_audio(images, audio_file, duration_per_image=3, y_po
         return None, f"❌ FFmpeg Concat Fehler:\n{e.stderr}"
     # Audio hinzufügen, falls vorhanden
-    if audio_file:
         final_output = Path(temp_dir) / f"slideshow_audio_{uuid.uuid4().hex}.mp4"
         cmd_audio = [
             "ffmpeg",
             "-y",
             "-i", str(output_file),
-            "-i", audio_file.name,
             "-c:v", "copy",
             "-c:a", "aac",
             "-shortest",
@@ -163,4 +181,4 @@ with gr.Blocks() as demo:
         outputs=[out_video, status]
     )
-demo.launch()

 import requests
 import base64
 import math
+import shutil
 # Erlaubte Dateiformate
 allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
 API_URL = "https://text.pollinations.ai/openai"
+def save_temp_audio(audio_file):
+    """
+    Speichert die hochgeladene Datei sicher mit korrekter Endung in einem temporären Verzeichnis.
+    """
+    ext = Path(audio_file.name).suffix
+    if ext.lower() not in allowed_audios:
+        ext = ".mp3"  # Standard falls Endung fehlt
+    temp_audio = Path(tempfile.mkdtemp()) / f"input{ext}"
+    audio_file.seek(0)
+    with open(temp_audio, "wb") as f:
+        shutil.copyfileobj(audio_file, f)
+    return temp_audio
 def convert_to_wav(audio_path):
     wav_path = Path(audio_path).with_suffix(".wav")
     cmd = ["ffmpeg", "-y", "-i", str(audio_path), "-ar", "16000", "-ac", "1", str(wav_path)]
+    subprocess.run(cmd, check=True, capture_output=True, text=True)
     return wav_path
+def transcribe_audio(audio_file):
+    temp_audio = save_temp_audio(audio_file)
+    wav_file = convert_to_wav(temp_audio)
     with open(wav_file, "rb") as f:
         audio_data = base64.b64encode(f.read()).decode()
     payload = {
     # Transkription, falls Audio vorhanden
     if audio_file:
+        transcript = transcribe_audio(audio_file)
         words = transcript.split()
         total_words = len(words)
         segments_per_image = math.ceil(total_words / len(images))
             start = i * segments_per_image
             end = min((i + 1) * segments_per_image, total_words)
             texts.append(" ".join(words[start:end]))
+        temp_audio_file = save_temp_audio(audio_file)
     else:
         texts = [""] * len(images)
+        temp_audio_file = None
+    # Einzelne Clips erstellen
     for i, img_path in enumerate(images):
         clip_path = Path(temp_dir) / f"clip_{i}.mp4"
         text = texts[i] if i < len(texts) else ""
         )
         if text:
+            # Escape problematischer Zeichen für FFmpeg
             safe_text = text.replace(":", "\\:").replace("'", "\\'").replace(",", "\\,")
             drawtext_filter = (
                 f",drawtext=text='{safe_text}':fontcolor=white:fontsize={font_size}:borderw=2:"
         return None, f"❌ FFmpeg Concat Fehler:\n{e.stderr}"
     # Audio hinzufügen, falls vorhanden
+    if temp_audio_file:
         final_output = Path(temp_dir) / f"slideshow_audio_{uuid.uuid4().hex}.mp4"
         cmd_audio = [
             "ffmpeg",
             "-y",
             "-i", str(output_file),
+            "-i", str(temp_audio_file),
             "-c:v", "copy",
             "-c:a", "aac",
             "-shortest",
         outputs=[out_video, status]
     )
+demo.launch()