video-ffmpeg

Running

App Files Files Community

Tim13ekd commited on 6 days ago

Commit

6b5b2bf

verified ·

1 Parent(s): c9a86ba

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -4

app.py CHANGED Viewed

@@ -10,8 +10,15 @@ import math
 allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
 API_URL = "https://text.pollinations.ai/openai"
 def transcribe_audio(audio_path):
-    with open(audio_path, "rb") as f:
         audio_data = base64.b64encode(f.read()).decode()
     payload = {
         "model": "openai-audio",
@@ -26,7 +33,6 @@ def transcribe_audio(audio_path):
     response = requests.post(API_URL, json=payload)
     response.raise_for_status()
     result = response.json()
-    # API liefert Text in choices[0].message.content
     text = result['choices'][0]['message']['content']
     return text
@@ -41,7 +47,6 @@ def generate_slideshow_with_audio(images, audio_file, duration_per_image=3, y_po
     # Transkription, falls Audio vorhanden
     if audio_file:
         transcript = transcribe_audio(audio_file.name)
-        # Einfach in gleiche Länge wie Bilder aufteilen
         words = transcript.split()
         total_words = len(words)
         segments_per_image = math.ceil(total_words / len(images))
@@ -136,7 +141,7 @@ with gr.Blocks() as demo:
     gr.Markdown("# Slideshow mit Audio & automatischen Untertiteln")
     img_input = gr.Files(label="Bilder auswählen (mehrere)", file_types=allowed_medias)
-    audio_input = gr.File(label="Audio hinzufügen (optional, WAV)", file_types=[".wav"])
     duration_input = gr.Number(value=3, label="Dauer pro Bild in Sekunden", precision=1)
     fade_input = gr.Number(value=0.7, label="Fade Dauer in Sekunden", precision=1)
     ypos_input = gr.Slider(minimum=0.0, maximum=0.9, step=0.01, value=0.5, label="Y-Position für alle Texte (0=oben, 0.5=mitte, 0.9=unten)")

 allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
 API_URL = "https://text.pollinations.ai/openai"
+def convert_to_wav(audio_path):
+    wav_path = Path(audio_path).with_suffix(".wav")
+    cmd = ["ffmpeg", "-y", "-i", str(audio_path), "-ar", "16000", "-ac", "1", str(wav_path)]
+    subprocess.run(cmd, check=True)
+    return wav_path
 def transcribe_audio(audio_path):
+    wav_file = convert_to_wav(audio_path)
+    with open(wav_file, "rb") as f:
         audio_data = base64.b64encode(f.read()).decode()
     payload = {
         "model": "openai-audio",
     response = requests.post(API_URL, json=payload)
     response.raise_for_status()
     result = response.json()
     text = result['choices'][0]['message']['content']
     return text
     # Transkription, falls Audio vorhanden
     if audio_file:
         transcript = transcribe_audio(audio_file.name)
         words = transcript.split()
         total_words = len(words)
         segments_per_image = math.ceil(total_words / len(images))
     gr.Markdown("# Slideshow mit Audio & automatischen Untertiteln")
     img_input = gr.Files(label="Bilder auswählen (mehrere)", file_types=allowed_medias)
+    audio_input = gr.File(label="Audio hinzufügen (MP3, WAV, M4A, OGG ... optional)")
     duration_input = gr.Number(value=3, label="Dauer pro Bild in Sekunden", precision=1)
     fade_input = gr.Number(value=0.7, label="Fade Dauer in Sekunden", precision=1)
     ypos_input = gr.Slider(minimum=0.0, maximum=0.9, step=0.01, value=0.5, label="Y-Position für alle Texte (0=oben, 0.5=mitte, 0.9=unten)")