Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,8 +10,15 @@ import math
|
|
| 10 |
allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
|
| 11 |
API_URL = "https://text.pollinations.ai/openai"
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def transcribe_audio(audio_path):
|
| 14 |
-
|
|
|
|
| 15 |
audio_data = base64.b64encode(f.read()).decode()
|
| 16 |
payload = {
|
| 17 |
"model": "openai-audio",
|
|
@@ -26,7 +33,6 @@ def transcribe_audio(audio_path):
|
|
| 26 |
response = requests.post(API_URL, json=payload)
|
| 27 |
response.raise_for_status()
|
| 28 |
result = response.json()
|
| 29 |
-
# API liefert Text in choices[0].message.content
|
| 30 |
text = result['choices'][0]['message']['content']
|
| 31 |
return text
|
| 32 |
|
|
@@ -41,7 +47,6 @@ def generate_slideshow_with_audio(images, audio_file, duration_per_image=3, y_po
|
|
| 41 |
# Transkription, falls Audio vorhanden
|
| 42 |
if audio_file:
|
| 43 |
transcript = transcribe_audio(audio_file.name)
|
| 44 |
-
# Einfach in gleiche Länge wie Bilder aufteilen
|
| 45 |
words = transcript.split()
|
| 46 |
total_words = len(words)
|
| 47 |
segments_per_image = math.ceil(total_words / len(images))
|
|
@@ -136,7 +141,7 @@ with gr.Blocks() as demo:
|
|
| 136 |
gr.Markdown("# Slideshow mit Audio & automatischen Untertiteln")
|
| 137 |
|
| 138 |
img_input = gr.Files(label="Bilder auswählen (mehrere)", file_types=allowed_medias)
|
| 139 |
-
audio_input = gr.File(label="Audio hinzufügen (
|
| 140 |
duration_input = gr.Number(value=3, label="Dauer pro Bild in Sekunden", precision=1)
|
| 141 |
fade_input = gr.Number(value=0.7, label="Fade Dauer in Sekunden", precision=1)
|
| 142 |
ypos_input = gr.Slider(minimum=0.0, maximum=0.9, step=0.01, value=0.5, label="Y-Position für alle Texte (0=oben, 0.5=mitte, 0.9=unten)")
|
|
|
|
| 10 |
allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
|
| 11 |
API_URL = "https://text.pollinations.ai/openai"
|
| 12 |
|
| 13 |
+
def convert_to_wav(audio_path):
|
| 14 |
+
wav_path = Path(audio_path).with_suffix(".wav")
|
| 15 |
+
cmd = ["ffmpeg", "-y", "-i", str(audio_path), "-ar", "16000", "-ac", "1", str(wav_path)]
|
| 16 |
+
subprocess.run(cmd, check=True)
|
| 17 |
+
return wav_path
|
| 18 |
+
|
| 19 |
def transcribe_audio(audio_path):
|
| 20 |
+
wav_file = convert_to_wav(audio_path)
|
| 21 |
+
with open(wav_file, "rb") as f:
|
| 22 |
audio_data = base64.b64encode(f.read()).decode()
|
| 23 |
payload = {
|
| 24 |
"model": "openai-audio",
|
|
|
|
| 33 |
response = requests.post(API_URL, json=payload)
|
| 34 |
response.raise_for_status()
|
| 35 |
result = response.json()
|
|
|
|
| 36 |
text = result['choices'][0]['message']['content']
|
| 37 |
return text
|
| 38 |
|
|
|
|
| 47 |
# Transkription, falls Audio vorhanden
|
| 48 |
if audio_file:
|
| 49 |
transcript = transcribe_audio(audio_file.name)
|
|
|
|
| 50 |
words = transcript.split()
|
| 51 |
total_words = len(words)
|
| 52 |
segments_per_image = math.ceil(total_words / len(images))
|
|
|
|
| 141 |
gr.Markdown("# Slideshow mit Audio & automatischen Untertiteln")
|
| 142 |
|
| 143 |
img_input = gr.Files(label="Bilder auswählen (mehrere)", file_types=allowed_medias)
|
| 144 |
+
audio_input = gr.File(label="Audio hinzufügen (MP3, WAV, M4A, OGG ... optional)")
|
| 145 |
duration_input = gr.Number(value=3, label="Dauer pro Bild in Sekunden", precision=1)
|
| 146 |
fade_input = gr.Number(value=0.7, label="Fade Dauer in Sekunden", precision=1)
|
| 147 |
ypos_input = gr.Slider(minimum=0.0, maximum=0.9, step=0.01, value=0.5, label="Y-Position für alle Texte (0=oben, 0.5=mitte, 0.9=unten)")
|