Tim13ekd commited on
Commit
6b5b2bf
·
verified ·
1 Parent(s): c9a86ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -4
app.py CHANGED
@@ -10,8 +10,15 @@ import math
10
  allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
11
  API_URL = "https://text.pollinations.ai/openai"
12
 
 
 
 
 
 
 
13
  def transcribe_audio(audio_path):
14
- with open(audio_path, "rb") as f:
 
15
  audio_data = base64.b64encode(f.read()).decode()
16
  payload = {
17
  "model": "openai-audio",
@@ -26,7 +33,6 @@ def transcribe_audio(audio_path):
26
  response = requests.post(API_URL, json=payload)
27
  response.raise_for_status()
28
  result = response.json()
29
- # API liefert Text in choices[0].message.content
30
  text = result['choices'][0]['message']['content']
31
  return text
32
 
@@ -41,7 +47,6 @@ def generate_slideshow_with_audio(images, audio_file, duration_per_image=3, y_po
41
  # Transkription, falls Audio vorhanden
42
  if audio_file:
43
  transcript = transcribe_audio(audio_file.name)
44
- # Einfach in gleiche Länge wie Bilder aufteilen
45
  words = transcript.split()
46
  total_words = len(words)
47
  segments_per_image = math.ceil(total_words / len(images))
@@ -136,7 +141,7 @@ with gr.Blocks() as demo:
136
  gr.Markdown("# Slideshow mit Audio & automatischen Untertiteln")
137
 
138
  img_input = gr.Files(label="Bilder auswählen (mehrere)", file_types=allowed_medias)
139
- audio_input = gr.File(label="Audio hinzufügen (optional, WAV)", file_types=[".wav"])
140
  duration_input = gr.Number(value=3, label="Dauer pro Bild in Sekunden", precision=1)
141
  fade_input = gr.Number(value=0.7, label="Fade Dauer in Sekunden", precision=1)
142
  ypos_input = gr.Slider(minimum=0.0, maximum=0.9, step=0.01, value=0.5, label="Y-Position für alle Texte (0=oben, 0.5=mitte, 0.9=unten)")
 
10
  allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
11
  API_URL = "https://text.pollinations.ai/openai"
12
 
13
+ def convert_to_wav(audio_path):
14
+ wav_path = Path(audio_path).with_suffix(".wav")
15
+ cmd = ["ffmpeg", "-y", "-i", str(audio_path), "-ar", "16000", "-ac", "1", str(wav_path)]
16
+ subprocess.run(cmd, check=True)
17
+ return wav_path
18
+
19
  def transcribe_audio(audio_path):
20
+ wav_file = convert_to_wav(audio_path)
21
+ with open(wav_file, "rb") as f:
22
  audio_data = base64.b64encode(f.read()).decode()
23
  payload = {
24
  "model": "openai-audio",
 
33
  response = requests.post(API_URL, json=payload)
34
  response.raise_for_status()
35
  result = response.json()
 
36
  text = result['choices'][0]['message']['content']
37
  return text
38
 
 
47
  # Transkription, falls Audio vorhanden
48
  if audio_file:
49
  transcript = transcribe_audio(audio_file.name)
 
50
  words = transcript.split()
51
  total_words = len(words)
52
  segments_per_image = math.ceil(total_words / len(images))
 
141
  gr.Markdown("# Slideshow mit Audio & automatischen Untertiteln")
142
 
143
  img_input = gr.Files(label="Bilder auswählen (mehrere)", file_types=allowed_medias)
144
+ audio_input = gr.File(label="Audio hinzufügen (MP3, WAV, M4A, OGG ... optional)")
145
  duration_input = gr.Number(value=3, label="Dauer pro Bild in Sekunden", precision=1)
146
  fade_input = gr.Number(value=0.7, label="Fade Dauer in Sekunden", precision=1)
147
  ypos_input = gr.Slider(minimum=0.0, maximum=0.9, step=0.01, value=0.5, label="Y-Position für alle Texte (0=oben, 0.5=mitte, 0.9=unten)")