Aid3445 commited on
Commit
674ae41
·
verified ·
1 Parent(s): 666c2bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -38
app.py CHANGED
@@ -7,6 +7,8 @@ import re
7
  import time
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import gc
 
 
10
 
11
  # Fix for OpenMP duplicate library error
12
  os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
@@ -34,29 +36,94 @@ class KittenTTSGradio:
34
  if not self.model_loaded:
35
  self.load_model()
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def load_model(self):
38
- """Load the TTS model"""
39
  if self.model_loaded:
40
  return
41
 
42
  try:
43
  print("Loading KittenTTS model...")
44
- # Try the mini model first
45
- self.model = KittenTTS("KittenML/kitten-tts-mini-0.1")
46
- self.model_loaded = True
47
- print("Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  except Exception as e:
49
- print(f"Failed to load mini model: {e}")
50
- # Try the nano model as fallback
51
- try:
52
- print("Trying nano model as fallback...")
53
- self.model = KittenTTS("KittenML/kitten-tts-nano-0.2")
54
- self.model_loaded = True
55
- print("Nano model loaded successfully!")
56
- except Exception as e2:
57
- print(f"Failed to load nano model: {e2}")
58
- self.model_loaded = False
59
- raise Exception("Failed to load any KittenTTS model")
60
 
61
  def split_into_sentences(self, text):
62
  """Split text into sentences"""
@@ -79,6 +146,19 @@ class KittenTTSGradio:
79
 
80
  return processed_sentences
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def clean_text_for_model(self, text):
83
  """Clean text for the TTS model"""
84
  if not text:
@@ -138,7 +218,7 @@ class KittenTTSGradio:
138
  audio = self.safe_generate_audio(cleaned_sentence, voice=voice, speed=speed)
139
  return audio
140
 
141
- def convert_text_to_speech(self, text, voice, speed, use_multithreading, progress=gr.Progress()):
142
  """Main conversion function for Gradio"""
143
  # Ensure model is loaded
144
  try:
@@ -150,25 +230,31 @@ class KittenTTSGradio:
150
  raise gr.Error("Please enter some text to convert.")
151
 
152
  try:
153
- # Split into sentences
154
  sentences = self.split_into_sentences(text)
155
 
156
  if not sentences:
157
  raise gr.Error("No valid sentences found in the text.")
158
 
 
 
 
 
159
  total_sentences = len(sentences)
160
- progress(0, desc=f"Processing {total_sentences} sentences...")
161
 
162
- # Process sentences
 
 
 
163
  audio_chunks = []
164
 
165
- if use_multithreading and total_sentences > 1:
166
  # Multithreaded processing
167
  with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
168
- # Submit all sentences
169
  futures = {
170
- executor.submit(self.process_single_sentence, sentence, voice, speed): i
171
- for i, sentence in enumerate(sentences)
172
  }
173
 
174
  # Collect results in order
@@ -181,10 +267,10 @@ class KittenTTSGradio:
181
  audio = future.result()
182
  results[idx] = audio
183
  completed += 1
184
- progress(completed / total_sentences,
185
- desc=f"Processed {completed}/{total_sentences} sentences")
186
  except Exception as e:
187
- print(f"Error processing sentence: {e}")
188
  continue
189
 
190
  # Sort by index
@@ -192,14 +278,14 @@ class KittenTTSGradio:
192
  audio_chunks.append(results[i])
193
  else:
194
  # Sequential processing
195
- for i, sentence in enumerate(sentences):
196
  try:
197
- audio = self.process_single_sentence(sentence, voice, speed)
198
  audio_chunks.append(audio)
199
- progress((i + 1) / total_sentences,
200
- desc=f"Processed {i + 1}/{total_sentences} sentences")
201
  except Exception as e:
202
- print(f"Error processing sentence: {e}")
203
  continue
204
 
205
  if not audio_chunks:
@@ -224,7 +310,8 @@ class KittenTTSGradio:
224
  gc.collect()
225
 
226
  processing_method = "multithreading" if use_multithreading else "sequential"
227
- status_message = f" Successfully converted {total_sentences} sentences using {processing_method} processing!"
 
228
 
229
  return output_file.name, status_message
230
 
@@ -247,10 +334,11 @@ def create_interface():
247
  **Features:**
248
  - 8 different voice options (male and female)
249
  - Adjustable speech speed
250
- - Sentence-by-sentence processing for better quality
 
251
  - Multithreading support for faster processing
252
 
253
- **Note:** The model will load on first use (~170MB download).
254
  """)
255
 
256
  with gr.Row():
@@ -320,10 +408,19 @@ def create_interface():
320
  info="Adjust the speed of speech (1.0 = normal)"
321
  )
322
 
 
 
 
 
 
 
 
 
 
323
  multithread_checkbox = gr.Checkbox(
324
  value=True,
325
  label=f"Enable Multithreading ({app.max_workers} workers)",
326
- info="Process multiple sentences in parallel for faster conversion"
327
  )
328
 
329
  convert_btn = gr.Button(
@@ -358,7 +455,7 @@ def create_interface():
358
  # Connect the conversion function
359
  convert_btn.click(
360
  fn=app.convert_text_to_speech,
361
- inputs=[text_input, voice_dropdown, speed_slider, multithread_checkbox],
362
  outputs=[audio_output, status_output]
363
  )
364
 
@@ -370,7 +467,7 @@ def create_interface():
370
  - Processing time depends on text length, chunk size, and multithreading setting
371
  - Each voice has different characteristics - try them out!
372
  - The model runs entirely on CPU - no GPU required
373
- - First conversion will take longer as the model loads
374
 
375
  ### 🎭 Available Voices:
376
  - **expr-voice-2-m/f**: Expressive male/female voices
 
7
  import time
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import gc
10
+ from huggingface_hub import hf_hub_download
11
+ import json
12
 
13
  # Fix for OpenMP duplicate library error
14
  os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
 
36
  if not self.model_loaded:
37
  self.load_model()
38
 
39
+ def download_and_load_model(self, repo_id):
40
+ """Download model files and load them"""
41
+ try:
42
+ print(f"Downloading model files from {repo_id}...")
43
+
44
+ # Download config file
45
+ config_path = hf_hub_download(
46
+ repo_id=repo_id,
47
+ filename="config.json"
48
+ )
49
+
50
+ # Read config to get file names
51
+ with open(config_path, 'r') as f:
52
+ config = json.load(f)
53
+
54
+ # Download model file - try different possible names
55
+ model_filename = config.get("model_file", None)
56
+ if not model_filename:
57
+ # Try common names
58
+ possible_names = ["kitten_tts_mini_v0_1.onnx", "kitten_tts_nano_v0_2.onnx", "kitten_tts_nano_v0_1.onnx"]
59
+ for name in possible_names:
60
+ try:
61
+ model_path = hf_hub_download(repo_id=repo_id, filename=name)
62
+ model_filename = name
63
+ break
64
+ except:
65
+ continue
66
+ else:
67
+ model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
68
+
69
+ # Download voices file
70
+ voices_filename = config.get("voices", "voices.npz")
71
+ voices_path = hf_hub_download(
72
+ repo_id=repo_id,
73
+ filename=voices_filename
74
+ )
75
+
76
+ print(f"Model files downloaded successfully")
77
+
78
+ # Now try to load with KittenTTS using the repo_id
79
+ # The library should use the cached files
80
+ self.model = KittenTTS(repo_id)
81
+ return True
82
+
83
+ except Exception as e:
84
+ print(f"Failed to download and load {repo_id}: {e}")
85
+ return False
86
+
87
  def load_model(self):
88
+ """Load the TTS model with multiple fallback options"""
89
  if self.model_loaded:
90
  return
91
 
92
  try:
93
  print("Loading KittenTTS model...")
94
+
95
+ # Try different loading strategies
96
+ strategies = [
97
+ ("KittenML/kitten-tts-mini-0.1", "mini"),
98
+ ("KittenML/kitten-tts-nano-0.2", "nano v0.2"),
99
+ ("KittenML/kitten-tts-nano-0.1", "nano v0.1"),
100
+ ]
101
+
102
+ for repo_id, name in strategies:
103
+ print(f"Trying to load {name} model...")
104
+
105
+ # First try direct loading (in case files are cached)
106
+ try:
107
+ self.model = KittenTTS(repo_id)
108
+ self.model_loaded = True
109
+ print(f"Successfully loaded {name} model!")
110
+ return
111
+ except Exception as e:
112
+ print(f"Direct loading failed: {e}")
113
+
114
+ # Try downloading and loading
115
+ if self.download_and_load_model(repo_id):
116
+ self.model_loaded = True
117
+ print(f"Successfully loaded {name} model after download!")
118
+ return
119
+
120
+ # If all strategies failed
121
+ raise Exception("Failed to load any KittenTTS model. Please check your internet connection.")
122
+
123
  except Exception as e:
124
+ print(f"Error loading model: {e}")
125
+ self.model_loaded = False
126
+ raise e
 
 
 
 
 
 
 
 
127
 
128
  def split_into_sentences(self, text):
129
  """Split text into sentences"""
 
146
 
147
  return processed_sentences
148
 
149
+ def group_sentences_into_chunks(self, sentences, chunk_size):
150
+ """Group sentences into chunks of specified size"""
151
+ if chunk_size <= 0:
152
+ chunk_size = 1
153
+
154
+ chunks = []
155
+ for i in range(0, len(sentences), chunk_size):
156
+ # Join sentences in this chunk with a space
157
+ chunk = ' '.join(sentences[i:i + chunk_size])
158
+ chunks.append(chunk)
159
+
160
+ return chunks
161
+
162
  def clean_text_for_model(self, text):
163
  """Clean text for the TTS model"""
164
  if not text:
 
218
  audio = self.safe_generate_audio(cleaned_sentence, voice=voice, speed=speed)
219
  return audio
220
 
221
+ def convert_text_to_speech(self, text, voice, speed, chunk_size, use_multithreading, progress=gr.Progress()):
222
  """Main conversion function for Gradio"""
223
  # Ensure model is loaded
224
  try:
 
230
  raise gr.Error("Please enter some text to convert.")
231
 
232
  try:
233
+ # Split into sentences first
234
  sentences = self.split_into_sentences(text)
235
 
236
  if not sentences:
237
  raise gr.Error("No valid sentences found in the text.")
238
 
239
+ # Group sentences into chunks based on chunk_size
240
+ chunks = self.group_sentences_into_chunks(sentences, chunk_size)
241
+
242
+ total_chunks = len(chunks)
243
  total_sentences = len(sentences)
 
244
 
245
+ chunk_label = "chunk" if chunk_size == 1 else f"chunk ({chunk_size} sentences each)"
246
+ progress(0, desc=f"Processing {total_sentences} sentences in {total_chunks} {chunk_label}s...")
247
+
248
+ # Process chunks
249
  audio_chunks = []
250
 
251
+ if use_multithreading and total_chunks > 1:
252
  # Multithreaded processing
253
  with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
254
+ # Submit all chunks
255
  futures = {
256
+ executor.submit(self.process_single_sentence, chunk, voice, speed): i
257
+ for i, chunk in enumerate(chunks)
258
  }
259
 
260
  # Collect results in order
 
267
  audio = future.result()
268
  results[idx] = audio
269
  completed += 1
270
+ progress(completed / total_chunks,
271
+ desc=f"Processed {completed}/{total_chunks} {chunk_label}s")
272
  except Exception as e:
273
+ print(f"Error processing chunk: {e}")
274
  continue
275
 
276
  # Sort by index
 
278
  audio_chunks.append(results[i])
279
  else:
280
  # Sequential processing
281
+ for i, chunk in enumerate(chunks):
282
  try:
283
+ audio = self.process_single_sentence(chunk, voice, speed)
284
  audio_chunks.append(audio)
285
+ progress((i + 1) / total_chunks,
286
+ desc=f"Processed {i + 1}/{total_chunks} {chunk_label}s")
287
  except Exception as e:
288
+ print(f"Error processing chunk: {e}")
289
  continue
290
 
291
  if not audio_chunks:
 
310
  gc.collect()
311
 
312
  processing_method = "multithreading" if use_multithreading else "sequential"
313
+ chunk_description = f"{chunk_size} sentence(s) per chunk" if chunk_size > 1 else "sentence-by-sentence"
314
+ status_message = f"✅ Successfully converted {total_sentences} sentences ({total_chunks} chunks) using {processing_method} processing with {chunk_description}!"
315
 
316
  return output_file.name, status_message
317
 
 
334
  **Features:**
335
  - 8 different voice options (male and female)
336
  - Adjustable speech speed
337
+ - Adjustable chunk size for processing
338
+ - Sentence-by-sentence or multi-sentence processing
339
  - Multithreading support for faster processing
340
 
341
+ **Note:** The model will download on first use (~170MB for mini model, ~25MB for nano).
342
  """)
343
 
344
  with gr.Row():
 
408
  info="Adjust the speed of speech (1.0 = normal)"
409
  )
410
 
411
+ chunk_size_slider = gr.Slider(
412
+ minimum=1,
413
+ maximum=10,
414
+ value=1,
415
+ step=1,
416
+ label="Sentences per Chunk",
417
+ info="Group sentences together (1 = best quality, higher = faster processing)"
418
+ )
419
+
420
  multithread_checkbox = gr.Checkbox(
421
  value=True,
422
  label=f"Enable Multithreading ({app.max_workers} workers)",
423
+ info="Process multiple chunks in parallel"
424
  )
425
 
426
  convert_btn = gr.Button(
 
455
  # Connect the conversion function
456
  convert_btn.click(
457
  fn=app.convert_text_to_speech,
458
+ inputs=[text_input, voice_dropdown, speed_slider, chunk_size_slider, multithread_checkbox],
459
  outputs=[audio_output, status_output]
460
  )
461
 
 
467
  - Processing time depends on text length, chunk size, and multithreading setting
468
  - Each voice has different characteristics - try them out!
469
  - The model runs entirely on CPU - no GPU required
470
+ - First conversion will take longer as the model downloads and loads
471
 
472
  ### 🎭 Available Voices:
473
  - **expr-voice-2-m/f**: Expressive male/female voices