Spaces:

Aid3445
/

Good.KTTS

Paused

App Files Files Community

Aid3445 commited on Sep 8

Commit

674ae41

verified ·

1 Parent(s): 666c2bc

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -38

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ import re
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import gc
 # Fix for OpenMP duplicate library error
 os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
@@ -34,29 +36,94 @@ class KittenTTSGradio:
         if not self.model_loaded:
             self.load_model()
     def load_model(self):
-        """Load the TTS model"""
         if self.model_loaded:
             return
         try:
             print("Loading KittenTTS model...")
-            # Try the mini model first
-            self.model = KittenTTS("KittenML/kitten-tts-mini-0.1")
-            self.model_loaded = True
-            print("Model loaded successfully!")
         except Exception as e:
-            print(f"Failed to load mini model: {e}")
-            # Try the nano model as fallback
-            try:
-                print("Trying nano model as fallback...")
-                self.model = KittenTTS("KittenML/kitten-tts-nano-0.2")
-                self.model_loaded = True
-                print("Nano model loaded successfully!")
-            except Exception as e2:
-                print(f"Failed to load nano model: {e2}")
-                self.model_loaded = False
-                raise Exception("Failed to load any KittenTTS model")
     def split_into_sentences(self, text):
         """Split text into sentences"""
@@ -79,6 +146,19 @@ class KittenTTSGradio:
         return processed_sentences
     def clean_text_for_model(self, text):
         """Clean text for the TTS model"""
         if not text:
@@ -138,7 +218,7 @@ class KittenTTSGradio:
         audio = self.safe_generate_audio(cleaned_sentence, voice=voice, speed=speed)
         return audio
-    def convert_text_to_speech(self, text, voice, speed, use_multithreading, progress=gr.Progress()):
         """Main conversion function for Gradio"""
         # Ensure model is loaded
         try:
@@ -150,25 +230,31 @@ class KittenTTSGradio:
             raise gr.Error("Please enter some text to convert.")
         try:
-            # Split into sentences
             sentences = self.split_into_sentences(text)
             if not sentences:
                 raise gr.Error("No valid sentences found in the text.")
             total_sentences = len(sentences)
-            progress(0, desc=f"Processing {total_sentences} sentences...")
-            # Process sentences
             audio_chunks = []
-            if use_multithreading and total_sentences > 1:
                 # Multithreaded processing
                 with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
-                    # Submit all sentences
                     futures = {
-                        executor.submit(self.process_single_sentence, sentence, voice, speed): i
-                        for i, sentence in enumerate(sentences)
                     }
                     # Collect results in order
@@ -181,10 +267,10 @@ class KittenTTSGradio:
                             audio = future.result()
                             results[idx] = audio
                             completed += 1
-                            progress(completed / total_sentences,
-                                   desc=f"Processed {completed}/{total_sentences} sentences")
                         except Exception as e:
-                            print(f"Error processing sentence: {e}")
                             continue
                     # Sort by index
@@ -192,14 +278,14 @@ class KittenTTSGradio:
                         audio_chunks.append(results[i])
             else:
                 # Sequential processing
-                for i, sentence in enumerate(sentences):
                     try:
-                        audio = self.process_single_sentence(sentence, voice, speed)
                         audio_chunks.append(audio)
-                        progress((i + 1) / total_sentences,
-                               desc=f"Processed {i + 1}/{total_sentences} sentences")
                     except Exception as e:
-                        print(f"Error processing sentence: {e}")
                         continue
             if not audio_chunks:
@@ -224,7 +310,8 @@ class KittenTTSGradio:
             gc.collect()
             processing_method = "multithreading" if use_multithreading else "sequential"
-            status_message = f"✅ Successfully converted {total_sentences} sentences using {processing_method} processing!"
             return output_file.name, status_message
@@ -247,10 +334,11 @@ def create_interface():
         **Features:**
         - 8 different voice options (male and female)
         - Adjustable speech speed
-        - Sentence-by-sentence processing for better quality
         - Multithreading support for faster processing
-        **Note:** The model will load on first use (~170MB download).
         """)
         with gr.Row():
@@ -320,10 +408,19 @@ def create_interface():
                     info="Adjust the speed of speech (1.0 = normal)"
                 )
                 multithread_checkbox = gr.Checkbox(
                     value=True,
                     label=f"Enable Multithreading ({app.max_workers} workers)",
-                    info="Process multiple sentences in parallel for faster conversion"
                 )
                 convert_btn = gr.Button(
@@ -358,7 +455,7 @@ def create_interface():
         # Connect the conversion function
         convert_btn.click(
             fn=app.convert_text_to_speech,
-            inputs=[text_input, voice_dropdown, speed_slider, multithread_checkbox],
             outputs=[audio_output, status_output]
         )
@@ -370,7 +467,7 @@ def create_interface():
         - Processing time depends on text length, chunk size, and multithreading setting
         - Each voice has different characteristics - try them out!
         - The model runs entirely on CPU - no GPU required
-        - First conversion will take longer as the model loads
         ### 🎭 Available Voices:
         - **expr-voice-2-m/f**: Expressive male/female voices

 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import gc
+from huggingface_hub import hf_hub_download
+import json
 # Fix for OpenMP duplicate library error
 os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
         if not self.model_loaded:
             self.load_model()
+    def download_and_load_model(self, repo_id):
+        """Download model files and load them"""
+        try:
+            print(f"Downloading model files from {repo_id}...")
+            # Download config file
+            config_path = hf_hub_download(
+                repo_id=repo_id,
+                filename="config.json"
+            )
+            # Read config to get file names
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+            # Download model file - try different possible names
+            model_filename = config.get("model_file", None)
+            if not model_filename:
+                # Try common names
+                possible_names = ["kitten_tts_mini_v0_1.onnx", "kitten_tts_nano_v0_2.onnx", "kitten_tts_nano_v0_1.onnx"]
+                for name in possible_names:
+                    try:
+                        model_path = hf_hub_download(repo_id=repo_id, filename=name)
+                        model_filename = name
+                        break
+                    except:
+                        continue
+            else:
+                model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
+            # Download voices file
+            voices_filename = config.get("voices", "voices.npz")
+            voices_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=voices_filename
+            )
+            print(f"Model files downloaded successfully")
+            # Now try to load with KittenTTS using the repo_id
+            # The library should use the cached files
+            self.model = KittenTTS(repo_id)
+            return True
+        except Exception as e:
+            print(f"Failed to download and load {repo_id}: {e}")
+            return False
     def load_model(self):
+        """Load the TTS model with multiple fallback options"""
         if self.model_loaded:
             return
         try:
             print("Loading KittenTTS model...")
+            # Try different loading strategies
+            strategies = [
+                ("KittenML/kitten-tts-mini-0.1", "mini"),
+                ("KittenML/kitten-tts-nano-0.2", "nano v0.2"),
+                ("KittenML/kitten-tts-nano-0.1", "nano v0.1"),
+            ]
+            for repo_id, name in strategies:
+                print(f"Trying to load {name} model...")
+                # First try direct loading (in case files are cached)
+                try:
+                    self.model = KittenTTS(repo_id)
+                    self.model_loaded = True
+                    print(f"Successfully loaded {name} model!")
+                    return
+                except Exception as e:
+                    print(f"Direct loading failed: {e}")
+                # Try downloading and loading
+                if self.download_and_load_model(repo_id):
+                    self.model_loaded = True
+                    print(f"Successfully loaded {name} model after download!")
+                    return
+            # If all strategies failed
+            raise Exception("Failed to load any KittenTTS model. Please check your internet connection.")
         except Exception as e:
+            print(f"Error loading model: {e}")
+            self.model_loaded = False
+            raise e
     def split_into_sentences(self, text):
         """Split text into sentences"""
         return processed_sentences
+    def group_sentences_into_chunks(self, sentences, chunk_size):
+        """Group sentences into chunks of specified size"""
+        if chunk_size <= 0:
+            chunk_size = 1
+        chunks = []
+        for i in range(0, len(sentences), chunk_size):
+            # Join sentences in this chunk with a space
+            chunk = ' '.join(sentences[i:i + chunk_size])
+            chunks.append(chunk)
+        return chunks
     def clean_text_for_model(self, text):
         """Clean text for the TTS model"""
         if not text:
         audio = self.safe_generate_audio(cleaned_sentence, voice=voice, speed=speed)
         return audio
+    def convert_text_to_speech(self, text, voice, speed, chunk_size, use_multithreading, progress=gr.Progress()):
         """Main conversion function for Gradio"""
         # Ensure model is loaded
         try:
             raise gr.Error("Please enter some text to convert.")
         try:
+            # Split into sentences first
             sentences = self.split_into_sentences(text)
             if not sentences:
                 raise gr.Error("No valid sentences found in the text.")
+            # Group sentences into chunks based on chunk_size
+            chunks = self.group_sentences_into_chunks(sentences, chunk_size)
+            total_chunks = len(chunks)
             total_sentences = len(sentences)
+            chunk_label = "chunk" if chunk_size == 1 else f"chunk ({chunk_size} sentences each)"
+            progress(0, desc=f"Processing {total_sentences} sentences in {total_chunks} {chunk_label}s...")
+            # Process chunks
             audio_chunks = []
+            if use_multithreading and total_chunks > 1:
                 # Multithreaded processing
                 with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                    # Submit all chunks
                     futures = {
+                        executor.submit(self.process_single_sentence, chunk, voice, speed): i
+                        for i, chunk in enumerate(chunks)
                     }
                     # Collect results in order
                             audio = future.result()
                             results[idx] = audio
                             completed += 1
+                            progress(completed / total_chunks,
+                                   desc=f"Processed {completed}/{total_chunks} {chunk_label}s")
                         except Exception as e:
+                            print(f"Error processing chunk: {e}")
                             continue
                     # Sort by index
                         audio_chunks.append(results[i])
             else:
                 # Sequential processing
+                for i, chunk in enumerate(chunks):
                     try:
+                        audio = self.process_single_sentence(chunk, voice, speed)
                         audio_chunks.append(audio)
+                        progress((i + 1) / total_chunks,
+                               desc=f"Processed {i + 1}/{total_chunks} {chunk_label}s")
                     except Exception as e:
+                        print(f"Error processing chunk: {e}")
                         continue
             if not audio_chunks:
             gc.collect()
             processing_method = "multithreading" if use_multithreading else "sequential"
+            chunk_description = f"{chunk_size} sentence(s) per chunk" if chunk_size > 1 else "sentence-by-sentence"
+            status_message = f"✅ Successfully converted {total_sentences} sentences ({total_chunks} chunks) using {processing_method} processing with {chunk_description}!"
             return output_file.name, status_message
         **Features:**
         - 8 different voice options (male and female)
         - Adjustable speech speed
+        - Adjustable chunk size for processing
+        - Sentence-by-sentence or multi-sentence processing
         - Multithreading support for faster processing
+        **Note:** The model will download on first use (~170MB for mini model, ~25MB for nano).
         """)
         with gr.Row():
                     info="Adjust the speed of speech (1.0 = normal)"
                 )
+                chunk_size_slider = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=1,
+                    step=1,
+                    label="Sentences per Chunk",
+                    info="Group sentences together (1 = best quality, higher = faster processing)"
+                )
                 multithread_checkbox = gr.Checkbox(
                     value=True,
                     label=f"Enable Multithreading ({app.max_workers} workers)",
+                    info="Process multiple chunks in parallel"
                 )
                 convert_btn = gr.Button(
         # Connect the conversion function
         convert_btn.click(
             fn=app.convert_text_to_speech,
+            inputs=[text_input, voice_dropdown, speed_slider, chunk_size_slider, multithread_checkbox],
             outputs=[audio_output, status_output]
         )
         - Processing time depends on text length, chunk size, and multithreading setting
         - Each voice has different characteristics - try them out!
         - The model runs entirely on CPU - no GPU required
+        - First conversion will take longer as the model downloads and loads
         ### 🎭 Available Voices:
         - **expr-voice-2-m/f**: Expressive male/female voices