Nymbo commited on
Commit
e84a565
·
verified ·
1 Parent(s): fc3c2b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -161
app.py CHANGED
@@ -1,161 +1,153 @@
1
- import gradio as gr
2
- import os
3
- import io
4
- import wave
5
- import numpy as np
6
- import soundfile as sf
7
- from huggingface_hub import snapshot_download
8
- from helper import load_text_to_speech, load_voice_style
9
-
10
- _SUPERTONIC_STATE = {"initialized": False, "tts": None, "assets_dir": None}
11
-
12
- def _init_supertonic() -> None:
13
- if _SUPERTONIC_STATE["initialized"]:
14
- return
15
-
16
- print("Initializing Supertonic...")
17
- # Download models if not present
18
- assets_dir = os.path.join(os.path.dirname(__file__), "assets")
19
- if not os.path.exists(assets_dir):
20
- print(f"Downloading Supertonic models to {assets_dir}...")
21
- snapshot_download(repo_id="Supertone/supertonic", local_dir=assets_dir)
22
-
23
- onnx_dir = os.path.join(assets_dir, "onnx")
24
- tts = load_text_to_speech(onnx_dir, use_gpu=False)
25
-
26
- _SUPERTONIC_STATE.update({"initialized": True, "tts": tts, "assets_dir": assets_dir})
27
- print("Supertonic initialized.")
28
-
29
- def get_supertonic_voices():
30
- """Get list of available Supertonic voice styles."""
31
- # Ensure assets are downloaded to list voices
32
- assets_dir = os.path.join(os.path.dirname(__file__), "assets")
33
- if not os.path.exists(assets_dir):
34
- # If not initialized/downloaded yet, we might not see voices.
35
- # But we can try to download just to list, or just init.
36
- _init_supertonic()
37
- assets_dir = _SUPERTONIC_STATE["assets_dir"]
38
-
39
- voice_styles_dir = os.path.join(assets_dir, "voice_styles")
40
- if not os.path.exists(voice_styles_dir):
41
- return []
42
-
43
- files = os.listdir(voice_styles_dir)
44
- voices = [f.replace('.json', '') for f in files if f.endswith('.json')]
45
- return sorted(voices)
46
-
47
- def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
48
- audio_clipped = np.clip(audio_np, -1.0, 1.0)
49
- return (audio_clipped * 32767.0).astype(np.int16)
50
-
51
- def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes:
52
- buffer = io.BytesIO()
53
- with wave.open(buffer, "wb") as wf:
54
- wf.setnchannels(1)
55
- wf.setsampwidth(2)
56
- wf.setframerate(sample_rate)
57
- wf.writeframes(audio_int16.tobytes())
58
- return buffer.getvalue()
59
-
60
- def supertonic_tts(text: str, speed: float, voice: str, steps: int):
61
- if not text or not text.strip():
62
- raise gr.Error("Please enter text to synthesize.")
63
-
64
- _init_supertonic()
65
- tts = _SUPERTONIC_STATE["tts"]
66
- assets_dir = _SUPERTONIC_STATE["assets_dir"]
67
-
68
- voice_path = os.path.join(assets_dir, "voice_styles", f"{voice}.json")
69
- if not os.path.exists(voice_path):
70
- raise gr.Error(f"Voice style {voice} not found.")
71
-
72
- style = load_voice_style([voice_path])
73
-
74
- try:
75
- sr = tts.sample_rate
76
- for audio_chunk in tts.stream(text, style, steps, speed):
77
- audio_int16 = _audio_np_to_int16(audio_chunk)
78
- yield _wav_bytes_from_int16(audio_int16, sr)
79
-
80
- except Exception as e:
81
- raise gr.Error(f"Error during speech generation: {str(e)}")
82
-
83
- with gr.Blocks(theme='Nymbo/Nymbo_Theme') as demo:
84
- gr.HTML("<h1 style='text-align: center;'>Supertonic-Hub</h1><p style='text-align: center;'>Powered by Supertone/supertonic</p>")
85
-
86
- # We need to initialize to get voices, but we don't want to block startup too long if download is needed.
87
- # For now, let's try to get voices, if empty, user might need to click generate to trigger download/init first?
88
- # Or we can just list a default if not found.
89
- try:
90
- available_voices = get_supertonic_voices()
91
- except Exception:
92
- available_voices = []
93
-
94
- default_voice = available_voices[0] if available_voices else None
95
-
96
- with gr.Row(variant='panel'):
97
- speed_slider = gr.Slider(
98
- minimum=0.5,
99
- maximum=2.0,
100
- value=1.0,
101
- step=0.1,
102
- label='Speed'
103
- )
104
- steps_slider = gr.Slider(
105
- minimum=1,
106
- maximum=50,
107
- value=5,
108
- step=1,
109
- label='Steps (Quality vs Speed)'
110
- )
111
- voice_dropdown = gr.Dropdown(
112
- choices=available_voices,
113
- label='Voice',
114
- value=default_voice,
115
- allow_custom_value=True
116
- )
117
-
118
- text_input = gr.Textbox(
119
- label="Input Text",
120
- placeholder="Enter the text you want to convert to speech here...",
121
- lines=5,
122
- value="This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
123
- )
124
-
125
- generate_btn = gr.Button(
126
- "Generate Speech",
127
- variant="primary",
128
- )
129
-
130
- audio_output = gr.Audio(
131
- label="Generated Speech",
132
- streaming=True,
133
- autoplay=True
134
- )
135
-
136
- def update_voices():
137
- voices = get_supertonic_voices()
138
- return gr.Dropdown(choices=voices, value=voices[0] if voices else None)
139
-
140
- # Add a refresh button for voices in case they weren't loaded initially
141
- refresh_btn = gr.Button("Refresh Voices (Downloads Model if needed)")
142
- refresh_btn.click(fn=update_voices, outputs=voice_dropdown)
143
-
144
- generate_inputs = [text_input, speed_slider, voice_dropdown, steps_slider]
145
-
146
- generate_btn.click(
147
- fn=supertonic_tts,
148
- inputs=generate_inputs,
149
- outputs=audio_output,
150
- api_name="generate_speech"
151
- )
152
-
153
- text_input.submit(
154
- fn=supertonic_tts,
155
- inputs=generate_inputs,
156
- outputs=audio_output,
157
- api_name="generate_speech_enter"
158
- )
159
-
160
- if __name__ == "__main__":
161
- demo.queue().launch()
 
1
+ import gradio as gr
2
+ import os
3
+ import io
4
+ import wave
5
+ import numpy as np
6
+ import soundfile as sf
7
+ from huggingface_hub import snapshot_download
8
+ from helper import load_text_to_speech, load_voice_style
9
+
10
+ _SUPERTONIC_STATE = {"initialized": False, "tts": None, "assets_dir": None}
11
+
12
+ def _init_supertonic() -> None:
13
+ if _SUPERTONIC_STATE["initialized"]:
14
+ return
15
+
16
+ print("Initializing Supertonic...")
17
+ # Download models if not present
18
+ assets_dir = os.path.join(os.path.dirname(__file__), "assets")
19
+ if not os.path.exists(assets_dir):
20
+ print(f"Downloading Supertonic models to {assets_dir}...")
21
+ snapshot_download(repo_id="Supertone/supertonic", local_dir=assets_dir)
22
+
23
+ onnx_dir = os.path.join(assets_dir, "onnx")
24
+ tts = load_text_to_speech(onnx_dir, use_gpu=False)
25
+
26
+ _SUPERTONIC_STATE.update({"initialized": True, "tts": tts, "assets_dir": assets_dir})
27
+ print("Supertonic initialized.")
28
+
29
+ def get_supertonic_voices():
30
+ """Get list of available Supertonic voice styles."""
31
+ # Ensure assets are downloaded to list voices
32
+ assets_dir = os.path.join(os.path.dirname(__file__), "assets")
33
+ if not os.path.exists(assets_dir):
34
+ # If not initialized/downloaded yet, we might not see voices.
35
+ # But we can try to download just to list, or just init.
36
+ _init_supertonic()
37
+ assets_dir = _SUPERTONIC_STATE["assets_dir"]
38
+
39
+ voice_styles_dir = os.path.join(assets_dir, "voice_styles")
40
+ if not os.path.exists(voice_styles_dir):
41
+ return []
42
+
43
+ files = os.listdir(voice_styles_dir)
44
+ voices = [f.replace('.json', '') for f in files if f.endswith('.json')]
45
+ return sorted(voices)
46
+
47
+ def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
48
+ audio_clipped = np.clip(audio_np, -1.0, 1.0)
49
+ return (audio_clipped * 32767.0).astype(np.int16)
50
+
51
+ def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes:
52
+ buffer = io.BytesIO()
53
+ with wave.open(buffer, "wb") as wf:
54
+ wf.setnchannels(1)
55
+ wf.setsampwidth(2)
56
+ wf.setframerate(sample_rate)
57
+ wf.writeframes(audio_int16.tobytes())
58
+ return buffer.getvalue()
59
+
60
+ def supertonic_tts(text: str, speed: float, voice: str, steps: int):
61
+ if not text or not text.strip():
62
+ raise gr.Error("Please enter text to synthesize.")
63
+
64
+ _init_supertonic()
65
+ tts = _SUPERTONIC_STATE["tts"]
66
+ assets_dir = _SUPERTONIC_STATE["assets_dir"]
67
+
68
+ voice_path = os.path.join(assets_dir, "voice_styles", f"{voice}.json")
69
+ if not os.path.exists(voice_path):
70
+ raise gr.Error(f"Voice style {voice} not found.")
71
+
72
+ style = load_voice_style([voice_path])
73
+
74
+ try:
75
+ sr = tts.sample_rate
76
+ for audio_chunk in tts.stream(text, style, steps, speed):
77
+ audio_int16 = _audio_np_to_int16(audio_chunk)
78
+ yield _wav_bytes_from_int16(audio_int16, sr)
79
+
80
+ except Exception as e:
81
+ raise gr.Error(f"Error during speech generation: {str(e)}")
82
+
83
+ with gr.Blocks(theme='Nymbo/Nymbo_Theme') as demo:
84
+ gr.HTML("<h1 style='text-align: center;'>Supertonic-Hub</h1><p style='text-align: center;'>Powered by Supertone/supertonic</p>")
85
+
86
+ # We need to initialize to get voices, but we don't want to block startup too long if download is needed.
87
+ # For now, let's try to get voices, if empty, user might need to click generate to trigger download/init first?
88
+ # Or we can just list a default if not found.
89
+ try:
90
+ available_voices = get_supertonic_voices()
91
+ except Exception:
92
+ available_voices = []
93
+
94
+ default_voice = available_voices[0] if available_voices else None
95
+
96
+ with gr.Row(variant='panel'):
97
+ speed_slider = gr.Slider(
98
+ minimum=0.5,
99
+ maximum=2.0,
100
+ value=1.0,
101
+ step=0.1,
102
+ label='Speed'
103
+ )
104
+ steps_slider = gr.Slider(
105
+ minimum=1,
106
+ maximum=50,
107
+ value=5,
108
+ step=1,
109
+ label='Steps (Quality vs Speed)'
110
+ )
111
+ voice_dropdown = gr.Dropdown(
112
+ choices=available_voices,
113
+ label='Voice',
114
+ value=default_voice,
115
+ allow_custom_value=True
116
+ )
117
+
118
+ text_input = gr.Textbox(
119
+ label="Input Text",
120
+ placeholder="Enter the text you want to convert to speech here...",
121
+ lines=5,
122
+ value="This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
123
+ )
124
+
125
+ generate_btn = gr.Button(
126
+ "Generate Speech",
127
+ variant="primary",
128
+ )
129
+
130
+ audio_output = gr.Audio(
131
+ label="Generated Speech",
132
+ streaming=True,
133
+ autoplay=True
134
+ )
135
+
136
+ generate_inputs = [text_input, speed_slider, voice_dropdown, steps_slider]
137
+
138
+ generate_btn.click(
139
+ fn=supertonic_tts,
140
+ inputs=generate_inputs,
141
+ outputs=audio_output,
142
+ api_name="generate_speech"
143
+ )
144
+
145
+ text_input.submit(
146
+ fn=supertonic_tts,
147
+ inputs=generate_inputs,
148
+ outputs=audio_output,
149
+ api_name="generate_speech_enter"
150
+ )
151
+
152
+ if __name__ == "__main__":
153
+ demo.queue().launch()