SohomToom commited on
Commit
a50e211
·
verified ·
1 Parent(s): c13df21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -158
app.py CHANGED
@@ -118,174 +118,94 @@ SPEAKER_METADATA = {
118
 
119
 
120
 
 
 
 
 
 
 
 
121
 
122
- # Voice model
123
- VOICE_MODEL = "tts_models/en/vctk/vits"
124
-
125
-
126
- def clean_text(text):
127
- # Remove hyperlinks
128
- return re.sub(r'http[s]?://\S+', '', text)
129
-
130
- def extract_paragraphs_from_docx(docx_file):
131
- document = Document(docx_file.name)
132
- paragraphs = [p.text.strip() for p in document.paragraphs if p.text.strip()]
133
- return [clean_text(p) for p in paragraphs]
134
-
135
- def list_speaker_choices():
136
- return [f"{sid} | {meta['gender']} | {meta['accent']}" for sid, meta in SPEAKER_METADATA.items()]
137
 
138
  def get_speaker_id_from_label(label):
 
139
  return label.split('|')[0].strip()
140
 
141
- # Bark Voice List (Textual Prompts)
142
- bark_voice_choices = [
143
- "young female voice",
144
- "middle-aged male voice with British accent",
145
- "calm narrator",
146
- "excited teenager",
147
- "elderly male voice",
148
- "child with American accent"
149
- ]
150
-
151
- # Function to generate audio using Coqui TTS (with metadata)
152
- def generate_sample_audio(sample_text, speaker_label, model_choice):
153
- if len(sample_text) > 500:
154
- raise gr.Error("Sample text exceeds 500 characters.")
155
-
156
  speaker_id = get_speaker_id_from_label(speaker_label)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- if model_choice == "Coqui":
159
- model = TTS("tts_models/multilingual/multi-dataset/your_model")
160
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
161
- model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
162
- return tmp_wav.name
163
- elif model_choice == "Bark":
164
- voice_prompt = speaker_label # Bark's speaker prompt could be a descriptive voice label
165
- audio = generate_audio(sample_text, speaker_prompt=voice_prompt) # Bark's method for audio generation
166
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
167
- audio.export(tmp_wav.name, format="wav")
168
- return tmp_wav.name
169
- else:
170
- model = TTS("tts_models/en/vctk/vits")
171
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
172
- model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
173
- return tmp_wav.name
174
-
175
- # Function to generate full audio from DOCX using selected TTS model
176
- def generate_audio(docx_file, speaker_label, model_choice):
177
- speaker_id = get_speaker_id_from_label(speaker_label)
178
 
179
- if model_choice == "Coqui":
180
- model = TTS("tts_models/multilingual/multi-dataset/your_model")
181
- paragraphs = extract_paragraphs_from_docx(docx_file)
182
- combined_audio = AudioSegment.empty()
183
- temp_files = []
184
-
185
- try:
186
- for idx, para in enumerate(paragraphs):
187
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
188
- model.tts_to_file(text=para, speaker="p"+speaker_id, file_path=tmp.name)
189
- audio_chunk = AudioSegment.from_wav(tmp.name)
190
- combined_audio += audio_chunk
191
- temp_files.append(tmp.name)
192
- tmp.close()
193
-
194
- except Exception as e:
195
- print("Generation interrupted. Saving partial output.", e)
196
-
197
- output_dir = tempfile.mkdtemp()
198
- final_output_path = os.path.join(output_dir, "final_output.wav")
199
- combined_audio.export(final_output_path, format="wav")
200
-
201
- zip_path = os.path.join(output_dir, "output.zip")
202
- with zipfile.ZipFile(zip_path, 'w') as zipf:
203
- zipf.write(final_output_path, arcname="final_output.wav")
204
-
205
- for f in temp_files:
206
- os.remove(f)
207
-
208
- return zip_path
209
 
210
- elif model_choice == "Bark":
211
- paragraphs = extract_paragraphs_from_docx(docx_file)
212
- combined_audio = AudioSegment.empty()
213
-
214
- try:
215
- for para in paragraphs:
216
- audio = generate_audio(para, speaker_prompt=speaker_label) # Bark
217
- combined_audio += audio # Append audio to final output
218
- except Exception as e:
219
- print("Generation interrupted. Saving partial output.", e)
220
-
221
- output_dir = tempfile.mkdtemp()
222
- final_output_path = os.path.join(output_dir, "final_output.wav")
223
- combined_audio.export(final_output_path, format="wav")
224
-
225
- zip_path = os.path.join(output_dir, "output.zip")
226
- with zipfile.ZipFile(zip_path, 'w') as zipf:
227
- zipf.write(final_output_path, arcname="final_output.wav")
228
-
229
- return zip_path
230
-
231
- else: # VCTK
232
- paragraphs = extract_paragraphs_from_docx(docx_file)
233
- combined_audio = AudioSegment.empty()
234
- temp_files = []
235
-
236
- try:
237
- for idx, para in enumerate(paragraphs):
238
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
239
- model = TTS("tts_models/en/vctk/vits")
240
- model.tts_to_file(text=para, speaker="p"+speaker_id, file_path=tmp.name)
241
- audio_chunk = AudioSegment.from_wav(tmp.name)
242
- combined_audio += audio_chunk
243
- temp_files.append(tmp.name)
244
- tmp.close()
245
-
246
- except Exception as e:
247
- print("Generation interrupted. Saving partial output.", e)
248
-
249
- output_dir = tempfile.mkdtemp()
250
- final_output_path = os.path.join(output_dir, "final_output.wav")
251
- combined_audio.export(final_output_path, format="wav")
252
-
253
- zip_path = os.path.join(output_dir, "output.zip")
254
- with zipfile.ZipFile(zip_path, 'w') as zipf:
255
- zipf.write(final_output_path, arcname="final_output.wav")
256
-
257
- for f in temp_files:
258
- os.remove(f)
259
-
260
- return zip_path
261
-
262
-
263
- # --- UI ---
264
- speaker_choices = list_speaker_choices()
265
-
266
- with gr.Blocks() as demo:
267
- gr.Markdown("## 📄 TTS Voice Generator with Paragraph-Wise Processing")
268
-
269
- with gr.Row():
270
- model_selector = gr.Dropdown(label="Select TTS Engine", choices=["Coqui", "Bark", "VCTK"], value="VCTK")
271
- speaker_dropdown = gr.Dropdown(label="Select Voice", choices=speaker_choices)
272
-
273
- with gr.Row():
274
- sample_textbox = gr.Textbox(label="Enter Sample Text (Max 500 characters)", max_lines=5)
275
- sample_button = gr.Button("Generate Sample")
276
- clear_button = gr.Button("Clear Sample")
277
-
278
- sample_audio = gr.Audio(label="Sample Output", type="filepath")
279
-
280
- sample_button.click(fn=generate_sample_audio, inputs=[sample_textbox, speaker_dropdown, model_selector], outputs=[sample_audio])
281
- clear_button.click(fn=lambda: None, inputs=[], outputs=[sample_audio])
282
-
283
- with gr.Row():
284
- docx_input = gr.File(label="Upload DOCX File", file_types=[".docx"])
285
- generate_button = gr.Button("Generate Full Audio")
286
- download_output = gr.File(label="Download Output Zip")
287
 
288
- generate_button.click(fn=generate_audio, inputs=[docx_input, speaker_dropdown, model_selector], outputs=[download_output])
289
 
290
  if __name__ == "__main__":
291
  demo.launch()
 
118
 
119
 
120
 
121
+ # Bark prompts (example)
122
+ BARK_PROMPTS = [
123
+ "Shy girl",
124
+ "Old man",
125
+ "Excited child",
126
+ "Angry woman"
127
+ ]
128
 
129
+ def list_speaker_choices(metadata):
130
+ """Helper function to list speakers from metadata (for VCTK and Coqui)"""
131
+ return [f"Speaker {sid} | {meta['gender']} | {meta['accent']}" for sid, meta in metadata.items()]
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  def get_speaker_id_from_label(label):
134
+ """Extract speaker ID from label string"""
135
  return label.split('|')[0].strip()
136
 
137
+ def generate_audio(sample_text, speaker_label, engine):
138
+ """Generate audio based on engine choice"""
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  speaker_id = get_speaker_id_from_label(speaker_label)
140
+ model = None
141
+
142
+ # Engine selection logic
143
+ if engine == "bark":
144
+ model = TTS("bark_model_path") # Replace with actual path for Bark model
145
+ elif engine == "coqui":
146
+ model = TTS("coqui_model_path") # Replace with actual path for Coqui model
147
+ elif engine == "vctk":
148
+ model = TTS(VOICE_MODEL) # Replace with actual path for VCTK model
149
+
150
+ # Temporary file creation for output audio
151
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
152
+ model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
153
+ return tmp_wav.name
154
+
155
+ # --- UI Components ---
156
+ with gr.Blocks() as demo:
157
+ gr.Markdown("## 📄 TTS Voice Generator with Multiple Engines")
158
 
159
+ # Engine dropdown
160
+ engine_dropdown = gr.Dropdown(
161
+ label="Select TTS Engine",
162
+ choices=["bark", "coqui", "vctk"],
163
+ value="vctk"
164
+ )
165
+
166
+ # Speaker/Prompt dropdown (dynamic)
167
+ speaker_dropdown = gr.Dropdown(label="Select Speaker", visible=False)
168
+ prompt_dropdown = gr.Dropdown(label="Select Prompt", visible=False)
169
+
170
+ # Sample text box
171
+ sample_textbox = gr.Textbox(label="Enter Sample Text (Max 500 characters)", max_lines=5, max_chars=500)
172
+ sample_audio = gr.Audio(label="Sample Output Audio", type="filepath")
 
 
 
 
 
 
173
 
174
+ # Define metadata choices for speakers (Coqui and VCTK)
175
+ speaker_choices_vctk_coqui = list_speaker_choices(SPEAKER_METADATA)
176
+ speaker_dropdown.choices = speaker_choices_vctk_coqui # Use metadata for VCTK/Coqui speakers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
+ # Define Bark prompts (choose from predefined prompts)
179
+ prompt_dropdown.choices = BARK_PROMPTS
180
+
181
+ # Dynamically update dropdown visibility based on engine selection
182
+ def update_dropdowns(engine):
183
+ if engine == "bark":
184
+ speaker_dropdown.visible = False
185
+ prompt_dropdown.visible = True
186
+ elif engine == "coqui" or engine == "vctk":
187
+ speaker_dropdown.visible = True
188
+ prompt_dropdown.visible = False
189
+ return gr.update(visible=speaker_dropdown.visible), gr.update(visible=prompt_dropdown.visible)
190
+
191
+ # Trigger dropdown visibility changes
192
+ engine_dropdown.change(update_dropdowns, inputs=engine_dropdown, outputs=[speaker_dropdown, prompt_dropdown])
193
+
194
+ # Button to generate audio from sample text
195
+ generate_button = gr.Button("Generate Audio")
196
+ generate_button.click(
197
+ fn=generate_audio,
198
+ inputs=[sample_textbox, speaker_dropdown, engine_dropdown],
199
+ outputs=[sample_audio]
200
+ )
201
+
202
+ # Button to clear the sample text and audio
203
+ def clear_sample():
204
+ return "", None
205
+
206
+ clear_button = gr.Button("Clear")
207
+ clear_button.click(fn=clear_sample, inputs=[], outputs=[sample_textbox, sample_audio])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
 
209
 
210
  if __name__ == "__main__":
211
  demo.launch()