SohomToom commited on
Commit
f2fdc48
·
verified ·
1 Parent(s): 1900d87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -39
app.py CHANGED
@@ -14,6 +14,11 @@ from pydub import AudioSegment
14
 
15
  final_audio = AudioSegment.empty()
16
 
 
 
 
 
 
17
 
18
  # Voice model
19
  VOICE_MODEL = "tts_models/en/vctk/vits"
@@ -111,6 +116,19 @@ SPEAKER_METADATA = {
111
  273: { "age": 18, "gender": "F", "accent": "English"}
112
  }
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def clean_text(text):
115
  # Remove hyperlinks
116
  return re.sub(r'http[s]?://\S+', '', text)
@@ -126,40 +144,58 @@ def list_speaker_choices():
126
  def get_speaker_id_from_label(label):
127
  return label.split('|')[0].strip()
128
 
129
- def generate_sample_audio(sample_text, speaker_label):
 
 
 
 
 
 
 
 
 
 
 
130
  if len(sample_text) > 500:
131
  raise gr.Error("Sample text exceeds 500 characters.")
 
132
  speaker_id = get_speaker_id_from_label(speaker_label)
133
- model = TTS("tts_models/en/vctk/vits")
134
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
135
- model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
136
- return tmp_wav.name
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- def generate_audio(docx_file, speaker_label):
 
139
  speaker_id = get_speaker_id_from_label(speaker_label)
140
 
141
- if engine_choice == "Bark":
142
- from bark import generate_audio
143
- from bark.generation import preload_models
144
- preload_models()
145
- audio_array = generate_audio(sample_text)
146
- tmp_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
147
- AudioSegment(audio_array.tobytes(), frame_rate=24000, sample_width=2, channels=1).export(tmp_path, format="wav")
148
- return tmp_path
149
- else:
150
- model = TTS("tts_models/en/vctk/vits")
151
  paragraphs = extract_paragraphs_from_docx(docx_file)
152
  combined_audio = AudioSegment.empty()
153
  temp_files = []
154
 
155
  try:
156
  for idx, para in enumerate(paragraphs):
157
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
158
- model.tts_to_file(text=para, speaker="p"+speaker_id, file_path=tmp.name)
159
- audio_chunk = AudioSegment.from_wav(tmp.name)
160
- combined_audio += audio_chunk
161
- temp_files.append(tmp.name)
162
- tmp.close()
163
 
164
  except Exception as e:
165
  print("Generation interrupted. Saving partial output.", e)
@@ -167,14 +203,68 @@ def generate_audio(docx_file, speaker_label):
167
  output_dir = tempfile.mkdtemp()
168
  final_output_path = os.path.join(output_dir, "final_output.wav")
169
  combined_audio.export(final_output_path, format="wav")
 
170
  zip_path = os.path.join(output_dir, "output.zip")
171
  with zipfile.ZipFile(zip_path, 'w') as zipf:
172
- zipf.write(final_output_path, arcname="final_output.wav")
173
 
174
  for f in temp_files:
175
- os.remove(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- return zip_path
178
 
179
  # --- UI ---
180
  speaker_choices = list_speaker_choices()
@@ -183,24 +273,17 @@ with gr.Blocks() as demo:
183
  gr.Markdown("## 📄 TTS Voice Generator with Paragraph-Wise Processing")
184
 
185
  with gr.Row():
 
186
  speaker_dropdown = gr.Dropdown(label="Select Voice", choices=speaker_choices)
187
 
188
-
189
  with gr.Row():
190
  sample_textbox = gr.Textbox(label="Enter Sample Text (Max 500 characters)", max_lines=5)
191
  sample_button = gr.Button("Generate Sample")
192
  clear_button = gr.Button("Clear Sample")
193
 
194
- tts_engine_dropdown = gr.Dropdown(label="TTS Engine", choices=["Coqui (XTTS)", "Bark"], value="Coqui (XTTS)")
195
-
196
-
197
  sample_audio = gr.Audio(label="Sample Output", type="filepath")
198
 
199
- sample_button.click(
200
- fn=generate_sample_audio,
201
- inputs=[sample_textbox, speaker_dropdown, tts_engine_dropdown],
202
- outputs=[sample_audio]
203
- )
204
  clear_button.click(fn=lambda: None, inputs=[], outputs=[sample_audio])
205
 
206
  with gr.Row():
@@ -208,11 +291,7 @@ with gr.Blocks() as demo:
208
  generate_button = gr.Button("Generate Full Audio")
209
  download_output = gr.File(label="Download Output Zip")
210
 
211
- generate_button.click(
212
- fn=generate_audio,
213
- inputs=[docx_input, speaker_dropdown, tts_engine_dropdown],
214
- outputs=[download_output]
215
- )
216
 
217
  if __name__ == "__main__":
218
  demo.launch()
 
14
 
15
  final_audio = AudioSegment.empty()
16
 
17
+ from pydub import AudioSegment
18
+ from bark import generate_audio # Importing Bark
19
+
20
+
21
+
22
 
23
  # Voice model
24
  VOICE_MODEL = "tts_models/en/vctk/vits"
 
116
  273: { "age": 18, "gender": "F", "accent": "English"}
117
  }
118
 
119
+
120
+
121
+
122
+ # Voice model
123
+ VOICE_MODEL = "tts_models/en/vctk/vits"
124
+
125
+ # Embedded metadata (from your file)
126
+ SPEAKER_METADATA = {
127
+ 300: { "age": 23, "gender": "F", "accent": "American"},
128
+ 271: { "age": 19, "gender": "M", "accent": "Scottish"},
129
+ # More entries as before
130
+ }
131
+
132
  def clean_text(text):
133
  # Remove hyperlinks
134
  return re.sub(r'http[s]?://\S+', '', text)
 
144
  def get_speaker_id_from_label(label):
145
  return label.split('|')[0].strip()
146
 
147
+ # Bark Voice List (Textual Prompts)
148
+ bark_voice_choices = [
149
+ "young female voice",
150
+ "middle-aged male voice with British accent",
151
+ "calm narrator",
152
+ "excited teenager",
153
+ "elderly male voice",
154
+ "child with American accent"
155
+ ]
156
+
157
+ # Function to generate audio using Coqui TTS (with metadata)
158
+ def generate_sample_audio(sample_text, speaker_label, model_choice):
159
  if len(sample_text) > 500:
160
  raise gr.Error("Sample text exceeds 500 characters.")
161
+
162
  speaker_id = get_speaker_id_from_label(speaker_label)
163
+
164
+ if model_choice == "Coqui":
165
+ model = TTS("tts_models/multilingual/multi-dataset/your_model")
166
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
167
+ model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
168
+ return tmp_wav.name
169
+ elif model_choice == "Bark":
170
+ voice_prompt = speaker_label # Bark's speaker prompt could be a descriptive voice label
171
+ audio = generate_audio(sample_text, speaker_prompt=voice_prompt) # Bark's method for audio generation
172
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
173
+ audio.export(tmp_wav.name, format="wav")
174
+ return tmp_wav.name
175
+ else:
176
+ model = TTS("tts_models/en/vctk/vits")
177
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
178
+ model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
179
+ return tmp_wav.name
180
 
181
+ # Function to generate full audio from DOCX using selected TTS model
182
+ def generate_audio(docx_file, speaker_label, model_choice):
183
  speaker_id = get_speaker_id_from_label(speaker_label)
184
 
185
+ if model_choice == "Coqui":
186
+ model = TTS("tts_models/multilingual/multi-dataset/your_model")
 
 
 
 
 
 
 
 
187
  paragraphs = extract_paragraphs_from_docx(docx_file)
188
  combined_audio = AudioSegment.empty()
189
  temp_files = []
190
 
191
  try:
192
  for idx, para in enumerate(paragraphs):
193
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
194
+ model.tts_to_file(text=para, speaker="p"+speaker_id, file_path=tmp.name)
195
+ audio_chunk = AudioSegment.from_wav(tmp.name)
196
+ combined_audio += audio_chunk
197
+ temp_files.append(tmp.name)
198
+ tmp.close()
199
 
200
  except Exception as e:
201
  print("Generation interrupted. Saving partial output.", e)
 
203
  output_dir = tempfile.mkdtemp()
204
  final_output_path = os.path.join(output_dir, "final_output.wav")
205
  combined_audio.export(final_output_path, format="wav")
206
+
207
  zip_path = os.path.join(output_dir, "output.zip")
208
  with zipfile.ZipFile(zip_path, 'w') as zipf:
209
+ zipf.write(final_output_path, arcname="final_output.wav")
210
 
211
  for f in temp_files:
212
+ os.remove(f)
213
+
214
+ return zip_path
215
+
216
+ elif model_choice == "Bark":
217
+ paragraphs = extract_paragraphs_from_docx(docx_file)
218
+ combined_audio = AudioSegment.empty()
219
+
220
+ try:
221
+ for para in paragraphs:
222
+ audio = generate_audio(para, speaker_prompt=speaker_label) # Bark
223
+ combined_audio += audio # Append audio to final output
224
+ except Exception as e:
225
+ print("Generation interrupted. Saving partial output.", e)
226
+
227
+ output_dir = tempfile.mkdtemp()
228
+ final_output_path = os.path.join(output_dir, "final_output.wav")
229
+ combined_audio.export(final_output_path, format="wav")
230
+
231
+ zip_path = os.path.join(output_dir, "output.zip")
232
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
233
+ zipf.write(final_output_path, arcname="final_output.wav")
234
+
235
+ return zip_path
236
+
237
+ else: # VCTK
238
+ paragraphs = extract_paragraphs_from_docx(docx_file)
239
+ combined_audio = AudioSegment.empty()
240
+ temp_files = []
241
+
242
+ try:
243
+ for idx, para in enumerate(paragraphs):
244
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
245
+ model = TTS("tts_models/en/vctk/vits")
246
+ model.tts_to_file(text=para, speaker="p"+speaker_id, file_path=tmp.name)
247
+ audio_chunk = AudioSegment.from_wav(tmp.name)
248
+ combined_audio += audio_chunk
249
+ temp_files.append(tmp.name)
250
+ tmp.close()
251
+
252
+ except Exception as e:
253
+ print("Generation interrupted. Saving partial output.", e)
254
+
255
+ output_dir = tempfile.mkdtemp()
256
+ final_output_path = os.path.join(output_dir, "final_output.wav")
257
+ combined_audio.export(final_output_path, format="wav")
258
+
259
+ zip_path = os.path.join(output_dir, "output.zip")
260
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
261
+ zipf.write(final_output_path, arcname="final_output.wav")
262
+
263
+ for f in temp_files:
264
+ os.remove(f)
265
+
266
+ return zip_path
267
 
 
268
 
269
  # --- UI ---
270
  speaker_choices = list_speaker_choices()
 
273
  gr.Markdown("## 📄 TTS Voice Generator with Paragraph-Wise Processing")
274
 
275
  with gr.Row():
276
+ model_selector = gr.Dropdown(label="Select TTS Engine", choices=["Coqui", "Bark", "VCTK"], value="VCTK")
277
  speaker_dropdown = gr.Dropdown(label="Select Voice", choices=speaker_choices)
278
 
 
279
  with gr.Row():
280
  sample_textbox = gr.Textbox(label="Enter Sample Text (Max 500 characters)", max_lines=5)
281
  sample_button = gr.Button("Generate Sample")
282
  clear_button = gr.Button("Clear Sample")
283
 
 
 
 
284
  sample_audio = gr.Audio(label="Sample Output", type="filepath")
285
 
286
+ sample_button.click(fn=generate_sample_audio, inputs=[sample_textbox, speaker_dropdown, model_selector], outputs=[sample_audio])
 
 
 
 
287
  clear_button.click(fn=lambda: None, inputs=[], outputs=[sample_audio])
288
 
289
  with gr.Row():
 
291
  generate_button = gr.Button("Generate Full Audio")
292
  download_output = gr.File(label="Download Output Zip")
293
 
294
+ generate_button.click(fn=generate_audio, inputs=[docx_input, speaker_dropdown, model_selector], outputs=[download_output])
 
 
 
 
295
 
296
  if __name__ == "__main__":
297
  demo.launch()