SohomToom commited on
Commit
9a8ac22
·
verified ·
1 Parent(s): b3721bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -84
app.py CHANGED
@@ -14,11 +14,6 @@ from pydub import AudioSegment
14
 
15
  final_audio = AudioSegment.empty()
16
 
17
- from pydub import AudioSegment
18
- from bark import generate_audio # Importing Bark
19
-
20
-
21
-
22
 
23
  # Voice model
24
  VOICE_MODEL = "tts_models/en/vctk/vits"
@@ -116,98 +111,88 @@ SPEAKER_METADATA = {
116
  273: { "age": 18, "gender": "F", "accent": "English"}
117
  }
118
 
 
 
 
119
 
 
 
 
 
120
 
121
- # Bark prompts (example)
122
- BARK_PROMPTS = [
123
- "Shy girl",
124
- "Old man",
125
- "Excited child",
126
- "Angry woman"
127
- ]
128
-
129
- def list_speaker_choices(metadata):
130
- """Helper function to list speakers from metadata (for VCTK and Coqui)"""
131
- return [f"Speaker {sid} | {meta['gender']} | {meta['accent']}" for sid, meta in SPEAKER_METADATA.items()]
132
 
133
  def get_speaker_id_from_label(label):
134
- """Extract speaker ID from label string"""
135
  return label.split('|')[0].strip()
136
 
137
- def generate_audio(sample_text, speaker_label, engine):
138
- """Generate audio based on engine choice"""
 
139
  speaker_id = get_speaker_id_from_label(speaker_label)
140
- model = None
141
-
142
- # Engine selection logic
143
- if engine == "bark":
144
- model = TTS("bark_model_path") # Replace with actual path for Bark model
145
- elif engine == "coqui":
146
- model = TTS("tts_models/multilingual/multi-dataset/xtts_v2") # Replace with actual path for Coqui model
147
- elif engine == "vctk":
148
- model = TTS(VOICE_MODEL) # Replace with actual path for VCTK model
149
-
150
- # Temporary file creation for output audio
151
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
152
  model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
153
  return tmp_wav.name
154
 
155
- # --- UI Components ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  with gr.Blocks() as demo:
157
- gr.Markdown("## 📄 TTS Voice Generator with Multiple Engines")
158
-
159
- # Engine dropdown
160
- engine_dropdown = gr.Dropdown(
161
- label="Select TTS Engine",
162
- choices=["bark", "coqui", "vctk"],
163
- value="vctk"
164
- )
165
-
166
- # Speaker/Prompt dropdown (dynamic)
167
- speaker_dropdown = gr.Dropdown(label="Select Speaker", visible=False)
168
- prompt_dropdown = gr.Dropdown(label="Select Prompt", visible=False)
169
-
170
- # Sample text box
171
- sample_textbox = gr.Textbox(label="Enter Sample Text (Max 500 characters)", max_lines=5)
172
- sample_audio = gr.Audio(label="Sample Output Audio", type="filepath")
173
-
174
- # Define metadata choices for speakers (Coqui and VCTK)
175
- speaker_choices_vctk_coqui = list_speaker_choices(SPEAKER_METADATA)
176
- speaker_dropdown.choices = speaker_choices_vctk_coqui # Use metadata for VCTK/Coqui speakers
177
-
178
- # Define Bark prompts (choose from predefined prompts)
179
- prompt_dropdown.choices = BARK_PROMPTS
180
-
181
- # Dynamically update dropdown visibility based on engine selection
182
- def update_dropdowns(engine):
183
- if engine == "bark":
184
- speaker_dropdown.visible = False
185
- prompt_dropdown.visible = True
186
- elif engine == "coqui" or engine == "vctk":
187
- speaker_dropdown.visible = True
188
- prompt_dropdown.visible = False
189
- return gr.update(visible=speaker_dropdown.visible), gr.update(visible=prompt_dropdown.visible)
190
-
191
- # Trigger dropdown visibility changes
192
- engine_dropdown.change(update_dropdowns, inputs=engine_dropdown, outputs=[speaker_dropdown, prompt_dropdown])
193
-
194
- # Button to generate audio from sample text
195
- generate_button = gr.Button("Generate Audio")
196
- generate_button.click(
197
- fn=generate_audio,
198
- inputs=[sample_textbox, speaker_dropdown, engine_dropdown],
199
- outputs=[sample_audio]
200
- )
201
-
202
- # Button to clear the sample text and audio
203
- def clear_sample():
204
- return "", None
205
-
206
- clear_button = gr.Button("Clear")
207
- clear_button.click(fn=clear_sample, inputs=[], outputs=[sample_textbox, sample_audio])
208
 
 
 
209
 
210
- if __name__ == "__main__":
211
- demo.launch()
 
 
 
 
 
 
 
212
 
 
 
 
 
213
 
 
 
 
 
 
14
 
15
  final_audio = AudioSegment.empty()
16
 
 
 
 
 
 
17
 
18
  # Voice model
19
  VOICE_MODEL = "tts_models/en/vctk/vits"
 
111
  273: { "age": 18, "gender": "F", "accent": "English"}
112
  }
113
 
114
+ def clean_text(text):
115
+ # Remove hyperlinks
116
+ return re.sub(r'http[s]?://\S+', '', text)
117
 
118
+ def extract_paragraphs_from_docx(docx_file):
119
+ document = Document(docx_file.name)
120
+ paragraphs = [p.text.strip() for p in document.paragraphs if p.text.strip()]
121
+ return [clean_text(p) for p in paragraphs]
122
 
123
+ def list_speaker_choices():
124
+ return [f"{sid} | {meta['gender']} | {meta['accent']}" for sid, meta in SPEAKER_METADATA.items()]
 
 
 
 
 
 
 
 
 
125
 
126
  def get_speaker_id_from_label(label):
 
127
  return label.split('|')[0].strip()
128
 
129
+ def generate_sample_audio(sample_text, speaker_label):
130
+ if len(sample_text) > 500:
131
+ raise gr.Error("Sample text exceeds 500 characters.")
132
  speaker_id = get_speaker_id_from_label(speaker_label)
133
+ model = TTS("tts_models/en/vctk/vits")
 
 
 
 
 
 
 
 
 
 
134
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
135
  model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
136
  return tmp_wav.name
137
 
138
+ def generate_audio(docx_file, speaker_label):
139
+ speaker_id = get_speaker_id_from_label(speaker_label)
140
+ model = TTS("tts_models/en/vctk/vits")
141
+
142
+ paragraphs = extract_paragraphs_from_docx(docx_file)
143
+ combined_audio = AudioSegment.empty()
144
+ temp_files = []
145
+
146
+ try:
147
+ for idx, para in enumerate(paragraphs):
148
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
149
+ model.tts_to_file(text=para, speaker="p"+speaker_id, file_path=tmp.name)
150
+ audio_chunk = AudioSegment.from_wav(tmp.name)
151
+ combined_audio += audio_chunk
152
+ temp_files.append(tmp.name)
153
+ tmp.close()
154
+
155
+ except Exception as e:
156
+ print("Generation interrupted. Saving partial output.", e)
157
+
158
+ output_dir = tempfile.mkdtemp()
159
+ final_output_path = os.path.join(output_dir, "final_output.wav")
160
+ combined_audio.export(final_output_path, format="wav")
161
+
162
+ zip_path = os.path.join(output_dir, "output.zip")
163
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
164
+ zipf.write(final_output_path, arcname="final_output.wav")
165
+
166
+ for f in temp_files:
167
+ os.remove(f)
168
+
169
+ return zip_path
170
+
171
+ # --- UI ---
172
+ speaker_choices = list_speaker_choices()
173
+
174
  with gr.Blocks() as demo:
175
+ gr.Markdown("## 📄 TTS Voice Generator with Paragraph-Wise Processing")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ with gr.Row():
178
+ speaker_dropdown = gr.Dropdown(label="Select Voice", choices=speaker_choices)
179
 
180
+ with gr.Row():
181
+ sample_textbox = gr.Textbox(label="Enter Sample Text (Max 500 characters)", max_lines=5)
182
+ sample_button = gr.Button("Generate Sample")
183
+ clear_button = gr.Button("Clear Sample")
184
+
185
+ sample_audio = gr.Audio(label="Sample Output", type="filepath")
186
+
187
+ sample_button.click(fn=generate_sample_audio, inputs=[sample_textbox, speaker_dropdown], outputs=[sample_audio])
188
+ clear_button.click(fn=lambda: None, inputs=[], outputs=[sample_audio])
189
 
190
+ with gr.Row():
191
+ docx_input = gr.File(label="Upload DOCX File", file_types=[".docx"])
192
+ generate_button = gr.Button("Generate Full Audio")
193
+ download_output = gr.File(label="Download Output Zip")
194
 
195
+ generate_button.click(fn=generate_audio, inputs=[docx_input, speaker_dropdown], outputs=[download_output])
196
+
197
+ if __name__ == "__main__":
198
+ demo.launch()