SohomToom's picture
Update app.py
40ede2a verified
raw
history blame
8.77 kB
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"
import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile
# Embedding the metadata directly into the script
SPEAKER_METADATA = {
300: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
271: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"},
287: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
262: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"},
284: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Scottish"},
297: {"audio_id": 1, "age": 20, "gender": "F", "accent": "American"},
227: {"audio_id": 1, "age": 38, "gender": "M", "accent": "English"},
246: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
225: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
259: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
252: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
231: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
266: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"},
241: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
312: {"audio_id": 1, "age": 19, "gender": "F", "accent": "Canadian"},
329: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
232: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
305: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
311: {"audio_id": 1, "age": 21, "gender": "M", "accent": "American"},
301: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
304: {"audio_id": 1, "age": 22, "gender": "M", "accent": "NorthernIrish"},
310: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"},
260: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
315: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"},
374: {"audio_id": 1, "age": 28, "gender": "M", "accent": "Australian"},
364: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Irish"},
269: {"audio_id": 1, "age": 20, "gender": "F", "accent": "English"},
345: {"audio_id": 1, "age": 22, "gender": "M", "accent": "American"},
326: {"audio_id": 1, "age": 26, "gender": "M", "accent": "Australian"},
343: {"audio_id": 1, "age": 27, "gender": "F", "accent": "Canadian"},
230: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
376: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Indian"},
240: {"audio_id": 1, "age": 21, "gender": "F", "accent": "English"},
298: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Irish"},
272: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"},
248: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Indian"},
264: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"},
250: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
292: {"audio_id": 1, "age": 23, "gender": "M", "accent": "NorthernIrish"},
237: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
363: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Canadian"},
313: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"},
285: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
268: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
302: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"},
261: {"audio_id": 1, "age": 26, "gender": "F", "accent": "NorthernIrish"},
336: {"audio_id": 1, "age": 18, "gender": "F", "accent": "SouthAfrican"},
288: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"},
226: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
277: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
360: {"audio_id": 1, "age": 19, "gender": "M", "accent": "American"},
257: {"audio_id": 1, "age": 24, "gender": "F", "accent": "English"},
254: {"audio_id": 1, "age": 21, "gender": "M", "accent": "English"},
339: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"},
323: {"audio_id": 1, "age": 19, "gender": "F", "accent": "SouthAfrican"},
255: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"},
249: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Scottish"},
293: {"audio_id": 1, "age": 22, "gender": "F", "accent": "NorthernIrish"},
244: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
245: {"audio_id": 1, "age": 25, "gender": "M", "accent": "Irish"},
361: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
314: {"audio_id": 1, "age": 26, "gender": "F", "accent": "SouthAfrican"},
308: {"audio_id": 1, "age": 18, "gender": "F", "accent": "American"},
229: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
341: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"},
275: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"},
263: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
253: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Welsh"},
299: {"audio_id": 1, "age": 25, "gender": "F", "accent": "American"},
316: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"},
282: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
362: {"audio_id": 1, "age": 29, "gender": "F", "accent": "American"},
294: {"audio_id": 1, "age": 33, "gender": "F", "accent": "American"},
274: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
279: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
281: {"audio_id": 1, "age": 29, "gender": "M", "accent": "Scottish"},
286: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
258: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
247: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
351: {"audio_id": 1, "age": 21, "gender": "F", "accent": "NorthernIrish"},
283: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"},
334: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"},
333: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
295: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Irish"},
330: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"},
335: {"audio_id": 1, "age": 25, "gender": "F", "accent": "NewZealand"},
228: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
267: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
273: {"audio_id": 1, "age": 18, "gender": "F", "accent": "English"}
}
# Load the TTS model
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=False)
# Extract speakers from metadata
def extract_speakers(voice_selection):
speaker_choices = []
for speaker_id, metadata in SPEAKER_METADATA.items():
if voice_selection == "english":
speaker_choices.append((
str(speaker_id),
f"p{speaker_id} ({metadata['gender']}, {metadata['accent']}, {metadata['age']} yrs)"
))
return speaker_choices
# Update the speaker dropdown based on selected voice
def update_speaker_dropdown(voice_selection):
speaker_choices = extract_speakers(voice_selection)
visible = bool(speaker_choices)
default = speaker_choices[0][0] if speaker_choices else None
return gr.Dropdown.update(
choices=speaker_choices,
visible=visible,
value=default
)
# Generate speech to text
def generate_audio(voice_selection, speaker_selection, text_input):
speaker_id = int(speaker_selection)
temp_file = tempfile.mktemp(suffix=".wav")
tts.tts_to_file(text_input, temp_file, speaker=speaker_id)
return temp_file
# Gradio interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
voice_dropdown = gr.Dropdown(
choices=["english", "other"],
label="Select Voice",
value="english"
)
speaker_dropdown = gr.Dropdown(
label="Select Speaker",
visible=False
)
text_input = gr.Textbox(label="Enter text")
audio_output = gr.Audio(label="Generated Audio")
voice_dropdown.change(
fn=update_speaker_dropdown,
inputs=voice_dropdown,
outputs=speaker_dropdown
)
generate_button = gr.Button("Generate Audio")
generate_button.click(generate_audio, inputs=[voice_dropdown, speaker_dropdown, text_input], outputs=audio_output)
demo.launch()