import os os.environ["NUMBA_DISABLE_CACHE"] = "1" import gradio as gr from docx import Document from TTS.api import TTS import tempfile # Embedding the metadata directly into the script SPEAKER_METADATA = { 300: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"}, 271: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"}, 287: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, 262: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"}, 284: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Scottish"}, 297: {"audio_id": 1, "age": 20, "gender": "F", "accent": "American"}, 227: {"audio_id": 1, "age": 38, "gender": "M", "accent": "English"}, 246: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, 225: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, 259: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, 252: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, 231: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, 266: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"}, 241: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"}, 312: {"audio_id": 1, "age": 19, "gender": "F", "accent": "Canadian"}, 329: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"}, 232: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, 305: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"}, 311: {"audio_id": 1, "age": 21, "gender": "M", "accent": "American"}, 301: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"}, 304: {"audio_id": 1, "age": 22, "gender": "M", "accent": "NorthernIrish"}, 310: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"}, 260: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"}, 315: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"}, 374: {"audio_id": 1, "age": 28, "gender": "M", "accent": "Australian"}, 364: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Irish"}, 269: {"audio_id": 1, "age": 20, "gender": "F", "accent": "English"}, 345: {"audio_id": 1, "age": 22, "gender": "M", "accent": "American"}, 326: {"audio_id": 1, "age": 26, "gender": "M", "accent": "Australian"}, 343: {"audio_id": 1, "age": 27, "gender": "F", "accent": "Canadian"}, 230: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"}, 376: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Indian"}, 240: {"audio_id": 1, "age": 21, "gender": "F", "accent": "English"}, 298: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Irish"}, 272: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"}, 248: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Indian"}, 264: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"}, 250: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"}, 292: {"audio_id": 1, "age": 23, "gender": "M", "accent": "NorthernIrish"}, 237: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, 363: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Canadian"}, 313: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"}, 285: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"}, 268: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, 302: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"}, 261: {"audio_id": 1, "age": 26, "gender": "F", "accent": "NorthernIrish"}, 336: {"audio_id": 1, "age": 18, "gender": "F", "accent": "SouthAfrican"}, 288: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"}, 226: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"}, 277: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, 360: {"audio_id": 1, "age": 19, "gender": "M", "accent": "American"}, 257: {"audio_id": 1, "age": 24, "gender": "F", "accent": "English"}, 254: {"audio_id": 1, "age": 21, "gender": "M", "accent": "English"}, 339: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"}, 323: {"audio_id": 1, "age": 19, "gender": "F", "accent": "SouthAfrican"}, 255: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"}, 249: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Scottish"}, 293: {"audio_id": 1, "age": 22, "gender": "F", "accent": "NorthernIrish"}, 244: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"}, 245: {"audio_id": 1, "age": 25, "gender": "M", "accent": "Irish"}, 361: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"}, 314: {"audio_id": 1, "age": 26, "gender": "F", "accent": "SouthAfrican"}, 308: {"audio_id": 1, "age": 18, "gender": "F", "accent": "American"}, 229: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, 341: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"}, 275: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"}, 263: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, 253: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Welsh"}, 299: {"audio_id": 1, "age": 25, "gender": "F", "accent": "American"}, 316: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"}, 282: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, 362: {"audio_id": 1, "age": 29, "gender": "F", "accent": "American"}, 294: {"audio_id": 1, "age": 33, "gender": "F", "accent": "American"}, 274: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"}, 279: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, 281: {"audio_id": 1, "age": 29, "gender": "M", "accent": "Scottish"}, 286: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, 258: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"}, 247: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, 351: {"audio_id": 1, "age": 21, "gender": "F", "accent": "NorthernIrish"}, 283: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"}, 334: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"}, 333: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"}, 295: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Irish"}, 330: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"}, 335: {"audio_id": 1, "age": 25, "gender": "F", "accent": "NewZealand"}, 228: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"}, 267: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, 273: {"audio_id": 1, "age": 18, "gender": "F", "accent": "English"} } # Load the TTS model tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=False) # Extract speakers from metadata def extract_speakers(voice_selection): speaker_choices = [] for speaker_id, metadata in SPEAKER_METADATA.items(): if voice_selection == "english": speaker_choices.append(( str(speaker_id), f"p{speaker_id} ({metadata['gender']}, {metadata['accent']}, {metadata['age']} yrs)" )) return speaker_choices # Update the speaker dropdown based on selected voice def update_speaker_dropdown(voice_selection): speaker_choices = extract_speakers(voice_selection) visible = bool(speaker_choices) default = speaker_choices[0][0] if speaker_choices else None return gr.Dropdown.update( choices=speaker_choices, visible=visible, value=default ) # Generate speech to text def generate_audio(voice_selection, speaker_selection, text_input): speaker_id = int(speaker_selection) temp_file = tempfile.mktemp(suffix=".wav") tts.tts_to_file(text_input, temp_file, speaker=speaker_id) return temp_file # Gradio interface with gr.Blocks() as demo: with gr.Row(): with gr.Column(): voice_dropdown = gr.Dropdown( choices=["english", "other"], label="Select Voice", value="english" ) speaker_dropdown = gr.Dropdown( label="Select Speaker", visible=False ) text_input = gr.Textbox(label="Enter text") audio_output = gr.Audio(label="Generated Audio") voice_dropdown.change( fn=update_speaker_dropdown, inputs=voice_dropdown, outputs=speaker_dropdown ) generate_button = gr.Button("Generate Audio") generate_button.click(generate_audio, inputs=[voice_dropdown, speaker_dropdown, text_input], outputs=audio_output) demo.launch()