Spaces:
Sleeping
Sleeping
import os | |
os.environ["NUMBA_DISABLE_CACHE"] = "1" | |
import gradio as gr | |
from docx import Document | |
from TTS.api import TTS | |
import tempfile | |
# Embedding the metadata directly into the script | |
SPEAKER_METADATA = { | |
300: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"}, | |
271: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"}, | |
287: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, | |
262: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"}, | |
284: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Scottish"}, | |
297: {"audio_id": 1, "age": 20, "gender": "F", "accent": "American"}, | |
227: {"audio_id": 1, "age": 38, "gender": "M", "accent": "English"}, | |
246: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, | |
225: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, | |
259: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, | |
252: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, | |
231: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, | |
266: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"}, | |
241: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"}, | |
312: {"audio_id": 1, "age": 19, "gender": "F", "accent": "Canadian"}, | |
329: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"}, | |
232: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, | |
305: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"}, | |
311: {"audio_id": 1, "age": 21, "gender": "M", "accent": "American"}, | |
301: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"}, | |
304: {"audio_id": 1, "age": 22, "gender": "M", "accent": "NorthernIrish"}, | |
310: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"}, | |
260: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"}, | |
315: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"}, | |
374: {"audio_id": 1, "age": 28, "gender": "M", "accent": "Australian"}, | |
364: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Irish"}, | |
269: {"audio_id": 1, "age": 20, "gender": "F", "accent": "English"}, | |
345: {"audio_id": 1, "age": 22, "gender": "M", "accent": "American"}, | |
326: {"audio_id": 1, "age": 26, "gender": "M", "accent": "Australian"}, | |
343: {"audio_id": 1, "age": 27, "gender": "F", "accent": "Canadian"}, | |
230: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"}, | |
376: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Indian"}, | |
240: {"audio_id": 1, "age": 21, "gender": "F", "accent": "English"}, | |
298: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Irish"}, | |
272: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"}, | |
248: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Indian"}, | |
264: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"}, | |
250: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"}, | |
292: {"audio_id": 1, "age": 23, "gender": "M", "accent": "NorthernIrish"}, | |
237: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, | |
363: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Canadian"}, | |
313: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"}, | |
285: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"}, | |
268: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, | |
302: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"}, | |
261: {"audio_id": 1, "age": 26, "gender": "F", "accent": "NorthernIrish"}, | |
336: {"audio_id": 1, "age": 18, "gender": "F", "accent": "SouthAfrican"}, | |
288: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"}, | |
226: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"}, | |
277: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, | |
360: {"audio_id": 1, "age": 19, "gender": "M", "accent": "American"}, | |
257: {"audio_id": 1, "age": 24, "gender": "F", "accent": "English"}, | |
254: {"audio_id": 1, "age": 21, "gender": "M", "accent": "English"}, | |
339: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"}, | |
323: {"audio_id": 1, "age": 19, "gender": "F", "accent": "SouthAfrican"}, | |
255: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"}, | |
249: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Scottish"}, | |
293: {"audio_id": 1, "age": 22, "gender": "F", "accent": "NorthernIrish"}, | |
244: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"}, | |
245: {"audio_id": 1, "age": 25, "gender": "M", "accent": "Irish"}, | |
361: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"}, | |
314: {"audio_id": 1, "age": 26, "gender": "F", "accent": "SouthAfrican"}, | |
308: {"audio_id": 1, "age": 18, "gender": "F", "accent": "American"}, | |
229: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, | |
341: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"}, | |
275: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"}, | |
263: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, | |
253: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Welsh"}, | |
299: {"audio_id": 1, "age": 25, "gender": "F", "accent": "American"}, | |
316: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"}, | |
282: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, | |
362: {"audio_id": 1, "age": 29, "gender": "F", "accent": "American"}, | |
294: {"audio_id": 1, "age": 33, "gender": "F", "accent": "American"}, | |
274: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"}, | |
279: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, | |
281: {"audio_id": 1, "age": 29, "gender": "M", "accent": "Scottish"}, | |
286: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"}, | |
258: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"}, | |
247: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"}, | |
351: {"audio_id": 1, "age": 21, "gender": "F", "accent": "NorthernIrish"}, | |
283: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"}, | |
334: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"}, | |
333: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"}, | |
295: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Irish"}, | |
330: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"}, | |
335: {"audio_id": 1, "age": 25, "gender": "F", "accent": "NewZealand"}, | |
228: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"}, | |
267: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"}, | |
273: {"audio_id": 1, "age": 18, "gender": "F", "accent": "English"} | |
} | |
# Load the TTS model | |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=False) | |
# Extract speakers from metadata | |
def extract_speakers(voice_selection): | |
speaker_choices = [] | |
for speaker_id, metadata in SPEAKER_METADATA.items(): | |
if voice_selection == "english": | |
speaker_choices.append(( | |
str(speaker_id), | |
f"p{speaker_id} ({metadata['gender']}, {metadata['accent']}, {metadata['age']} yrs)" | |
)) | |
return speaker_choices | |
# Update the speaker dropdown based on selected voice | |
def update_speaker_dropdown(voice_selection): | |
speaker_choices = extract_speakers(voice_selection) | |
visible = bool(speaker_choices) | |
default = speaker_choices[0][0] if speaker_choices else None | |
return gr.Dropdown.update( | |
choices=speaker_choices, | |
visible=visible, | |
value=default | |
) | |
# Generate speech to text | |
def generate_audio(voice_selection, speaker_selection, text_input): | |
speaker_id = int(speaker_selection) | |
temp_file = tempfile.mktemp(suffix=".wav") | |
tts.tts_to_file(text_input, temp_file, speaker=speaker_id) | |
return temp_file | |
# Gradio interface | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
voice_dropdown = gr.Dropdown( | |
choices=["english", "other"], | |
label="Select Voice", | |
value="english" | |
) | |
speaker_dropdown = gr.Dropdown( | |
label="Select Speaker", | |
visible=False | |
) | |
text_input = gr.Textbox(label="Enter text") | |
audio_output = gr.Audio(label="Generated Audio") | |
voice_dropdown.change( | |
fn=update_speaker_dropdown, | |
inputs=voice_dropdown, | |
outputs=speaker_dropdown | |
) | |
generate_button = gr.Button("Generate Audio") | |
generate_button.click(generate_audio, inputs=[voice_dropdown, speaker_dropdown, text_input], outputs=audio_output) | |
demo.launch() | |