File size: 8,772 Bytes
a3e2313
bebc496
 
 
68f40ec
 
bebc496
b9bf9b2
40ede2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152fe30
b9bf9b2
40ede2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bebc496
40ede2a
 
 
 
 
 
bebc496
40ede2a
 
bebc496
40ede2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a34b148
40ede2a
 
 
b9bf9b2
40ede2a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"

import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile

# Embedding the metadata directly into the script
SPEAKER_METADATA = {
    300: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
    271: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"},
    287: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
    262: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"},
    284: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Scottish"},
    297: {"audio_id": 1, "age": 20, "gender": "F", "accent": "American"},
    227: {"audio_id": 1, "age": 38, "gender": "M", "accent": "English"},
    246: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
    225: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
    259: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
    252: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
    231: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
    266: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"},
    241: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
    312: {"audio_id": 1, "age": 19, "gender": "F", "accent": "Canadian"},
    329: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
    232: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
    305: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
    311: {"audio_id": 1, "age": 21, "gender": "M", "accent": "American"},
    301: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
    304: {"audio_id": 1, "age": 22, "gender": "M", "accent": "NorthernIrish"},
    310: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"},
    260: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
    315: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"},
    374: {"audio_id": 1, "age": 28, "gender": "M", "accent": "Australian"},
    364: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Irish"},
    269: {"audio_id": 1, "age": 20, "gender": "F", "accent": "English"},
    345: {"audio_id": 1, "age": 22, "gender": "M", "accent": "American"},
    326: {"audio_id": 1, "age": 26, "gender": "M", "accent": "Australian"},
    343: {"audio_id": 1, "age": 27, "gender": "F", "accent": "Canadian"},
    230: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
    376: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Indian"},
    240: {"audio_id": 1, "age": 21, "gender": "F", "accent": "English"},
    298: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Irish"},
    272: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"},
    248: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Indian"},
    264: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"},
    250: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
    292: {"audio_id": 1, "age": 23, "gender": "M", "accent": "NorthernIrish"},
    237: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
    363: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Canadian"},
    313: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"},
    285: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
    268: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
    302: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"},
    261: {"audio_id": 1, "age": 26, "gender": "F", "accent": "NorthernIrish"},
    336: {"audio_id": 1, "age": 18, "gender": "F", "accent": "SouthAfrican"},
    288: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"},
    226: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
    277: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
    360: {"audio_id": 1, "age": 19, "gender": "M", "accent": "American"},
    257: {"audio_id": 1, "age": 24, "gender": "F", "accent": "English"},
    254: {"audio_id": 1, "age": 21, "gender": "M", "accent": "English"},
    339: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"},
    323: {"audio_id": 1, "age": 19, "gender": "F", "accent": "SouthAfrican"},
    255: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"},
    249: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Scottish"},
    293: {"audio_id": 1, "age": 22, "gender": "F", "accent": "NorthernIrish"},
    244: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
    245: {"audio_id": 1, "age": 25, "gender": "M", "accent": "Irish"},
    361: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
    314: {"audio_id": 1, "age": 26, "gender": "F", "accent": "SouthAfrican"},
    308: {"audio_id": 1, "age": 18, "gender": "F", "accent": "American"},
    229: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
    341: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"},
    275: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"},
    263: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
    253: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Welsh"},
    299: {"audio_id": 1, "age": 25, "gender": "F", "accent": "American"},
    316: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"},
    282: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
    362: {"audio_id": 1, "age": 29, "gender": "F", "accent": "American"},
    294: {"audio_id": 1, "age": 33, "gender": "F", "accent": "American"},
    274: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
    279: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
    281: {"audio_id": 1, "age": 29, "gender": "M", "accent": "Scottish"},
    286: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
    258: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
    247: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
    351: {"audio_id": 1, "age": 21, "gender": "F", "accent": "NorthernIrish"},
    283: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"},
    334: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"},
    333: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
    295: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Irish"},
    330: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"},
    335: {"audio_id": 1, "age": 25, "gender": "F", "accent": "NewZealand"},
    228: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
    267: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
    273: {"audio_id": 1, "age": 18, "gender": "F", "accent": "English"}
}

# Load the TTS model
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=False)

# Extract speakers from metadata
def extract_speakers(voice_selection):
    speaker_choices = []
    for speaker_id, metadata in SPEAKER_METADATA.items():
        if voice_selection == "english":
            speaker_choices.append((
                str(speaker_id),
                f"p{speaker_id} ({metadata['gender']}, {metadata['accent']}, {metadata['age']} yrs)"
            ))
    return speaker_choices

# Update the speaker dropdown based on selected voice
def update_speaker_dropdown(voice_selection):
    speaker_choices = extract_speakers(voice_selection)
    visible = bool(speaker_choices)
    default = speaker_choices[0][0] if speaker_choices else None
    return gr.Dropdown.update(
        choices=speaker_choices,
        visible=visible,
        value=default
    )

# Generate speech to text
def generate_audio(voice_selection, speaker_selection, text_input):
    speaker_id = int(speaker_selection)
    temp_file = tempfile.mktemp(suffix=".wav")
    tts.tts_to_file(text_input, temp_file, speaker=speaker_id)
    return temp_file

# Gradio interface
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            voice_dropdown = gr.Dropdown(
                choices=["english", "other"], 
                label="Select Voice",
                value="english"
            )
            speaker_dropdown = gr.Dropdown(
                label="Select Speaker",
                visible=False
            )
            text_input = gr.Textbox(label="Enter text")
            audio_output = gr.Audio(label="Generated Audio")
            
    voice_dropdown.change(
        fn=update_speaker_dropdown,
        inputs=voice_dropdown,
        outputs=speaker_dropdown
    )
    
    generate_button = gr.Button("Generate Audio")
    generate_button.click(generate_audio, inputs=[voice_dropdown, speaker_dropdown, text_input], outputs=audio_output)

demo.launch()