File size: 8,863 Bytes
9e3182d
 
 
bebc496
9e3182d
68f40ec
bebc496
b9d2659
c849c89
9e3182d
 
 
 
6e7a5e3
9e3182d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c16247c
9e3182d
 
 
 
 
bebc496
9e3182d
 
 
 
c849c89
9e3182d
 
 
 
c849c89
9e3182d
 
 
bebc496
9e3182d
 
c849c89
9e3182d
 
c849c89
b9d2659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e3182d
 
 
 
c849c89
9e3182d
 
 
 
 
 
c849c89
9e3182d
 
c849c89
 
b9d2659
 
 
 
b9bf9b2
9e3182d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"

import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile
import zipfile

# Voice model
VOICE_MODEL = "tts_models/en/vctk/vits"

# Embedded metadata (from your file)
SPEAKER_METADATA = {
	    300: { "age": 23, "gender": "F", "accent": "American"},
	    271: { "age": 19, "gender": "M", "accent": "Scottish"},
	    287: { "age": 23, "gender": "M", "accent": "English"},
	    262: { "age": 23, "gender": "F", "accent": "Scottish"},
	    284: { "age": 20, "gender": "M", "accent": "Scottish"},
	    297: { "age": 20, "gender": "F", "accent": "American"},
	    227: { "age": 38, "gender": "M", "accent": "English"},
	    246: { "age": 22, "gender": "M", "accent": "Scottish"},
	    225: { "age": 23, "gender": "F", "accent": "English"},
	    259: { "age": 23, "gender": "M", "accent": "English"},
	    252: { "age": 22, "gender": "M", "accent": "Scottish"},
	    231: { "age": 23, "gender": "F", "accent": "English"},
	    266: { "age": 22, "gender": "F", "accent": "Irish"},
	    241: { "age": 21, "gender": "M", "accent": "Scottish"},
	    312: { "age": 19, "gender": "F", "accent": "Canadian"},
	    329: { "age": 23, "gender": "F", "accent": "American"},
	    232: { "age": 23, "gender": "M", "accent": "English"},
	    305: { "age": 19, "gender": "F", "accent": "American"},
	    311: { "age": 21, "gender": "M", "accent": "American"},
	    301: { "age": 23, "gender": "F", "accent": "American"},
	    304: { "age": 22, "gender": "M", "accent": "NorthernIrish"},
	    310: { "age": 21, "gender": "F", "accent": "American"},
	    260: { "age": 21, "gender": "M", "accent": "Scottish"},
	    315: { "age": 18, "gender": "M", "accent": "American"},
	    374: { "age": 28, "gender": "M", "accent": "Australian"},
	    364: { "age": 23, "gender": "M", "accent": "Irish"},
	    269: { "age": 20, "gender": "F", "accent": "English"},
	    345: { "age": 22, "gender": "M", "accent": "American"},
	    326: { "age": 26, "gender": "M", "accent": "Australian"},
	    343: { "age": 27, "gender": "F", "accent": "Canadian"},
	    230: { "age": 22, "gender": "F", "accent": "English"},
	    376: { "age": 22, "gender": "M", "accent": "Indian"},
	    240: { "age": 21, "gender": "F", "accent": "English"},
	    298: { "age": 19, "gender": "M", "accent": "Irish"},
	    272: { "age": 23, "gender": "M", "accent": "Scottish"},
	    248: { "age": 23, "gender": "F", "accent": "Indian"},
	    264: { "age": 23, "gender": "F", "accent": "Scottish"},
	    250: { "age": 22, "gender": "F", "accent": "English"},
	    292: { "age": 23, "gender": "M", "accent": "NorthernIrish"},
	    237: { "age": 22, "gender": "M", "accent": "Scottish"},
	    363: { "age": 22, "gender": "M", "accent": "Canadian"},
	    313: { "age": 24, "gender": "F", "accent": "Irish"},
	    285: { "age": 21, "gender": "M", "accent": "Scottish"},
	    268: { "age": 23, "gender": "F", "accent": "English"},
	    302: { "age": 20, "gender": "M", "accent": "Canadian"},
	    261: { "age": 26, "gender": "F", "accent": "NorthernIrish"},
	    336: { "age": 18, "gender": "F", "accent": "SouthAfrican"},
	    288: { "age": 22, "gender": "F", "accent": "Irish"},
	    226: { "age": 22, "gender": "M", "accent": "English"},
	    277: { "age": 23, "gender": "F", "accent": "English"},
	    360: { "age": 19, "gender": "M", "accent": "American"},
	    257: { "age": 24, "gender": "F", "accent": "English"},
	    254: { "age": 21, "gender": "M", "accent": "English"},
	    339: { "age": 21, "gender": "F", "accent": "American"},
	    323: { "age": 19, "gender": "F", "accent": "SouthAfrican"},
	    255: { "age": 19, "gender": "M", "accent": "Scottish"},
	    249: { "age": 22, "gender": "F", "accent": "Scottish"},
	    293: { "age": 22, "gender": "F", "accent": "NorthernIrish"},
	    244: { "age": 22, "gender": "F", "accent": "English"},
	    245: { "age": 25, "gender": "M", "accent": "Irish"},
	    361: { "age": 19, "gender": "F", "accent": "American"},
	    314: { "age": 26, "gender": "F", "accent": "SouthAfrican"},
	    308: { "age": 18, "gender": "F", "accent": "American"},
	    229: { "age": 23, "gender": "F", "accent": "English"},
	    341: { "age": 26, "gender": "F", "accent": "American"},
	    275: { "age": 23, "gender": "M", "accent": "Scottish"},
	    263: { "age": 22, "gender": "M", "accent": "Scottish"},
	    253: { "age": 22, "gender": "F", "accent": "Welsh"},
	    299: { "age": 25, "gender": "F", "accent": "American"},
	    316: { "age": 20, "gender": "M", "accent": "Canadian"},
	    282: { "age": 23, "gender": "F", "accent": "English"},
	    362: { "age": 29, "gender": "F", "accent": "American"},
	    294: { "age": 33, "gender": "F", "accent": "American"},
	    274: { "age": 22, "gender": "M", "accent": "English"},
	    279: { "age": 23, "gender": "M", "accent": "English"},
	    281: { "age": 29, "gender": "M", "accent": "Scottish"},
	    286: { "age": 23, "gender": "M", "accent": "English"},
	    258: { "age": 22, "gender": "M", "accent": "English"},
	    247: { "age": 22, "gender": "M", "accent": "Scottish"},
	    351: { "age": 21, "gender": "F", "accent": "NorthernIrish"},
	    283: { "age": 24, "gender": "F", "accent": "Irish"},
	    334: { "age": 18, "gender": "M", "accent": "American"},
	    333: { "age": 19, "gender": "F", "accent": "American"},
	    295: { "age": 23, "gender": "F", "accent": "Irish"},
	    330: { "age": 26, "gender": "F", "accent": "American"},
	    335: { "age": 25, "gender": "F", "accent": "NewZealand"},
	    228: { "age": 22, "gender": "F", "accent": "English"},
	    267: { "age": 23, "gender": "F", "accent": "English"},
	    273: { "age": 18, "gender": "F", "accent": "English"}
	}



# Return dropdown list like: "p225 - F, English"
def get_speaker_dropdown_choices():
    choices = []
    for speaker_id, meta in SPEAKER_METADATA.items():
        desc = f"p{speaker_id} - {meta['gender']}, {meta['accent']}"
        choices.append((desc, f"p{speaker_id}"))
    return choices

# Cache TTS model
MODEL_CACHE = {}

def load_tts_model():
    if VOICE_MODEL not in MODEL_CACHE:
        MODEL_CACHE[VOICE_MODEL] = TTS(model_name=VOICE_MODEL, progress_bar=False, gpu=False)
    return MODEL_CACHE[VOICE_MODEL]

def docx_to_wav(doc_file, selected_desc):
    speaker_id = next((sid for desc, sid in get_speaker_dropdown_choices() if desc == selected_desc), None)
    if not speaker_id:
        raise ValueError("Invalid speaker selection")

    tts = load_tts_model()
    document = Document(doc_file.name)
    full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
        wav_path = tmp_wav.name

    tts.tts_to_file(text=full_text, file_path=wav_path, speaker=speaker_id)
    return wav_path


def docx_to_zipped_wavs(doc_file, selected_desc):
    speaker_id = next((sid for desc, sid in get_speaker_dropdown_choices() if desc == selected_desc), None)
    if not speaker_id:
        raise ValueError("Invalid speaker selection")

    tts = load_tts_model()
    document = Document(doc_file.name)
    paragraphs = [p.text.strip() for p in document.paragraphs if p.text.strip()]

    if not paragraphs:
        raise ValueError("No non-empty paragraphs found in the document.")

    with tempfile.TemporaryDirectory() as temp_dir:
        wav_paths = []
        for i, para in enumerate(paragraphs, start=1):
            wav_path = os.path.join(temp_dir, f"chunk_{i:02d}.wav")
            tts.tts_to_file(text=para, file_path=wav_path, speaker=speaker_id)
            wav_paths.append(wav_path)

        # Create a zip file
        zip_path = os.path.join(temp_dir, "voice_chunks.zip")
        with zipfile.ZipFile(zip_path, "w") as zipf:
            for wav in wav_paths:
                zipf.write(wav, os.path.basename(wav))

        # Copy zip to a final temp file for Gradio to return
        final_zip = tempfile.NamedTemporaryFile(suffix=".zip", delete=False)
        with open(zip_path, "rb") as src, open(final_zip.name, "wb") as dst:
            dst.write(src.read())

    return final_zip.name


# Gradio UI
with gr.Blocks() as interface:
    gr.Markdown("# 🎤 English Voice Generator from DOCX")
    gr.Markdown("Upload a `.docx` file and select a speaker to generate a WAV voiceover.")

    doc_input = gr.File(label="Upload .docx File", type="filepath")
    speaker_dropdown = gr.Dropdown(
        choices=[desc for desc, _ in get_speaker_dropdown_choices()],
        label="Select Speaker",
        value=None
    )

    generate_btn = gr.Button("Generate WAV")
    output_audio = gr.Audio(label="Generated Audio", type="filepath")

    generate_btn.click(
    fn=docx_to_zipped_wavs,
    inputs=[doc_input, speaker_dropdown],
    outputs=gr.File(label="Download ZIP of Audio Files")
)

if __name__ == "__main__":
    interface.launch()