SohomToom commited on
Commit
9e3182d
·
verified ·
1 Parent(s): c849c89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -82
app.py CHANGED
@@ -1,98 +1,160 @@
 
 
 
1
  import gradio as gr
 
2
  from TTS.api import TTS
3
  import tempfile
4
- import docx
5
-
6
- # Voice models dictionary with metadata on whether they support multi-speaker
7
- VOICE_MODELS = {
8
- "LJSpeech (Standard Female)": {
9
- "model_name": "tts_models/en/ljspeech/vits",
10
- "multi_speaker": False
11
- },
12
- "VCTK (Multi-speaker English)": {
13
- "model_name": "tts_models/en/vctk/vits",
14
- "multi_speaker": True
15
- }
16
- }
17
-
18
- # Embedded short speaker metadata (from your CSV)
19
- SPEAKER_METADATA = {
20
- "225": {"age": 23, "gender": "F", "accent": "English"},
21
- "226": {"age": 22, "gender": "M", "accent": "English"},
22
- "227": {"age": 38, "gender": "M", "accent": "English"},
23
- "228": {"age": 22, "gender": "F", "accent": "English"},
24
- "229": {"age": 23, "gender": "F", "accent": "English"},
25
- "230": {"age": 22, "gender": "F", "accent": "English"},
26
- "231": {"age": 23, "gender": "F", "accent": "English"},
27
- "232": {"age": 23, "gender": "M", "accent": "English"},
28
- "233": {"age": 23, "gender": "F", "accent": "English"},
29
- "234": {"age": 22, "gender": "F", "accent": "Scottish"}
30
- # Add more as needed
31
- }
32
-
33
- # Pre-format speaker dropdown choices
34
- SPEAKER_CHOICES = [
35
- (sid, f"p{sid} ({data['gender']}, {data['accent']}, {data['age']} yrs)")
36
- for sid, data in SPEAKER_METADATA.items()
37
- ]
38
-
39
- # Model cache
40
- MODEL_CACHE = {}
41
-
42
- def load_tts_model(model_key):
43
- if model_key in MODEL_CACHE:
44
- return MODEL_CACHE[model_key]
45
- model_info = VOICE_MODELS[model_key]
46
- tts = TTS(model_name=model_info["model_name"], gpu=False)
47
- MODEL_CACHE[model_key] = tts
48
- return tts
49
 
50
- def extract_text_from_docx(file):
51
- doc = docx.Document(file)
52
- return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
53
-
54
- def generate_audio(voice_key, speaker_id, docx_file):
55
- text = extract_text_from_docx(docx_file)
56
- tts = load_tts_model(voice_key)
57
- kwargs = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- if VOICE_MODELS[voice_key]["multi_speaker"]:
60
- kwargs["speaker"] = speaker_id
 
 
61
 
62
- output_path = tempfile.mktemp(suffix=".wav")
63
- tts.tts_to_file(text=text, file_path=output_path, **kwargs)
64
- return output_path
 
65
 
66
- def update_speaker_visibility(voice_key):
67
- visible = VOICE_MODELS[voice_key]["multi_speaker"]
68
- return gr.update(visible=visible)
69
 
70
- with gr.Blocks() as demo:
71
- gr.Markdown("## DOCX to Speech with Speaker Selection")
72
 
73
- with gr.Row():
74
- voice_dropdown = gr.Dropdown(
75
- choices=list(VOICE_MODELS.keys()),
76
- value="LJSpeech (Standard Female)",
77
- label="Select Voice"
78
- )
79
 
80
- speaker_dropdown = gr.Dropdown(
81
- choices=SPEAKER_CHOICES,
82
- label="Select Speaker",
83
- visible=False
84
- )
85
 
86
- docx_input = gr.File(label="Upload .docx File", file_types=[".docx"])
87
- generate_btn = gr.Button("Generate Audio")
88
- audio_output = gr.Audio(label="Output Audio")
 
 
 
89
 
90
- voice_dropdown.change(fn=update_speaker_visibility, inputs=voice_dropdown, outputs=speaker_dropdown)
 
91
 
92
  generate_btn.click(
93
- fn=generate_audio,
94
- inputs=[voice_dropdown, speaker_dropdown, docx_input],
95
- outputs=audio_output
96
  )
97
 
98
- demo.launch()
 
 
1
+ import os
2
+ os.environ["NUMBA_DISABLE_CACHE"] = "1"
3
+
4
  import gradio as gr
5
+ from docx import Document
6
  from TTS.api import TTS
7
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Voice model
10
+ VOICE_MODEL = "tts_models/en/vctk/vits"
11
+
12
+ # Embedded metadata (from your file)
13
+ SPEAKER_METADATA = {
14
+ 300: { "age": 23, "gender": "F", "accent": "American"},
15
+ 271: { "age": 19, "gender": "M", "accent": "Scottish"},
16
+ 287: { "age": 23, "gender": "M", "accent": "English"},
17
+ 262: { "age": 23, "gender": "F", "accent": "Scottish"},
18
+ 284: { "age": 20, "gender": "M", "accent": "Scottish"},
19
+ 297: { "age": 20, "gender": "F", "accent": "American"},
20
+ 227: { "age": 38, "gender": "M", "accent": "English"},
21
+ 246: { "age": 22, "gender": "M", "accent": "Scottish"},
22
+ 225: { "age": 23, "gender": "F", "accent": "English"},
23
+ 259: { "age": 23, "gender": "M", "accent": "English"},
24
+ 252: { "age": 22, "gender": "M", "accent": "Scottish"},
25
+ 231: { "age": 23, "gender": "F", "accent": "English"},
26
+ 266: { "age": 22, "gender": "F", "accent": "Irish"},
27
+ 241: { "age": 21, "gender": "M", "accent": "Scottish"},
28
+ 312: { "age": 19, "gender": "F", "accent": "Canadian"},
29
+ 329: { "age": 23, "gender": "F", "accent": "American"},
30
+ 232: { "age": 23, "gender": "M", "accent": "English"},
31
+ 305: { "age": 19, "gender": "F", "accent": "American"},
32
+ 311: { "age": 21, "gender": "M", "accent": "American"},
33
+ 301: { "age": 23, "gender": "F", "accent": "American"},
34
+ 304: { "age": 22, "gender": "M", "accent": "NorthernIrish"},
35
+ 310: { "age": 21, "gender": "F", "accent": "American"},
36
+ 260: { "age": 21, "gender": "M", "accent": "Scottish"},
37
+ 315: { "age": 18, "gender": "M", "accent": "American"},
38
+ 374: { "age": 28, "gender": "M", "accent": "Australian"},
39
+ 364: { "age": 23, "gender": "M", "accent": "Irish"},
40
+ 269: { "age": 20, "gender": "F", "accent": "English"},
41
+ 345: { "age": 22, "gender": "M", "accent": "American"},
42
+ 326: { "age": 26, "gender": "M", "accent": "Australian"},
43
+ 343: { "age": 27, "gender": "F", "accent": "Canadian"},
44
+ 230: { "age": 22, "gender": "F", "accent": "English"},
45
+ 376: { "age": 22, "gender": "M", "accent": "Indian"},
46
+ 240: { "age": 21, "gender": "F", "accent": "English"},
47
+ 298: { "age": 19, "gender": "M", "accent": "Irish"},
48
+ 272: { "age": 23, "gender": "M", "accent": "Scottish"},
49
+ 248: { "age": 23, "gender": "F", "accent": "Indian"},
50
+ 264: { "age": 23, "gender": "F", "accent": "Scottish"},
51
+ 250: { "age": 22, "gender": "F", "accent": "English"},
52
+ 292: { "age": 23, "gender": "M", "accent": "NorthernIrish"},
53
+ 237: { "age": 22, "gender": "M", "accent": "Scottish"},
54
+ 363: { "age": 22, "gender": "M", "accent": "Canadian"},
55
+ 313: { "age": 24, "gender": "F", "accent": "Irish"},
56
+ 285: { "age": 21, "gender": "M", "accent": "Scottish"},
57
+ 268: { "age": 23, "gender": "F", "accent": "English"},
58
+ 302: { "age": 20, "gender": "M", "accent": "Canadian"},
59
+ 261: { "age": 26, "gender": "F", "accent": "NorthernIrish"},
60
+ 336: { "age": 18, "gender": "F", "accent": "SouthAfrican"},
61
+ 288: { "age": 22, "gender": "F", "accent": "Irish"},
62
+ 226: { "age": 22, "gender": "M", "accent": "English"},
63
+ 277: { "age": 23, "gender": "F", "accent": "English"},
64
+ 360: { "age": 19, "gender": "M", "accent": "American"},
65
+ 257: { "age": 24, "gender": "F", "accent": "English"},
66
+ 254: { "age": 21, "gender": "M", "accent": "English"},
67
+ 339: { "age": 21, "gender": "F", "accent": "American"},
68
+ 323: { "age": 19, "gender": "F", "accent": "SouthAfrican"},
69
+ 255: { "age": 19, "gender": "M", "accent": "Scottish"},
70
+ 249: { "age": 22, "gender": "F", "accent": "Scottish"},
71
+ 293: { "age": 22, "gender": "F", "accent": "NorthernIrish"},
72
+ 244: { "age": 22, "gender": "F", "accent": "English"},
73
+ 245: { "age": 25, "gender": "M", "accent": "Irish"},
74
+ 361: { "age": 19, "gender": "F", "accent": "American"},
75
+ 314: { "age": 26, "gender": "F", "accent": "SouthAfrican"},
76
+ 308: { "age": 18, "gender": "F", "accent": "American"},
77
+ 229: { "age": 23, "gender": "F", "accent": "English"},
78
+ 341: { "age": 26, "gender": "F", "accent": "American"},
79
+ 275: { "age": 23, "gender": "M", "accent": "Scottish"},
80
+ 263: { "age": 22, "gender": "M", "accent": "Scottish"},
81
+ 253: { "age": 22, "gender": "F", "accent": "Welsh"},
82
+ 299: { "age": 25, "gender": "F", "accent": "American"},
83
+ 316: { "age": 20, "gender": "M", "accent": "Canadian"},
84
+ 282: { "age": 23, "gender": "F", "accent": "English"},
85
+ 362: { "age": 29, "gender": "F", "accent": "American"},
86
+ 294: { "age": 33, "gender": "F", "accent": "American"},
87
+ 274: { "age": 22, "gender": "M", "accent": "English"},
88
+ 279: { "age": 23, "gender": "M", "accent": "English"},
89
+ 281: { "age": 29, "gender": "M", "accent": "Scottish"},
90
+ 286: { "age": 23, "gender": "M", "accent": "English"},
91
+ 258: { "age": 22, "gender": "M", "accent": "English"},
92
+ 247: { "age": 22, "gender": "M", "accent": "Scottish"},
93
+ 351: { "age": 21, "gender": "F", "accent": "NorthernIrish"},
94
+ 283: { "age": 24, "gender": "F", "accent": "Irish"},
95
+ 334: { "age": 18, "gender": "M", "accent": "American"},
96
+ 333: { "age": 19, "gender": "F", "accent": "American"},
97
+ 295: { "age": 23, "gender": "F", "accent": "Irish"},
98
+ 330: { "age": 26, "gender": "F", "accent": "American"},
99
+ 335: { "age": 25, "gender": "F", "accent": "NewZealand"},
100
+ 228: { "age": 22, "gender": "F", "accent": "English"},
101
+ 267: { "age": 23, "gender": "F", "accent": "English"},
102
+ 273: { "age": 18, "gender": "F", "accent": "English"}
103
+ }
104
+
105
+
106
+
107
+ # Return dropdown list like: "p225 - F, English"
108
+ def get_speaker_dropdown_choices():
109
+ choices = []
110
+ for speaker_id, meta in SPEAKER_METADATA.items():
111
+ desc = f"p{speaker_id} - {meta['gender']}, {meta['accents']}"
112
+ choices.append((desc, f"p{speaker_id}"))
113
+ return choices
114
+
115
+ # Cache TTS model
116
+ MODEL_CACHE = {}
117
 
118
+ def load_tts_model():
119
+ if VOICE_MODEL not in MODEL_CACHE:
120
+ MODEL_CACHE[VOICE_MODEL] = TTS(model_name=VOICE_MODEL, progress_bar=False, gpu=False)
121
+ return MODEL_CACHE[VOICE_MODEL]
122
 
123
+ def docx_to_wav(doc_file, selected_desc):
124
+ speaker_id = next((sid for desc, sid in get_speaker_dropdown_choices() if desc == selected_desc), None)
125
+ if not speaker_id:
126
+ raise ValueError("Invalid speaker selection")
127
 
128
+ tts = load_tts_model()
129
+ document = Document(doc_file.name)
130
+ full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
131
 
132
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
133
+ wav_path = tmp_wav.name
134
 
135
+ tts.tts_to_file(text=full_text, file_path=wav_path, speaker=speaker_id)
136
+ return wav_path
 
 
 
 
137
 
138
+ # Gradio UI
139
+ with gr.Blocks() as interface:
140
+ gr.Markdown("# 🎤 English Voice Generator from DOCX")
141
+ gr.Markdown("Upload a `.docx` file and select a speaker to generate a WAV voiceover.")
 
142
 
143
+ doc_input = gr.File(label="Upload .docx File", type="filepath")
144
+ speaker_dropdown = gr.Dropdown(
145
+ choices=[desc for desc, _ in get_speaker_dropdown_choices()],
146
+ label="Select Speaker",
147
+ value=None
148
+ )
149
 
150
+ generate_btn = gr.Button("Generate WAV")
151
+ output_audio = gr.Audio(label="Generated Audio", type="filepath")
152
 
153
  generate_btn.click(
154
+ fn=docx_to_wav,
155
+ inputs=[doc_input, speaker_dropdown],
156
+ outputs=output_audio
157
  )
158
 
159
+ if __name__ == "__main__":
160
+ interface.launch()