SohomToom commited on
Commit
c849c89
·
verified ·
1 Parent(s): 40ede2a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -145
app.py CHANGED
@@ -1,159 +1,98 @@
1
- import os
2
- os.environ["NUMBA_DISABLE_CACHE"] = "1"
3
-
4
  import gradio as gr
5
- from docx import Document
6
  from TTS.api import TTS
7
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # Embedding the metadata directly into the script
10
  SPEAKER_METADATA = {
11
- 300: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
12
- 271: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"},
13
- 287: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
14
- 262: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"},
15
- 284: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Scottish"},
16
- 297: {"audio_id": 1, "age": 20, "gender": "F", "accent": "American"},
17
- 227: {"audio_id": 1, "age": 38, "gender": "M", "accent": "English"},
18
- 246: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
19
- 225: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
20
- 259: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
21
- 252: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
22
- 231: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
23
- 266: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"},
24
- 241: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
25
- 312: {"audio_id": 1, "age": 19, "gender": "F", "accent": "Canadian"},
26
- 329: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
27
- 232: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
28
- 305: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
29
- 311: {"audio_id": 1, "age": 21, "gender": "M", "accent": "American"},
30
- 301: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
31
- 304: {"audio_id": 1, "age": 22, "gender": "M", "accent": "NorthernIrish"},
32
- 310: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"},
33
- 260: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
34
- 315: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"},
35
- 374: {"audio_id": 1, "age": 28, "gender": "M", "accent": "Australian"},
36
- 364: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Irish"},
37
- 269: {"audio_id": 1, "age": 20, "gender": "F", "accent": "English"},
38
- 345: {"audio_id": 1, "age": 22, "gender": "M", "accent": "American"},
39
- 326: {"audio_id": 1, "age": 26, "gender": "M", "accent": "Australian"},
40
- 343: {"audio_id": 1, "age": 27, "gender": "F", "accent": "Canadian"},
41
- 230: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
42
- 376: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Indian"},
43
- 240: {"audio_id": 1, "age": 21, "gender": "F", "accent": "English"},
44
- 298: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Irish"},
45
- 272: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"},
46
- 248: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Indian"},
47
- 264: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"},
48
- 250: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
49
- 292: {"audio_id": 1, "age": 23, "gender": "M", "accent": "NorthernIrish"},
50
- 237: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
51
- 363: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Canadian"},
52
- 313: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"},
53
- 285: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
54
- 268: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
55
- 302: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"},
56
- 261: {"audio_id": 1, "age": 26, "gender": "F", "accent": "NorthernIrish"},
57
- 336: {"audio_id": 1, "age": 18, "gender": "F", "accent": "SouthAfrican"},
58
- 288: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"},
59
- 226: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
60
- 277: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
61
- 360: {"audio_id": 1, "age": 19, "gender": "M", "accent": "American"},
62
- 257: {"audio_id": 1, "age": 24, "gender": "F", "accent": "English"},
63
- 254: {"audio_id": 1, "age": 21, "gender": "M", "accent": "English"},
64
- 339: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"},
65
- 323: {"audio_id": 1, "age": 19, "gender": "F", "accent": "SouthAfrican"},
66
- 255: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"},
67
- 249: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Scottish"},
68
- 293: {"audio_id": 1, "age": 22, "gender": "F", "accent": "NorthernIrish"},
69
- 244: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
70
- 245: {"audio_id": 1, "age": 25, "gender": "M", "accent": "Irish"},
71
- 361: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
72
- 314: {"audio_id": 1, "age": 26, "gender": "F", "accent": "SouthAfrican"},
73
- 308: {"audio_id": 1, "age": 18, "gender": "F", "accent": "American"},
74
- 229: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
75
- 341: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"},
76
- 275: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"},
77
- 263: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
78
- 253: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Welsh"},
79
- 299: {"audio_id": 1, "age": 25, "gender": "F", "accent": "American"},
80
- 316: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"},
81
- 282: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
82
- 362: {"audio_id": 1, "age": 29, "gender": "F", "accent": "American"},
83
- 294: {"audio_id": 1, "age": 33, "gender": "F", "accent": "American"},
84
- 274: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
85
- 279: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
86
- 281: {"audio_id": 1, "age": 29, "gender": "M", "accent": "Scottish"},
87
- 286: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
88
- 258: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
89
- 247: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
90
- 351: {"audio_id": 1, "age": 21, "gender": "F", "accent": "NorthernIrish"},
91
- 283: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"},
92
- 334: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"},
93
- 333: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
94
- 295: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Irish"},
95
- 330: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"},
96
- 335: {"audio_id": 1, "age": 25, "gender": "F", "accent": "NewZealand"},
97
- 228: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
98
- 267: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
99
- 273: {"audio_id": 1, "age": 18, "gender": "F", "accent": "English"}
100
  }
101
 
102
- # Load the TTS model
103
- tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=False)
104
-
105
- # Extract speakers from metadata
106
- def extract_speakers(voice_selection):
107
- speaker_choices = []
108
- for speaker_id, metadata in SPEAKER_METADATA.items():
109
- if voice_selection == "english":
110
- speaker_choices.append((
111
- str(speaker_id),
112
- f"p{speaker_id} ({metadata['gender']}, {metadata['accent']}, {metadata['age']} yrs)"
113
- ))
114
- return speaker_choices
115
-
116
- # Update the speaker dropdown based on selected voice
117
- def update_speaker_dropdown(voice_selection):
118
- speaker_choices = extract_speakers(voice_selection)
119
- visible = bool(speaker_choices)
120
- default = speaker_choices[0][0] if speaker_choices else None
121
- return gr.Dropdown.update(
122
- choices=speaker_choices,
123
- visible=visible,
124
- value=default
125
- )
 
126
 
127
- # Generate speech to text
128
- def generate_audio(voice_selection, speaker_selection, text_input):
129
- speaker_id = int(speaker_selection)
130
- temp_file = tempfile.mktemp(suffix=".wav")
131
- tts.tts_to_file(text_input, temp_file, speaker=speaker_id)
132
- return temp_file
 
 
 
 
133
 
134
- # Gradio interface
135
  with gr.Blocks() as demo:
 
 
136
  with gr.Row():
137
- with gr.Column():
138
- voice_dropdown = gr.Dropdown(
139
- choices=["english", "other"],
140
- label="Select Voice",
141
- value="english"
142
- )
143
- speaker_dropdown = gr.Dropdown(
144
- label="Select Speaker",
145
- visible=False
146
- )
147
- text_input = gr.Textbox(label="Enter text")
148
- audio_output = gr.Audio(label="Generated Audio")
149
-
150
- voice_dropdown.change(
151
- fn=update_speaker_dropdown,
152
- inputs=voice_dropdown,
153
- outputs=speaker_dropdown
 
 
 
 
 
154
  )
155
-
156
- generate_button = gr.Button("Generate Audio")
157
- generate_button.click(generate_audio, inputs=[voice_dropdown, speaker_dropdown, text_input], outputs=audio_output)
158
 
159
  demo.launch()
 
 
 
 
1
  import gradio as gr
 
2
  from TTS.api import TTS
3
  import tempfile
4
+ import docx
5
+
6
+ # Voice models dictionary with metadata on whether they support multi-speaker
7
+ VOICE_MODELS = {
8
+ "LJSpeech (Standard Female)": {
9
+ "model_name": "tts_models/en/ljspeech/vits",
10
+ "multi_speaker": False
11
+ },
12
+ "VCTK (Multi-speaker English)": {
13
+ "model_name": "tts_models/en/vctk/vits",
14
+ "multi_speaker": True
15
+ }
16
+ }
17
 
18
+ # Embedded short speaker metadata (from your CSV)
19
  SPEAKER_METADATA = {
20
+ "225": {"age": 23, "gender": "F", "accent": "English"},
21
+ "226": {"age": 22, "gender": "M", "accent": "English"},
22
+ "227": {"age": 38, "gender": "M", "accent": "English"},
23
+ "228": {"age": 22, "gender": "F", "accent": "English"},
24
+ "229": {"age": 23, "gender": "F", "accent": "English"},
25
+ "230": {"age": 22, "gender": "F", "accent": "English"},
26
+ "231": {"age": 23, "gender": "F", "accent": "English"},
27
+ "232": {"age": 23, "gender": "M", "accent": "English"},
28
+ "233": {"age": 23, "gender": "F", "accent": "English"},
29
+ "234": {"age": 22, "gender": "F", "accent": "Scottish"}
30
+ # Add more as needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
 
33
+ # Pre-format speaker dropdown choices
34
+ SPEAKER_CHOICES = [
35
+ (sid, f"p{sid} ({data['gender']}, {data['accent']}, {data['age']} yrs)")
36
+ for sid, data in SPEAKER_METADATA.items()
37
+ ]
38
+
39
+ # Model cache
40
+ MODEL_CACHE = {}
41
+
42
+ def load_tts_model(model_key):
43
+ if model_key in MODEL_CACHE:
44
+ return MODEL_CACHE[model_key]
45
+ model_info = VOICE_MODELS[model_key]
46
+ tts = TTS(model_name=model_info["model_name"], gpu=False)
47
+ MODEL_CACHE[model_key] = tts
48
+ return tts
49
+
50
+ def extract_text_from_docx(file):
51
+ doc = docx.Document(file)
52
+ return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
53
+
54
+ def generate_audio(voice_key, speaker_id, docx_file):
55
+ text = extract_text_from_docx(docx_file)
56
+ tts = load_tts_model(voice_key)
57
+ kwargs = {}
58
 
59
+ if VOICE_MODELS[voice_key]["multi_speaker"]:
60
+ kwargs["speaker"] = speaker_id
61
+
62
+ output_path = tempfile.mktemp(suffix=".wav")
63
+ tts.tts_to_file(text=text, file_path=output_path, **kwargs)
64
+ return output_path
65
+
66
+ def update_speaker_visibility(voice_key):
67
+ visible = VOICE_MODELS[voice_key]["multi_speaker"]
68
+ return gr.update(visible=visible)
69
 
 
70
  with gr.Blocks() as demo:
71
+ gr.Markdown("## DOCX to Speech with Speaker Selection")
72
+
73
  with gr.Row():
74
+ voice_dropdown = gr.Dropdown(
75
+ choices=list(VOICE_MODELS.keys()),
76
+ value="LJSpeech (Standard Female)",
77
+ label="Select Voice"
78
+ )
79
+
80
+ speaker_dropdown = gr.Dropdown(
81
+ choices=SPEAKER_CHOICES,
82
+ label="Select Speaker",
83
+ visible=False
84
+ )
85
+
86
+ docx_input = gr.File(label="Upload .docx File", file_types=[".docx"])
87
+ generate_btn = gr.Button("Generate Audio")
88
+ audio_output = gr.Audio(label="Output Audio")
89
+
90
+ voice_dropdown.change(fn=update_speaker_visibility, inputs=voice_dropdown, outputs=speaker_dropdown)
91
+
92
+ generate_btn.click(
93
+ fn=generate_audio,
94
+ inputs=[voice_dropdown, speaker_dropdown, docx_input],
95
+ outputs=audio_output
96
  )
 
 
 
97
 
98
  demo.launch()