SohomToom commited on
Commit
40ede2a
·
verified ·
1 Parent(s): eb45da8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -104
app.py CHANGED
@@ -5,115 +5,155 @@ import gradio as gr
5
  from docx import Document
6
  from TTS.api import TTS
7
  import tempfile
8
- import csv
9
- from collections import defaultdict
10
 
11
- # Model dictionary
12
- VOICE_MODELS = {
13
- "Jenny (Expressive Female)": {
14
- "model_name": "tts_models/en/jenny/jenny",
15
- "multi_speaker": False
16
- },
17
- "LJSpeech (Standard Female)": {
18
- "model_name": "tts_models/en/ljspeech/vits",
19
- "multi_speaker": False
20
- },
21
- "VCTK (Multiple Speakers)": {
22
- "model_name": "tts_models/en/vctk/vits",
23
- "multi_speaker": True
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
 
27
- # Cache
28
- MODEL_CACHE = {}
29
- SPEAKER_DROPDOWN_MAP = {} # Maps label -> ID
30
-
31
- def load_tts_model(model_key):
32
- if model_key in MODEL_CACHE:
33
- return MODEL_CACHE[model_key]
34
- info = VOICE_MODELS[model_key]
35
- tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False)
36
- MODEL_CACHE[model_key] = tts
37
- return tts
38
-
39
- def extract_speakers(model_key, metadata_path="metadata.csv"):
40
- global SPEAKER_DROPDOWN_MAP
41
- info = VOICE_MODELS[model_key]
42
- if not info["multi_speaker"]:
43
- return []
44
-
45
- tts = load_tts_model(model_key)
46
- available_speakers = set(getattr(tts, "speakers", []))
47
-
48
- speaker_audio_map = defaultdict(list)
49
- with open(metadata_path, newline='') as csvfile:
50
- reader = csv.reader(csvfile)
51
- next(reader)
52
- for row in reader:
53
- if len(row) >= 2:
54
- audio_id, speaker_id = row[1], row[0]
55
- if speaker_id in available_speakers:
56
- speaker_audio_map[speaker_id].append(audio_id)
57
-
58
- SPEAKER_DROPDOWN_MAP.clear()
59
- dropdown_choices = []
60
- for speaker_id, audio_ids in speaker_audio_map.items():
61
- label = f"{speaker_id} ({len(audio_ids)} samples)"
62
- SPEAKER_DROPDOWN_MAP[label] = speaker_id
63
- dropdown_choices.append(label)
64
-
65
- return dropdown_choices
66
-
67
- def docx_to_wav(doc_file, selected_voice, speaker_label=None):
68
- info = VOICE_MODELS[selected_voice]
69
- tts = load_tts_model(selected_voice)
70
-
71
- # Extract text
72
- document = Document(doc_file.name)
73
- full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
74
-
75
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
76
- wav_path = tmp_wav.name
77
-
78
- kwargs = {}
79
- if info["multi_speaker"] and speaker_label:
80
- speaker_id = SPEAKER_DROPDOWN_MAP.get(speaker_label)
81
- if not speaker_id:
82
- raise ValueError("Speaker ID not found.")
83
- kwargs["speaker"] = speaker_id
84
-
85
- tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
86
- return wav_path
87
-
88
- def show_load_button(voice_selection):
89
- is_multi = VOICE_MODELS[voice_selection]["multi_speaker"]
90
- return gr.update(visible=is_multi)
91
-
92
- def load_speakers_ui(voice_selection):
93
- speakers = extract_speakers(voice_selection)
94
- return gr.update(choices=speakers, visible=True, value=speakers[0] if speakers else None)
95
 
96
- with gr.Blocks() as interface:
97
- gr.Markdown("# 🗣️ DOCX to Realistic Voiceover")
 
 
 
 
98
 
 
 
99
  with gr.Row():
100
- docx_input = gr.File(label="Upload .docx File", type="filepath")
101
- voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
102
- load_button = gr.Button("🔄 Load Speakers", visible=False)
103
- speaker_dropdown = gr.Dropdown(label="Speaker", visible=False)
104
-
105
- generate_button = gr.Button("🎙️ Generate Speech")
106
- audio_output = gr.Audio(label="🔊 Output WAV", type="filepath")
107
-
108
- # Event bindings
109
- voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=load_button)
110
- load_button.click(fn=load_speakers_ui, inputs=voice_dropdown, outputs=speaker_dropdown)
111
-
112
- generate_button.click(
113
- fn=docx_to_wav,
114
- inputs=[docx_input, voice_dropdown, speaker_dropdown],
115
- outputs=audio_output
 
116
  )
 
 
 
117
 
118
- if __name__ == "__main__":
119
- interface.launch()
 
5
  from docx import Document
6
  from TTS.api import TTS
7
  import tempfile
 
 
8
 
9
+ # Embedding the metadata directly into the script
10
+ SPEAKER_METADATA = {
11
+ 300: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
12
+ 271: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"},
13
+ 287: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
14
+ 262: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"},
15
+ 284: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Scottish"},
16
+ 297: {"audio_id": 1, "age": 20, "gender": "F", "accent": "American"},
17
+ 227: {"audio_id": 1, "age": 38, "gender": "M", "accent": "English"},
18
+ 246: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
19
+ 225: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
20
+ 259: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
21
+ 252: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
22
+ 231: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
23
+ 266: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"},
24
+ 241: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
25
+ 312: {"audio_id": 1, "age": 19, "gender": "F", "accent": "Canadian"},
26
+ 329: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
27
+ 232: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
28
+ 305: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
29
+ 311: {"audio_id": 1, "age": 21, "gender": "M", "accent": "American"},
30
+ 301: {"audio_id": 1, "age": 23, "gender": "F", "accent": "American"},
31
+ 304: {"audio_id": 1, "age": 22, "gender": "M", "accent": "NorthernIrish"},
32
+ 310: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"},
33
+ 260: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
34
+ 315: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"},
35
+ 374: {"audio_id": 1, "age": 28, "gender": "M", "accent": "Australian"},
36
+ 364: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Irish"},
37
+ 269: {"audio_id": 1, "age": 20, "gender": "F", "accent": "English"},
38
+ 345: {"audio_id": 1, "age": 22, "gender": "M", "accent": "American"},
39
+ 326: {"audio_id": 1, "age": 26, "gender": "M", "accent": "Australian"},
40
+ 343: {"audio_id": 1, "age": 27, "gender": "F", "accent": "Canadian"},
41
+ 230: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
42
+ 376: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Indian"},
43
+ 240: {"audio_id": 1, "age": 21, "gender": "F", "accent": "English"},
44
+ 298: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Irish"},
45
+ 272: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"},
46
+ 248: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Indian"},
47
+ 264: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Scottish"},
48
+ 250: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
49
+ 292: {"audio_id": 1, "age": 23, "gender": "M", "accent": "NorthernIrish"},
50
+ 237: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
51
+ 363: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Canadian"},
52
+ 313: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"},
53
+ 285: {"audio_id": 1, "age": 21, "gender": "M", "accent": "Scottish"},
54
+ 268: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
55
+ 302: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"},
56
+ 261: {"audio_id": 1, "age": 26, "gender": "F", "accent": "NorthernIrish"},
57
+ 336: {"audio_id": 1, "age": 18, "gender": "F", "accent": "SouthAfrican"},
58
+ 288: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Irish"},
59
+ 226: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
60
+ 277: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
61
+ 360: {"audio_id": 1, "age": 19, "gender": "M", "accent": "American"},
62
+ 257: {"audio_id": 1, "age": 24, "gender": "F", "accent": "English"},
63
+ 254: {"audio_id": 1, "age": 21, "gender": "M", "accent": "English"},
64
+ 339: {"audio_id": 1, "age": 21, "gender": "F", "accent": "American"},
65
+ 323: {"audio_id": 1, "age": 19, "gender": "F", "accent": "SouthAfrican"},
66
+ 255: {"audio_id": 1, "age": 19, "gender": "M", "accent": "Scottish"},
67
+ 249: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Scottish"},
68
+ 293: {"audio_id": 1, "age": 22, "gender": "F", "accent": "NorthernIrish"},
69
+ 244: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
70
+ 245: {"audio_id": 1, "age": 25, "gender": "M", "accent": "Irish"},
71
+ 361: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
72
+ 314: {"audio_id": 1, "age": 26, "gender": "F", "accent": "SouthAfrican"},
73
+ 308: {"audio_id": 1, "age": 18, "gender": "F", "accent": "American"},
74
+ 229: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
75
+ 341: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"},
76
+ 275: {"audio_id": 1, "age": 23, "gender": "M", "accent": "Scottish"},
77
+ 263: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
78
+ 253: {"audio_id": 1, "age": 22, "gender": "F", "accent": "Welsh"},
79
+ 299: {"audio_id": 1, "age": 25, "gender": "F", "accent": "American"},
80
+ 316: {"audio_id": 1, "age": 20, "gender": "M", "accent": "Canadian"},
81
+ 282: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
82
+ 362: {"audio_id": 1, "age": 29, "gender": "F", "accent": "American"},
83
+ 294: {"audio_id": 1, "age": 33, "gender": "F", "accent": "American"},
84
+ 274: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
85
+ 279: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
86
+ 281: {"audio_id": 1, "age": 29, "gender": "M", "accent": "Scottish"},
87
+ 286: {"audio_id": 1, "age": 23, "gender": "M", "accent": "English"},
88
+ 258: {"audio_id": 1, "age": 22, "gender": "M", "accent": "English"},
89
+ 247: {"audio_id": 1, "age": 22, "gender": "M", "accent": "Scottish"},
90
+ 351: {"audio_id": 1, "age": 21, "gender": "F", "accent": "NorthernIrish"},
91
+ 283: {"audio_id": 1, "age": 24, "gender": "F", "accent": "Irish"},
92
+ 334: {"audio_id": 1, "age": 18, "gender": "M", "accent": "American"},
93
+ 333: {"audio_id": 1, "age": 19, "gender": "F", "accent": "American"},
94
+ 295: {"audio_id": 1, "age": 23, "gender": "F", "accent": "Irish"},
95
+ 330: {"audio_id": 1, "age": 26, "gender": "F", "accent": "American"},
96
+ 335: {"audio_id": 1, "age": 25, "gender": "F", "accent": "NewZealand"},
97
+ 228: {"audio_id": 1, "age": 22, "gender": "F", "accent": "English"},
98
+ 267: {"audio_id": 1, "age": 23, "gender": "F", "accent": "English"},
99
+ 273: {"audio_id": 1, "age": 18, "gender": "F", "accent": "English"}
100
  }
101
 
102
+ # Load the TTS model
103
+ tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=False)
104
+
105
+ # Extract speakers from metadata
106
+ def extract_speakers(voice_selection):
107
+ speaker_choices = []
108
+ for speaker_id, metadata in SPEAKER_METADATA.items():
109
+ if voice_selection == "english":
110
+ speaker_choices.append((
111
+ str(speaker_id),
112
+ f"p{speaker_id} ({metadata['gender']}, {metadata['accent']}, {metadata['age']} yrs)"
113
+ ))
114
+ return speaker_choices
115
+
116
+ # Update the speaker dropdown based on selected voice
117
+ def update_speaker_dropdown(voice_selection):
118
+ speaker_choices = extract_speakers(voice_selection)
119
+ visible = bool(speaker_choices)
120
+ default = speaker_choices[0][0] if speaker_choices else None
121
+ return gr.Dropdown.update(
122
+ choices=speaker_choices,
123
+ visible=visible,
124
+ value=default
125
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ # Generate speech to text
128
+ def generate_audio(voice_selection, speaker_selection, text_input):
129
+ speaker_id = int(speaker_selection)
130
+ temp_file = tempfile.mktemp(suffix=".wav")
131
+ tts.tts_to_file(text_input, temp_file, speaker=speaker_id)
132
+ return temp_file
133
 
134
+ # Gradio interface
135
+ with gr.Blocks() as demo:
136
  with gr.Row():
137
+ with gr.Column():
138
+ voice_dropdown = gr.Dropdown(
139
+ choices=["english", "other"],
140
+ label="Select Voice",
141
+ value="english"
142
+ )
143
+ speaker_dropdown = gr.Dropdown(
144
+ label="Select Speaker",
145
+ visible=False
146
+ )
147
+ text_input = gr.Textbox(label="Enter text")
148
+ audio_output = gr.Audio(label="Generated Audio")
149
+
150
+ voice_dropdown.change(
151
+ fn=update_speaker_dropdown,
152
+ inputs=voice_dropdown,
153
+ outputs=speaker_dropdown
154
  )
155
+
156
+ generate_button = gr.Button("Generate Audio")
157
+ generate_button.click(generate_audio, inputs=[voice_dropdown, speaker_dropdown, text_input], outputs=audio_output)
158
 
159
+ demo.launch()