SohomToom commited on
Commit
a34b148
Β·
verified Β·
1 Parent(s): 4ed0036

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -39
app.py CHANGED
@@ -5,7 +5,10 @@ import gradio as gr
5
  from docx import Document
6
  from TTS.api import TTS
7
  import tempfile
 
 
8
 
 
9
  VOICE_MODELS = {
10
  "Jenny (Expressive Female)": {
11
  "model_name": "tts_models/en/jenny/jenny",
@@ -21,7 +24,9 @@ VOICE_MODELS = {
21
  }
22
  }
23
 
 
24
  MODEL_CACHE = {}
 
25
 
26
  def load_tts_model(model_key):
27
  if model_key in MODEL_CACHE:
@@ -31,17 +36,39 @@ def load_tts_model(model_key):
31
  MODEL_CACHE[model_key] = tts
32
  return tts
33
 
34
- def extract_speakers(model_key):
 
35
  info = VOICE_MODELS[model_key]
36
- if info["multi_speaker"]:
37
- tts = load_tts_model(model_key)
38
- return getattr(tts, "speakers", [])
39
- return []
40
-
41
- def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  info = VOICE_MODELS[selected_voice]
43
  tts = load_tts_model(selected_voice)
44
 
 
45
  document = Document(doc_file.name)
46
  full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
47
 
@@ -49,52 +76,44 @@ def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
49
  wav_path = tmp_wav.name
50
 
51
  kwargs = {}
52
- if info["multi_speaker"]:
53
- kwargs["speaker"] = selected_speaker
 
 
 
54
 
55
  tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
56
  return wav_path
57
 
58
  def show_load_button(voice_selection):
59
- info = VOICE_MODELS[voice_selection]
60
- if info["multi_speaker"]:
61
- return (
62
- gr.update(visible=True), # Show "Load Speakers"
63
- gr.update(visible=False), # Hide speaker dropdown until loaded
64
- gr.update(interactive=False) # Disable generate button
65
- )
66
- else:
67
- return (
68
- gr.update(visible=False), # Hide "Load Speakers"
69
- gr.update(visible=False), # Hide speaker dropdown
70
- gr.update(interactive=True) # Enable generate button for single speaker
71
- )
72
-
73
- def load_and_show_speakers(voice_selection):
74
- speakers = extract_speakers(voice_selection)
75
- return (
76
- gr.update(choices=speakers, visible=True, value=speakers[0]),
77
- gr.update(interactive=True) # Now enable the generate button
78
- )
79
 
 
 
 
80
 
81
  with gr.Blocks() as interface:
82
- gr.Markdown("# 🎀 Realistic Voiceover from DOCX\nUpload a `.docx` file, select a voice, and generate lifelike speech!")
83
 
84
  with gr.Row():
85
  docx_input = gr.File(label="Upload .docx File", type="filepath")
86
  voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
87
- load_speakers_btn = gr.Button("πŸ” Load Speakers", visible=False)
88
- speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False)
89
-
90
- generate_button = gr.Button("🎧 Generate Speech", interactive=True)
91
- audio_output = gr.Audio(label="πŸ”Š Download WAV", type="filepath")
92
 
93
- # Interactions
94
- voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=[load_speakers_btn, speaker_dropdown, generate_button])
95
- load_speakers_btn.click(fn=load_and_show_speakers, inputs=voice_dropdown, outputs=[speaker_dropdown, generate_button])
96
 
97
- generate_button.click(fn=docx_to_wav, inputs=[docx_input, voice_dropdown, speaker_dropdown], outputs=audio_output)
 
 
 
 
98
 
99
  if __name__ == "__main__":
100
  interface.launch()
 
5
  from docx import Document
6
  from TTS.api import TTS
7
  import tempfile
8
+ import csv
9
+ from collections import defaultdict
10
 
11
+ # Model dictionary
12
  VOICE_MODELS = {
13
  "Jenny (Expressive Female)": {
14
  "model_name": "tts_models/en/jenny/jenny",
 
24
  }
25
  }
26
 
27
+ # Cache
28
  MODEL_CACHE = {}
29
+ SPEAKER_DROPDOWN_MAP = {} # Maps label -> ID
30
 
31
  def load_tts_model(model_key):
32
  if model_key in MODEL_CACHE:
 
36
  MODEL_CACHE[model_key] = tts
37
  return tts
38
 
39
+ def extract_speakers(model_key, metadata_path="metadata.csv"):
40
+ global SPEAKER_DROPDOWN_MAP
41
  info = VOICE_MODELS[model_key]
42
+ if not info["multi_speaker"]:
43
+ return []
44
+
45
+ tts = load_tts_model(model_key)
46
+ available_speakers = set(getattr(tts, "speakers", []))
47
+
48
+ speaker_audio_map = defaultdict(list)
49
+ with open(metadata_path, newline='') as csvfile:
50
+ reader = csv.reader(csvfile)
51
+ next(reader)
52
+ for row in reader:
53
+ if len(row) >= 2:
54
+ audio_id, speaker_id = row[0], row[1]
55
+ if speaker_id in available_speakers:
56
+ speaker_audio_map[speaker_id].append(audio_id)
57
+
58
+ SPEAKER_DROPDOWN_MAP.clear()
59
+ dropdown_choices = []
60
+ for speaker_id, audio_ids in speaker_audio_map.items():
61
+ label = f"{speaker_id} ({len(audio_ids)} samples)"
62
+ SPEAKER_DROPDOWN_MAP[label] = speaker_id
63
+ dropdown_choices.append(label)
64
+
65
+ return dropdown_choices
66
+
67
+ def docx_to_wav(doc_file, selected_voice, speaker_label=None):
68
  info = VOICE_MODELS[selected_voice]
69
  tts = load_tts_model(selected_voice)
70
 
71
+ # Extract text
72
  document = Document(doc_file.name)
73
  full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
74
 
 
76
  wav_path = tmp_wav.name
77
 
78
  kwargs = {}
79
+ if info["multi_speaker"] and speaker_label:
80
+ speaker_id = SPEAKER_DROPDOWN_MAP.get(speaker_label)
81
+ if not speaker_id:
82
+ raise ValueError("Speaker ID not found.")
83
+ kwargs["speaker"] = speaker_id
84
 
85
  tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
86
  return wav_path
87
 
88
  def show_load_button(voice_selection):
89
+ is_multi = VOICE_MODELS[voice_selection]["multi_speaker"]
90
+ return gr.update(visible=is_multi)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ def load_speakers_ui(voice_selection):
93
+ speakers = extract_speakers(voice_selection)
94
+ return gr.update(choices=speakers, visible=True, value=speakers[0] if speakers else None)
95
 
96
  with gr.Blocks() as interface:
97
+ gr.Markdown("# πŸ—£οΈ DOCX to Realistic Voiceover")
98
 
99
  with gr.Row():
100
  docx_input = gr.File(label="Upload .docx File", type="filepath")
101
  voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
102
+ load_button = gr.Button("πŸ”„ Load Speakers", visible=False)
103
+ speaker_dropdown = gr.Dropdown(label="Speaker", visible=False)
104
+
105
+ generate_button = gr.Button("πŸŽ™οΈ Generate Speech")
106
+ audio_output = gr.Audio(label="πŸ”Š Output WAV", type="filepath")
107
 
108
+ # Event bindings
109
+ voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=load_button)
110
+ load_button.click(fn=load_speakers_ui, inputs=voice_dropdown, outputs=speaker_dropdown)
111
 
112
+ generate_button.click(
113
+ fn=docx_to_wav,
114
+ inputs=[docx_input, voice_dropdown, speaker_dropdown],
115
+ outputs=audio_output
116
+ )
117
 
118
  if __name__ == "__main__":
119
  interface.launch()