SohomToom commited on
Commit
3db8382
Β·
verified Β·
1 Parent(s): bf698fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -27
app.py CHANGED
@@ -6,7 +6,6 @@ from docx import Document
6
  from TTS.api import TTS
7
  import tempfile
8
 
9
- # Available TTS models with descriptions
10
  VOICE_MODELS = {
11
  "Jenny (Expressive Female)": {
12
  "model_name": "tts_models/en/jenny/jenny",
@@ -22,7 +21,6 @@ VOICE_MODELS = {
22
  }
23
  }
24
 
25
- # Cache to avoid reloading models
26
  MODEL_CACHE = {}
27
 
28
  def load_tts_model(model_key):
@@ -33,35 +31,23 @@ def load_tts_model(model_key):
33
  MODEL_CACHE[model_key] = tts
34
  return tts
35
 
36
- # def extract_speakers(model_key):
37
- # info = VOICE_MODELS[model_key]
38
- # if info["multi_speaker"]:
39
- # tts = load_tts_model(model_key)
40
- # return list(tts.speakers)
41
- # return []
42
-
43
  def extract_speakers(model_key):
44
  info = VOICE_MODELS[model_key]
45
  if info["multi_speaker"]:
46
  if info["model_name"] == "tts_models/en/vctk/vits":
47
- # Common VCTK speakers
48
  return ["p225", "p226", "p227", "p228", "p229", "p230", "p231", "p232", "p233", "p234"]
49
  else:
50
  tts = load_tts_model(model_key)
51
  return getattr(tts, "speakers", [])
52
  return []
53
 
54
-
55
-
56
  def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
57
  info = VOICE_MODELS[selected_voice]
58
  tts = load_tts_model(selected_voice)
59
 
60
- # Extract text from docx
61
  document = Document(doc_file.name)
62
  full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
63
 
64
- # Save to WAV
65
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
66
  wav_path = tmp_wav.name
67
 
@@ -70,31 +56,33 @@ def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
70
  kwargs["speaker"] = selected_speaker
71
 
72
  tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
73
-
74
  return wav_path
75
 
76
- def update_speaker_dropdown(voice_selection):
 
 
 
 
77
  speakers = extract_speakers(voice_selection)
78
- return gr.Dropdown.update(choices=speakers, visible=bool(speakers), value=speakers[0] if speakers else None)
79
 
80
  with gr.Blocks() as interface:
81
- gr.Markdown("# Realistic Voiceover from DOCX\nUpload a .docx and choose a voice to generate a WAV audio.")
82
 
83
  with gr.Row():
84
  docx_input = gr.File(label="Upload .docx File", type="filepath")
85
  voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
 
86
  speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False)
 
 
 
87
 
88
- generate_button = gr.Button("Generate Speech")
89
- audio_output = gr.Audio(label="Download WAV", type="filepath")
90
-
91
- voice_dropdown.change(fn=update_speaker_dropdown, inputs=voice_dropdown, outputs=speaker_dropdown)
92
 
93
- generate_button.click(
94
- fn=docx_to_wav,
95
- inputs=[docx_input, voice_dropdown, speaker_dropdown],
96
- outputs=audio_output
97
- )
98
 
99
  if __name__ == "__main__":
100
  interface.launch()
 
6
  from TTS.api import TTS
7
  import tempfile
8
 
 
9
  VOICE_MODELS = {
10
  "Jenny (Expressive Female)": {
11
  "model_name": "tts_models/en/jenny/jenny",
 
21
  }
22
  }
23
 
 
24
  MODEL_CACHE = {}
25
 
26
  def load_tts_model(model_key):
 
31
  MODEL_CACHE[model_key] = tts
32
  return tts
33
 
 
 
 
 
 
 
 
34
  def extract_speakers(model_key):
35
  info = VOICE_MODELS[model_key]
36
  if info["multi_speaker"]:
37
  if info["model_name"] == "tts_models/en/vctk/vits":
 
38
  return ["p225", "p226", "p227", "p228", "p229", "p230", "p231", "p232", "p233", "p234"]
39
  else:
40
  tts = load_tts_model(model_key)
41
  return getattr(tts, "speakers", [])
42
  return []
43
 
 
 
44
  def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
45
  info = VOICE_MODELS[selected_voice]
46
  tts = load_tts_model(selected_voice)
47
 
 
48
  document = Document(doc_file.name)
49
  full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
50
 
 
51
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
52
  wav_path = tmp_wav.name
53
 
 
56
  kwargs["speaker"] = selected_speaker
57
 
58
  tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
 
59
  return wav_path
60
 
61
+ def show_load_button(voice_selection):
62
+ info = VOICE_MODELS[voice_selection]
63
+ return gr.update(visible=info["multi_speaker"]), gr.update(visible=False), gr.update(visible=False, interactive=False)
64
+
65
+ def load_and_show_speakers(voice_selection):
66
  speakers = extract_speakers(voice_selection)
67
+ return gr.update(choices=speakers, visible=True, value=speakers[0]), gr.update(interactive=True)
68
 
69
  with gr.Blocks() as interface:
70
+ gr.Markdown("# 🎀 Realistic Voiceover from DOCX\nUpload a `.docx` file, select a voice, and generate lifelike speech!")
71
 
72
  with gr.Row():
73
  docx_input = gr.File(label="Upload .docx File", type="filepath")
74
  voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
75
+ load_speakers_btn = gr.Button("πŸ” Load Speakers", visible=False)
76
  speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False)
77
+
78
+ generate_button = gr.Button("🎧 Generate Speech", interactive=True)
79
+ audio_output = gr.Audio(label="πŸ”Š Download WAV", type="filepath")
80
 
81
+ # Interactions
82
+ voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=[load_speakers_btn, speaker_dropdown, generate_button])
83
+ load_speakers_btn.click(fn=load_and_show_speakers, inputs=voice_dropdown, outputs=[speaker_dropdown, generate_button])
 
84
 
85
+ generate_button.click(fn=docx_to_wav, inputs=[docx_input, voice_dropdown, speaker_dropdown], outputs=audio_output)
 
 
 
 
86
 
87
  if __name__ == "__main__":
88
  interface.launch()