SohomToom commited on
Commit
bebc496
·
verified ·
1 Parent(s): 14d8745

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -48
app.py CHANGED
@@ -1,65 +1,87 @@
1
  import os
2
- import tempfile
3
- import zipfile
 
4
  from docx import Document
5
  from TTS.api import TTS
6
- from pydub import AudioSegment
7
- import gradio as gr
8
 
9
- # Available TTS models with voice descriptions
10
  VOICE_MODELS = {
11
- "Jenny (Expressive Female)": "tts_models/en/jenny/jenny",
12
- "LJSpeech (Standard Female)": "tts_models/en/ljspeech/vits",
13
- "VCTK (Multiple Speakers)": "tts_models/en/vctk/vits"
 
 
 
 
 
 
 
 
 
14
  }
15
 
16
- # Function to update speaker choices based on the selected model
17
- def update_speaker_choices(selected_voice):
18
- if selected_voice == "VCTK (Multiple Speakers)":
19
- return ["Speaker 1", "Speaker 2", "Speaker 3"] # Modify with actual speaker names or indices
20
- return ["Default Speaker"]
 
 
 
 
 
21
 
22
- def docx_to_wav_zip(doc_file, selected_voice, speaker_name):
23
- # Load the selected TTS model
24
- tts = TTS(model_name=VOICE_MODELS[selected_voice], progress_bar=False, gpu=False)
 
 
 
25
 
26
- # Extract text from .docx
 
 
 
 
27
  document = Document(doc_file.name)
28
  full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
29
 
30
- # Generate temporary paths
31
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
32
  wav_path = tmp_wav.name
33
- zip_path = wav_path.replace(".wav", ".zip")
34
-
35
- # Get speaker index (this part assumes speaker names are like 'Speaker 1', 'Speaker 2', etc.)
36
- speaker_idx = int(speaker_name.split()[-1]) - 1 if speaker_name.startswith("Speaker") else 0
37
-
38
- # Generate speech with the selected speaker index
39
- tts.tts_to_file(text=full_text, speaker_idx=speaker_idx, file_path=wav_path)
40
-
41
- # Convert wav to mp3 and zip the result
42
- sound = AudioSegment.from_wav(wav_path)
43
- sound.export(wav_path, format="wav") # keeping the wav format
44
-
45
- # Zip the files
46
- with zipfile.ZipFile(zip_path, 'w') as zipf:
47
- zipf.write(wav_path, os.path.basename(wav_path))
48
-
49
- return zip_path
50
-
51
- # Gradio interface
52
- interface = gr.Interface(
53
- fn=docx_to_wav_zip,
54
- inputs=[
55
- gr.File(label="Upload .docx File"),
56
- gr.Dropdown(choices=list(VOICE_MODELS.keys()), label="Choose Voice", value="Jenny (Expressive Female)"),
57
- gr.Dropdown(choices=update_speaker_choices("VCTK (Multiple Speakers)"), label="Choose Speaker", value="Speaker 1") # Example
58
- ],
59
- outputs=gr.File(label="Download Zip File"),
60
- title="Realistic Voiceover from DOCX (Multiple Voices)",
61
- description="Upload a .docx file, choose a realistic voice, and pick a speaker to generate a voiceover in WAV format."
62
- )
 
63
 
64
  if __name__ == "__main__":
65
  interface.launch()
 
1
  import os
2
+ os.environ["NUMBA_DISABLE_CACHE"] = "1"
3
+
4
+ import gradio as gr
5
  from docx import Document
6
  from TTS.api import TTS
7
+ import tempfile
 
8
 
9
+ # Available TTS models with descriptions
10
  VOICE_MODELS = {
11
+ "Jenny (Expressive Female)": {
12
+ "model_name": "tts_models/en/jenny/jenny",
13
+ "multi_speaker": False
14
+ },
15
+ "LJSpeech (Standard Female)": {
16
+ "model_name": "tts_models/en/ljspeech/vits",
17
+ "multi_speaker": False
18
+ },
19
+ "VCTK (Multiple Speakers)": {
20
+ "model_name": "tts_models/en/vctk/vits",
21
+ "multi_speaker": True
22
+ }
23
  }
24
 
25
+ # Cache to avoid reloading models
26
+ MODEL_CACHE = {}
27
+
28
+ def load_tts_model(model_key):
29
+ if model_key in MODEL_CACHE:
30
+ return MODEL_CACHE[model_key]
31
+ info = VOICE_MODELS[model_key]
32
+ tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False)
33
+ MODEL_CACHE[model_key] = tts
34
+ return tts
35
 
36
+ def extract_speakers(model_key):
37
+ info = VOICE_MODELS[model_key]
38
+ if info["multi_speaker"]:
39
+ tts = load_tts_model(model_key)
40
+ return list(tts.speakers)
41
+ return []
42
 
43
+ def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
44
+ info = VOICE_MODELS[selected_voice]
45
+ tts = load_tts_model(selected_voice)
46
+
47
+ # Extract text from docx
48
  document = Document(doc_file.name)
49
  full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
50
 
51
+ # Save to WAV
52
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
53
  wav_path = tmp_wav.name
54
+
55
+ kwargs = {}
56
+ if info["multi_speaker"]:
57
+ kwargs["speaker"] = selected_speaker
58
+
59
+ tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
60
+
61
+ return wav_path
62
+
63
+ def update_speaker_dropdown(voice_selection):
64
+ speakers = extract_speakers(voice_selection)
65
+ return gr.Dropdown.update(choices=speakers, visible=bool(speakers), value=speakers[0] if speakers else None)
66
+
67
+ with gr.Blocks() as interface:
68
+ gr.Markdown("# Realistic Voiceover from DOCX\nUpload a .docx and choose a voice to generate a WAV audio.")
69
+
70
+ with gr.Row():
71
+ docx_input = gr.File(label="Upload .docx File", type="file")
72
+ voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
73
+ speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False)
74
+
75
+ generate_button = gr.Button("Generate Speech")
76
+ audio_output = gr.Audio(label="Download WAV", type="filepath")
77
+
78
+ voice_dropdown.change(fn=update_speaker_dropdown, inputs=voice_dropdown, outputs=speaker_dropdown)
79
+
80
+ generate_button.click(
81
+ fn=docx_to_wav,
82
+ inputs=[docx_input, voice_dropdown, speaker_dropdown],
83
+ outputs=audio_output
84
+ )
85
 
86
  if __name__ == "__main__":
87
  interface.launch()