Spaces:
Sleeping
Sleeping
File size: 3,735 Bytes
a3e2313 bebc496 68f40ec bebc496 b9bf9b2 152fe30 bebc496 152fe30 b9bf9b2 bebc496 68f40ec bebc496 bf698fd ec3daa0 bebc496 e929cde bebc496 a3e2313 e929cde a3e2313 68f40ec bebc496 3db8382 3e61782 3db8382 bebc496 3e61782 bebc496 3db8382 bebc496 74f2f02 bebc496 3db8382 bebc496 3db8382 bebc496 3db8382 bebc496 3db8382 b9bf9b2 a3e2313 68f40ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"
import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile
VOICE_MODELS = {
"Jenny (Expressive Female)": {
"model_name": "tts_models/en/jenny/jenny",
"multi_speaker": False
},
"LJSpeech (Standard Female)": {
"model_name": "tts_models/en/ljspeech/vits",
"multi_speaker": False
},
"VCTK (Multiple Speakers)": {
"model_name": "tts_models/en/vctk/vits",
"multi_speaker": True
}
}
MODEL_CACHE = {}
def load_tts_model(model_key):
if model_key in MODEL_CACHE:
return MODEL_CACHE[model_key]
info = VOICE_MODELS[model_key]
tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False)
MODEL_CACHE[model_key] = tts
return tts
def extract_speakers(model_key):
info = VOICE_MODELS[model_key]
if info["multi_speaker"]:
if info["model_name"] == "tts_models/en/vctk/vits":
return ["p225", "p226", "p227", "p228", "p229", "p230", "p231", "p232", "p233", "p234"]
else:
tts = load_tts_model(model_key)
return getattr(tts, "speakers", [])
return []
def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
info = VOICE_MODELS[selected_voice]
tts = load_tts_model(selected_voice)
document = Document(doc_file.name)
full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
wav_path = tmp_wav.name
kwargs = {}
if info["multi_speaker"]:
kwargs["speaker"] = selected_speaker
tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
return wav_path
def show_load_button(voice_selection):
info = VOICE_MODELS[voice_selection]
if info["multi_speaker"]:
return (
gr.update(visible=True), # Show "Load Speakers"
gr.update(visible=False), # Hide speaker dropdown until loaded
gr.update(interactive=False) # Disable generate button
)
else:
return (
gr.update(visible=False), # Hide "Load Speakers"
gr.update(visible=False), # Hide speaker dropdown
gr.update(interactive=True) # Enable generate button for single speaker
)
def load_and_show_speakers(voice_selection):
speakers = extract_speakers(voice_selection)
return (
gr.update(choices=speakers, visible=True, value=speakers[0]),
gr.update(interactive=True) # Now enable the generate button
)
with gr.Blocks() as interface:
gr.Markdown("# π€ Realistic Voiceover from DOCX\nUpload a `.docx` file, select a voice, and generate lifelike speech!")
with gr.Row():
docx_input = gr.File(label="Upload .docx File", type="filepath")
voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
load_speakers_btn = gr.Button("π Load Speakers", visible=False)
speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False)
generate_button = gr.Button("π§ Generate Speech", interactive=True)
audio_output = gr.Audio(label="π Download WAV", type="filepath")
# Interactions
voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=[load_speakers_btn, speaker_dropdown, generate_button])
load_speakers_btn.click(fn=load_and_show_speakers, inputs=voice_dropdown, outputs=[speaker_dropdown, generate_button])
generate_button.click(fn=docx_to_wav, inputs=[docx_input, voice_dropdown, speaker_dropdown], outputs=audio_output)
if __name__ == "__main__":
interface.launch()
|