Spaces:
Sleeping
Sleeping
File size: 3,286 Bytes
a3e2313 bebc496 68f40ec bebc496 b9bf9b2 152fe30 bebc496 152fe30 b9bf9b2 bebc496 68f40ec bebc496 bf698fd ec3daa0 bebc496 e929cde bebc496 a3e2313 e929cde a3e2313 68f40ec bebc496 3db8382 bebc496 3db8382 bebc496 3db8382 bebc496 74f2f02 bebc496 3db8382 bebc496 3db8382 bebc496 3db8382 bebc496 3db8382 b9bf9b2 a3e2313 68f40ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"
import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile
VOICE_MODELS = {
"Jenny (Expressive Female)": {
"model_name": "tts_models/en/jenny/jenny",
"multi_speaker": False
},
"LJSpeech (Standard Female)": {
"model_name": "tts_models/en/ljspeech/vits",
"multi_speaker": False
},
"VCTK (Multiple Speakers)": {
"model_name": "tts_models/en/vctk/vits",
"multi_speaker": True
}
}
MODEL_CACHE = {}
def load_tts_model(model_key):
if model_key in MODEL_CACHE:
return MODEL_CACHE[model_key]
info = VOICE_MODELS[model_key]
tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False)
MODEL_CACHE[model_key] = tts
return tts
def extract_speakers(model_key):
info = VOICE_MODELS[model_key]
if info["multi_speaker"]:
if info["model_name"] == "tts_models/en/vctk/vits":
return ["p225", "p226", "p227", "p228", "p229", "p230", "p231", "p232", "p233", "p234"]
else:
tts = load_tts_model(model_key)
return getattr(tts, "speakers", [])
return []
def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
info = VOICE_MODELS[selected_voice]
tts = load_tts_model(selected_voice)
document = Document(doc_file.name)
full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
wav_path = tmp_wav.name
kwargs = {}
if info["multi_speaker"]:
kwargs["speaker"] = selected_speaker
tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
return wav_path
def show_load_button(voice_selection):
info = VOICE_MODELS[voice_selection]
return gr.update(visible=info["multi_speaker"]), gr.update(visible=False), gr.update(visible=False, interactive=False)
def load_and_show_speakers(voice_selection):
speakers = extract_speakers(voice_selection)
return gr.update(choices=speakers, visible=True, value=speakers[0]), gr.update(interactive=True)
with gr.Blocks() as interface:
gr.Markdown("# π€ Realistic Voiceover from DOCX\nUpload a `.docx` file, select a voice, and generate lifelike speech!")
with gr.Row():
docx_input = gr.File(label="Upload .docx File", type="filepath")
voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
load_speakers_btn = gr.Button("π Load Speakers", visible=False)
speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False)
generate_button = gr.Button("π§ Generate Speech", interactive=True)
audio_output = gr.Audio(label="π Download WAV", type="filepath")
# Interactions
voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=[load_speakers_btn, speaker_dropdown, generate_button])
load_speakers_btn.click(fn=load_and_show_speakers, inputs=voice_dropdown, outputs=[speaker_dropdown, generate_button])
generate_button.click(fn=docx_to_wav, inputs=[docx_input, voice_dropdown, speaker_dropdown], outputs=audio_output)
if __name__ == "__main__":
interface.launch()
|