Spaces:
Sleeping
Sleeping
File size: 3,158 Bytes
a3e2313 bebc496 68f40ec bebc496 b9bf9b2 bebc496 152fe30 bebc496 152fe30 b9bf9b2 bebc496 68f40ec ec3daa0 bebc496 ec3daa0 bebc496 e929cde ec3daa0 bebc496 a3e2313 e929cde a3e2313 bebc496 68f40ec bebc496 74f2f02 bebc496 b9bf9b2 a3e2313 68f40ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"
import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile
# Available TTS models with descriptions
VOICE_MODELS = {
"Jenny (Expressive Female)": {
"model_name": "tts_models/en/jenny/jenny",
"multi_speaker": False
},
"LJSpeech (Standard Female)": {
"model_name": "tts_models/en/ljspeech/vits",
"multi_speaker": False
},
"VCTK (Multiple Speakers)": {
"model_name": "tts_models/en/vctk/vits",
"multi_speaker": True
}
}
# Cache to avoid reloading models
MODEL_CACHE = {}
def load_tts_model(model_key):
if model_key in MODEL_CACHE:
return MODEL_CACHE[model_key]
info = VOICE_MODELS[model_key]
tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False)
MODEL_CACHE[model_key] = tts
return tts
# def extract_speakers(model_key):
# info = VOICE_MODELS[model_key]
# if info["multi_speaker"]:
# tts = load_tts_model(model_key)
# return list(tts.speakers)
# return []
def extract_speakers(model_key):
info = VOICE_MODELS[model_key]
if info["model_name"] == "tts_models/en/vctk/vits":
# Common VCTK speakers
return ["p225", "p226", "p227", "p228", "p229", "p230", "p231", "p232", "p233", "p234"]
else:
tts = load_tts_model(model_key)
return getattr(tts, "speakers", [])
return []
def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
info = VOICE_MODELS[selected_voice]
tts = load_tts_model(selected_voice)
# Extract text from docx
document = Document(doc_file.name)
full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
# Save to WAV
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
wav_path = tmp_wav.name
kwargs = {}
if info["multi_speaker"]:
kwargs["speaker"] = selected_speaker
tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
return wav_path
def update_speaker_dropdown(voice_selection):
speakers = extract_speakers(voice_selection)
return gr.Dropdown.update(choices=speakers, visible=bool(speakers), value=speakers[0] if speakers else None)
with gr.Blocks() as interface:
gr.Markdown("# Realistic Voiceover from DOCX\nUpload a .docx and choose a voice to generate a WAV audio.")
with gr.Row():
docx_input = gr.File(label="Upload .docx File", type="filepath")
voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False)
generate_button = gr.Button("Generate Speech")
audio_output = gr.Audio(label="Download WAV", type="filepath")
voice_dropdown.change(fn=update_speaker_dropdown, inputs=voice_dropdown, outputs=speaker_dropdown)
generate_button.click(
fn=docx_to_wav,
inputs=[docx_input, voice_dropdown, speaker_dropdown],
outputs=audio_output
)
if __name__ == "__main__":
interface.launch()
|