Spaces:
Sleeping
Sleeping
File size: 3,931 Bytes
a3e2313 bebc496 68f40ec bebc496 a34b148 b9bf9b2 a34b148 152fe30 bebc496 152fe30 b9bf9b2 a34b148 bebc496 a34b148 bebc496 68f40ec a34b148 bebc496 a34b148 bebc496 a34b148 a3e2313 e929cde a3e2313 68f40ec bebc496 a34b148 bebc496 3db8382 a34b148 3e61782 a34b148 bebc496 a34b148 bebc496 74f2f02 bebc496 a34b148 bebc496 a34b148 bebc496 a34b148 b9bf9b2 a3e2313 68f40ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"
import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile
import csv
from collections import defaultdict
# Model dictionary
VOICE_MODELS = {
"Jenny (Expressive Female)": {
"model_name": "tts_models/en/jenny/jenny",
"multi_speaker": False
},
"LJSpeech (Standard Female)": {
"model_name": "tts_models/en/ljspeech/vits",
"multi_speaker": False
},
"VCTK (Multiple Speakers)": {
"model_name": "tts_models/en/vctk/vits",
"multi_speaker": True
}
}
# Cache
MODEL_CACHE = {}
SPEAKER_DROPDOWN_MAP = {} # Maps label -> ID
def load_tts_model(model_key):
if model_key in MODEL_CACHE:
return MODEL_CACHE[model_key]
info = VOICE_MODELS[model_key]
tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False)
MODEL_CACHE[model_key] = tts
return tts
def extract_speakers(model_key, metadata_path="metadata.csv"):
global SPEAKER_DROPDOWN_MAP
info = VOICE_MODELS[model_key]
if not info["multi_speaker"]:
return []
tts = load_tts_model(model_key)
available_speakers = set(getattr(tts, "speakers", []))
speaker_audio_map = defaultdict(list)
with open(metadata_path, newline='') as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
if len(row) >= 2:
audio_id, speaker_id = row[0], row[1]
if speaker_id in available_speakers:
speaker_audio_map[speaker_id].append(audio_id)
SPEAKER_DROPDOWN_MAP.clear()
dropdown_choices = []
for speaker_id, audio_ids in speaker_audio_map.items():
label = f"{speaker_id} ({len(audio_ids)} samples)"
SPEAKER_DROPDOWN_MAP[label] = speaker_id
dropdown_choices.append(label)
return dropdown_choices
def docx_to_wav(doc_file, selected_voice, speaker_label=None):
info = VOICE_MODELS[selected_voice]
tts = load_tts_model(selected_voice)
# Extract text
document = Document(doc_file.name)
full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
wav_path = tmp_wav.name
kwargs = {}
if info["multi_speaker"] and speaker_label:
speaker_id = SPEAKER_DROPDOWN_MAP.get(speaker_label)
if not speaker_id:
raise ValueError("Speaker ID not found.")
kwargs["speaker"] = speaker_id
tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
return wav_path
def show_load_button(voice_selection):
is_multi = VOICE_MODELS[voice_selection]["multi_speaker"]
return gr.update(visible=is_multi)
def load_speakers_ui(voice_selection):
speakers = extract_speakers(voice_selection)
return gr.update(choices=speakers, visible=True, value=speakers[0] if speakers else None)
with gr.Blocks() as interface:
gr.Markdown("# π£οΈ DOCX to Realistic Voiceover")
with gr.Row():
docx_input = gr.File(label="Upload .docx File", type="filepath")
voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
load_button = gr.Button("π Load Speakers", visible=False)
speaker_dropdown = gr.Dropdown(label="Speaker", visible=False)
generate_button = gr.Button("ποΈ Generate Speech")
audio_output = gr.Audio(label="π Output WAV", type="filepath")
# Event bindings
voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=load_button)
load_button.click(fn=load_speakers_ui, inputs=voice_dropdown, outputs=speaker_dropdown)
generate_button.click(
fn=docx_to_wav,
inputs=[docx_input, voice_dropdown, speaker_dropdown],
outputs=audio_output
)
if __name__ == "__main__":
interface.launch()
|