import os os.environ["NUMBA_DISABLE_CACHE"] = "1" import gradio as gr from docx import Document from TTS.api import TTS import tempfile import csv from collections import defaultdict # Model dictionary VOICE_MODELS = { "Jenny (Expressive Female)": { "model_name": "tts_models/en/jenny/jenny", "multi_speaker": False }, "LJSpeech (Standard Female)": { "model_name": "tts_models/en/ljspeech/vits", "multi_speaker": False }, "VCTK (Multiple Speakers)": { "model_name": "tts_models/en/vctk/vits", "multi_speaker": True } } # Cache MODEL_CACHE = {} SPEAKER_DROPDOWN_MAP = {} # Maps label -> ID def load_tts_model(model_key): if model_key in MODEL_CACHE: return MODEL_CACHE[model_key] info = VOICE_MODELS[model_key] tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False) MODEL_CACHE[model_key] = tts return tts def extract_speakers(model_key, metadata_path="metadata.csv"): global SPEAKER_DROPDOWN_MAP info = VOICE_MODELS[model_key] if not info["multi_speaker"]: return [] tts = load_tts_model(model_key) available_speakers = set(getattr(tts, "speakers", [])) speaker_audio_map = defaultdict(list) with open(metadata_path, newline='') as csvfile: reader = csv.reader(csvfile) next(reader) for row in reader: if len(row) >= 2: audio_id, speaker_id = row[1], row[0] if speaker_id in available_speakers: speaker_audio_map[speaker_id].append(audio_id) SPEAKER_DROPDOWN_MAP.clear() dropdown_choices = [] for speaker_id, audio_ids in speaker_audio_map.items(): label = f"{speaker_id} ({len(audio_ids)} samples)" SPEAKER_DROPDOWN_MAP[label] = speaker_id dropdown_choices.append(label) return dropdown_choices def docx_to_wav(doc_file, selected_voice, speaker_label=None): info = VOICE_MODELS[selected_voice] tts = load_tts_model(selected_voice) # Extract text document = Document(doc_file.name) full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()]) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav: wav_path = tmp_wav.name kwargs = {} if info["multi_speaker"] and speaker_label: speaker_id = SPEAKER_DROPDOWN_MAP.get(speaker_label) if not speaker_id: raise ValueError("Speaker ID not found.") kwargs["speaker"] = speaker_id tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs) return wav_path def show_load_button(voice_selection): is_multi = VOICE_MODELS[voice_selection]["multi_speaker"] return gr.update(visible=is_multi) def load_speakers_ui(voice_selection): speakers = extract_speakers(voice_selection) return gr.update(choices=speakers, visible=True, value=speakers[0] if speakers else None) with gr.Blocks() as interface: gr.Markdown("# 🗣️ DOCX to Realistic Voiceover") with gr.Row(): docx_input = gr.File(label="Upload .docx File", type="filepath") voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice") load_button = gr.Button("🔄 Load Speakers", visible=False) speaker_dropdown = gr.Dropdown(label="Speaker", visible=False) generate_button = gr.Button("🎙️ Generate Speech") audio_output = gr.Audio(label="🔊 Output WAV", type="filepath") # Event bindings voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=load_button) load_button.click(fn=load_speakers_ui, inputs=voice_dropdown, outputs=speaker_dropdown) generate_button.click( fn=docx_to_wav, inputs=[docx_input, voice_dropdown, speaker_dropdown], outputs=audio_output ) if __name__ == "__main__": interface.launch()