import os os.environ["NUMBA_DISABLE_CACHE"] = "1" import gradio as gr from docx import Document from TTS.api import TTS import tempfile import zipfile # Available TTS models with voice descriptions VOICE_MODELS = { "Jenny (Expressive Female)": "tts_models/en/jenny/jenny", "LJSpeech (Standard Female)": "tts_models/en/ljspeech/vits", "VCTK (Multiple Speakers)": "tts_models/en/vctk/vits", "Blizzard (Deep Male Voice)": "tts_models/en/blizzard2013/capacitron-t2-cv-v1" } def docx_to_wav_zip(doc_file, selected_voice): tts = TTS(model_name=VOICE_MODELS[selected_voice], progress_bar=False, gpu=False) document = Document(doc_file.name) paragraphs = [para.text.strip() for para in document.paragraphs if para.text.strip()] temp_dir = tempfile.mkdtemp() audio_files = [] for i, chunk in enumerate(paragraphs): wav_path = os.path.join(temp_dir, f"chunk_{i+1}.wav") tts.tts_to_file(text=chunk, file_path=wav_path) audio_files.append(wav_path) zip_path = os.path.join(temp_dir, "voiceover_chunks.zip") with zipfile.ZipFile(zip_path, 'w') as zipf: for wav_file in audio_files: zipf.write(wav_file, arcname=os.path.basename(wav_file)) return zip_path # Gradio interface interface = gr.Interface( fn=docx_to_wav_zip, inputs=[ gr.File(label="Upload .docx File"), gr.Dropdown(choices=list(VOICE_MODELS.keys()), label="Choose Voice", value="Jenny (Expressive Female)") ], outputs=gr.File(label="Download ZIP of WAV Files"), title="Realistic Voiceover from DOCX (Multiple Voices)", description="Upload a .docx file and choose a realistic voice to generate WAV voiceover files chunked by paragraph, downloadable as a ZIP archive." ) if __name__ == "__main__": interface.launch()