import os import tempfile import zipfile from docx import Document from TTS.api import TTS from pydub import AudioSegment import gradio as gr # Available TTS models with voice descriptions VOICE_MODELS = { "Jenny (Expressive Female)": "tts_models/en/jenny/jenny", "LJSpeech (Standard Female)": "tts_models/en/ljspeech/vits", "VCTK (Multiple Speakers)": "tts_models/en/vctk/vits" } # Function to update speaker choices based on the selected model def update_speaker_choices(selected_voice): if selected_voice == "VCTK (Multiple Speakers)": return ["Speaker 1", "Speaker 2", "Speaker 3"] # Modify with actual speaker names or indices return ["Default Speaker"] def docx_to_wav_zip(doc_file, selected_voice, speaker_name): # Load the selected TTS model tts = TTS(model_name=VOICE_MODELS[selected_voice], progress_bar=False, gpu=False) # Extract text from .docx document = Document(doc_file.name) full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()]) # Generate temporary paths with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav: wav_path = tmp_wav.name zip_path = wav_path.replace(".wav", ".zip") # Get speaker index (this part assumes speaker names are like 'Speaker 1', 'Speaker 2', etc.) speaker_idx = int(speaker_name.split()[-1]) - 1 if speaker_name.startswith("Speaker") else 0 # Generate speech with the selected speaker index tts.tts_to_file(text=full_text, speaker_idx=speaker_idx, file_path=wav_path) # Convert wav to mp3 and zip the result sound = AudioSegment.from_wav(wav_path) sound.export(wav_path, format="wav") # keeping the wav format # Zip the files with zipfile.ZipFile(zip_path, 'w') as zipf: zipf.write(wav_path, os.path.basename(wav_path)) return zip_path # Gradio interface interface = gr.Interface( fn=docx_to_wav_zip, inputs=[ gr.File(label="Upload .docx File"), gr.Dropdown(choices=list(VOICE_MODELS.keys()), label="Choose Voice", value="Jenny (Expressive Female)"), gr.Dropdown(choices=update_speaker_choices("VCTK (Multiple Speakers)"), label="Choose Speaker", value="Speaker 1") # Example ], outputs=gr.File(label="Download Zip File"), title="Realistic Voiceover from DOCX (Multiple Voices)", description="Upload a .docx file, choose a realistic voice, and pick a speaker to generate a voiceover in WAV format." ) if __name__ == "__main__": interface.launch()