SohomToom's picture
Update app.py
68f40ec verified
raw
history blame
2.53 kB
import os
import tempfile
import zipfile
from docx import Document
from TTS.api import TTS
from pydub import AudioSegment
import gradio as gr
# Available TTS models with voice descriptions
VOICE_MODELS = {
"Jenny (Expressive Female)": "tts_models/en/jenny/jenny",
"LJSpeech (Standard Female)": "tts_models/en/ljspeech/vits",
"VCTK (Multiple Speakers)": "tts_models/en/vctk/vits"
}
# Function to update speaker choices based on the selected model
def update_speaker_choices(selected_voice):
if selected_voice == "VCTK (Multiple Speakers)":
return ["Speaker 1", "Speaker 2", "Speaker 3"] # Modify with actual speaker names or indices
return ["Default Speaker"]
def docx_to_wav_zip(doc_file, selected_voice, speaker_name):
# Load the selected TTS model
tts = TTS(model_name=VOICE_MODELS[selected_voice], progress_bar=False, gpu=False)
# Extract text from .docx
document = Document(doc_file.name)
full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
# Generate temporary paths
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
wav_path = tmp_wav.name
zip_path = wav_path.replace(".wav", ".zip")
# Get speaker index (this part assumes speaker names are like 'Speaker 1', 'Speaker 2', etc.)
speaker_idx = int(speaker_name.split()[-1]) - 1 if speaker_name.startswith("Speaker") else 0
# Generate speech with the selected speaker index
tts.tts_to_file(text=full_text, speaker_idx=speaker_idx, file_path=wav_path)
# Convert wav to mp3 and zip the result
sound = AudioSegment.from_wav(wav_path)
sound.export(wav_path, format="wav") # keeping the wav format
# Zip the files
with zipfile.ZipFile(zip_path, 'w') as zipf:
zipf.write(wav_path, os.path.basename(wav_path))
return zip_path
# Gradio interface
interface = gr.Interface(
fn=docx_to_wav_zip,
inputs=[
gr.File(label="Upload .docx File"),
gr.Dropdown(choices=list(VOICE_MODELS.keys()), label="Choose Voice", value="Jenny (Expressive Female)"),
gr.Dropdown(choices=update_speaker_choices("VCTK (Multiple Speakers)"), label="Choose Speaker", value="Speaker 1") # Example
],
outputs=gr.File(label="Download Zip File"),
title="Realistic Voiceover from DOCX (Multiple Voices)",
description="Upload a .docx file, choose a realistic voice, and pick a speaker to generate a voiceover in WAV format."
)
if __name__ == "__main__":
interface.launch()