File size: 2,526 Bytes
a3e2313
b9bf9b2
152fe30
68f40ec
 
 
 
b9bf9b2
68f40ec
152fe30
68f40ec
 
 
152fe30
b9bf9b2
68f40ec
 
 
 
 
 
 
 
 
e929cde
 
a3e2313
e929cde
a3e2313
68f40ec
 
 
 
 
 
 
 
 
 
b9bf9b2
68f40ec
 
 
152fe30
68f40ec
 
 
152fe30
68f40ec
b9bf9b2
a3e2313
b9bf9b2
152fe30
 
 
68f40ec
 
152fe30
68f40ec
 
 
b9bf9b2
 
a3e2313
68f40ec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
import tempfile
import zipfile
from docx import Document
from TTS.api import TTS
from pydub import AudioSegment
import gradio as gr

# Available TTS models with voice descriptions
VOICE_MODELS = {
    "Jenny (Expressive Female)": "tts_models/en/jenny/jenny",
    "LJSpeech (Standard Female)": "tts_models/en/ljspeech/vits",
    "VCTK (Multiple Speakers)": "tts_models/en/vctk/vits"
}

# Function to update speaker choices based on the selected model
def update_speaker_choices(selected_voice):
    if selected_voice == "VCTK (Multiple Speakers)":
        return ["Speaker 1", "Speaker 2", "Speaker 3"]  # Modify with actual speaker names or indices
    return ["Default Speaker"]

def docx_to_wav_zip(doc_file, selected_voice, speaker_name):
    # Load the selected TTS model
    tts = TTS(model_name=VOICE_MODELS[selected_voice], progress_bar=False, gpu=False)

    # Extract text from .docx
    document = Document(doc_file.name)
    full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])

    # Generate temporary paths
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
        wav_path = tmp_wav.name
    zip_path = wav_path.replace(".wav", ".zip")

    # Get speaker index (this part assumes speaker names are like 'Speaker 1', 'Speaker 2', etc.)
    speaker_idx = int(speaker_name.split()[-1]) - 1 if speaker_name.startswith("Speaker") else 0

    # Generate speech with the selected speaker index
    tts.tts_to_file(text=full_text, speaker_idx=speaker_idx, file_path=wav_path)

    # Convert wav to mp3 and zip the result
    sound = AudioSegment.from_wav(wav_path)
    sound.export(wav_path, format="wav")  # keeping the wav format

    # Zip the files
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        zipf.write(wav_path, os.path.basename(wav_path))

    return zip_path

# Gradio interface
interface = gr.Interface(
    fn=docx_to_wav_zip,
    inputs=[
        gr.File(label="Upload .docx File"),
        gr.Dropdown(choices=list(VOICE_MODELS.keys()), label="Choose Voice", value="Jenny (Expressive Female)"),
        gr.Dropdown(choices=update_speaker_choices("VCTK (Multiple Speakers)"), label="Choose Speaker", value="Speaker 1")  # Example
    ],
    outputs=gr.File(label="Download Zip File"),
    title="Realistic Voiceover from DOCX (Multiple Voices)",
    description="Upload a .docx file, choose a realistic voice, and pick a speaker to generate a voiceover in WAV format."
)

if __name__ == "__main__":
    interface.launch()