File size: 3,158 Bytes
a3e2313
bebc496
 
 
68f40ec
 
bebc496
b9bf9b2
bebc496
152fe30
bebc496
 
 
 
 
 
 
 
 
 
 
 
152fe30
b9bf9b2
bebc496
 
 
 
 
 
 
 
 
 
68f40ec
ec3daa0
 
 
 
 
 
 
bebc496
 
ec3daa0
 
 
 
 
 
bebc496
e929cde
ec3daa0
bebc496
 
 
 
 
a3e2313
e929cde
a3e2313
bebc496
68f40ec
 
bebc496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74f2f02
bebc496
 
 
 
 
 
 
 
 
 
 
 
 
b9bf9b2
a3e2313
68f40ec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"

import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile

# Available TTS models with descriptions
VOICE_MODELS = {
    "Jenny (Expressive Female)": {
        "model_name": "tts_models/en/jenny/jenny",
        "multi_speaker": False
    },
    "LJSpeech (Standard Female)": {
        "model_name": "tts_models/en/ljspeech/vits",
        "multi_speaker": False
    },
    "VCTK (Multiple Speakers)": {
        "model_name": "tts_models/en/vctk/vits",
        "multi_speaker": True
    }
}

# Cache to avoid reloading models
MODEL_CACHE = {}

def load_tts_model(model_key):
    if model_key in MODEL_CACHE:
        return MODEL_CACHE[model_key]
    info = VOICE_MODELS[model_key]
    tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False)
    MODEL_CACHE[model_key] = tts
    return tts

# def extract_speakers(model_key):
#     info = VOICE_MODELS[model_key]
#     if info["multi_speaker"]:
#         tts = load_tts_model(model_key)
#         return list(tts.speakers)
#     return []

def extract_speakers(model_key):
    info = VOICE_MODELS[model_key]
        if info["model_name"] == "tts_models/en/vctk/vits":
            # Common VCTK speakers
            return ["p225", "p226", "p227", "p228", "p229", "p230", "p231", "p232", "p233", "p234"]
        else:
            tts = load_tts_model(model_key)
            return getattr(tts, "speakers", [])
    return []


def docx_to_wav(doc_file, selected_voice, selected_speaker=None):
    info = VOICE_MODELS[selected_voice]
    tts = load_tts_model(selected_voice)

    # Extract text from docx
    document = Document(doc_file.name)
    full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])

    # Save to WAV
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
        wav_path = tmp_wav.name

    kwargs = {}
    if info["multi_speaker"]:
        kwargs["speaker"] = selected_speaker

    tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)

    return wav_path

def update_speaker_dropdown(voice_selection):
    speakers = extract_speakers(voice_selection)
    return gr.Dropdown.update(choices=speakers, visible=bool(speakers), value=speakers[0] if speakers else None)

with gr.Blocks() as interface:
    gr.Markdown("# Realistic Voiceover from DOCX\nUpload a .docx and choose a voice to generate a WAV audio.")

    with gr.Row():
        docx_input = gr.File(label="Upload .docx File", type="filepath")
        voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
        speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False)

    generate_button = gr.Button("Generate Speech")
    audio_output = gr.Audio(label="Download WAV", type="filepath")

    voice_dropdown.change(fn=update_speaker_dropdown, inputs=voice_dropdown, outputs=speaker_dropdown)

    generate_button.click(
        fn=docx_to_wav,
        inputs=[docx_input, voice_dropdown, speaker_dropdown],
        outputs=audio_output
    )

if __name__ == "__main__":
    interface.launch()