File size: 3,931 Bytes
a3e2313
bebc496
 
 
68f40ec
 
bebc496
a34b148
 
b9bf9b2
a34b148
152fe30
bebc496
 
 
 
 
 
 
 
 
 
 
 
152fe30
b9bf9b2
a34b148
bebc496
a34b148
bebc496
 
 
 
 
 
 
 
68f40ec
a34b148
 
bebc496
a34b148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bebc496
 
 
a34b148
a3e2313
e929cde
a3e2313
68f40ec
 
bebc496
 
a34b148
 
 
 
 
bebc496
 
 
 
3db8382
a34b148
 
3e61782
a34b148
 
 
bebc496
 
a34b148
bebc496
 
74f2f02
bebc496
a34b148
 
 
 
 
bebc496
a34b148
 
 
bebc496
a34b148
 
 
 
 
b9bf9b2
a3e2313
68f40ec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"

import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile
import csv
from collections import defaultdict

# Model dictionary
VOICE_MODELS = {
    "Jenny (Expressive Female)": {
        "model_name": "tts_models/en/jenny/jenny",
        "multi_speaker": False
    },
    "LJSpeech (Standard Female)": {
        "model_name": "tts_models/en/ljspeech/vits",
        "multi_speaker": False
    },
    "VCTK (Multiple Speakers)": {
        "model_name": "tts_models/en/vctk/vits",
        "multi_speaker": True
    }
}

# Cache
MODEL_CACHE = {}
SPEAKER_DROPDOWN_MAP = {}  # Maps label -> ID

def load_tts_model(model_key):
    if model_key in MODEL_CACHE:
        return MODEL_CACHE[model_key]
    info = VOICE_MODELS[model_key]
    tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False)
    MODEL_CACHE[model_key] = tts
    return tts

def extract_speakers(model_key, metadata_path="metadata.csv"):
    global SPEAKER_DROPDOWN_MAP
    info = VOICE_MODELS[model_key]
    if not info["multi_speaker"]:
        return []

    tts = load_tts_model(model_key)
    available_speakers = set(getattr(tts, "speakers", []))

    speaker_audio_map = defaultdict(list)
    with open(metadata_path, newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)
        for row in reader:
            if len(row) >= 2:
                audio_id, speaker_id = row[0], row[1]
                if speaker_id in available_speakers:
                    speaker_audio_map[speaker_id].append(audio_id)

    SPEAKER_DROPDOWN_MAP.clear()
    dropdown_choices = []
    for speaker_id, audio_ids in speaker_audio_map.items():
        label = f"{speaker_id} ({len(audio_ids)} samples)"
        SPEAKER_DROPDOWN_MAP[label] = speaker_id
        dropdown_choices.append(label)

    return dropdown_choices

def docx_to_wav(doc_file, selected_voice, speaker_label=None):
    info = VOICE_MODELS[selected_voice]
    tts = load_tts_model(selected_voice)

    # Extract text
    document = Document(doc_file.name)
    full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
        wav_path = tmp_wav.name

    kwargs = {}
    if info["multi_speaker"] and speaker_label:
        speaker_id = SPEAKER_DROPDOWN_MAP.get(speaker_label)
        if not speaker_id:
            raise ValueError("Speaker ID not found.")
        kwargs["speaker"] = speaker_id

    tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs)
    return wav_path

def show_load_button(voice_selection):
    is_multi = VOICE_MODELS[voice_selection]["multi_speaker"]
    return gr.update(visible=is_multi)

def load_speakers_ui(voice_selection):
    speakers = extract_speakers(voice_selection)
    return gr.update(choices=speakers, visible=True, value=speakers[0] if speakers else None)

with gr.Blocks() as interface:
    gr.Markdown("# πŸ—£οΈ DOCX to Realistic Voiceover")

    with gr.Row():
        docx_input = gr.File(label="Upload .docx File", type="filepath")
        voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice")
        load_button = gr.Button("πŸ”„ Load Speakers", visible=False)
        speaker_dropdown = gr.Dropdown(label="Speaker", visible=False)

    generate_button = gr.Button("πŸŽ™οΈ Generate Speech")
    audio_output = gr.Audio(label="πŸ”Š Output WAV", type="filepath")

    # Event bindings
    voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=load_button)
    load_button.click(fn=load_speakers_ui, inputs=voice_dropdown, outputs=speaker_dropdown)

    generate_button.click(
        fn=docx_to_wav,
        inputs=[docx_input, voice_dropdown, speaker_dropdown],
        outputs=audio_output
    )

if __name__ == "__main__":
    interface.launch()