File size: 6,738 Bytes
7712bf9
6711545
7712bf9
 
0c20337
ab0bdb4
16c7cf3
 
49c7767
16c7cf3
b5485c0
 
16c7cf3
0856e34
93c38a4
1fbf0a3
ab0bdb4
16c7cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab0bdb4
 
16c7cf3
 
 
 
 
 
0c5c249
6d77b5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16c7cf3
ed1a5ad
 
 
 
 
8e6abd8
ed1a5ad
6d77b5b
 
 
 
 
 
ed1a5ad
 
16c7cf3
6d77b5b
0c5c249
6d77b5b
 
 
 
 
 
 
 
 
 
 
 
0c20337
16c7cf3
a0ea8bb
ed1a5ad
 
 
 
16c7cf3
0c5c249
5165e58
ed1a5ad
0c20337
6d77b5b
16c7cf3
878264e
 
 
 
 
 
 
 
 
 
 
 
 
 
0c20337
878264e
 
0c20337
878264e
 
 
 
 
 
16c7cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab0bdb4
16c7cf3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import nltk
nltk.download('all')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import os
import uuid
import time
import torch
import gradio as gr
os.environ["NUMBA_DISABLE_CACHE"] = "1"
# import mecab_patch
# import english_patch
#from melo.api import TTS
from MeloTTS.melo.api import TTS
from openvoice.api import ToneColorConverter
#from meloTTS import english

# Set temporary cache locations for Hugging Face Spaces
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/torch", exist_ok=True)
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)

# Output folder
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)

# Initialize tone converter
ckpt_converter = "checkpoints/converter/config.json"
tone_color_converter = ToneColorConverter(ckpt_converter)

# Device setting
device = "cuda" if torch.cuda.is_available() else "cpu"

# def clone_and_speak(text, speaker_wav):
#     if not speaker_wav:
#         return "Please upload a reference .wav file."

#     base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
#     tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
#     final_output_path = f"{output_dir}/{base_name}_converted.wav"

#     # Use English speaker model
#     model = TTS(language="EN", device=device)
#     speaker_ids = model.hps.data.spk2id
#     default_speaker_id = next(iter(speaker_ids.values()))

#     # Generate base TTS voice
#     speed = 1.0
#     model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)

#     # Use speaker_wav as reference to extract style embedding
#     from openvoice import se_extractor
#     ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True)

#     # Run the tone conversion
#     tone_color_converter.convert(
#         audio_src_path=tmp_melo_path,
#         src_se=ref_se,
#         tgt_se=ref_se,
#         output_path=final_output_path,
#         message="@HuggingFace",
#     )

#     return final_output_path

def clone_and_speak(text, selected_speaker_key):
    if not text or not selected_speaker_key:
        return "Please enter text and select a speaker."

    base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
    tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
    final_output_path = f"{output_dir}/{base_name}_converted.wav"

    # Use English speaker model
    model = TTS(language="EN", device=device)
    speaker_ids = model.hps.data.spk2id

    # Map speaker_key to speaker_id (model-specific)
    if selected_speaker_key not in speaker_ids:
        return f"Speaker '{selected_speaker_key}' not found in model."

    speaker_id = speaker_ids[selected_speaker_key]

    # Generate base TTS voice
    speed = 1.0
    model.tts_to_file(text, speaker_id, tmp_melo_path, speed=speed)

    # Load pre-saved speaker embedding
    normalized_key = selected_speaker_key.lower().replace("_", "-")
    se_path = f'checkpoints_v2/base_speakers/ses/{normalized_key}.pth'
    
    if not os.path.isfile(se_path):
        return f"SE file not found for speaker '{normalized_key}'."

    ref_se = torch.load(se_path, map_location=device)

    # Disable MPS if present but device is CPU
    if torch.backends.mps.is_available() and device == 'cpu':
        torch.backends.mps.is_available = lambda: False

    # Run the tone conversion
    tone_color_converter.convert(
        audio_src_path=tmp_melo_path,
        src_se=ref_se,
        tgt_se=ref_se,
        output_path=final_output_path,
        message="@HuggingFace",
    )

    return final_output_path


# Gradio interface
# gr.Interface(
#     fn=clone_and_speak,
#     inputs=[
#         gr.Textbox(label="Enter Text"),
#         gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
#     ],
#     outputs=gr.Audio(label="Synthesized Output"),
#     flagging_dir="/tmp/flagged",
#     title="Text to Voice using Melo TTS + OpenVoice",
#     description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
# ).launch()

iface = gr.Interface(
    fn=clone_with_base_speaker,
    inputs=[
        gr.Textbox(label="Input Text", placeholder="Enter text to synthesize..."),
        gr.Dropdown(choices=base_speaker_choices, label="Select Base Speaker"),
    ],
    outputs=gr.Audio(type="filepath", label="Cloned Voice Output"),
    title="Voice Cloning with OpenVoice Base Speakers",
    description="Choose a base speaker from OpenVoice and enter text to generate voice."
)

iface.launch()


# import os
# import time
# import uuid
# import gradio as gr

# from TTS.api import TTS
# from openvoice import se_extractor
# from openvoice.api import ToneColorConverter

# # Import your local english.py logic
# from meloTTS import english

# # Paths
# device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
# output_dir = "outputs"
# os.makedirs(output_dir, exist_ok=True)

# # Load OpenVoice tone converter
# tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
# tone_color_converter.load_model()

# def clone_and_speak(text, speaker_wav):
#     if not speaker_wav:
#         return "Please upload a reference .wav file."

#     base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
#     tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
#     final_output_path = f"{output_dir}/{base_name}_converted.wav"

#     # Use English speaker model
#     model = TTS(language="EN", device=device)
#     speaker_ids = model.hps.data.spk2id
#     default_speaker_id = next(iter(speaker_ids.values()))

#     # Generate base TTS voice
#     model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)

#     # Extract style embedding
#     ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)

#     # Convert tone
#     tone_color_converter.convert(
#         audio_src_path=tmp_melo_path,
#         src_se=ref_se,
#         tgt_se=ref_se,
#         output_path=final_output_path,
#         message="@HuggingFace"
#     )

#     return final_output_path

# # Gradio Interface
# demo = gr.Interface(
#     fn=clone_and_speak,
#     inputs=[
#         gr.Textbox(label="Text to Synthesize"),
#         gr.Audio(label="Reference Voice (WAV)", type="filepath")
#     ],
#     outputs=gr.Audio(label="Cloned Voice Output"),
#     title="Voice Cloner with MeloTTS + OpenVoice"
# )

# if __name__ == "__main__":
#     demo.launch()