File size: 5,001 Bytes
ab0bdb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c20337
ed1a5ad
ab0bdb4
49c7767
5165e58
ab0bdb4
 
0c5c249
ab0bdb4
 
 
 
 
 
 
 
 
 
 
0c5c249
 
 
 
 
ed1a5ad
 
 
 
 
8e6abd8
ed1a5ad
 
 
 
ab0bdb4
0c5c249
ab0bdb4
ed1a5ad
0c20337
ab0bdb4
a0ea8bb
ed1a5ad
 
 
 
ab0bdb4
0c5c249
5165e58
ed1a5ad
0c20337
ab0bdb4
 
0c5c249
0c20337
ab0bdb4
 
0c20337
ab0bdb4
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# import os
# import uuid
# import time
# import torch
# import gradio as gr
# os.environ["NUMBA_DISABLE_CACHE"] = "1"
# import mecab_patch
# import english_patch
# from melo.api import TTS
# from openvoice.api import ToneColorConverter

# # Set temporary cache locations for Hugging Face Spaces
# os.environ["TORCH_HOME"] = "/tmp/torch"
# os.environ["HF_HOME"] = "/tmp/huggingface"
# os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
# os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
# os.environ["MPLCONFIGDIR"] = "/tmp"
# os.environ["XDG_CACHE_HOME"] = "/tmp"
# os.environ["XDG_CONFIG_HOME"] = "/tmp"
# os.environ["NUMBA_DISABLE_CACHE"] = "1"
# os.makedirs("/tmp/torch", exist_ok=True)
# os.makedirs("/tmp/huggingface", exist_ok=True)
# os.makedirs("/tmp/flagged", exist_ok=True)

# # Output folder
# output_dir = "/tmp/outputs"
# os.makedirs(output_dir, exist_ok=True)

# # Initialize tone converter
# ckpt_converter = "checkpoints/converter/config.json"
# tone_color_converter = ToneColorConverter(ckpt_converter)

# # Device setting
# device = "cuda" if torch.cuda.is_available() else "cpu"

# def clone_and_speak(text, speaker_wav):
#     if not speaker_wav:
#         return "Please upload a reference .wav file."

#     # import melo.text.english as english
#     # original_g2p = english.g2p

#     # def patched_g2p(text):
#     #     phones, tones, word2ph = original_g2p(text)
#     #     # Fix: wrap ints in list to avoid TypeError
#     #     word2ph_fixed = []
#     #     for item in word2ph:
#     #         if isinstance(item, int):
#     #             word2ph_fixed.append([item])
#     #         else:
#     #             word2ph_fixed.append(item)
#     #     return phones, tones, word2ph_fixed

#     # english.g2p = patched_g2p

#     base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
#     tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
#     final_output_path = f"{output_dir}/{base_name}_converted.wav"

#     # Use English speaker model
#     model = TTS(language="EN", device=device)
#     speaker_ids = model.hps.data.spk2id
#     default_speaker_id = next(iter(speaker_ids.values()))

#     # Generate base TTS voice
#     speed = 1.0
#     model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)

#     # Use speaker_wav as reference to extract style embedding
#     from openvoice import se_extractor
#     ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)

#     # Run the tone conversion
#     tone_color_converter.convert(
#         audio_src_path=tmp_melo_path,
#         src_se=ref_se,
#         tgt_se=ref_se,
#         output_path=final_output_path,
#         message="@HuggingFace",
#     )

#     return final_output_path

# # Gradio interface
# gr.Interface(
#     fn=clone_and_speak,
#     inputs=[
#         gr.Textbox(label="Enter Text"),
#         gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
#     ],
#     outputs=gr.Audio(label="Synthesized Output"),
#     flagging_dir="/tmp/flagged",
#     title="Text to Voice using Melo TTS + OpenVoice",
#     description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
# ).launch()


import os
import time
import uuid
import gradio as gr

from TTS.api import TTS
from openvoice import se_extractor, ToneColorConverter

# Import your local english.py logic
from meloTTS import english

# Paths
device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)

# Load OpenVoice tone converter
tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
tone_color_converter.load_model()

def clone_and_speak(text, speaker_wav):
    if not speaker_wav:
        return "Please upload a reference .wav file."

    base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
    tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
    final_output_path = f"{output_dir}/{base_name}_converted.wav"

    # Use English speaker model
    model = TTS(language="EN", device=device)
    speaker_ids = model.hps.data.spk2id
    default_speaker_id = next(iter(speaker_ids.values()))

    # Generate base TTS voice
    model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)

    # Extract style embedding
    ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)

    # Convert tone
    tone_color_converter.convert(
        audio_src_path=tmp_melo_path,
        src_se=ref_se,
        tgt_se=ref_se,
        output_path=final_output_path,
        message="@HuggingFace"
    )

    return final_output_path

# Gradio Interface
demo = gr.Interface(
    fn=clone_and_speak,
    inputs=[
        gr.Textbox(label="Text to Synthesize"),
        gr.Audio(label="Reference Voice (WAV)", type="filepath")
    ],
    outputs=gr.Audio(label="Cloned Voice Output"),
    title="Voice Cloner with MeloTTS + OpenVoice"
)

if __name__ == "__main__":
    demo.launch()