File size: 2,484 Bytes
0c20337
fbd01f5
ed1a5ad
 
49c7767
31fde06
32a4d99
ed1a5ad
81a8f5e
e16fd1e
ed1a5ad
81a8f5e
a647645
e03756e
 
e16fd1e
 
 
 
ed1a5ad
e16fd1e
43e8b5c
5165e58
ed1a5ad
8b02d24
0c5c249
5165e58
ed1a5ad
 
0c5c249
 
ed1a5ad
 
0c5c249
 
 
 
 
ed1a5ad
 
 
 
 
8e6abd8
ed1a5ad
 
 
 
0a48281
0c5c249
ed1a5ad
 
 
0c20337
ed1a5ad
a0ea8bb
ed1a5ad
 
 
 
 
0c5c249
5165e58
ed1a5ad
0c20337
ed1a5ad
e17e821
0c5c249
0c20337
0c5c249
 
0c20337
0c5c249
ed1a5ad
 
 
5ad697d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import uuid
import time
import torch
import gradio as gr
os.environ["NUMBA_DISABLE_CACHE"] = "1"
import mecab_patch
from melo.api import TTS
from openvoice.api import ToneColorConverter

# Set temporary cache locations for Hugging Face Spaces
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/torch", exist_ok=True)
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)

# Output folder
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)

# Initialize tone converter
ckpt_converter = "checkpoints/converter/config.json"
tone_color_converter = ToneColorConverter(ckpt_converter)

# Device setting
device = "cuda" if torch.cuda.is_available() else "cpu"

def clone_and_speak(text, speaker_wav):
    if not speaker_wav:
        return "Please upload a reference .wav file."

    base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
    tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
    final_output_path = f"{output_dir}/{base_name}_converted.wav"

    # Use English speaker model
    model = TTS(language="EN", device=device)
    speaker_ids = model.hps.data.spk2id
    default_speaker_id = next(iter(speaker_ids.values()))

    # Generate base TTS voice
    model.tts_to_file(text, default_speaker_id, tmp_melo_path)

    # Use speaker_wav as reference to extract style embedding
    from openvoice import se_extractor
    ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)

    # Run the tone conversion
    tone_color_converter.convert(
        audio_src_path=tmp_melo_path,
        src_se=ref_se,
        tgt_se=ref_se,
        output_path=final_output_path,
        message="@HuggingFace",
    )

    return final_output_path

# Gradio interface
gr.Interface(
    fn=clone_and_speak,
    inputs=[
        gr.Textbox(label="Enter Text"),
        gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
    ],
    outputs=gr.Audio(label="Synthesized Output"),
    flagging_dir="/tmp/flagged",
    title="Text to Voice using Melo TTS + OpenVoice",
    description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
).launch()