File size: 2,123 Bytes
0c20337
a59b93c
 
 
 
 
 
e16fd1e
a59b93c
a647645
e03756e
 
e16fd1e
 
 
 
 
43e8b5c
5165e58
0c5c249
509a00f
8b02d24
0c5c249
5165e58
a59b93c
0c5c249
 
a59b93c
0c5c249
 
 
 
 
 
 
 
 
 
 
 
 
 
0c20337
a59b93c
0c5c249
 
 
 
 
 
 
 
 
 
5165e58
0c5c249
0c20337
a59b93c
0c5c249
 
0c20337
0c5c249
 
0c20337
0c5c249
a59b93c
0c5c249
 
0c20337
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
import gradio as gr
from openvoice.api import ToneColorConverter
from openvoice import se_extractor
import torch
import time
import uuid

# Environment fixes for HF Spaces
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)

# Set model paths
ckpt_converter = "checkpoints/converter/config.json"
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)

# Initialize OpenVoice converter
tone_color_converter = ToneColorConverter(ckpt_converter)

# Speaker embedding cache
ref_speaker_embed = None

def clone_and_speak(text, speaker_wav):
    if not speaker_wav:
        return "Please upload a reference .wav file."

    # Generate a unique filename
    timestamp = str(int(time.time()))
    base_name = f"output_{timestamp}_{uuid.uuid4().hex[:6]}"
    output_wav = os.path.join(output_dir, f"{base_name}.wav")

    # Extract style from uploaded speaker voice
    global ref_speaker_embed
    ref_speaker_embed = se_extractor.get_se(speaker_wav, tone_color_converter)

    # Generate speech using base model
    tone_color_converter.infer(
        text=text,
        speaker_id="openvoice",
        language="en",
        ref_speaker=speaker_wav,
        ref_embed=ref_speaker_embed,
        output_path=output_wav,
        top_k=10,
        temperature=0.3
    )

    return output_wav

# Gradio interface (exposed as global `demo` for HF Spaces)
demo = gr.Interface(
    fn=clone_and_speak,
    inputs=[
        gr.Textbox(label="Enter Text"),
        gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
    ],
    outputs=gr.Audio(label="Synthesized Output"),
    flagging_dir="/tmp/flagged",  # safe temporary dir
    title="Text to Voice using OpenVoice",
    description="Clone any voice (English) and generate speech using OpenVoice on CPU.",
)