File size: 2,425 Bytes
0c20337 81a8f5e 49c7767 81a8f5e e16fd1e 81a8f5e a647645 e03756e e16fd1e 81a8f5e e16fd1e 43e8b5c 5165e58 81a8f5e 8b02d24 0c5c249 5165e58 81a8f5e 0c5c249 81a8f5e 0c5c249 81a8f5e 0c5c249 81a8f5e 0c20337 81a8f5e a0ea8bb 81a8f5e 0c5c249 5165e58 81a8f5e 0c20337 81a8f5e e17e821 0c5c249 0c20337 0c5c249 0c20337 0c5c249 81a8f5e 5ad697d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import os
import uuid
import time
import torch
import gradio as gr
from melo.api import TTS
from openvoice.api import ToneColorConverter
# Set temporary cache locations for Hugging Face Spaces
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/torch", exist_ok=True)
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
# Output folder
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)
# Initialize tone converter
ckpt_converter = "checkpoints/converter/config.json"
tone_color_converter = ToneColorConverter(ckpt_converter)
# Device setting
device = "cuda" if torch.cuda.is_available() else "cpu"
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
final_output_path = f"{output_dir}/{base_name}_converted.wav"
# Use English speaker model
model = TTS(language="EN", device=device)
speaker_ids = model.hps.data.spk2id
default_speaker_id = next(iter(speaker_ids.values()))
# Generate base TTS voice
model.tts_to_file(text, default_speaker_id, tmp_melo_path)
# Use speaker_wav as reference to extract style embedding
from openvoice import se_extractor
ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# Run the tone conversion
tone_color_converter.convert(
audio_src_path=tmp_melo_path,
src_se=ref_se,
tgt_se=ref_se,
output_path=final_output_path,
message="@HuggingFace",
)
return final_output_path
# Gradio interface
gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
],
outputs=gr.Audio(label="Synthesized Output"),
flagging_dir="/tmp/flagged",
title="Text to Voice using Melo TTS + OpenVoice",
description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
).launch()
|