File size: 2,484 Bytes
0c20337 fbd01f5 ed1a5ad 49c7767 31fde06 32a4d99 ed1a5ad 81a8f5e e16fd1e ed1a5ad 81a8f5e a647645 e03756e e16fd1e ed1a5ad e16fd1e 43e8b5c 5165e58 ed1a5ad 8b02d24 0c5c249 5165e58 ed1a5ad 0c5c249 ed1a5ad 0c5c249 ed1a5ad 8e6abd8 ed1a5ad 0a48281 0c5c249 ed1a5ad 0c20337 ed1a5ad a0ea8bb ed1a5ad 0c5c249 5165e58 ed1a5ad 0c20337 ed1a5ad e17e821 0c5c249 0c20337 0c5c249 0c20337 0c5c249 ed1a5ad 5ad697d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import os
import uuid
import time
import torch
import gradio as gr
os.environ["NUMBA_DISABLE_CACHE"] = "1"
import mecab_patch
from melo.api import TTS
from openvoice.api import ToneColorConverter
# Set temporary cache locations for Hugging Face Spaces
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/torch", exist_ok=True)
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
# Output folder
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)
# Initialize tone converter
ckpt_converter = "checkpoints/converter/config.json"
tone_color_converter = ToneColorConverter(ckpt_converter)
# Device setting
device = "cuda" if torch.cuda.is_available() else "cpu"
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
final_output_path = f"{output_dir}/{base_name}_converted.wav"
# Use English speaker model
model = TTS(language="EN", device=device)
speaker_ids = model.hps.data.spk2id
default_speaker_id = next(iter(speaker_ids.values()))
# Generate base TTS voice
model.tts_to_file(text, default_speaker_id, tmp_melo_path)
# Use speaker_wav as reference to extract style embedding
from openvoice import se_extractor
ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# Run the tone conversion
tone_color_converter.convert(
audio_src_path=tmp_melo_path,
src_se=ref_se,
tgt_se=ref_se,
output_path=final_output_path,
message="@HuggingFace",
)
return final_output_path
# Gradio interface
gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
],
outputs=gr.Audio(label="Synthesized Output"),
flagging_dir="/tmp/flagged",
title="Text to Voice using Melo TTS + OpenVoice",
description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
).launch()
|