|
import nltk |
|
nltk.download('all') |
|
nltk.download('averaged_perceptron_tagger') |
|
nltk.download('punkt') |
|
import os |
|
import uuid |
|
import time |
|
import torch |
|
import gradio as gr |
|
os.environ["NUMBA_DISABLE_CACHE"] = "1" |
|
|
|
|
|
|
|
from MeloTTS.melo.api import TTS |
|
from openvoice.api import ToneColorConverter |
|
|
|
|
|
|
|
os.environ["TORCH_HOME"] = "/tmp/torch" |
|
os.environ["HF_HOME"] = "/tmp/huggingface" |
|
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface" |
|
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface" |
|
os.environ["MPLCONFIGDIR"] = "/tmp" |
|
os.environ["XDG_CACHE_HOME"] = "/tmp" |
|
os.environ["XDG_CONFIG_HOME"] = "/tmp" |
|
os.environ["NUMBA_DISABLE_CACHE"] = "1" |
|
os.makedirs("/tmp/torch", exist_ok=True) |
|
os.makedirs("/tmp/huggingface", exist_ok=True) |
|
os.makedirs("/tmp/flagged", exist_ok=True) |
|
|
|
|
|
output_dir = "/tmp/outputs" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
ckpt_converter = "checkpoints/converter/config.json" |
|
tone_color_converter = ToneColorConverter(ckpt_converter) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
def clone_and_speak(text, speaker_wav): |
|
if not speaker_wav: |
|
return "Please upload a reference .wav file." |
|
|
|
base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}" |
|
tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav" |
|
final_output_path = f"{output_dir}/{base_name}_converted.wav" |
|
|
|
|
|
model = TTS(language="EN", device=device) |
|
speaker_ids = model.hps.data.spk2id |
|
|
|
|
|
for speaker_key in speaker_ids.keys(): |
|
speaker_id = speaker_ids[speaker_key] |
|
speaker_key = speaker_key.lower().replace('_', '-') |
|
|
|
|
|
speed = 1.0 |
|
source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device) |
|
model.tts_to_file(text, speaker_id, tmp_melo_path,speed=speed) |
|
|
|
|
|
from openvoice import se_extractor |
|
ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True) |
|
|
|
|
|
tone_color_converter.convert( |
|
audio_src_path=tmp_melo_path, |
|
src_se=source_se, |
|
tgt_se=ref_se, |
|
output_path=final_output_path, |
|
message="@HuggingFace", |
|
) |
|
|
|
return final_output_path |
|
|
|
|
|
|
|
|
|
Gradio interface |
|
gr.Interface( |
|
fn=clone_and_speak, |
|
inputs=[ |
|
gr.Textbox(label="Enter Text"), |
|
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)") |
|
], |
|
outputs=gr.Audio(label="Synthesized Output"), |
|
flagging_dir="/tmp/flagged", |
|
title="Text to Voice using Melo TTS + OpenVoice", |
|
description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.", |
|
).launch() |
|
|
|
iface = gr.Interface( |
|
fn=clone_with_base_speaker, |
|
inputs=[ |
|
gr.Textbox(label="Input Text", placeholder="Enter text to synthesize..."), |
|
gr.Dropdown(choices=base_speaker_choices, label="Select Base Speaker"), |
|
], |
|
outputs=gr.Audio(type="filepath", label="Cloned Voice Output"), |
|
title="Voice Cloning with OpenVoice Base Speakers", |
|
description="Choose a base speaker from OpenVoice and enter text to generate voice." |
|
) |
|
|
|
iface.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|