import os import uuid import time import torch import gradio as gr os.environ["NUMBA_DISABLE_CACHE"] = "1" from melo.api import TTS from openvoice.api import ToneColorConverter # Set temporary cache locations for Hugging Face Spaces os.environ["TORCH_HOME"] = "/tmp/torch" os.environ["HF_HOME"] = "/tmp/huggingface" os.environ["HF_HUB_CACHE"] = "/tmp/huggingface" os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface" os.environ["MPLCONFIGDIR"] = "/tmp" os.environ["XDG_CACHE_HOME"] = "/tmp" os.environ["XDG_CONFIG_HOME"] = "/tmp" os.environ["NUMBA_DISABLE_CACHE"] = "1" os.makedirs("/tmp/torch", exist_ok=True) os.makedirs("/tmp/huggingface", exist_ok=True) os.makedirs("/tmp/flagged", exist_ok=True) # Output folder output_dir = "/tmp/outputs" os.makedirs(output_dir, exist_ok=True) # Initialize tone converter ckpt_converter = "checkpoints/converter/config.json" tone_color_converter = ToneColorConverter(ckpt_converter) # Device setting device = "cuda" if torch.cuda.is_available() else "cpu" def clone_and_speak(text, speaker_wav): if not speaker_wav: return "Please upload a reference .wav file." base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}" tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav" final_output_path = f"{output_dir}/{base_name}_converted.wav" # Use English speaker model model = TTS(language="EN", device=device) speaker_ids = model.hps.data.spk2id default_speaker_id = next(iter(speaker_ids.values())) # Generate base TTS voice model.tts_to_file(text, default_speaker_id, tmp_melo_path) # Use speaker_wav as reference to extract style embedding from openvoice import se_extractor ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False) # Run the tone conversion tone_color_converter.convert( audio_src_path=tmp_melo_path, src_se=ref_se, tgt_se=ref_se, output_path=final_output_path, message="@HuggingFace", ) return final_output_path # Gradio interface gr.Interface( fn=clone_and_speak, inputs=[ gr.Textbox(label="Enter Text"), gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)") ], outputs=gr.Audio(label="Synthesized Output"), flagging_dir="/tmp/flagged", title="Text to Voice using Melo TTS + OpenVoice", description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.", ).launch()