File size: 5,001 Bytes
ab0bdb4 0c20337 ed1a5ad ab0bdb4 49c7767 5165e58 ab0bdb4 0c5c249 ab0bdb4 0c5c249 ed1a5ad 8e6abd8 ed1a5ad ab0bdb4 0c5c249 ab0bdb4 ed1a5ad 0c20337 ab0bdb4 a0ea8bb ed1a5ad ab0bdb4 0c5c249 5165e58 ed1a5ad 0c20337 ab0bdb4 0c5c249 0c20337 ab0bdb4 0c20337 ab0bdb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# import os
# import uuid
# import time
# import torch
# import gradio as gr
# os.environ["NUMBA_DISABLE_CACHE"] = "1"
# import mecab_patch
# import english_patch
# from melo.api import TTS
# from openvoice.api import ToneColorConverter
# # Set temporary cache locations for Hugging Face Spaces
# os.environ["TORCH_HOME"] = "/tmp/torch"
# os.environ["HF_HOME"] = "/tmp/huggingface"
# os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
# os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
# os.environ["MPLCONFIGDIR"] = "/tmp"
# os.environ["XDG_CACHE_HOME"] = "/tmp"
# os.environ["XDG_CONFIG_HOME"] = "/tmp"
# os.environ["NUMBA_DISABLE_CACHE"] = "1"
# os.makedirs("/tmp/torch", exist_ok=True)
# os.makedirs("/tmp/huggingface", exist_ok=True)
# os.makedirs("/tmp/flagged", exist_ok=True)
# # Output folder
# output_dir = "/tmp/outputs"
# os.makedirs(output_dir, exist_ok=True)
# # Initialize tone converter
# ckpt_converter = "checkpoints/converter/config.json"
# tone_color_converter = ToneColorConverter(ckpt_converter)
# # Device setting
# device = "cuda" if torch.cuda.is_available() else "cpu"
# def clone_and_speak(text, speaker_wav):
# if not speaker_wav:
# return "Please upload a reference .wav file."
# # import melo.text.english as english
# # original_g2p = english.g2p
# # def patched_g2p(text):
# # phones, tones, word2ph = original_g2p(text)
# # # Fix: wrap ints in list to avoid TypeError
# # word2ph_fixed = []
# # for item in word2ph:
# # if isinstance(item, int):
# # word2ph_fixed.append([item])
# # else:
# # word2ph_fixed.append(item)
# # return phones, tones, word2ph_fixed
# # english.g2p = patched_g2p
# base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
# tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
# final_output_path = f"{output_dir}/{base_name}_converted.wav"
# # Use English speaker model
# model = TTS(language="EN", device=device)
# speaker_ids = model.hps.data.spk2id
# default_speaker_id = next(iter(speaker_ids.values()))
# # Generate base TTS voice
# speed = 1.0
# model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)
# # Use speaker_wav as reference to extract style embedding
# from openvoice import se_extractor
# ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# # Run the tone conversion
# tone_color_converter.convert(
# audio_src_path=tmp_melo_path,
# src_se=ref_se,
# tgt_se=ref_se,
# output_path=final_output_path,
# message="@HuggingFace",
# )
# return final_output_path
# # Gradio interface
# gr.Interface(
# fn=clone_and_speak,
# inputs=[
# gr.Textbox(label="Enter Text"),
# gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
# ],
# outputs=gr.Audio(label="Synthesized Output"),
# flagging_dir="/tmp/flagged",
# title="Text to Voice using Melo TTS + OpenVoice",
# description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
# ).launch()
import os
import time
import uuid
import gradio as gr
from TTS.api import TTS
from openvoice import se_extractor, ToneColorConverter
# Import your local english.py logic
from meloTTS import english
# Paths
device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)
# Load OpenVoice tone converter
tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
tone_color_converter.load_model()
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
final_output_path = f"{output_dir}/{base_name}_converted.wav"
# Use English speaker model
model = TTS(language="EN", device=device)
speaker_ids = model.hps.data.spk2id
default_speaker_id = next(iter(speaker_ids.values()))
# Generate base TTS voice
model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)
# Extract style embedding
ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# Convert tone
tone_color_converter.convert(
audio_src_path=tmp_melo_path,
src_se=ref_se,
tgt_se=ref_se,
output_path=final_output_path,
message="@HuggingFace"
)
return final_output_path
# Gradio Interface
demo = gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Text to Synthesize"),
gr.Audio(label="Reference Voice (WAV)", type="filepath")
],
outputs=gr.Audio(label="Cloned Voice Output"),
title="Voice Cloner with MeloTTS + OpenVoice"
)
if __name__ == "__main__":
demo.launch()
|