import nltk nltk.download('all') nltk.download('averaged_perceptron_tagger') nltk.download('punkt') import os import uuid import time import torch import gradio as gr os.environ["NUMBA_DISABLE_CACHE"] = "1" # import mecab_patch # import english_patch #from melo.api import TTS from MeloTTS.melo.api import TTS from openvoice.api import ToneColorConverter #from meloTTS import english # Set temporary cache locations for Hugging Face Spaces os.environ["TORCH_HOME"] = "/tmp/torch" os.environ["HF_HOME"] = "/tmp/huggingface" os.environ["HF_HUB_CACHE"] = "/tmp/huggingface" os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface" os.environ["MPLCONFIGDIR"] = "/tmp" os.environ["XDG_CACHE_HOME"] = "/tmp" os.environ["XDG_CONFIG_HOME"] = "/tmp" os.environ["NUMBA_DISABLE_CACHE"] = "1" os.makedirs("/tmp/torch", exist_ok=True) os.makedirs("/tmp/huggingface", exist_ok=True) os.makedirs("/tmp/flagged", exist_ok=True) # Output folder output_dir = "/tmp/outputs" os.makedirs(output_dir, exist_ok=True) # Initialize tone converter ckpt_converter = "checkpoints/converter/config.json" tone_color_converter = ToneColorConverter(ckpt_converter) # Device setting device = "cuda" if torch.cuda.is_available() else "cpu" # def clone_and_speak(text, speaker_wav): # if not speaker_wav: # return "Please upload a reference .wav file." # base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}" # tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav" # final_output_path = f"{output_dir}/{base_name}_converted.wav" # # Use English speaker model # model = TTS(language="EN", device=device) # speaker_ids = model.hps.data.spk2id # default_speaker_id = next(iter(speaker_ids.values())) # # Generate base TTS voice # speed = 1.0 # model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed) # # Use speaker_wav as reference to extract style embedding # from openvoice import se_extractor # ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True) # # Run the tone conversion # tone_color_converter.convert( # audio_src_path=tmp_melo_path, # src_se=ref_se, # tgt_se=ref_se, # output_path=final_output_path, # message="@HuggingFace", # ) # return final_output_path def clone_and_speak(text, selected_speaker_key): if not text or not selected_speaker_key: return "Please enter text and select a speaker." base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}" tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav" final_output_path = f"{output_dir}/{base_name}_converted.wav" # Use English speaker model model = TTS(language="EN", device=device) speaker_ids = model.hps.data.spk2id # Map speaker_key to speaker_id (model-specific) if selected_speaker_key not in speaker_ids: return f"Speaker '{selected_speaker_key}' not found in model." speaker_id = speaker_ids[selected_speaker_key] # Generate base TTS voice speed = 1.0 model.tts_to_file(text, speaker_id, tmp_melo_path, speed=speed) # Load pre-saved speaker embedding normalized_key = selected_speaker_key.lower().replace("_", "-") se_path = f'checkpoints_v2/base_speakers/ses/{normalized_key}.pth' if not os.path.isfile(se_path): return f"SE file not found for speaker '{normalized_key}'." ref_se = torch.load(se_path, map_location=device) # Disable MPS if present but device is CPU if torch.backends.mps.is_available() and device == 'cpu': torch.backends.mps.is_available = lambda: False # Run the tone conversion tone_color_converter.convert( audio_src_path=tmp_melo_path, src_se=ref_se, tgt_se=ref_se, output_path=final_output_path, message="@HuggingFace", ) return final_output_path # Gradio interface # gr.Interface( # fn=clone_and_speak, # inputs=[ # gr.Textbox(label="Enter Text"), # gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)") # ], # outputs=gr.Audio(label="Synthesized Output"), # flagging_dir="/tmp/flagged", # title="Text to Voice using Melo TTS + OpenVoice", # description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.", # ).launch() iface = gr.Interface( fn=clone_with_base_speaker, inputs=[ gr.Textbox(label="Input Text", placeholder="Enter text to synthesize..."), gr.Dropdown(choices=base_speaker_choices, label="Select Base Speaker"), ], outputs=gr.Audio(type="filepath", label="Cloned Voice Output"), title="Voice Cloning with OpenVoice Base Speakers", description="Choose a base speaker from OpenVoice and enter text to generate voice." ) iface.launch() # import os # import time # import uuid # import gradio as gr # from TTS.api import TTS # from openvoice import se_extractor # from openvoice.api import ToneColorConverter # # Import your local english.py logic # from meloTTS import english # # Paths # device = "cuda" if os.system("nvidia-smi") == 0 else "cpu" # output_dir = "outputs" # os.makedirs(output_dir, exist_ok=True) # # Load OpenVoice tone converter # tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device) # tone_color_converter.load_model() # def clone_and_speak(text, speaker_wav): # if not speaker_wav: # return "Please upload a reference .wav file." # base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}" # tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav" # final_output_path = f"{output_dir}/{base_name}_converted.wav" # # Use English speaker model # model = TTS(language="EN", device=device) # speaker_ids = model.hps.data.spk2id # default_speaker_id = next(iter(speaker_ids.values())) # # Generate base TTS voice # model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0) # # Extract style embedding # ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False) # # Convert tone # tone_color_converter.convert( # audio_src_path=tmp_melo_path, # src_se=ref_se, # tgt_se=ref_se, # output_path=final_output_path, # message="@HuggingFace" # ) # return final_output_path # # Gradio Interface # demo = gr.Interface( # fn=clone_and_speak, # inputs=[ # gr.Textbox(label="Text to Synthesize"), # gr.Audio(label="Reference Voice (WAV)", type="filepath") # ], # outputs=gr.Audio(label="Cloned Voice Output"), # title="Voice Cloner with MeloTTS + OpenVoice" # ) # if __name__ == "__main__": # demo.launch()