Spaces:
Running
Running
import gradio as gr | |
import torch | |
import torchaudio | |
import numpy as np | |
import tempfile | |
import os | |
from pathlib import Path | |
import librosa | |
import soundfile as sf | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from transformers import Wav2Vec2Processor, Wav2Vec2Model | |
from datasets import load_dataset | |
import warnings | |
import gc | |
import requests | |
import json | |
import base64 | |
warnings.filterwarnings("ignore") | |
class VoiceCloningTTS: | |
def __init__(self): | |
"""Initialize the TTS system with SpeechT5 model""" | |
self.device = torch.device("cpu") | |
print(f"Using device: {self.device}") | |
try: | |
print("Loading SpeechT5 processor...") | |
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
print("Loading SpeechT5 TTS model...") | |
self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
self.model.to(self.device) | |
self.model.eval() | |
print("Loading SpeechT5 vocoder...") | |
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
self.vocoder.to(self.device) | |
self.vocoder.eval() | |
print("Loading Wav2Vec2 for speaker embedding...") | |
self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") | |
self.wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") | |
self.wav2vec2_model.to(self.device) | |
self.wav2vec2_model.eval() | |
print("Loading speaker embeddings dataset...") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
self.speaker_embeddings_dataset = embeddings_dataset | |
self.default_speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device) | |
self.user_speaker_embeddings = None | |
self.sample_rate = 16000 | |
print("β TTS system initialized successfully!") | |
except Exception as e: | |
print(f"β Error initializing TTS system: {str(e)}") | |
raise e | |
def preprocess_audio(self, audio_path): | |
"""Preprocess audio for better speaker embedding extraction""" | |
try: | |
waveform, sample_rate = torchaudio.load(audio_path) | |
if waveform.shape[0] > 1: | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
if sample_rate != self.sample_rate: | |
resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate) | |
waveform = resampler(waveform) | |
waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8) | |
min_length = 3 * self.sample_rate | |
if waveform.shape[1] < min_length: | |
repeat_times = int(np.ceil(min_length / waveform.shape[1])) | |
waveform = waveform.repeat(1, repeat_times)[:, :min_length] | |
max_length = 20 * self.sample_rate | |
if waveform.shape[1] > max_length: | |
waveform = waveform[:, :max_length] | |
return waveform.squeeze() | |
except Exception as e: | |
print(f"Error in audio preprocessing: {e}") | |
raise e | |
def extract_speaker_embedding_advanced(self, audio_path): | |
"""Extract speaker embedding using advanced methods""" | |
try: | |
print(f"Processing audio file: {audio_path}") | |
audio_tensor = self.preprocess_audio(audio_path) | |
audio_numpy = audio_tensor.numpy() | |
print("Extracting deep audio features with Wav2Vec2...") | |
with torch.no_grad(): | |
inputs = self.wav2vec2_processor(audio_numpy, sampling_rate=self.sample_rate, return_tensors="pt", padding=True) | |
outputs = self.wav2vec2_model(inputs.input_values.to(self.device)) | |
speaker_features = torch.mean(outputs.last_hidden_state, dim=1) | |
print(f"Extracted Wav2Vec2 features: {speaker_features.shape}") | |
best_embedding = self.find_best_matching_speaker(speaker_features, audio_numpy) | |
print("β Advanced speaker embedding created successfully!") | |
return best_embedding, "β Voice profile extracted using advanced neural analysis! You can now generate speech in this voice." | |
except Exception as e: | |
print(f"Error in advanced embedding extraction: {e}") | |
return self.extract_speaker_embedding_improved(audio_path) | |
def find_best_matching_speaker(self, target_features, audio_numpy): | |
"""Create a modified embedding based on acoustic features""" | |
try: | |
mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13) | |
pitch, _ = librosa.piptrack(y=audio_numpy, sr=self.sample_rate) | |
spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate) | |
acoustic_signature = np.concatenate([ | |
np.mean(mfccs, axis=1), | |
np.std(mfccs, axis=1), | |
[np.mean(pitch[pitch > 0]) if np.any(pitch > 0) else 200], | |
[np.mean(spectral_centroids)] | |
]) | |
best_embedding = self.default_speaker_embeddings | |
modification_factor = 0.3 # Increased for more distinct voice | |
feature_mod = torch.tensor(acoustic_signature[:best_embedding.shape[1]], dtype=torch.float32).to(self.device) | |
feature_mod = (feature_mod - torch.mean(feature_mod)) / (torch.std(feature_mod) + 1e-8) | |
modified_embedding = best_embedding + modification_factor * feature_mod.unsqueeze(0) | |
modified_embedding = torch.nn.functional.normalize(modified_embedding, p=2, dim=1) | |
return modified_embedding | |
except Exception as e: | |
print(f"Error in speaker matching: {e}") | |
return self.default_speaker_embeddings | |
def extract_speaker_embedding_improved(self, audio_path): | |
"""Improved speaker embedding extraction with better acoustic analysis""" | |
try: | |
print("Using improved speaker embedding extraction...") | |
audio_tensor = self.preprocess_audio(audio_path) | |
audio_numpy = audio_tensor.numpy() | |
print("Extracting comprehensive acoustic features...") | |
mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=20) | |
delta_mfccs = librosa.feature.delta(mfccs) | |
delta2_mfccs = librosa.feature.delta(mfccs, order=2) | |
f0, _, _ = librosa.pyin(audio_numpy, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) | |
f0_clean = f0[~np.isnan(f0)] | |
spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate) | |
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate) | |
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate) | |
spectral_contrast = librosa.feature.spectral_contrast(y=audio_numpy, sr=self.sample_rate) | |
lpc_coeffs = librosa.lpc(audio_numpy, order=16) | |
features = np.concatenate([ | |
np.mean(mfccs, axis=1), | |
np.std(mfccs, axis=1), | |
np.mean(delta_mfccs, axis=1), | |
np.mean(delta2_mfccs, axis=1), | |
[np.mean(f0_clean) if len(f0_clean) > 0 else 200], | |
[np.std(f0_clean) if len(f0_clean) > 0 else 50], | |
[np.mean(spectral_centroids)], | |
[np.mean(spectral_bandwidth)], | |
[np.mean(spectral_rolloff)], | |
np.mean(spectral_contrast, axis=1), | |
lpc_coeffs[1:] | |
]) | |
print(f"Extracted {len(features)} advanced acoustic features") | |
base_embedding = self.default_speaker_embeddings | |
embedding_size = base_embedding.shape[1] | |
features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8) | |
if len(features_normalized) > embedding_size: | |
modification_vector = features_normalized[:embedding_size] | |
else: | |
modification_vector = np.pad(features_normalized, (0, embedding_size - len(features_normalized)), 'reflect') | |
modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device) | |
modification_strength = 0.3 # Increased for more distinct voice | |
speaker_embedding = base_embedding + modification_strength * modification_tensor.unsqueeze(0) | |
if len(f0_clean) > 0: | |
pitch_factor = np.mean(f0_clean) / 200.0 | |
pitch_modification = 0.05 * (pitch_factor - 1.0) | |
speaker_embedding = speaker_embedding * (1.0 + pitch_modification) | |
speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1) | |
return speaker_embedding, "β Voice profile extracted with enhanced acoustic analysis! Ready for speech generation." | |
except Exception as e: | |
print(f"β Error in improved embedding extraction: {str(e)}") | |
return None, f"β Error processing audio: {str(e)}" | |
def extract_speaker_embedding(self, audio_path): | |
"""Main method for speaker embedding extraction""" | |
try: | |
return self.extract_speaker_embedding_advanced(audio_path) | |
except Exception as e: | |
print(f"Advanced method failed: {e}") | |
return self.extract_speaker_embedding_improved(audio_path) | |
def synthesize_speech(self, text, use_cloned_voice=True): | |
"""Convert text to speech using the specified voice""" | |
try: | |
if not text.strip(): | |
return None, "β Please enter some text to convert." | |
if len(text) > 500: | |
text = text[:500] | |
print("Text truncated to 500 characters") | |
print(f"Synthesizing speech for: '{text[:50]}...'") | |
if use_cloned_voice and self.user_speaker_embeddings is not None: | |
speaker_embeddings = self.user_speaker_embeddings | |
voice_type = "your cloned voice" | |
print("Using cloned voice embeddings") | |
else: | |
speaker_embeddings = self.default_speaker_embeddings | |
voice_type = "default voice" | |
print("Using default voice embeddings") | |
print(f"Speaker embedding shape: {speaker_embeddings.shape}") | |
inputs = self.processor(text=text, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(self.device) | |
print("Generating speech...") | |
with torch.no_grad(): | |
speaker_embeddings = speaker_embeddings.to(self.device) | |
if speaker_embeddings.dim() == 1: | |
speaker_embeddings = speaker_embeddings.unsqueeze(0) | |
speech = self.model.generate_speech(input_ids, speaker_embeddings, vocoder=self.vocoder) | |
speech_numpy = speech.cpu().numpy() | |
print(f"Generated audio shape: {speech_numpy.shape}") | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
sf.write(tmp_file.name, speech_numpy, self.sample_rate) | |
print(f"Audio saved to: {tmp_file.name}") | |
del speech, input_ids | |
gc.collect() | |
return tmp_file.name, f"β Speech generated successfully using {voice_type}!" | |
except Exception as e: | |
print(f"β Error in synthesize_speech: {str(e)}") | |
return Nail, f"β Error generating speech: {str(e)}" | |
print("π Initializing Enhanced Voice Cloning TTS System...") | |
tts_system = VoiceCloningTTS() | |
def process_voice_upload(audio_file): | |
if audio_file is None: | |
return "β Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False) | |
try: | |
print(f"Processing uploaded file: {audio_file}") | |
speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file) | |
if speaker_embedding is not None: | |
tts_system.user_speaker_embeddings = speaker_embedding | |
print("β Speaker embeddings saved successfully") | |
return message, gr.update(interactive=True), gr.update(interactive=True) | |
else: | |
return message, gr.update(interactive=False), gr.update(interactive=False) | |
except Exception as e: | |
error_msg = f"β Error processing audio: {str(e)}" | |
print(error_msg) | |
return error_msg, gr.update(interactive=False), gr.update(interactive=False) | |
def generate_speech(text, use_cloned_voice): | |
Rosin 42 recommends that when working with audio, you should ensure that the audio file is in a format compatible with `torchaudio.load()`, such as WAV, and that the sample rate matches the expected 16kHz. Here's a solution that should ensure the cloned voice is used correctly: | |
```python | |
import gradio as gr | |
import torch | |
import torchaudio | |
import numpy as np | |
import tempfile | |
import os | |
from pathlib import Path | |
import librosa | |
import soundfile as sf | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from transformers import Wav2Vec2Processor, Wav2Vec2Model | |
from datasets import load_dataset | |
import warnings | |
import gc | |
warnings.filterwarnings("ignore") | |
class VoiceCloningTTS: | |
def __init__(self): | |
self.device = torch.device("cpu") | |
print(f"Using device: {self.device}") | |
try: | |
print("Loading SpeechT5 processor...") | |
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
print("Loading SpeechT5 TTS model...") | |
self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
self.model.to(self.device) | |
self.model.eval() | |
print("Loading SpeechT5 vocoder...") | |
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
self.vocoder.to(self.device) | |
self.vocoder.eval() | |
print("Loading Wav2Vec2 for speaker embedding...") | |
self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") | |
self.wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") | |
self.wav2vec2_model.to(self.device) | |
self.wav2vec2_model.eval() | |
print("Loading speaker embeddings dataset...") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
self.speaker_embeddings_dataset = embeddings_dataset | |
self.default_speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device) | |
self.user_speaker_embeddings = None | |
self.sample_rate = 16000 | |
print("β TTS system initialized successfully!") | |
except Exception as e: | |
print(f"β Error initializing TTS system: {str(e)}") | |
raise e | |
def preprocess_audio(self, audio_path): | |
try: | |
waveform, sample_rate = torchaudio.load(audio_path) | |
if waveform.shape[0] > 1: | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
if sample_rate != self.sample_rate: | |
resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate) | |
waveform = resampler(waveform) | |
waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8) | |
min_length = 3 * self.sample_rate | |
if waveform.shape[1] < min_length: | |
repeat_times = int(np.ceil(min_length / waveform.shape[1])) | |
waveform = waveform.repeat(1, repeat_times)[:, :min_length] | |
max_length = 20 * self.sample_rate | |
if waveform.shape[1] > max_length: | |
waveform = waveform[:, :max_length] | |
return waveform.squeeze() | |
except Exception as e: | |
print(f"Error in audio preprocessing: {e}") | |
raise e | |
def extract_speaker_embedding_advanced(self, audio_path): | |
try: | |
print(f"Processing audio file: {audio_path}") | |
audio_tensor = self.preprocess_audio(audio_path) | |
audio_numpy = audio_tensor.numpy() | |
print("Extracting deep audio features with Wav2Vec2...") | |
with torch.no_grad(): | |
inputs = self.wav2vec2_processor(audio_numpy, sampling_rate=self.sample_rate, return_tensors="pt", padding=True) | |
outputs = self.wav2vec2_model(inputs.input_values.to(self.device)) | |
speaker_features = torch.mean(outputs.last_hidden_state, dim=1) | |
print(f"Extracted Wav2Vec2 features: {speaker_features.shape}") | |
best_embedding = self.find_best_matching_speaker(speaker_features, audio_numpy) | |
print("β Advanced speaker embedding created successfully!") | |
return best_embedding, "β Voice profile extracted using advanced neural analysis!" | |
except Exception as e: | |
print(f"Error in advanced embedding extraction: {e}") | |
return self.extract_speaker_embedding_improved(audio_path) | |
def find_best_matching_speaker(self, target_features, audio_numpy): | |
try: | |
mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13) | |
pitch, _ = librosa.piptrack(y=audio_numpy, sr=self.sample_rate) | |
spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate) | |
acoustic_signature = np.concatenate([ | |
np.mean(mfccs, axis=1), | |
np.std(mfccs, axis=1), | |
[np.mean(pitch[pitch > 0]) if np.any(pitch > 0) else 200], | |
[np.mean(spectral_centroids)] | |
]) | |
best_embedding = self.default_speaker_embeddings | |
modification_factor = 0.3 # Increased for more distinct voice | |
feature_mod = torch.tensor(acoustic_signature[:best_embedding.shape[1]], dtype=torch.float32).to(self.device) | |
feature_mod = (feature_mod - torch.mean(feature_mod)) / (torch.std(feature_mod) + 1e-8) | |
modified_embedding = best_embedding + modification_factor * feature_mod.unsqueeze(0) | |
modified_embedding = torch.nn.functional.normalize(modified_embedding, p=2, dim=1) | |
return modified_embedding | |
except Exception as e: | |
print(f"Error in speaker matching: {e}") | |
return self.default_speaker_embeddings | |
def extract_speaker_embedding_improved(self, audio_path): | |
try: | |
print("Using improved speaker embedding extraction...") | |
audio_tensor = self.preprocess_audio(audio_path) | |
audio_numpy = audio_tensor.numpy() | |
print("Extracting comprehensive acoustic features...") | |
mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=20) | |
delta_mfccs = librosa.feature.delta(mfccs) | |
delta2_mfccs = librosa.feature.delta(mfccs, order=2) | |
f0, _, _ = librosa.pyin(audio_numpy, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) | |
f0_clean = f0[~np.isnan(f0)] | |
spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate) | |
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate) | |
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate) | |
spectral_contrast = librosa.feature.spectral_contrast(y=audio_numpy, sr=self.sample_rate) | |
lpc_coeffs = librosa.lpc(audio_numpy, order=16) | |
features = np.concatenate([ | |
np.mean(mfccs, axis=1), | |
np.std(mfccs, axis=1), | |
np.mean(delta_mfccs, axis=1), | |
np.mean(delta2_mfccs, axis=1), | |
[np.mean(f0_clean) if len(f0_clean) > 0 else 200], | |
[np.std(f0_clean) if len(f0_clean) > 0 else 50], | |
[np.mean(spectral_centroids)], | |
[np.mean(spectral_bandwidth)], | |
[np.mean(spectral_rolloff)], | |
np.mean(spectral_contrast, axis=1), | |
lpc_coeffs[1:] | |
]) | |
print(f"Extracted {len(features)} advanced acoustic features") | |
base_embedding = self.default_speaker_embeddings | |
embedding_size = base_embedding.shape[1] | |
features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8) | |
if len(features_normalized) > embedding_size: | |
modification_vector = features_normalized[:embedding_size] | |
else: | |
modification_vector = np.pad(features_normalized, (0, embedding_size - len(features_normalized)), 'reflect') | |
modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device) | |
modification_strength = 0.3 # Increased for more distinct voice | |
speaker_embedding = base_embedding + modification_strength * modification_tensor.unsqueeze(0) | |
if len(f0_clean) > 0: | |
pitch_factor = np.mean(f0_clean) / 200.0 | |
pitch_modification = 0.05 * (pitch_factor - 1.0) | |
speaker_embedding = speaker_embedding * (1.0 + pitch_modification) | |
speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1) | |
return speaker_embedding, "β Voice profile extracted with enhanced acoustic analysis!" | |
except Exception as e: | |
print(f"β Error in improved embedding extraction: {str(e)}") | |
return None, f"β Error processing audio: {str(e)}" | |
def extract_speaker_embedding(self, audio_path): | |
try: | |
return self.extract_speaker_embedding_advanced(audio_path) | |
except Exception as e: | |
print(f"Advanced method failed: {e}") | |
return self.extract_speaker_embedding_improved(audio_path) | |
def synthesize_speech(self, text, use_cloned_voice=True): | |
try: | |
if not text.strip(): | |
return None, "β Please enter some text to convert." | |
if len(text) > 500: | |
text = text[:500] | |
print("Text truncated to 500 characters") | |
print(f"Synthesizing speech for: '{text[:50]}...'") | |
if use_cloned_voice and self.user_speaker_embeddings is not None: | |
speaker_embeddings = self.user_speaker_embeddings | |
voice_type = "your cloned voice" | |
print("Using cloned voice embeddings") | |
else: | |
speaker_embeddings = self.default_speaker_embeddings | |
voice_type = "default voice" | |
print("Using default voice embeddings") | |
print(f"Speaker embedding shape: {speaker_embeddings.shape}") | |
inputs = self.processor(text=text, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(self.device) | |
print("Generating speech...") | |
with torch.no_grad(): | |
speaker_embeddings = speaker_embeddings.to(self.device) | |
if speaker_embeddings.dim() == 1: | |
speaker_embeddings = speaker_embeddings.unsqueeze(0) | |
speech = self.model.generate_speech(input_ids, speaker_embeddings, vocoder=self.vocoder) | |
speech_numpy = speech.cpu().numpy() | |
print(f"Generated audio shape: {speech_numpy.shape}") | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
sf.write(tmp_file.name, speech_numpy, self.sample_rate) | |
print(f"Audio saved to: {tmp_file.name}") | |
del speech, input_ids | |
gc.collect() | |
return tmp_file.name, f"β Speech generated successfully using {voice_type}!" | |
except Exception as e: | |
print(f"β Error in synthesize_speech: {str(e)}") | |
return None, f"β Error generating speech: {str(e)}" | |
print("π Initializing Voice Cloning TTS System...") | |
tts_system = VoiceCloningTTS() | |
def process_voice_upload(audio_file): | |
if audio_file is None: | |
return "β Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False) | |
try: | |
print(f"Processing uploaded file: {audio_file}") | |
speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file) | |
if speaker_embedding is not None: | |
tts_system.user_speaker_embeddings = speaker_embedding | |
print("β Speaker embeddings saved successfully") | |
return message, gr.update(interactive=True), gr.update(interactive=True) | |
else: | |
return message, gr.update(interactive=False), gr.update(interactive=False) | |
except Exception as e: | |
error_msg = f"β Error processing audio: {str(e)}" | |
print(error_msg) | |
return error_msg, gr.update(interactive=False), gr.update(interactive=False) | |
def generate_speech(text, use_cloned_voice): | |
if not text.strip(): | |
return None, "β Please enter some text to convert." | |
try: | |
print(f"Generating speech - Use cloned voice: {use_cloned_voice}") | |
audio_file, message = tts_system.synthesize_speech(text, use_cloned_voice) | |
return audio_file, message | |
except Exception as e: | |
error_msg = f"β Error generating speech: {str(e)}" | |
print(error_msg) | |
return None, error_msg | |
def clear_voice_profile(): | |
tts_system.user_speaker_embeddings = None | |
return "π Voice profile cleared.", gr.update(interactive=False), gr.update(interactive=False) | |
def update_generate_button(text, use_cloned): | |
text_ready = bool(text.strip()) | |
voice_ready = (not use_cloned) or (tts_system.user_speaker_embeddings is not None) | |
return gr.update(interactive=text_ready and voice_ready) | |
with gr.Blocks(title="Voice Cloning TTS System") as demo: | |
gr.Markdown("# Voice Cloning TTS System") | |
gr.Markdown("Upload an audio file to clone your voice and generate speech.") | |
with gr.Row(): | |
with gr.Column(): | |
voice_upload = gr.Audio(label="Upload Voice Sample", type="filepath", sources=["upload", "microphone"]) | |
upload_status = gr.Textbox(label="Status", interactive=False) | |
clear_btn = gr.Button("Clear Voice Profile") | |
with gr.Column(): | |
text_input = gr.Textbox(label="Text to Convert", lines=5) | |
use_cloned_voice = gr.Checkbox(label="Use Cloned Voice", value=True, interactive=False) | |
generate_btn = gr.Button("Generate Speech", interactive=False) | |
output_audio = gr.Audio(label="Generated Speech", type="filepath") | |
generation_status = gr.Textbox(label="Generation Status", interactive=False) | |
voice_upload.change(fn=process_voice_upload, inputs=[voice_upload], outputs=[upload_status, use_cloned_voice, generate_btn]) | |
text_input.change(fn=update_generate_button, inputs=[text_input, use_cloned_voice], outputs=[generate_btn]) | |
use_cloned_voice.change(fn=update_generate_button, inputs=[text_input, use_cloned_voice], outputs=[generate_btn]) | |
generate_btn.click(fn=generate_speech, inputs=[text_input, use_cloned_voice], outputs=[output_audio, generation_status]) | |
clear_btn.click(fn=clear_voice_profile, outputs=[upload_status, use_cloned_voice, generate_btn]) | |
if __name__ == "__main__": | |
print("π Starting Voice Cloning TTS System...") | |
demo.launch() |