Spaces:
Running
Running
import gradio as gr | |
import torch | |
import torchaudio | |
import numpy as np | |
import tempfile | |
import os | |
from pathlib import Path | |
import librosa | |
import soundfile as sf | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from datasets import load_dataset | |
import warnings | |
import gc | |
warnings.filterwarnings("ignore") | |
class VoiceCloningTTS: | |
def __init__(self): | |
"""Initialize the TTS system with SpeechT5 model""" | |
# Use CPU for HF Spaces to avoid memory issues | |
self.device = torch.device("cpu") | |
print(f"Using device: {self.device}") | |
try: | |
# Load SpeechT5 models with memory optimization | |
print("Loading SpeechT5 processor...") | |
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
print("Loading SpeechT5 TTS model...") | |
self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
self.model.to(self.device) | |
self.model.eval() # Set to evaluation mode | |
print("Loading SpeechT5 vocoder...") | |
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
self.vocoder.to(self.device) | |
self.vocoder.eval() | |
# Load default speaker embeddings | |
print("Loading speaker embeddings...") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
self.default_speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device) | |
self.user_speaker_embeddings = None | |
self.sample_rate = 16000 | |
print("β TTS system initialized successfully!") | |
except Exception as e: | |
print(f"β Error initializing TTS system: {str(e)}") | |
raise e | |
def extract_speaker_embedding(self, audio_path): | |
"""Extract speaker embedding from uploaded audio""" | |
try: | |
print(f"Processing audio file: {audio_path}") | |
# Load and preprocess audio | |
waveform, sample_rate = torchaudio.load(audio_path) | |
print(f"Original audio shape: {waveform.shape}, sample rate: {sample_rate}") | |
# Resample if necessary | |
if sample_rate != self.sample_rate: | |
print(f"Resampling from {sample_rate} to {self.sample_rate}") | |
resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate) | |
waveform = resampler(waveform) | |
# Convert to mono if stereo | |
if waveform.shape[0] > 1: | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
print("Converted to mono") | |
# Ensure minimum length (at least 1 second) | |
min_length = self.sample_rate | |
if waveform.shape[1] < min_length: | |
# Pad with zeros if too short | |
padding = min_length - waveform.shape[1] | |
waveform = torch.nn.functional.pad(waveform, (0, padding)) | |
print(f"Padded audio to minimum length") | |
# Limit maximum length (30 seconds max for memory efficiency) | |
max_length = 30 * self.sample_rate | |
if waveform.shape[1] > max_length: | |
waveform = waveform[:, :max_length] | |
print("Truncated audio to 30 seconds") | |
# Normalize audio | |
waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8) | |
# Convert to numpy for librosa processing | |
audio_numpy = waveform.squeeze().numpy() | |
print("Extracting audio features...") | |
# Extract comprehensive audio features | |
try: | |
# MFCC features (mel-frequency cepstral coefficients) | |
mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13) | |
mfcc_mean = np.mean(mfccs, axis=1) | |
mfcc_std = np.std(mfccs, axis=1) | |
# Spectral features | |
spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate) | |
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate) | |
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate) | |
zero_crossing_rate = librosa.feature.zero_crossing_rate(audio_numpy) | |
# Pitch features | |
pitches, magnitudes = librosa.piptrack(y=audio_numpy, sr=self.sample_rate) | |
pitch_mean = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0 | |
# Chroma features | |
chroma = librosa.feature.chroma_stft(y=audio_numpy, sr=self.sample_rate) | |
chroma_mean = np.mean(chroma, axis=1) | |
# Combine all features | |
features = np.concatenate([ | |
mfcc_mean, | |
mfcc_std, | |
[np.mean(spectral_centroids)], | |
[np.mean(spectral_rolloff)], | |
[np.mean(spectral_bandwidth)], | |
[np.mean(zero_crossing_rate)], | |
[pitch_mean], | |
chroma_mean | |
]) | |
print(f"Extracted {len(features)} audio features") | |
except Exception as e: | |
print(f"Error extracting features: {e}") | |
# Simple fallback feature extraction | |
features = np.array([ | |
np.mean(audio_numpy), | |
np.std(audio_numpy), | |
np.max(audio_numpy), | |
np.min(audio_numpy) | |
]) | |
# Create speaker embedding by modifying the default embedding | |
base_embedding = self.default_speaker_embeddings.clone() | |
# Normalize features | |
features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8) | |
# Create modification vector (pad or truncate to match embedding size) | |
embedding_size = base_embedding.shape[1] # Should be 512 | |
if len(features_normalized) > embedding_size: | |
modification_vector = features_normalized[:embedding_size] | |
else: | |
modification_vector = np.pad(features_normalized, | |
(0, embedding_size - len(features_normalized)), | |
'constant', constant_values=0) | |
modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device) | |
# Apply modifications to create unique speaker embedding | |
# Use a smaller modification factor for stability | |
speaker_embedding = base_embedding + 0.05 * modification_tensor.unsqueeze(0) | |
# Normalize the final embedding | |
speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1) | |
print("β Speaker embedding created successfully!") | |
return speaker_embedding, "β Voice profile extracted successfully! You can now generate speech in this voice." | |
except Exception as e: | |
print(f"β Error in extract_speaker_embedding: {str(e)}") | |
return None, f"β Error processing audio: {str(e)}" | |
def synthesize_speech(self, text, use_cloned_voice=True): | |
"""Convert text to speech using the specified voice""" | |
try: | |
if not text.strip(): | |
return None, "β Please enter some text to convert." | |
# Limit text length for memory efficiency | |
if len(text) > 500: | |
text = text[:500] | |
print("Text truncated to 500 characters for memory efficiency") | |
print(f"Synthesizing speech for text: '{text[:50]}...'") | |
# Choose speaker embedding | |
if use_cloned_voice and self.user_speaker_embeddings is not None: | |
speaker_embeddings = self.user_speaker_embeddings | |
voice_type = "your cloned voice" | |
print("Using cloned voice") | |
else: | |
speaker_embeddings = self.default_speaker_embeddings | |
voice_type = "default voice" | |
print("Using default voice") | |
# Tokenize text | |
inputs = self.processor(text=text, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(self.device) | |
print("Generating speech...") | |
# Generate speech with memory optimization | |
with torch.no_grad(): | |
# Clear cache before generation | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
speech = self.model.generate_speech( | |
input_ids, | |
speaker_embeddings, | |
vocoder=self.vocoder | |
) | |
# Convert to numpy | |
speech_numpy = speech.cpu().numpy() | |
print(f"Generated audio shape: {speech_numpy.shape}") | |
# Create temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
sf.write(tmp_file.name, speech_numpy, self.sample_rate) | |
print(f"Audio saved to: {tmp_file.name}") | |
# Clean up memory | |
del speech, input_ids | |
gc.collect() | |
return tmp_file.name, f"β Speech generated successfully using {voice_type}!" | |
except Exception as e: | |
print(f"β Error in synthesize_speech: {str(e)}") | |
return None, f"β Error generating speech: {str(e)}" | |
# Initialize the TTS system | |
print("π Initializing Voice Cloning TTS System...") | |
tts_system = VoiceCloningTTS() | |
def process_voice_upload(audio_file): | |
"""Process uploaded voice file""" | |
if audio_file is None: | |
return "β Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False) | |
try: | |
speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file) | |
if speaker_embedding is not None: | |
tts_system.user_speaker_embeddings = speaker_embedding | |
return message, gr.update(interactive=True), gr.update(interactive=True) | |
else: | |
return message, gr.update(interactive=False), gr.update(interactive=False) | |
except Exception as e: | |
error_msg = f"β Error processing audio: {str(e)}" | |
return error_msg, gr.update(interactive=False), gr.update(interactive=False) | |
def generate_speech(text, use_cloned_voice): | |
"""Generate speech from text""" | |
if not text.strip(): | |
return None, "β Please enter some text to convert." | |
try: | |
audio_file, message = tts_system.synthesize_speech(text, use_cloned_voice) | |
return audio_file, message | |
except Exception as e: | |
error_msg = f"β Error generating speech: {str(e)}" | |
return None, error_msg | |
def clear_voice_profile(): | |
"""Clear the uploaded voice profile""" | |
tts_system.user_speaker_embeddings = None | |
return ("π Voice profile cleared. Upload a new audio file to clone a voice.", | |
gr.update(interactive=False), | |
gr.update(interactive=False)) | |
def update_generate_button(text, use_cloned): | |
"""Update generate button state based on inputs""" | |
text_ready = bool(text.strip()) | |
voice_ready = (not use_cloned) or (tts_system.user_speaker_embeddings is not None) | |
return gr.update(interactive=text_ready and voice_ready) | |
# Create Gradio interface optimized for HF Spaces | |
with gr.Blocks( | |
title="π€ Voice Cloning TTS System", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1000px !important; | |
margin: auto !important; | |
} | |
.header { | |
text-align: center; | |
margin-bottom: 30px; | |
padding: 20px; | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
border-radius: 15px; | |
color: white; | |
} | |
.step-box { | |
border: 2px solid #e1e5e9; | |
border-radius: 12px; | |
padding: 20px; | |
margin: 15px 0; | |
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
} | |
.tips-box { | |
background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%); | |
border-radius: 12px; | |
padding: 20px; | |
margin: 20px 0; | |
border-left: 5px solid #ff6b6b; | |
} | |
""" | |
) as demo: | |
gr.HTML(""" | |
<div class="header"> | |
<h1>π€ AI Voice Cloning TTS System</h1> | |
<p>π Upload your voice sample and convert any text to speech in YOUR voice!</p> | |
<p>β¨ Powered by Microsoft SpeechT5 & Advanced Voice Analysis</p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.HTML('<div class="step-box"><h3>ποΈ Step 1: Upload Your Voice Sample</h3><p>Record or upload 10-30 seconds of clear English speech</p></div>') | |
voice_upload = gr.Audio( | |
label="π€ Voice Sample (English)", | |
type="filepath", | |
sources=["upload", "microphone"], | |
format="wav" | |
) | |
upload_status = gr.Textbox( | |
label="π Voice Analysis Status", | |
interactive=False, | |
value="β³ Please upload an audio file to extract your voice profile.", | |
lines=2 | |
) | |
clear_btn = gr.Button("ποΈ Clear Voice Profile", variant="secondary", size="sm") | |
with gr.Column(scale=1): | |
gr.HTML('<div class="step-box"><h3>βοΈ Step 2: Enter Your Text</h3><p>Type the text you want to convert to speech</p></div>') | |
text_input = gr.Textbox( | |
label="π Text to Convert (Max 500 characters)", | |
placeholder="Enter the text you want to convert to speech using your cloned voice...", | |
lines=5, | |
max_lines=8 | |
) | |
use_cloned_voice = gr.Checkbox( | |
label="π Use My Cloned Voice", | |
value=True, | |
interactive=False, | |
info="Uncheck to use default voice" | |
) | |
generate_btn = gr.Button( | |
"π΅ Generate Speech", | |
variant="primary", | |
interactive=False, | |
size="lg" | |
) | |
gr.HTML('<div class="step-box"><h3>π Step 3: Your Generated Speech</h3></div>') | |
with gr.Row(): | |
with gr.Column(): | |
output_audio = gr.Audio( | |
label="π§ Generated Speech Audio", | |
type="filepath", | |
interactive=False | |
) | |
generation_status = gr.Textbox( | |
label="β‘ Generation Status", | |
interactive=False, | |
lines=2 | |
) | |
# Tips and information section | |
gr.HTML(""" | |
<div class="tips-box"> | |
<h3>π‘ Pro Tips for Best Results:</h3> | |
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 15px;"> | |
<div> | |
<h4>π€ Voice Sample Quality:</h4> | |
<ul> | |
<li>Use clear, natural English speech</li> | |
<li>10-30 seconds duration is optimal</li> | |
<li>Minimize background noise</li> | |
<li>Speak at normal pace and volume</li> | |
</ul> | |
</div> | |
<div> | |
<h4>π Text Guidelines:</h4> | |
<ul> | |
<li>English text works best</li> | |
<li>Keep sentences natural and clear</li> | |
<li>Avoid very long paragraphs</li> | |
<li>Punctuation helps with intonation</li> | |
</ul> | |
</div> | |
</div> | |
<div style="margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.7); border-radius: 8px;"> | |
<strong>π¬ How it works:</strong> The system analyzes your voice's unique characteristics (pitch, tone, formants) | |
and creates a personalized voice profile that's used to generate speech that sounds like you! | |
</div> | |
</div> | |
""") | |
# Event handlers with proper state management | |
voice_upload.change( | |
fn=process_voice_upload, | |
inputs=[voice_upload], | |
outputs=[upload_status, use_cloned_voice, generate_btn] | |
) | |
text_input.change( | |
fn=update_generate_button, | |
inputs=[text_input, use_cloned_voice], | |
outputs=[generate_btn] | |
) | |
use_cloned_voice.change( | |
fn=update_generate_button, | |
inputs=[text_input, use_cloned_voice], | |
outputs=[generate_btn] | |
) | |
generate_btn.click( | |
fn=generate_speech, | |
inputs=[text_input, use_cloned_voice], | |
outputs=[output_audio, generation_status] | |
) | |
clear_btn.click( | |
fn=clear_voice_profile, | |
outputs=[upload_status, use_cloned_voice, generate_btn] | |
) | |
# Launch configuration for Hugging Face Spaces | |
if __name__ == "__main__": | |
print("π Starting Voice Cloning TTS System on Hugging Face Spaces...") | |
demo.launch( | |
share=True # HF Spaces handles sharing automatically | |
) |