Spaces:

shukdevdatta123
/

VocalForge-AI

Running

App Files Files Community

shukdevdatta123 commited on 16 days ago

Commit

f8f4a26

verified ·

1 Parent(s): 7093262

Update app.py

Browse files

Files changed (1) hide show

app.py +283 -346

app.py CHANGED Viewed

@@ -20,12 +20,10 @@ warnings.filterwarnings("ignore")
 class VoiceCloningTTS:
     def __init__(self):
         """Initialize the TTS system with SpeechT5 model"""
-        # Use CPU for better compatibility
         self.device = torch.device("cpu")
         print(f"Using device: {self.device}")
         try:
-            # Load SpeechT5 models
             print("Loading SpeechT5 processor...")
             self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
@@ -39,14 +37,12 @@ class VoiceCloningTTS:
             self.vocoder.to(self.device)
             self.vocoder.eval()
-            # Load Wav2Vec2 for better speaker embedding extraction
             print("Loading Wav2Vec2 for speaker embedding...")
             self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
             self.wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
             self.wav2vec2_model.to(self.device)
             self.wav2vec2_model.eval()
-            # Load default speaker embeddings
             print("Loading speaker embeddings dataset...")
             embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
             self.speaker_embeddings_dataset = embeddings_dataset
@@ -64,35 +60,21 @@ class VoiceCloningTTS:
     def preprocess_audio(self, audio_path):
         """Preprocess audio for better speaker embedding extraction"""
         try:
-            # Load audio
             waveform, sample_rate = torchaudio.load(audio_path)
-            # Convert to mono
             if waveform.shape[0] > 1:
                 waveform = torch.mean(waveform, dim=0, keepdim=True)
-            # Resample to 16kHz
             if sample_rate != self.sample_rate:
                 resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
                 waveform = resampler(waveform)
-            # Normalize
             waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
-            # Ensure minimum length (3 seconds for better speaker characteristics)
             min_length = 3 * self.sample_rate
             if waveform.shape[1] < min_length:
-                # Repeat audio if too short
                 repeat_times = int(np.ceil(min_length / waveform.shape[1]))
                 waveform = waveform.repeat(1, repeat_times)[:, :min_length]
-            # Limit to 20 seconds max
             max_length = 20 * self.sample_rate
             if waveform.shape[1] > max_length:
                 waveform = waveform[:, :max_length]
             return waveform.squeeze()
         except Exception as e:
             print(f"Error in audio preprocessing: {e}")
             raise e
@@ -101,53 +83,31 @@ class VoiceCloningTTS:
         """Extract speaker embedding using advanced methods"""
         try:
             print(f"Processing audio file: {audio_path}")
-            # Preprocess audio
             audio_tensor = self.preprocess_audio(audio_path)
             audio_numpy = audio_tensor.numpy()
             print("Extracting deep audio features with Wav2Vec2...")
-            # Extract features using Wav2Vec2
             with torch.no_grad():
-                # Process with Wav2Vec2
-                inputs = self.wav2vec2_processor(
-                    audio_numpy,
-                    sampling_rate=self.sample_rate,
-                    return_tensors="pt",
-                    padding=True
-                )
-                # Get hidden states
                 outputs = self.wav2vec2_model(inputs.input_values.to(self.device))
-                hidden_states = outputs.last_hidden_state
-                # Pool the hidden states to get speaker representation
-                # Use mean pooling across time dimension
-                speaker_features = torch.mean(hidden_states, dim=1)  # Shape: (1, 768)
             print(f"Extracted Wav2Vec2 features: {speaker_features.shape}")
-            # Create speaker embedding by finding similar speaker in dataset
             best_embedding = self.find_best_matching_speaker(speaker_features, audio_numpy)
             print("✅ Advanced speaker embedding created successfully!")
             return best_embedding, "✅ Voice profile extracted using advanced neural analysis! You can now generate speech in this voice."
         except Exception as e:
             print(f"Error in advanced embedding extraction: {e}")
-            # Fallback to improved basic method
             return self.extract_speaker_embedding_improved(audio_path)
     def find_best_matching_speaker(self, target_features, audio_numpy):
-        """Find the best matching speaker from the dataset and create hybrid embedding"""
         try:
-            # Extract additional acoustic features
             mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13)
             pitch, _ = librosa.piptrack(y=audio_numpy, sr=self.sample_rate)
             spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
-            # Create acoustic signature
             acoustic_signature = np.concatenate([
                 np.mean(mfccs, axis=1),
                 np.std(mfccs, axis=1),
@@ -155,40 +115,14 @@ class VoiceCloningTTS:
                 [np.mean(spectral_centroids)]
             ])
-            # Sample multiple speakers from dataset for variety
-            speaker_indices = [100, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7306]
-            best_score = float('inf')
             best_embedding = self.default_speaker_embeddings
-            for idx in speaker_indices:
-                if idx < len(self.speaker_embeddings_dataset):
-                    candidate_embedding = torch.tensor(
-                        self.speaker_embeddings_dataset[idx]["xvector"]
-                    ).unsqueeze(0).to(self.device)
-                    # Simple scoring based on embedding similarity
-                    # In a real implementation, you'd use more sophisticated matching
-                    score = torch.norm(candidate_embedding - self.default_speaker_embeddings).item()
-                    if score < best_score:
-                        best_score = score
-                        best_embedding = candidate_embedding
-            # Create modified embedding based on acoustic features
-            modification_factor = 0.1
             feature_mod = torch.tensor(acoustic_signature[:best_embedding.shape[1]], dtype=torch.float32).to(self.device)
-            # Normalize feature modification
             feature_mod = (feature_mod - torch.mean(feature_mod)) / (torch.std(feature_mod) + 1e-8)
-            # Apply modification
             modified_embedding = best_embedding + modification_factor * feature_mod.unsqueeze(0)
-            # Normalize final embedding
             modified_embedding = torch.nn.functional.normalize(modified_embedding, p=2, dim=1)
             return modified_embedding
         except Exception as e:
             print(f"Error in speaker matching: {e}")
             return self.default_speaker_embeddings
@@ -197,35 +131,21 @@ class VoiceCloningTTS:
         """Improved speaker embedding extraction with better acoustic analysis"""
         try:
             print("Using improved speaker embedding extraction...")
-            # Preprocess audio
             audio_tensor = self.preprocess_audio(audio_path)
             audio_numpy = audio_tensor.numpy()
-            # Enhanced feature extraction
             print("Extracting comprehensive acoustic features...")
-            # Voice quality features
             mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=20)
             delta_mfccs = librosa.feature.delta(mfccs)
             delta2_mfccs = librosa.feature.delta(mfccs, order=2)
-            # Pitch and prosodic features
-            f0, voiced_flag, voiced_probs = librosa.pyin(audio_numpy,
-                                                        fmin=librosa.note_to_hz('C2'),
-                                                        fmax=librosa.note_to_hz('C7'))
             f0_clean = f0[~np.isnan(f0)]
-            # Spectral features
             spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
             spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate)
             spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate)
             spectral_contrast = librosa.feature.spectral_contrast(y=audio_numpy, sr=self.sample_rate)
-            # Formant-like features using LPC
             lpc_coeffs = librosa.lpc(audio_numpy, order=16)
-            # Combine all features
             features = np.concatenate([
                 np.mean(mfccs, axis=1),
                 np.std(mfccs, axis=1),
@@ -237,87 +157,303 @@ class VoiceCloningTTS:
                 [np.mean(spectral_bandwidth)],
                 [np.mean(spectral_rolloff)],
                 np.mean(spectral_contrast, axis=1),
-                lpc_coeffs[1:]  # Skip the first coefficient
             ])
             print(f"Extracted {len(features)} advanced acoustic features")
-            # Use multiple base embeddings for better diversity
-            base_indices = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 7306]
-            embeddings = []
-            for idx in base_indices:
-                if idx < len(self.speaker_embeddings_dataset):
-                    base_embedding = torch.tensor(
-                        self.speaker_embeddings_dataset[idx]["xvector"]
-                    ).to(self.device)
-                    embeddings.append(base_embedding)
-            # Create ensemble embedding
-            if embeddings:
-                ensemble_embedding = torch.stack(embeddings).mean(dim=0).unsqueeze(0)
             else:
-                ensemble_embedding = self.default_speaker_embeddings
-            # Apply sophisticated feature-based modification
-            embedding_size = ensemble_embedding.shape[1]
-            # Normalize and resize features to match embedding size
             features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8)
             if len(features_normalized) > embedding_size:
                 modification_vector = features_normalized[:embedding_size]
             else:
-                modification_vector = np.pad(features_normalized,
-                                           (0, embedding_size - len(features_normalized)),
-                                           'reflect')
             modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device)
-            # Apply stronger modification for more distinctive voice
-            modification_strength = 0.15
-            speaker_embedding = ensemble_embedding + modification_strength * modification_tensor.unsqueeze(0)
-            # Additional voice-specific transformations based on pitch
             if len(f0_clean) > 0:
-                pitch_factor = np.mean(f0_clean) / 200.0  # Normalize around 200Hz
                 pitch_modification = 0.05 * (pitch_factor - 1.0)
                 speaker_embedding = speaker_embedding * (1.0 + pitch_modification)
-            # Final normalization
             speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1)
-            return speaker_embedding, "✅ Voice profile extracted with enhanced acoustic analysis! Ready for speech generation."
         except Exception as e:
             print(f"❌ Error in improved embedding extraction: {str(e)}")
             return None, f"❌ Error processing audio: {str(e)}"
     def extract_speaker_embedding(self, audio_path):
-        """Main method for speaker embedding extraction"""
         try:
-            # Try advanced method first
-            embedding, message = self.extract_speaker_embedding_advanced(audio_path)
-            return embedding, message
         except Exception as e:
             print(f"Advanced method failed: {e}")
-            # Fallback to improved method
             return self.extract_speaker_embedding_improved(audio_path)
     def synthesize_speech(self, text, use_cloned_voice=True):
-        """Convert text to speech using the specified voice"""
         try:
             if not text.strip():
                 return None, "❌ Please enter some text to convert."
-            # Limit text length
             if len(text) > 500:
                 text = text[:500]
                 print("Text truncated to 500 characters")
             print(f"Synthesizing speech for: '{text[:50]}...'")
-            # Choose speaker embedding
             if use_cloned_voice and self.user_speaker_embeddings is not None:
                 speaker_embeddings = self.user_speaker_embeddings
                 voice_type = "your cloned voice"
@@ -328,61 +464,37 @@ class VoiceCloningTTS:
                 print("Using default voice embeddings")
             print(f"Speaker embedding shape: {speaker_embeddings.shape}")
-            # Tokenize text
             inputs = self.processor(text=text, return_tensors="pt")
             input_ids = inputs["input_ids"].to(self.device)
             print("Generating speech...")
-            # Generate speech
             with torch.no_grad():
-                # Ensure speaker embeddings are on correct device and have correct shape
                 speaker_embeddings = speaker_embeddings.to(self.device)
                 if speaker_embeddings.dim() == 1:
                     speaker_embeddings = speaker_embeddings.unsqueeze(0)
-                print(f"Final speaker embedding shape: {speaker_embeddings.shape}")
-                speech = self.model.generate_speech(
-                    input_ids,
-                    speaker_embeddings,
-                    vocoder=self.vocoder
-                )
-            # Convert to numpy
             speech_numpy = speech.cpu().numpy()
             print(f"Generated audio shape: {speech_numpy.shape}")
-            # Save to temporary file
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                 sf.write(tmp_file.name, speech_numpy, self.sample_rate)
                 print(f"Audio saved to: {tmp_file.name}")
-                # Cleanup
                 del speech, input_ids
                 gc.collect()
                 return tmp_file.name, f"✅ Speech generated successfully using {voice_type}!"
         except Exception as e:
             print(f"❌ Error in synthesize_speech: {str(e)}")
             return None, f"❌ Error generating speech: {str(e)}"
-# Initialize the TTS system
-print("🚀 Initializing Enhanced Voice Cloning TTS System...")
 tts_system = VoiceCloningTTS()
 def process_voice_upload(audio_file):
-    """Process uploaded voice file"""
     if audio_file is None:
         return "❌ Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False)
     try:
         print(f"Processing uploaded file: {audio_file}")
         speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file)
         if speaker_embedding is not None:
             tts_system.user_speaker_embeddings = speaker_embedding
             print("✅ Speaker embeddings saved successfully")
@@ -395,10 +507,8 @@ def process_voice_upload(audio_file):
         return error_msg, gr.update(interactive=False), gr.update(interactive=False)
 def generate_speech(text, use_cloned_voice):
-    """Generate speech from text"""
     if not text.strip():
         return None, "❌ Please enter some text to convert."
     try:
         print(f"Generating speech - Use cloned voice: {use_cloned_voice}")
         audio_file, message = tts_system.synthesize_speech(text, use_cloned_voice)
@@ -409,211 +519,38 @@ def generate_speech(text, use_cloned_voice):
         return None, error_msg
 def clear_voice_profile():
-    """Clear the uploaded voice profile"""
     tts_system.user_speaker_embeddings = None
-    return ("🔄 Voice profile cleared. Upload a new audio file to clone a voice.",
-            gr.update(interactive=False),
-            gr.update(interactive=False))
 def update_generate_button(text, use_cloned):
-    """Update generate button state based on inputs"""
     text_ready = bool(text.strip())
     voice_ready = (not use_cloned) or (tts_system.user_speaker_embeddings is not None)
     return gr.update(interactive=text_ready and voice_ready)
-# Create enhanced Gradio interface
-with gr.Blocks(
-    title="🎤 Enhanced Voice Cloning TTS System",
-    theme=gr.themes.Soft(),
-    css="""
-    .gradio-container {
-        max-width: 1200px !important;
-        margin: auto !important;
-    }
-    .header {
-        text-align: center;
-        margin-bottom: 30px;
-        padding: 25px;
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        border-radius: 15px;
-        color: white;
-        box-shadow: 0 8px 25px rgba(0,0,0,0.15);
-    }
-    .step-box {
-        border: 2px solid #e1e5e9;
-        border-radius: 12px;
-        padding: 20px;
-        margin: 15px 0;
-        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
-        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-    }
-    .tips-box {
-        background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
-        border-radius: 12px;
-        padding: 20px;
-        margin: 20px 0;
-        border-left: 5px solid #ff6b6b;
-    }
-    .improvement-box {
-        background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%);
-        border-radius: 12px;
-        padding: 20px;
-        margin: 20px 0;
-        border-left: 5px solid #00d2ff;
-    }
-    """
-) as demo:
-    gr.HTML("""
-    <div class="header">
-        <h1>🎤 Enhanced AI Voice Cloning TTS System</h1>
-        <p>🚀 Advanced neural voice analysis with Wav2Vec2 + SpeechT5</p>
-        <p>✨ Upload your voice and generate speech that sounds more like you!</p>
-    </div>
-    """)
     with gr.Row():
-        with gr.Column(scale=1):
-            gr.HTML('<div class="step-box"><h3>🎙️ Step 1: Upload Your Voice Sample</h3><p>Record 10-30 seconds of clear, natural speech for best results</p></div>')
-            voice_upload = gr.Audio(
-                label="📤 Voice Sample (Clear English Speech)",
-                type="filepath",
-                sources=["upload", "microphone"],
-                format="wav"
-            )
-            upload_status = gr.Textbox(
-                label="📊 Advanced Voice Analysis Status",
-                interactive=False,
-                value="⏳ Please upload an audio file to extract your unique voice profile using advanced neural analysis.",
-                lines=3
-            )
-            clear_btn = gr.Button("🗑️ Clear Voice Profile", variant="secondary", size="sm")
-        with gr.Column(scale=1):
-            gr.HTML('<div class="step-box"><h3>✍️ Step 2: Enter Your Text</h3><p>Type what you want to hear in your cloned voice</p></div>')
-            text_input = gr.Textbox(
-                label="📝 Text to Convert (Max 500 characters)",
-                placeholder="Enter the text you want to convert to speech using your cloned voice...",
-                lines=6,
-                max_lines=10
-            )
-            use_cloned_voice = gr.Checkbox(
-                label="🎭 Use My Cloned Voice (Enhanced)",
-                value=True,
-                interactive=False,
-                info="Uncheck to use default voice for comparison"
-            )
-            generate_btn = gr.Button(
-                "🎵 Generate Speech with AI Voice Cloning",
-                variant="primary",
-                interactive=False,
-                size="lg"
-            )
-    gr.HTML('<div class="step-box"><h3>🔊 Step 3: Your Generated Speech</h3></div>')
-    with gr.Row():
         with gr.Column():
-            output_audio = gr.Audio(
-                label="🎧 Generated Speech Audio",
-                type="filepath",
-                interactive=False
-            )
-            generation_status = gr.Textbox(
-                label="⚡ Generation Status",
-                interactive=False,
-                lines=2
-            )
-    # Enhanced tips section
-    gr.HTML("""
-    <div class="improvement-box">
-        <h3>🔬 Enhanced Voice Cloning Technology:</h3>
-        <p><strong>This improved version uses:</strong></p>
-        <ul>
-            <li><strong>Wav2Vec2 Neural Networks:</strong> Advanced deep learning for better voice feature extraction</li>
-            <li><strong>Multi-Speaker Analysis:</strong> Compares your voice against multiple reference speakers</li>
-            <li><strong>Enhanced Acoustic Features:</strong> 60+ voice characteristics including pitch, formants, and spectral features</li>
-            <li><strong>Ensemble Embeddings:</strong> Combines multiple speaker models for more accurate voice representation</li>
-        </ul>
-    </div>
-    """)
-    gr.HTML("""
-    <div class="tips-box">
-        <h3>💡 Pro Tips for Maximum Voice Similarity:</h3>
-        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 15px;">
-            <div>
-                <h4>🎤 Recording Best Practices:</h4>
-                <ul>
-                    <li><strong>Duration:</strong> 15-30 seconds is optimal</li>
-                    <li><strong>Content:</strong> Read naturally, include varied sentences</li>
-                    <li><strong>Environment:</strong> Quiet room, minimal echo</li>
-                    <li><strong>Quality:</strong> Use good microphone if possible</li>
-                    <li><strong>Speaking:</strong> Natural pace, clear pronunciation</li>
-                </ul>
-            </div>
-            <div>
-                <h4>📝 Text Generation Tips:</h4>
-                <ul>
-                    <li><strong>Language:</strong> English works best</li>
-                    <li><strong>Style:</strong> Match your natural speaking style</li>
-                    <li><strong>Length:</strong> Shorter texts often sound better</li>
-                    <li><strong>Punctuation:</strong> Helps with natural intonation</li>
-                    <li><strong>Testing:</strong> Try different texts to compare results</li>
-                </ul>
-            </div>
-        </div>
-        <div style="margin-top: 20px; padding: 15px; background: rgba(255,255,255,0.8); border-radius: 8px;">
-            <strong>🧠 How the Enhanced System Works:</strong>
-            <br>1. <strong>Neural Analysis:</strong> Wav2Vec2 extracts 768-dimensional voice features
-            <br>2. <strong>Speaker Matching:</strong> Finds similar voices in a large speaker database
-            <br>3. <strong>Feature Fusion:</strong> Combines 60+ acoustic characteristics (pitch, formants, spectral features)
-            <br>4. <strong>Voice Synthesis:</strong> SpeechT5 generates speech using your personalized voice embedding
-        </div>
-    </div>
-    """)
-    # Event handlers
-    voice_upload.change(
-        fn=process_voice_upload,
-        inputs=[voice_upload],
-        outputs=[upload_status, use_cloned_voice, generate_btn]
-    )
-    text_input.change(
-        fn=update_generate_button,
-        inputs=[text_input, use_cloned_voice],
-        outputs=[generate_btn]
-    )
-    use_cloned_voice.change(
-        fn=update_generate_button,
-        inputs=[text_input, use_cloned_voice],
-        outputs=[generate_btn]
-    )
-    generate_btn.click(
-        fn=generate_speech,
-        inputs=[text_input, use_cloned_voice],
-        outputs=[output_audio, generation_status]
-    )
-    clear_btn.click(
-        fn=clear_voice_profile,
-        outputs=[upload_status, use_cloned_voice, generate_btn]
-    )
-# Launch configuration
 if __name__ == "__main__":
-    print("🌟 Starting Enhanced Voice Cloning TTS System...")
-    demo.launch(
-        share=True
-    )

 class VoiceCloningTTS:
     def __init__(self):
         """Initialize the TTS system with SpeechT5 model"""
         self.device = torch.device("cpu")
         print(f"Using device: {self.device}")
         try:
             print("Loading SpeechT5 processor...")
             self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
             self.vocoder.to(self.device)
             self.vocoder.eval()
             print("Loading Wav2Vec2 for speaker embedding...")
             self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
             self.wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
             self.wav2vec2_model.to(self.device)
             self.wav2vec2_model.eval()
             print("Loading speaker embeddings dataset...")
             embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
             self.speaker_embeddings_dataset = embeddings_dataset
     def preprocess_audio(self, audio_path):
         """Preprocess audio for better speaker embedding extraction"""
         try:
             waveform, sample_rate = torchaudio.load(audio_path)
             if waveform.shape[0] > 1:
                 waveform = torch.mean(waveform, dim=0, keepdim=True)
             if sample_rate != self.sample_rate:
                 resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
                 waveform = resampler(waveform)
             waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
             min_length = 3 * self.sample_rate
             if waveform.shape[1] < min_length:
                 repeat_times = int(np.ceil(min_length / waveform.shape[1]))
                 waveform = waveform.repeat(1, repeat_times)[:, :min_length]
             max_length = 20 * self.sample_rate
             if waveform.shape[1] > max_length:
                 waveform = waveform[:, :max_length]
             return waveform.squeeze()
         except Exception as e:
             print(f"Error in audio preprocessing: {e}")
             raise e
         """Extract speaker embedding using advanced methods"""
         try:
             print(f"Processing audio file: {audio_path}")
             audio_tensor = self.preprocess_audio(audio_path)
             audio_numpy = audio_tensor.numpy()
             print("Extracting deep audio features with Wav2Vec2...")
             with torch.no_grad():
+                inputs = self.wav2vec2_processor(audio_numpy, sampling_rate=self.sample_rate, return_tensors="pt", padding=True)
                 outputs = self.wav2vec2_model(inputs.input_values.to(self.device))
+                speaker_features = torch.mean(outputs.last_hidden_state, dim=1)
             print(f"Extracted Wav2Vec2 features: {speaker_features.shape}")
             best_embedding = self.find_best_matching_speaker(speaker_features, audio_numpy)
             print("✅ Advanced speaker embedding created successfully!")
             return best_embedding, "✅ Voice profile extracted using advanced neural analysis! You can now generate speech in this voice."
         except Exception as e:
             print(f"Error in advanced embedding extraction: {e}")
             return self.extract_speaker_embedding_improved(audio_path)
     def find_best_matching_speaker(self, target_features, audio_numpy):
+        """Create a modified embedding based on acoustic features"""
         try:
             mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13)
             pitch, _ = librosa.piptrack(y=audio_numpy, sr=self.sample_rate)
             spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
             acoustic_signature = np.concatenate([
                 np.mean(mfccs, axis=1),
                 np.std(mfccs, axis=1),
                 [np.mean(spectral_centroids)]
             ])
             best_embedding = self.default_speaker_embeddings
+            modification_factor = 0.3  # Increased for more distinct voice
             feature_mod = torch.tensor(acoustic_signature[:best_embedding.shape[1]], dtype=torch.float32).to(self.device)
             feature_mod = (feature_mod - torch.mean(feature_mod)) / (torch.std(feature_mod) + 1e-8)
             modified_embedding = best_embedding + modification_factor * feature_mod.unsqueeze(0)
             modified_embedding = torch.nn.functional.normalize(modified_embedding, p=2, dim=1)
             return modified_embedding
         except Exception as e:
             print(f"Error in speaker matching: {e}")
             return self.default_speaker_embeddings
         """Improved speaker embedding extraction with better acoustic analysis"""
         try:
             print("Using improved speaker embedding extraction...")
             audio_tensor = self.preprocess_audio(audio_path)
             audio_numpy = audio_tensor.numpy()
             print("Extracting comprehensive acoustic features...")
             mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=20)
             delta_mfccs = librosa.feature.delta(mfccs)
             delta2_mfccs = librosa.feature.delta(mfccs, order=2)
+            f0, _, _ = librosa.pyin(audio_numpy, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
             f0_clean = f0[~np.isnan(f0)]
             spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
             spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate)
             spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate)
             spectral_contrast = librosa.feature.spectral_contrast(y=audio_numpy, sr=self.sample_rate)
             lpc_coeffs = librosa.lpc(audio_numpy, order=16)
             features = np.concatenate([
                 np.mean(mfccs, axis=1),
                 np.std(mfccs, axis=1),
                 [np.mean(spectral_bandwidth)],
                 [np.mean(spectral_rolloff)],
                 np.mean(spectral_contrast, axis=1),
+                lpc_coeffs[1:]
             ])
             print(f"Extracted {len(features)} advanced acoustic features")
+            base_embedding = self.default_speaker_embeddings
+            embedding_size = base_embedding.shape[1]
+            features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8)
+            if len(features_normalized) > embedding_size:
+                modification_vector = features_normalized[:embedding_size]
+            else:
+                modification_vector = np.pad(features_normalized, (0, embedding_size - len(features_normalized)), 'reflect')
+            modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device)
+            modification_strength = 0.3  # Increased for more distinct voice
+            speaker_embedding = base_embedding + modification_strength * modification_tensor.unsqueeze(0)
+            if len(f0_clean) > 0:
+                pitch_factor = np.mean(f0_clean) / 200.0
+                pitch_modification = 0.05 * (pitch_factor - 1.0)
+                speaker_embedding = speaker_embedding * (1.0 + pitch_modification)
+            speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1)
+            return speaker_embedding, "✅ Voice profile extracted with enhanced acoustic analysis! Ready for speech generation."
+        except Exception as e:
+            print(f"❌ Error in improved embedding extraction: {str(e)}")
+            return None, f"❌ Error processing audio: {str(e)}"
+    def extract_speaker_embedding(self, audio_path):
+        """Main method for speaker embedding extraction"""
+        try:
+            return self.extract_speaker_embedding_advanced(audio_path)
+        except Exception as e:
+            print(f"Advanced method failed: {e}")
+            return self.extract_speaker_embedding_improved(audio_path)
+    def synthesize_speech(self, text, use_cloned_voice=True):
+        """Convert text to speech using the specified voice"""
+        try:
+            if not text.strip():
+                return None, "❌ Please enter some text to convert."
+            if len(text) > 500:
+                text = text[:500]
+                print("Text truncated to 500 characters")
+            print(f"Synthesizing speech for: '{text[:50]}...'")
+            if use_cloned_voice and self.user_speaker_embeddings is not None:
+                speaker_embeddings = self.user_speaker_embeddings
+                voice_type = "your cloned voice"
+                print("Using cloned voice embeddings")
             else:
+                speaker_embeddings = self.default_speaker_embeddings
+                voice_type = "default voice"
+                print("Using default voice embeddings")
+            print(f"Speaker embedding shape: {speaker_embeddings.shape}")
+            inputs = self.processor(text=text, return_tensors="pt")
+            input_ids = inputs["input_ids"].to(self.device)
+            print("Generating speech...")
+            with torch.no_grad():
+                speaker_embeddings = speaker_embeddings.to(self.device)
+                if speaker_embeddings.dim() == 1:
+                    speaker_embeddings = speaker_embeddings.unsqueeze(0)
+                speech = self.model.generate_speech(input_ids, speaker_embeddings, vocoder=self.vocoder)
+            speech_numpy = speech.cpu().numpy()
+            print(f"Generated audio shape: {speech_numpy.shape}")
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                sf.write(tmp_file.name, speech_numpy, self.sample_rate)
+                print(f"Audio saved to: {tmp_file.name}")
+                del speech, input_ids
+                gc.collect()
+                return tmp_file.name, f"✅ Speech generated successfully using {voice_type}!"
+        except Exception as e:
+            print(f"❌ Error in synthesize_speech: {str(e)}")
+            return Nail, f"❌ Error generating speech: {str(e)}"
+print("🚀 Initializing Enhanced Voice Cloning TTS System...")
+tts_system = VoiceCloningTTS()
+def process_voice_upload(audio_file):
+    if audio_file is None:
+        return "❌ Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False)
+    try:
+        print(f"Processing uploaded file: {audio_file}")
+        speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file)
+        if speaker_embedding is not None:
+            tts_system.user_speaker_embeddings = speaker_embedding
+            print("✅ Speaker embeddings saved successfully")
+            return message, gr.update(interactive=True), gr.update(interactive=True)
+        else:
+            return message, gr.update(interactive=False), gr.update(interactive=False)
+    except Exception as e:
+        error_msg = f"❌ Error processing audio: {str(e)}"
+        print(error_msg)
+        return error_msg, gr.update(interactive=False), gr.update(interactive=False)
+def generate_speech(text, use_cloned_voice):
+ Rosin 42 recommends that when working with audio, you should ensure that the audio file is in a format compatible with `torchaudio.load()`, such as WAV, and that the sample rate matches the expected 16kHz. Here's a solution that should ensure the cloned voice is used correctly:
+```python
+import gradio as gr
+import torch
+import torchaudio
+import numpy as np
+import tempfile
+import os
+from pathlib import Path
+import librosa
+import soundfile as sf
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from transformers import Wav2Vec2Processor, Wav2Vec2Model
+from datasets import load_dataset
+import warnings
+import gc
+warnings.filterwarnings("ignore")
+class VoiceCloningTTS:
+    def __init__(self):
+        self.device = torch.device("cpu")
+        print(f"Using device: {self.device}")
+        try:
+            print("Loading SpeechT5 processor...")
+            self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+            print("Loading SpeechT5 TTS model...")
+            self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+            self.model.to(self.device)
+            self.model.eval()
+            print("Loading SpeechT5 vocoder...")
+            self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+            self.vocoder.to(self.device)
+            self.vocoder.eval()
+            print("Loading Wav2Vec2 for speaker embedding...")
+            self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+            self.wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
+            self.wav2vec2_model.to(self.device)
+            self.wav2vec2_model.eval()
+            print("Loading speaker embeddings dataset...")
+            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            self.speaker_embeddings_dataset = embeddings_dataset
+            self.default_speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
+            self.user_speaker_embeddings = None
+            self.sample_rate = 16000
+            print("✅ TTS system initialized successfully!")
+        except Exception as e:
+            print(f"❌ Error initializing TTS system: {str(e)}")
+            raise e
+    def preprocess_audio(self, audio_path):
+        try:
+            waveform, sample_rate = torchaudio.load(audio_path)
+            if waveform.shape[0] > 1:
+                waveform = torch.mean(waveform, dim=0, keepdim=True)
+            if sample_rate != self.sample_rate:
+                resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
+                waveform = resampler(waveform)
+            waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
+            min_length = 3 * self.sample_rate
+            if waveform.shape[1] < min_length:
+                repeat_times = int(np.ceil(min_length / waveform.shape[1]))
+                waveform = waveform.repeat(1, repeat_times)[:, :min_length]
+            max_length = 20 * self.sample_rate
+            if waveform.shape[1] > max_length:
+                waveform = waveform[:, :max_length]
+            return waveform.squeeze()
+        except Exception as e:
+            print(f"Error in audio preprocessing: {e}")
+            raise e
+    def extract_speaker_embedding_advanced(self, audio_path):
+        try:
+            print(f"Processing audio file: {audio_path}")
+            audio_tensor = self.preprocess_audio(audio_path)
+            audio_numpy = audio_tensor.numpy()
+            print("Extracting deep audio features with Wav2Vec2...")
+            with torch.no_grad():
+                inputs = self.wav2vec2_processor(audio_numpy, sampling_rate=self.sample_rate, return_tensors="pt", padding=True)
+                outputs = self.wav2vec2_model(inputs.input_values.to(self.device))
+                speaker_features = torch.mean(outputs.last_hidden_state, dim=1)
+            print(f"Extracted Wav2Vec2 features: {speaker_features.shape}")
+            best_embedding = self.find_best_matching_speaker(speaker_features, audio_numpy)
+            print("✅ Advanced speaker embedding created successfully!")
+            return best_embedding, "✅ Voice profile extracted using advanced neural analysis!"
+        except Exception as e:
+            print(f"Error in advanced embedding extraction: {e}")
+            return self.extract_speaker_embedding_improved(audio_path)
+    def find_best_matching_speaker(self, target_features, audio_numpy):
+        try:
+            mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13)
+            pitch, _ = librosa.piptrack(y=audio_numpy, sr=self.sample_rate)
+            spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
+            acoustic_signature = np.concatenate([
+                np.mean(mfccs, axis=1),
+                np.std(mfccs, axis=1),
+                [np.mean(pitch[pitch > 0]) if np.any(pitch > 0) else 200],
+                [np.mean(spectral_centroids)]
+            ])
+            best_embedding = self.default_speaker_embeddings
+            modification_factor = 0.3  # Increased for more distinct voice
+            feature_mod = torch.tensor(acoustic_signature[:best_embedding.shape[1]], dtype=torch.float32).to(self.device)
+            feature_mod = (feature_mod - torch.mean(feature_mod)) / (torch.std(feature_mod) + 1e-8)
+            modified_embedding = best_embedding + modification_factor * feature_mod.unsqueeze(0)
+            modified_embedding = torch.nn.functional.normalize(modified_embedding, p=2, dim=1)
+            return modified_embedding
+        except Exception as e:
+            print(f"Error in speaker matching: {e}")
+            return self.default_speaker_embeddings
+    def extract_speaker_embedding_improved(self, audio_path):
+        try:
+            print("Using improved speaker embedding extraction...")
+            audio_tensor = self.preprocess_audio(audio_path)
+            audio_numpy = audio_tensor.numpy()
+            print("Extracting comprehensive acoustic features...")
+            mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=20)
+            delta_mfccs = librosa.feature.delta(mfccs)
+            delta2_mfccs = librosa.feature.delta(mfccs, order=2)
+            f0, _, _ = librosa.pyin(audio_numpy, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
+            f0_clean = f0[~np.isnan(f0)]
+            spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
+            spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate)
+            spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate)
+            spectral_contrast = librosa.feature.spectral_contrast(y=audio_numpy, sr=self.sample_rate)
+            lpc_coeffs = librosa.lpc(audio_numpy, order=16)
+            features = np.concatenate([
+                np.mean(mfccs, axis=1),
+                np.std(mfccs, axis=1),
+                np.mean(delta_mfccs, axis=1),
+                np.mean(delta2_mfccs, axis=1),
+                [np.mean(f0_clean) if len(f0_clean) > 0 else 200],
+                [np.std(f0_clean) if len(f0_clean) > 0 else 50],
+                [np.mean(spectral_centroids)],
+                [np.mean(spectral_bandwidth)],
+                [np.mean(spectral_rolloff)],
+                np.mean(spectral_contrast, axis=1),
+                lpc_coeffs[1:]
+            ])
+            print(f"Extracted {len(features)} advanced acoustic features")
+            base_embedding = self.default_speaker_embeddings
+            embedding_size = base_embedding.shape[1]
             features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8)
             if len(features_normalized) > embedding_size:
                 modification_vector = features_normalized[:embedding_size]
             else:
+                modification_vector = np.pad(features_normalized, (0, embedding_size - len(features_normalized)), 'reflect')
             modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device)
+            modification_strength = 0.3  # Increased for more distinct voice
+            speaker_embedding = base_embedding + modification_strength * modification_tensor.unsqueeze(0)
             if len(f0_clean) > 0:
+                pitch_factor = np.mean(f0_clean) / 200.0
                 pitch_modification = 0.05 * (pitch_factor - 1.0)
                 speaker_embedding = speaker_embedding * (1.0 + pitch_modification)
             speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1)
+            return speaker_embedding, "✅ Voice profile extracted with enhanced acoustic analysis!"
         except Exception as e:
             print(f"❌ Error in improved embedding extraction: {str(e)}")
             return None, f"❌ Error processing audio: {str(e)}"
     def extract_speaker_embedding(self, audio_path):
         try:
+            return self.extract_speaker_embedding_advanced(audio_path)
         except Exception as e:
             print(f"Advanced method failed: {e}")
             return self.extract_speaker_embedding_improved(audio_path)
     def synthesize_speech(self, text, use_cloned_voice=True):
         try:
             if not text.strip():
                 return None, "❌ Please enter some text to convert."
             if len(text) > 500:
                 text = text[:500]
                 print("Text truncated to 500 characters")
             print(f"Synthesizing speech for: '{text[:50]}...'")
             if use_cloned_voice and self.user_speaker_embeddings is not None:
                 speaker_embeddings = self.user_speaker_embeddings
                 voice_type = "your cloned voice"
                 print("Using default voice embeddings")
             print(f"Speaker embedding shape: {speaker_embeddings.shape}")
             inputs = self.processor(text=text, return_tensors="pt")
             input_ids = inputs["input_ids"].to(self.device)
             print("Generating speech...")
             with torch.no_grad():
                 speaker_embeddings = speaker_embeddings.to(self.device)
                 if speaker_embeddings.dim() == 1:
                     speaker_embeddings = speaker_embeddings.unsqueeze(0)
+                speech = self.model.generate_speech(input_ids, speaker_embeddings, vocoder=self.vocoder)
             speech_numpy = speech.cpu().numpy()
             print(f"Generated audio shape: {speech_numpy.shape}")
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                 sf.write(tmp_file.name, speech_numpy, self.sample_rate)
                 print(f"Audio saved to: {tmp_file.name}")
                 del speech, input_ids
                 gc.collect()
                 return tmp_file.name, f"✅ Speech generated successfully using {voice_type}!"
         except Exception as e:
             print(f"❌ Error in synthesize_speech: {str(e)}")
             return None, f"❌ Error generating speech: {str(e)}"
+print("🚀 Initializing Voice Cloning TTS System...")
 tts_system = VoiceCloningTTS()
 def process_voice_upload(audio_file):
     if audio_file is None:
         return "❌ Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False)
     try:
         print(f"Processing uploaded file: {audio_file}")
         speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file)
         if speaker_embedding is not None:
             tts_system.user_speaker_embeddings = speaker_embedding
             print("✅ Speaker embeddings saved successfully")
         return error_msg, gr.update(interactive=False), gr.update(interactive=False)
 def generate_speech(text, use_cloned_voice):
     if not text.strip():
         return None, "❌ Please enter some text to convert."
     try:
         print(f"Generating speech - Use cloned voice: {use_cloned_voice}")
         audio_file, message = tts_system.synthesize_speech(text, use_cloned_voice)
         return None, error_msg
 def clear_voice_profile():
     tts_system.user_speaker_embeddings = None
+    return "🔄 Voice profile cleared.", gr.update(interactive=False), gr.update(interactive=False)
 def update_generate_button(text, use_cloned):
     text_ready = bool(text.strip())
     voice_ready = (not use_cloned) or (tts_system.user_speaker_embeddings is not None)
     return gr.update(interactive=text_ready and voice_ready)
+with gr.Blocks(title="Voice Cloning TTS System") as demo:
+    gr.Markdown("# Voice Cloning TTS System")
+    gr.Markdown("Upload an audio file to clone your voice and generate speech.")
     with gr.Row():
+        with gr.Column():
+            voice_upload = gr.Audio(label="Upload Voice Sample", type="filepath", sources=["upload", "microphone"])
+            upload_status = gr.Textbox(label="Status", interactive=False)
+            clear_btn = gr.Button("Clear Voice Profile")
         with gr.Column():
+            text_input = gr.Textbox(label="Text to Convert", lines=5)
+            use_cloned_voice = gr.Checkbox(label="Use Cloned Voice", value=True, interactive=False)
+            generate_btn = gr.Button("Generate Speech", interactive=False)
+    output_audio = gr.Audio(label="Generated Speech", type="filepath")
+    generation_status = gr.Textbox(label="Generation Status", interactive=False)
+    voice_upload.change(fn=process_voice_upload, inputs=[voice_upload], outputs=[upload_status, use_cloned_voice, generate_btn])
+    text_input.change(fn=update_generate_button, inputs=[text_input, use_cloned_voice], outputs=[generate_btn])
+    use_cloned_voice.change(fn=update_generate_button, inputs=[text_input, use_cloned_voice], outputs=[generate_btn])
+    generate_btn.click(fn=generate_speech, inputs=[text_input, use_cloned_voice], outputs=[output_audio, generation_status])
+    clear_btn.click(fn=clear_voice_profile, outputs=[upload_status, use_cloned_voice, generate_btn])
 if __name__ == "__main__":
+    print("🌟 Starting Voice Cloning TTS System...")
+    demo.launch()