shukdevdatta123 commited on
Commit
f8f4a26
·
verified ·
1 Parent(s): 7093262

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +283 -346
app.py CHANGED
@@ -20,12 +20,10 @@ warnings.filterwarnings("ignore")
20
  class VoiceCloningTTS:
21
  def __init__(self):
22
  """Initialize the TTS system with SpeechT5 model"""
23
- # Use CPU for better compatibility
24
  self.device = torch.device("cpu")
25
  print(f"Using device: {self.device}")
26
 
27
  try:
28
- # Load SpeechT5 models
29
  print("Loading SpeechT5 processor...")
30
  self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
31
 
@@ -39,14 +37,12 @@ class VoiceCloningTTS:
39
  self.vocoder.to(self.device)
40
  self.vocoder.eval()
41
 
42
- # Load Wav2Vec2 for better speaker embedding extraction
43
  print("Loading Wav2Vec2 for speaker embedding...")
44
  self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
45
  self.wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
46
  self.wav2vec2_model.to(self.device)
47
  self.wav2vec2_model.eval()
48
 
49
- # Load default speaker embeddings
50
  print("Loading speaker embeddings dataset...")
51
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
52
  self.speaker_embeddings_dataset = embeddings_dataset
@@ -64,35 +60,21 @@ class VoiceCloningTTS:
64
  def preprocess_audio(self, audio_path):
65
  """Preprocess audio for better speaker embedding extraction"""
66
  try:
67
- # Load audio
68
  waveform, sample_rate = torchaudio.load(audio_path)
69
-
70
- # Convert to mono
71
  if waveform.shape[0] > 1:
72
  waveform = torch.mean(waveform, dim=0, keepdim=True)
73
-
74
- # Resample to 16kHz
75
  if sample_rate != self.sample_rate:
76
  resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
77
  waveform = resampler(waveform)
78
-
79
- # Normalize
80
  waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
81
-
82
- # Ensure minimum length (3 seconds for better speaker characteristics)
83
  min_length = 3 * self.sample_rate
84
  if waveform.shape[1] < min_length:
85
- # Repeat audio if too short
86
  repeat_times = int(np.ceil(min_length / waveform.shape[1]))
87
  waveform = waveform.repeat(1, repeat_times)[:, :min_length]
88
-
89
- # Limit to 20 seconds max
90
  max_length = 20 * self.sample_rate
91
  if waveform.shape[1] > max_length:
92
  waveform = waveform[:, :max_length]
93
-
94
  return waveform.squeeze()
95
-
96
  except Exception as e:
97
  print(f"Error in audio preprocessing: {e}")
98
  raise e
@@ -101,53 +83,31 @@ class VoiceCloningTTS:
101
  """Extract speaker embedding using advanced methods"""
102
  try:
103
  print(f"Processing audio file: {audio_path}")
104
-
105
- # Preprocess audio
106
  audio_tensor = self.preprocess_audio(audio_path)
107
  audio_numpy = audio_tensor.numpy()
108
 
109
  print("Extracting deep audio features with Wav2Vec2...")
110
-
111
- # Extract features using Wav2Vec2
112
  with torch.no_grad():
113
- # Process with Wav2Vec2
114
- inputs = self.wav2vec2_processor(
115
- audio_numpy,
116
- sampling_rate=self.sample_rate,
117
- return_tensors="pt",
118
- padding=True
119
- )
120
-
121
- # Get hidden states
122
  outputs = self.wav2vec2_model(inputs.input_values.to(self.device))
123
- hidden_states = outputs.last_hidden_state
124
-
125
- # Pool the hidden states to get speaker representation
126
- # Use mean pooling across time dimension
127
- speaker_features = torch.mean(hidden_states, dim=1) # Shape: (1, 768)
128
 
129
  print(f"Extracted Wav2Vec2 features: {speaker_features.shape}")
130
-
131
- # Create speaker embedding by finding similar speaker in dataset
132
  best_embedding = self.find_best_matching_speaker(speaker_features, audio_numpy)
133
 
134
  print("✅ Advanced speaker embedding created successfully!")
135
  return best_embedding, "✅ Voice profile extracted using advanced neural analysis! You can now generate speech in this voice."
136
-
137
  except Exception as e:
138
  print(f"Error in advanced embedding extraction: {e}")
139
- # Fallback to improved basic method
140
  return self.extract_speaker_embedding_improved(audio_path)
141
 
142
  def find_best_matching_speaker(self, target_features, audio_numpy):
143
- """Find the best matching speaker from the dataset and create hybrid embedding"""
144
  try:
145
- # Extract additional acoustic features
146
  mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13)
147
  pitch, _ = librosa.piptrack(y=audio_numpy, sr=self.sample_rate)
148
  spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
149
 
150
- # Create acoustic signature
151
  acoustic_signature = np.concatenate([
152
  np.mean(mfccs, axis=1),
153
  np.std(mfccs, axis=1),
@@ -155,40 +115,14 @@ class VoiceCloningTTS:
155
  [np.mean(spectral_centroids)]
156
  ])
157
 
158
- # Sample multiple speakers from dataset for variety
159
- speaker_indices = [100, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7306]
160
- best_score = float('inf')
161
  best_embedding = self.default_speaker_embeddings
162
-
163
- for idx in speaker_indices:
164
- if idx < len(self.speaker_embeddings_dataset):
165
- candidate_embedding = torch.tensor(
166
- self.speaker_embeddings_dataset[idx]["xvector"]
167
- ).unsqueeze(0).to(self.device)
168
-
169
- # Simple scoring based on embedding similarity
170
- # In a real implementation, you'd use more sophisticated matching
171
- score = torch.norm(candidate_embedding - self.default_speaker_embeddings).item()
172
-
173
- if score < best_score:
174
- best_score = score
175
- best_embedding = candidate_embedding
176
-
177
- # Create modified embedding based on acoustic features
178
- modification_factor = 0.1
179
  feature_mod = torch.tensor(acoustic_signature[:best_embedding.shape[1]], dtype=torch.float32).to(self.device)
180
-
181
- # Normalize feature modification
182
  feature_mod = (feature_mod - torch.mean(feature_mod)) / (torch.std(feature_mod) + 1e-8)
183
-
184
- # Apply modification
185
  modified_embedding = best_embedding + modification_factor * feature_mod.unsqueeze(0)
186
-
187
- # Normalize final embedding
188
  modified_embedding = torch.nn.functional.normalize(modified_embedding, p=2, dim=1)
189
 
190
  return modified_embedding
191
-
192
  except Exception as e:
193
  print(f"Error in speaker matching: {e}")
194
  return self.default_speaker_embeddings
@@ -197,35 +131,21 @@ class VoiceCloningTTS:
197
  """Improved speaker embedding extraction with better acoustic analysis"""
198
  try:
199
  print("Using improved speaker embedding extraction...")
200
-
201
- # Preprocess audio
202
  audio_tensor = self.preprocess_audio(audio_path)
203
  audio_numpy = audio_tensor.numpy()
204
 
205
- # Enhanced feature extraction
206
  print("Extracting comprehensive acoustic features...")
207
-
208
- # Voice quality features
209
  mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=20)
210
  delta_mfccs = librosa.feature.delta(mfccs)
211
  delta2_mfccs = librosa.feature.delta(mfccs, order=2)
212
-
213
- # Pitch and prosodic features
214
- f0, voiced_flag, voiced_probs = librosa.pyin(audio_numpy,
215
- fmin=librosa.note_to_hz('C2'),
216
- fmax=librosa.note_to_hz('C7'))
217
  f0_clean = f0[~np.isnan(f0)]
218
-
219
- # Spectral features
220
  spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
221
  spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate)
222
  spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate)
223
  spectral_contrast = librosa.feature.spectral_contrast(y=audio_numpy, sr=self.sample_rate)
224
-
225
- # Formant-like features using LPC
226
  lpc_coeffs = librosa.lpc(audio_numpy, order=16)
227
 
228
- # Combine all features
229
  features = np.concatenate([
230
  np.mean(mfccs, axis=1),
231
  np.std(mfccs, axis=1),
@@ -237,87 +157,303 @@ class VoiceCloningTTS:
237
  [np.mean(spectral_bandwidth)],
238
  [np.mean(spectral_rolloff)],
239
  np.mean(spectral_contrast, axis=1),
240
- lpc_coeffs[1:] # Skip the first coefficient
241
  ])
242
 
243
  print(f"Extracted {len(features)} advanced acoustic features")
 
 
 
 
 
 
 
 
244
 
245
- # Use multiple base embeddings for better diversity
246
- base_indices = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 7306]
247
- embeddings = []
248
 
249
- for idx in base_indices:
250
- if idx < len(self.speaker_embeddings_dataset):
251
- base_embedding = torch.tensor(
252
- self.speaker_embeddings_dataset[idx]["xvector"]
253
- ).to(self.device)
254
- embeddings.append(base_embedding)
255
 
256
- # Create ensemble embedding
257
- if embeddings:
258
- ensemble_embedding = torch.stack(embeddings).mean(dim=0).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  else:
260
- ensemble_embedding = self.default_speaker_embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
- # Apply sophisticated feature-based modification
263
- embedding_size = ensemble_embedding.shape[1]
 
 
 
 
 
 
 
 
 
264
 
265
- # Normalize and resize features to match embedding size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8)
267
 
268
  if len(features_normalized) > embedding_size:
269
  modification_vector = features_normalized[:embedding_size]
270
  else:
271
- modification_vector = np.pad(features_normalized,
272
- (0, embedding_size - len(features_normalized)),
273
- 'reflect')
274
 
275
  modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device)
 
 
276
 
277
- # Apply stronger modification for more distinctive voice
278
- modification_strength = 0.15
279
- speaker_embedding = ensemble_embedding + modification_strength * modification_tensor.unsqueeze(0)
280
-
281
- # Additional voice-specific transformations based on pitch
282
  if len(f0_clean) > 0:
283
- pitch_factor = np.mean(f0_clean) / 200.0 # Normalize around 200Hz
284
  pitch_modification = 0.05 * (pitch_factor - 1.0)
285
  speaker_embedding = speaker_embedding * (1.0 + pitch_modification)
286
 
287
- # Final normalization
288
  speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1)
289
-
290
- return speaker_embedding, "✅ Voice profile extracted with enhanced acoustic analysis! Ready for speech generation."
291
-
292
  except Exception as e:
293
  print(f"❌ Error in improved embedding extraction: {str(e)}")
294
  return None, f"❌ Error processing audio: {str(e)}"
295
 
296
  def extract_speaker_embedding(self, audio_path):
297
- """Main method for speaker embedding extraction"""
298
  try:
299
- # Try advanced method first
300
- embedding, message = self.extract_speaker_embedding_advanced(audio_path)
301
- return embedding, message
302
  except Exception as e:
303
  print(f"Advanced method failed: {e}")
304
- # Fallback to improved method
305
  return self.extract_speaker_embedding_improved(audio_path)
306
 
307
  def synthesize_speech(self, text, use_cloned_voice=True):
308
- """Convert text to speech using the specified voice"""
309
  try:
310
  if not text.strip():
311
  return None, "❌ Please enter some text to convert."
312
-
313
- # Limit text length
314
  if len(text) > 500:
315
  text = text[:500]
316
  print("Text truncated to 500 characters")
317
 
318
  print(f"Synthesizing speech for: '{text[:50]}...'")
319
-
320
- # Choose speaker embedding
321
  if use_cloned_voice and self.user_speaker_embeddings is not None:
322
  speaker_embeddings = self.user_speaker_embeddings
323
  voice_type = "your cloned voice"
@@ -328,61 +464,37 @@ class VoiceCloningTTS:
328
  print("Using default voice embeddings")
329
 
330
  print(f"Speaker embedding shape: {speaker_embeddings.shape}")
331
-
332
- # Tokenize text
333
  inputs = self.processor(text=text, return_tensors="pt")
334
  input_ids = inputs["input_ids"].to(self.device)
335
 
336
  print("Generating speech...")
337
-
338
- # Generate speech
339
  with torch.no_grad():
340
- # Ensure speaker embeddings are on correct device and have correct shape
341
  speaker_embeddings = speaker_embeddings.to(self.device)
342
  if speaker_embeddings.dim() == 1:
343
  speaker_embeddings = speaker_embeddings.unsqueeze(0)
344
-
345
- print(f"Final speaker embedding shape: {speaker_embeddings.shape}")
346
-
347
- speech = self.model.generate_speech(
348
- input_ids,
349
- speaker_embeddings,
350
- vocoder=self.vocoder
351
- )
352
 
353
- # Convert to numpy
354
  speech_numpy = speech.cpu().numpy()
355
-
356
  print(f"Generated audio shape: {speech_numpy.shape}")
357
-
358
- # Save to temporary file
359
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
360
  sf.write(tmp_file.name, speech_numpy, self.sample_rate)
361
  print(f"Audio saved to: {tmp_file.name}")
362
-
363
- # Cleanup
364
  del speech, input_ids
365
  gc.collect()
366
-
367
  return tmp_file.name, f"✅ Speech generated successfully using {voice_type}!"
368
-
369
  except Exception as e:
370
  print(f"❌ Error in synthesize_speech: {str(e)}")
371
  return None, f"❌ Error generating speech: {str(e)}"
372
 
373
- # Initialize the TTS system
374
- print("🚀 Initializing Enhanced Voice Cloning TTS System...")
375
  tts_system = VoiceCloningTTS()
376
 
377
  def process_voice_upload(audio_file):
378
- """Process uploaded voice file"""
379
  if audio_file is None:
380
  return "❌ Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False)
381
-
382
  try:
383
  print(f"Processing uploaded file: {audio_file}")
384
  speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file)
385
-
386
  if speaker_embedding is not None:
387
  tts_system.user_speaker_embeddings = speaker_embedding
388
  print("✅ Speaker embeddings saved successfully")
@@ -395,10 +507,8 @@ def process_voice_upload(audio_file):
395
  return error_msg, gr.update(interactive=False), gr.update(interactive=False)
396
 
397
  def generate_speech(text, use_cloned_voice):
398
- """Generate speech from text"""
399
  if not text.strip():
400
  return None, "❌ Please enter some text to convert."
401
-
402
  try:
403
  print(f"Generating speech - Use cloned voice: {use_cloned_voice}")
404
  audio_file, message = tts_system.synthesize_speech(text, use_cloned_voice)
@@ -409,211 +519,38 @@ def generate_speech(text, use_cloned_voice):
409
  return None, error_msg
410
 
411
  def clear_voice_profile():
412
- """Clear the uploaded voice profile"""
413
  tts_system.user_speaker_embeddings = None
414
- return ("🔄 Voice profile cleared. Upload a new audio file to clone a voice.",
415
- gr.update(interactive=False),
416
- gr.update(interactive=False))
417
 
418
  def update_generate_button(text, use_cloned):
419
- """Update generate button state based on inputs"""
420
  text_ready = bool(text.strip())
421
  voice_ready = (not use_cloned) or (tts_system.user_speaker_embeddings is not None)
422
  return gr.update(interactive=text_ready and voice_ready)
423
 
424
- # Create enhanced Gradio interface
425
- with gr.Blocks(
426
- title="🎤 Enhanced Voice Cloning TTS System",
427
- theme=gr.themes.Soft(),
428
- css="""
429
- .gradio-container {
430
- max-width: 1200px !important;
431
- margin: auto !important;
432
- }
433
- .header {
434
- text-align: center;
435
- margin-bottom: 30px;
436
- padding: 25px;
437
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
438
- border-radius: 15px;
439
- color: white;
440
- box-shadow: 0 8px 25px rgba(0,0,0,0.15);
441
- }
442
- .step-box {
443
- border: 2px solid #e1e5e9;
444
- border-radius: 12px;
445
- padding: 20px;
446
- margin: 15px 0;
447
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
448
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
449
- }
450
- .tips-box {
451
- background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
452
- border-radius: 12px;
453
- padding: 20px;
454
- margin: 20px 0;
455
- border-left: 5px solid #ff6b6b;
456
- }
457
- .improvement-box {
458
- background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%);
459
- border-radius: 12px;
460
- padding: 20px;
461
- margin: 20px 0;
462
- border-left: 5px solid #00d2ff;
463
- }
464
- """
465
- ) as demo:
466
-
467
- gr.HTML("""
468
- <div class="header">
469
- <h1>🎤 Enhanced AI Voice Cloning TTS System</h1>
470
- <p>🚀 Advanced neural voice analysis with Wav2Vec2 + SpeechT5</p>
471
- <p>✨ Upload your voice and generate speech that sounds more like you!</p>
472
- </div>
473
- """)
474
 
475
  with gr.Row():
476
- with gr.Column(scale=1):
477
- gr.HTML('<div class="step-box"><h3>🎙️ Step 1: Upload Your Voice Sample</h3><p>Record 10-30 seconds of clear, natural speech for best results</p></div>')
478
-
479
- voice_upload = gr.Audio(
480
- label="📤 Voice Sample (Clear English Speech)",
481
- type="filepath",
482
- sources=["upload", "microphone"],
483
- format="wav"
484
- )
485
-
486
- upload_status = gr.Textbox(
487
- label="📊 Advanced Voice Analysis Status",
488
- interactive=False,
489
- value="⏳ Please upload an audio file to extract your unique voice profile using advanced neural analysis.",
490
- lines=3
491
- )
492
-
493
- clear_btn = gr.Button("🗑️ Clear Voice Profile", variant="secondary", size="sm")
494
 
495
- with gr.Column(scale=1):
496
- gr.HTML('<div class="step-box"><h3>✍️ Step 2: Enter Your Text</h3><p>Type what you want to hear in your cloned voice</p></div>')
497
-
498
- text_input = gr.Textbox(
499
- label="📝 Text to Convert (Max 500 characters)",
500
- placeholder="Enter the text you want to convert to speech using your cloned voice...",
501
- lines=6,
502
- max_lines=10
503
- )
504
-
505
- use_cloned_voice = gr.Checkbox(
506
- label="🎭 Use My Cloned Voice (Enhanced)",
507
- value=True,
508
- interactive=False,
509
- info="Uncheck to use default voice for comparison"
510
- )
511
-
512
- generate_btn = gr.Button(
513
- "🎵 Generate Speech with AI Voice Cloning",
514
- variant="primary",
515
- interactive=False,
516
- size="lg"
517
- )
518
-
519
- gr.HTML('<div class="step-box"><h3>🔊 Step 3: Your Generated Speech</h3></div>')
520
-
521
- with gr.Row():
522
  with gr.Column():
523
- output_audio = gr.Audio(
524
- label="🎧 Generated Speech Audio",
525
- type="filepath",
526
- interactive=False
527
- )
528
-
529
- generation_status = gr.Textbox(
530
- label="⚡ Generation Status",
531
- interactive=False,
532
- lines=2
533
- )
534
-
535
- # Enhanced tips section
536
- gr.HTML("""
537
- <div class="improvement-box">
538
- <h3>🔬 Enhanced Voice Cloning Technology:</h3>
539
- <p><strong>This improved version uses:</strong></p>
540
- <ul>
541
- <li><strong>Wav2Vec2 Neural Networks:</strong> Advanced deep learning for better voice feature extraction</li>
542
- <li><strong>Multi-Speaker Analysis:</strong> Compares your voice against multiple reference speakers</li>
543
- <li><strong>Enhanced Acoustic Features:</strong> 60+ voice characteristics including pitch, formants, and spectral features</li>
544
- <li><strong>Ensemble Embeddings:</strong> Combines multiple speaker models for more accurate voice representation</li>
545
- </ul>
546
- </div>
547
- """)
548
-
549
- gr.HTML("""
550
- <div class="tips-box">
551
- <h3>💡 Pro Tips for Maximum Voice Similarity:</h3>
552
- <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 15px;">
553
- <div>
554
- <h4>🎤 Recording Best Practices:</h4>
555
- <ul>
556
- <li><strong>Duration:</strong> 15-30 seconds is optimal</li>
557
- <li><strong>Content:</strong> Read naturally, include varied sentences</li>
558
- <li><strong>Environment:</strong> Quiet room, minimal echo</li>
559
- <li><strong>Quality:</strong> Use good microphone if possible</li>
560
- <li><strong>Speaking:</strong> Natural pace, clear pronunciation</li>
561
- </ul>
562
- </div>
563
- <div>
564
- <h4>📝 Text Generation Tips:</h4>
565
- <ul>
566
- <li><strong>Language:</strong> English works best</li>
567
- <li><strong>Style:</strong> Match your natural speaking style</li>
568
- <li><strong>Length:</strong> Shorter texts often sound better</li>
569
- <li><strong>Punctuation:</strong> Helps with natural intonation</li>
570
- <li><strong>Testing:</strong> Try different texts to compare results</li>
571
- </ul>
572
- </div>
573
- </div>
574
- <div style="margin-top: 20px; padding: 15px; background: rgba(255,255,255,0.8); border-radius: 8px;">
575
- <strong>🧠 How the Enhanced System Works:</strong>
576
- <br>1. <strong>Neural Analysis:</strong> Wav2Vec2 extracts 768-dimensional voice features
577
- <br>2. <strong>Speaker Matching:</strong> Finds similar voices in a large speaker database
578
- <br>3. <strong>Feature Fusion:</strong> Combines 60+ acoustic characteristics (pitch, formants, spectral features)
579
- <br>4. <strong>Voice Synthesis:</strong> SpeechT5 generates speech using your personalized voice embedding
580
- </div>
581
- </div>
582
- """)
583
-
584
- # Event handlers
585
- voice_upload.change(
586
- fn=process_voice_upload,
587
- inputs=[voice_upload],
588
- outputs=[upload_status, use_cloned_voice, generate_btn]
589
- )
590
-
591
- text_input.change(
592
- fn=update_generate_button,
593
- inputs=[text_input, use_cloned_voice],
594
- outputs=[generate_btn]
595
- )
596
-
597
- use_cloned_voice.change(
598
- fn=update_generate_button,
599
- inputs=[text_input, use_cloned_voice],
600
- outputs=[generate_btn]
601
- )
602
 
603
- generate_btn.click(
604
- fn=generate_speech,
605
- inputs=[text_input, use_cloned_voice],
606
- outputs=[output_audio, generation_status]
607
- )
608
 
609
- clear_btn.click(
610
- fn=clear_voice_profile,
611
- outputs=[upload_status, use_cloned_voice, generate_btn]
612
- )
 
613
 
614
- # Launch configuration
615
  if __name__ == "__main__":
616
- print("🌟 Starting Enhanced Voice Cloning TTS System...")
617
- demo.launch(
618
- share=True
619
- )
 
20
  class VoiceCloningTTS:
21
  def __init__(self):
22
  """Initialize the TTS system with SpeechT5 model"""
 
23
  self.device = torch.device("cpu")
24
  print(f"Using device: {self.device}")
25
 
26
  try:
 
27
  print("Loading SpeechT5 processor...")
28
  self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
29
 
 
37
  self.vocoder.to(self.device)
38
  self.vocoder.eval()
39
 
 
40
  print("Loading Wav2Vec2 for speaker embedding...")
41
  self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
42
  self.wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
43
  self.wav2vec2_model.to(self.device)
44
  self.wav2vec2_model.eval()
45
 
 
46
  print("Loading speaker embeddings dataset...")
47
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
48
  self.speaker_embeddings_dataset = embeddings_dataset
 
60
  def preprocess_audio(self, audio_path):
61
  """Preprocess audio for better speaker embedding extraction"""
62
  try:
 
63
  waveform, sample_rate = torchaudio.load(audio_path)
 
 
64
  if waveform.shape[0] > 1:
65
  waveform = torch.mean(waveform, dim=0, keepdim=True)
 
 
66
  if sample_rate != self.sample_rate:
67
  resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
68
  waveform = resampler(waveform)
 
 
69
  waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
 
 
70
  min_length = 3 * self.sample_rate
71
  if waveform.shape[1] < min_length:
 
72
  repeat_times = int(np.ceil(min_length / waveform.shape[1]))
73
  waveform = waveform.repeat(1, repeat_times)[:, :min_length]
 
 
74
  max_length = 20 * self.sample_rate
75
  if waveform.shape[1] > max_length:
76
  waveform = waveform[:, :max_length]
 
77
  return waveform.squeeze()
 
78
  except Exception as e:
79
  print(f"Error in audio preprocessing: {e}")
80
  raise e
 
83
  """Extract speaker embedding using advanced methods"""
84
  try:
85
  print(f"Processing audio file: {audio_path}")
 
 
86
  audio_tensor = self.preprocess_audio(audio_path)
87
  audio_numpy = audio_tensor.numpy()
88
 
89
  print("Extracting deep audio features with Wav2Vec2...")
 
 
90
  with torch.no_grad():
91
+ inputs = self.wav2vec2_processor(audio_numpy, sampling_rate=self.sample_rate, return_tensors="pt", padding=True)
 
 
 
 
 
 
 
 
92
  outputs = self.wav2vec2_model(inputs.input_values.to(self.device))
93
+ speaker_features = torch.mean(outputs.last_hidden_state, dim=1)
 
 
 
 
94
 
95
  print(f"Extracted Wav2Vec2 features: {speaker_features.shape}")
 
 
96
  best_embedding = self.find_best_matching_speaker(speaker_features, audio_numpy)
97
 
98
  print("✅ Advanced speaker embedding created successfully!")
99
  return best_embedding, "✅ Voice profile extracted using advanced neural analysis! You can now generate speech in this voice."
 
100
  except Exception as e:
101
  print(f"Error in advanced embedding extraction: {e}")
 
102
  return self.extract_speaker_embedding_improved(audio_path)
103
 
104
  def find_best_matching_speaker(self, target_features, audio_numpy):
105
+ """Create a modified embedding based on acoustic features"""
106
  try:
 
107
  mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13)
108
  pitch, _ = librosa.piptrack(y=audio_numpy, sr=self.sample_rate)
109
  spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
110
 
 
111
  acoustic_signature = np.concatenate([
112
  np.mean(mfccs, axis=1),
113
  np.std(mfccs, axis=1),
 
115
  [np.mean(spectral_centroids)]
116
  ])
117
 
 
 
 
118
  best_embedding = self.default_speaker_embeddings
119
+ modification_factor = 0.3 # Increased for more distinct voice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  feature_mod = torch.tensor(acoustic_signature[:best_embedding.shape[1]], dtype=torch.float32).to(self.device)
 
 
121
  feature_mod = (feature_mod - torch.mean(feature_mod)) / (torch.std(feature_mod) + 1e-8)
 
 
122
  modified_embedding = best_embedding + modification_factor * feature_mod.unsqueeze(0)
 
 
123
  modified_embedding = torch.nn.functional.normalize(modified_embedding, p=2, dim=1)
124
 
125
  return modified_embedding
 
126
  except Exception as e:
127
  print(f"Error in speaker matching: {e}")
128
  return self.default_speaker_embeddings
 
131
  """Improved speaker embedding extraction with better acoustic analysis"""
132
  try:
133
  print("Using improved speaker embedding extraction...")
 
 
134
  audio_tensor = self.preprocess_audio(audio_path)
135
  audio_numpy = audio_tensor.numpy()
136
 
 
137
  print("Extracting comprehensive acoustic features...")
 
 
138
  mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=20)
139
  delta_mfccs = librosa.feature.delta(mfccs)
140
  delta2_mfccs = librosa.feature.delta(mfccs, order=2)
141
+ f0, _, _ = librosa.pyin(audio_numpy, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
 
 
 
 
142
  f0_clean = f0[~np.isnan(f0)]
 
 
143
  spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
144
  spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate)
145
  spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate)
146
  spectral_contrast = librosa.feature.spectral_contrast(y=audio_numpy, sr=self.sample_rate)
 
 
147
  lpc_coeffs = librosa.lpc(audio_numpy, order=16)
148
 
 
149
  features = np.concatenate([
150
  np.mean(mfccs, axis=1),
151
  np.std(mfccs, axis=1),
 
157
  [np.mean(spectral_bandwidth)],
158
  [np.mean(spectral_rolloff)],
159
  np.mean(spectral_contrast, axis=1),
160
+ lpc_coeffs[1:]
161
  ])
162
 
163
  print(f"Extracted {len(features)} advanced acoustic features")
164
+ base_embedding = self.default_speaker_embeddings
165
+ embedding_size = base_embedding.shape[1]
166
+ features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8)
167
+
168
+ if len(features_normalized) > embedding_size:
169
+ modification_vector = features_normalized[:embedding_size]
170
+ else:
171
+ modification_vector = np.pad(features_normalized, (0, embedding_size - len(features_normalized)), 'reflect')
172
 
173
+ modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device)
174
+ modification_strength = 0.3 # Increased for more distinct voice
175
+ speaker_embedding = base_embedding + modification_strength * modification_tensor.unsqueeze(0)
176
 
177
+ if len(f0_clean) > 0:
178
+ pitch_factor = np.mean(f0_clean) / 200.0
179
+ pitch_modification = 0.05 * (pitch_factor - 1.0)
180
+ speaker_embedding = speaker_embedding * (1.0 + pitch_modification)
 
 
181
 
182
+ speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1)
183
+ return speaker_embedding, "✅ Voice profile extracted with enhanced acoustic analysis! Ready for speech generation."
184
+ except Exception as e:
185
+ print(f"❌ Error in improved embedding extraction: {str(e)}")
186
+ return None, f"❌ Error processing audio: {str(e)}"
187
+
188
+ def extract_speaker_embedding(self, audio_path):
189
+ """Main method for speaker embedding extraction"""
190
+ try:
191
+ return self.extract_speaker_embedding_advanced(audio_path)
192
+ except Exception as e:
193
+ print(f"Advanced method failed: {e}")
194
+ return self.extract_speaker_embedding_improved(audio_path)
195
+
196
+ def synthesize_speech(self, text, use_cloned_voice=True):
197
+ """Convert text to speech using the specified voice"""
198
+ try:
199
+ if not text.strip():
200
+ return None, "❌ Please enter some text to convert."
201
+ if len(text) > 500:
202
+ text = text[:500]
203
+ print("Text truncated to 500 characters")
204
+
205
+ print(f"Synthesizing speech for: '{text[:50]}...'")
206
+ if use_cloned_voice and self.user_speaker_embeddings is not None:
207
+ speaker_embeddings = self.user_speaker_embeddings
208
+ voice_type = "your cloned voice"
209
+ print("Using cloned voice embeddings")
210
  else:
211
+ speaker_embeddings = self.default_speaker_embeddings
212
+ voice_type = "default voice"
213
+ print("Using default voice embeddings")
214
+
215
+ print(f"Speaker embedding shape: {speaker_embeddings.shape}")
216
+ inputs = self.processor(text=text, return_tensors="pt")
217
+ input_ids = inputs["input_ids"].to(self.device)
218
+
219
+ print("Generating speech...")
220
+ with torch.no_grad():
221
+ speaker_embeddings = speaker_embeddings.to(self.device)
222
+ if speaker_embeddings.dim() == 1:
223
+ speaker_embeddings = speaker_embeddings.unsqueeze(0)
224
+ speech = self.model.generate_speech(input_ids, speaker_embeddings, vocoder=self.vocoder)
225
+
226
+ speech_numpy = speech.cpu().numpy()
227
+ print(f"Generated audio shape: {speech_numpy.shape}")
228
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
229
+ sf.write(tmp_file.name, speech_numpy, self.sample_rate)
230
+ print(f"Audio saved to: {tmp_file.name}")
231
+ del speech, input_ids
232
+ gc.collect()
233
+ return tmp_file.name, f"✅ Speech generated successfully using {voice_type}!"
234
+ except Exception as e:
235
+ print(f"❌ Error in synthesize_speech: {str(e)}")
236
+ return Nail, f"❌ Error generating speech: {str(e)}"
237
+
238
+ print("🚀 Initializing Enhanced Voice Cloning TTS System...")
239
+ tts_system = VoiceCloningTTS()
240
+
241
+ def process_voice_upload(audio_file):
242
+ if audio_file is None:
243
+ return "❌ Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False)
244
+ try:
245
+ print(f"Processing uploaded file: {audio_file}")
246
+ speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file)
247
+ if speaker_embedding is not None:
248
+ tts_system.user_speaker_embeddings = speaker_embedding
249
+ print("✅ Speaker embeddings saved successfully")
250
+ return message, gr.update(interactive=True), gr.update(interactive=True)
251
+ else:
252
+ return message, gr.update(interactive=False), gr.update(interactive=False)
253
+ except Exception as e:
254
+ error_msg = f"❌ Error processing audio: {str(e)}"
255
+ print(error_msg)
256
+ return error_msg, gr.update(interactive=False), gr.update(interactive=False)
257
+
258
+ def generate_speech(text, use_cloned_voice):
259
+ Rosin 42 recommends that when working with audio, you should ensure that the audio file is in a format compatible with `torchaudio.load()`, such as WAV, and that the sample rate matches the expected 16kHz. Here's a solution that should ensure the cloned voice is used correctly:
260
+
261
+ ```python
262
+ import gradio as gr
263
+ import torch
264
+ import torchaudio
265
+ import numpy as np
266
+ import tempfile
267
+ import os
268
+ from pathlib import Path
269
+ import librosa
270
+ import soundfile as sf
271
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
272
+ from transformers import Wav2Vec2Processor, Wav2Vec2Model
273
+ from datasets import load_dataset
274
+ import warnings
275
+ import gc
276
+
277
+ warnings.filterwarnings("ignore")
278
+
279
+ class VoiceCloningTTS:
280
+ def __init__(self):
281
+ self.device = torch.device("cpu")
282
+ print(f"Using device: {self.device}")
283
+
284
+ try:
285
+ print("Loading SpeechT5 processor...")
286
+ self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
287
+
288
+ print("Loading SpeechT5 TTS model...")
289
+ self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
290
+ self.model.to(self.device)
291
+ self.model.eval()
292
+
293
+ print("Loading SpeechT5 vocoder...")
294
+ self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
295
+ self.vocoder.to(self.device)
296
+ self.vocoder.eval()
297
+
298
+ print("Loading Wav2Vec2 for speaker embedding...")
299
+ self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
300
+ self.wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
301
+ self.wav2vec2_model.to(self.device)
302
+ self.wav2vec2_model.eval()
303
+
304
+ print("Loading speaker embeddings dataset...")
305
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
306
+ self.speaker_embeddings_dataset = embeddings_dataset
307
+ self.default_speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
308
+
309
+ self.user_speaker_embeddings = None
310
+ self.sample_rate = 16000
311
+
312
+ print("✅ TTS system initialized successfully!")
313
+ except Exception as e:
314
+ print(f"❌ Error initializing TTS system: {str(e)}")
315
+ raise e
316
+
317
+ def preprocess_audio(self, audio_path):
318
+ try:
319
+ waveform, sample_rate = torchaudio.load(audio_path)
320
+ if waveform.shape[0] > 1:
321
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
322
+ if sample_rate != self.sample_rate:
323
+ resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
324
+ waveform = resampler(waveform)
325
+ waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
326
+ min_length = 3 * self.sample_rate
327
+ if waveform.shape[1] < min_length:
328
+ repeat_times = int(np.ceil(min_length / waveform.shape[1]))
329
+ waveform = waveform.repeat(1, repeat_times)[:, :min_length]
330
+ max_length = 20 * self.sample_rate
331
+ if waveform.shape[1] > max_length:
332
+ waveform = waveform[:, :max_length]
333
+ return waveform.squeeze()
334
+ except Exception as e:
335
+ print(f"Error in audio preprocessing: {e}")
336
+ raise e
337
+
338
+ def extract_speaker_embedding_advanced(self, audio_path):
339
+ try:
340
+ print(f"Processing audio file: {audio_path}")
341
+ audio_tensor = self.preprocess_audio(audio_path)
342
+ audio_numpy = audio_tensor.numpy()
343
+
344
+ print("Extracting deep audio features with Wav2Vec2...")
345
+ with torch.no_grad():
346
+ inputs = self.wav2vec2_processor(audio_numpy, sampling_rate=self.sample_rate, return_tensors="pt", padding=True)
347
+ outputs = self.wav2vec2_model(inputs.input_values.to(self.device))
348
+ speaker_features = torch.mean(outputs.last_hidden_state, dim=1)
349
+
350
+ print(f"Extracted Wav2Vec2 features: {speaker_features.shape}")
351
+ best_embedding = self.find_best_matching_speaker(speaker_features, audio_numpy)
352
+
353
+ print("✅ Advanced speaker embedding created successfully!")
354
+ return best_embedding, "✅ Voice profile extracted using advanced neural analysis!"
355
+ except Exception as e:
356
+ print(f"Error in advanced embedding extraction: {e}")
357
+ return self.extract_speaker_embedding_improved(audio_path)
358
+
359
+ def find_best_matching_speaker(self, target_features, audio_numpy):
360
+ try:
361
+ mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13)
362
+ pitch, _ = librosa.piptrack(y=audio_numpy, sr=self.sample_rate)
363
+ spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
364
+
365
+ acoustic_signature = np.concatenate([
366
+ np.mean(mfccs, axis=1),
367
+ np.std(mfccs, axis=1),
368
+ [np.mean(pitch[pitch > 0]) if np.any(pitch > 0) else 200],
369
+ [np.mean(spectral_centroids)]
370
+ ])
371
+
372
+ best_embedding = self.default_speaker_embeddings
373
+ modification_factor = 0.3 # Increased for more distinct voice
374
+ feature_mod = torch.tensor(acoustic_signature[:best_embedding.shape[1]], dtype=torch.float32).to(self.device)
375
+ feature_mod = (feature_mod - torch.mean(feature_mod)) / (torch.std(feature_mod) + 1e-8)
376
+ modified_embedding = best_embedding + modification_factor * feature_mod.unsqueeze(0)
377
+ modified_embedding = torch.nn.functional.normalize(modified_embedding, p=2, dim=1)
378
+
379
+ return modified_embedding
380
+ except Exception as e:
381
+ print(f"Error in speaker matching: {e}")
382
+ return self.default_speaker_embeddings
383
+
384
+ def extract_speaker_embedding_improved(self, audio_path):
385
+ try:
386
+ print("Using improved speaker embedding extraction...")
387
+ audio_tensor = self.preprocess_audio(audio_path)
388
+ audio_numpy = audio_tensor.numpy()
389
 
390
+ print("Extracting comprehensive acoustic features...")
391
+ mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=20)
392
+ delta_mfccs = librosa.feature.delta(mfccs)
393
+ delta2_mfccs = librosa.feature.delta(mfccs, order=2)
394
+ f0, _, _ = librosa.pyin(audio_numpy, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
395
+ f0_clean = f0[~np.isnan(f0)]
396
+ spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
397
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate)
398
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate)
399
+ spectral_contrast = librosa.feature.spectral_contrast(y=audio_numpy, sr=self.sample_rate)
400
+ lpc_coeffs = librosa.lpc(audio_numpy, order=16)
401
 
402
+ features = np.concatenate([
403
+ np.mean(mfccs, axis=1),
404
+ np.std(mfccs, axis=1),
405
+ np.mean(delta_mfccs, axis=1),
406
+ np.mean(delta2_mfccs, axis=1),
407
+ [np.mean(f0_clean) if len(f0_clean) > 0 else 200],
408
+ [np.std(f0_clean) if len(f0_clean) > 0 else 50],
409
+ [np.mean(spectral_centroids)],
410
+ [np.mean(spectral_bandwidth)],
411
+ [np.mean(spectral_rolloff)],
412
+ np.mean(spectral_contrast, axis=1),
413
+ lpc_coeffs[1:]
414
+ ])
415
+
416
+ print(f"Extracted {len(features)} advanced acoustic features")
417
+ base_embedding = self.default_speaker_embeddings
418
+ embedding_size = base_embedding.shape[1]
419
  features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8)
420
 
421
  if len(features_normalized) > embedding_size:
422
  modification_vector = features_normalized[:embedding_size]
423
  else:
424
+ modification_vector = np.pad(features_normalized, (0, embedding_size - len(features_normalized)), 'reflect')
 
 
425
 
426
  modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device)
427
+ modification_strength = 0.3 # Increased for more distinct voice
428
+ speaker_embedding = base_embedding + modification_strength * modification_tensor.unsqueeze(0)
429
 
 
 
 
 
 
430
  if len(f0_clean) > 0:
431
+ pitch_factor = np.mean(f0_clean) / 200.0
432
  pitch_modification = 0.05 * (pitch_factor - 1.0)
433
  speaker_embedding = speaker_embedding * (1.0 + pitch_modification)
434
 
 
435
  speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1)
436
+ return speaker_embedding, "✅ Voice profile extracted with enhanced acoustic analysis!"
 
 
437
  except Exception as e:
438
  print(f"❌ Error in improved embedding extraction: {str(e)}")
439
  return None, f"❌ Error processing audio: {str(e)}"
440
 
441
  def extract_speaker_embedding(self, audio_path):
 
442
  try:
443
+ return self.extract_speaker_embedding_advanced(audio_path)
 
 
444
  except Exception as e:
445
  print(f"Advanced method failed: {e}")
 
446
  return self.extract_speaker_embedding_improved(audio_path)
447
 
448
  def synthesize_speech(self, text, use_cloned_voice=True):
 
449
  try:
450
  if not text.strip():
451
  return None, "❌ Please enter some text to convert."
 
 
452
  if len(text) > 500:
453
  text = text[:500]
454
  print("Text truncated to 500 characters")
455
 
456
  print(f"Synthesizing speech for: '{text[:50]}...'")
 
 
457
  if use_cloned_voice and self.user_speaker_embeddings is not None:
458
  speaker_embeddings = self.user_speaker_embeddings
459
  voice_type = "your cloned voice"
 
464
  print("Using default voice embeddings")
465
 
466
  print(f"Speaker embedding shape: {speaker_embeddings.shape}")
 
 
467
  inputs = self.processor(text=text, return_tensors="pt")
468
  input_ids = inputs["input_ids"].to(self.device)
469
 
470
  print("Generating speech...")
 
 
471
  with torch.no_grad():
 
472
  speaker_embeddings = speaker_embeddings.to(self.device)
473
  if speaker_embeddings.dim() == 1:
474
  speaker_embeddings = speaker_embeddings.unsqueeze(0)
475
+ speech = self.model.generate_speech(input_ids, speaker_embeddings, vocoder=self.vocoder)
 
 
 
 
 
 
 
476
 
 
477
  speech_numpy = speech.cpu().numpy()
 
478
  print(f"Generated audio shape: {speech_numpy.shape}")
 
 
479
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
480
  sf.write(tmp_file.name, speech_numpy, self.sample_rate)
481
  print(f"Audio saved to: {tmp_file.name}")
 
 
482
  del speech, input_ids
483
  gc.collect()
 
484
  return tmp_file.name, f"✅ Speech generated successfully using {voice_type}!"
 
485
  except Exception as e:
486
  print(f"❌ Error in synthesize_speech: {str(e)}")
487
  return None, f"❌ Error generating speech: {str(e)}"
488
 
489
+ print("🚀 Initializing Voice Cloning TTS System...")
 
490
  tts_system = VoiceCloningTTS()
491
 
492
  def process_voice_upload(audio_file):
 
493
  if audio_file is None:
494
  return "❌ Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False)
 
495
  try:
496
  print(f"Processing uploaded file: {audio_file}")
497
  speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file)
 
498
  if speaker_embedding is not None:
499
  tts_system.user_speaker_embeddings = speaker_embedding
500
  print("✅ Speaker embeddings saved successfully")
 
507
  return error_msg, gr.update(interactive=False), gr.update(interactive=False)
508
 
509
  def generate_speech(text, use_cloned_voice):
 
510
  if not text.strip():
511
  return None, "❌ Please enter some text to convert."
 
512
  try:
513
  print(f"Generating speech - Use cloned voice: {use_cloned_voice}")
514
  audio_file, message = tts_system.synthesize_speech(text, use_cloned_voice)
 
519
  return None, error_msg
520
 
521
  def clear_voice_profile():
 
522
  tts_system.user_speaker_embeddings = None
523
+ return "🔄 Voice profile cleared.", gr.update(interactive=False), gr.update(interactive=False)
 
 
524
 
525
  def update_generate_button(text, use_cloned):
 
526
  text_ready = bool(text.strip())
527
  voice_ready = (not use_cloned) or (tts_system.user_speaker_embeddings is not None)
528
  return gr.update(interactive=text_ready and voice_ready)
529
 
530
+ with gr.Blocks(title="Voice Cloning TTS System") as demo:
531
+ gr.Markdown("# Voice Cloning TTS System")
532
+ gr.Markdown("Upload an audio file to clone your voice and generate speech.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
 
534
  with gr.Row():
535
+ with gr.Column():
536
+ voice_upload = gr.Audio(label="Upload Voice Sample", type="filepath", sources=["upload", "microphone"])
537
+ upload_status = gr.Textbox(label="Status", interactive=False)
538
+ clear_btn = gr.Button("Clear Voice Profile")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  with gr.Column():
541
+ text_input = gr.Textbox(label="Text to Convert", lines=5)
542
+ use_cloned_voice = gr.Checkbox(label="Use Cloned Voice", value=True, interactive=False)
543
+ generate_btn = gr.Button("Generate Speech", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
+ output_audio = gr.Audio(label="Generated Speech", type="filepath")
546
+ generation_status = gr.Textbox(label="Generation Status", interactive=False)
 
 
 
547
 
548
+ voice_upload.change(fn=process_voice_upload, inputs=[voice_upload], outputs=[upload_status, use_cloned_voice, generate_btn])
549
+ text_input.change(fn=update_generate_button, inputs=[text_input, use_cloned_voice], outputs=[generate_btn])
550
+ use_cloned_voice.change(fn=update_generate_button, inputs=[text_input, use_cloned_voice], outputs=[generate_btn])
551
+ generate_btn.click(fn=generate_speech, inputs=[text_input, use_cloned_voice], outputs=[output_audio, generation_status])
552
+ clear_btn.click(fn=clear_voice_profile, outputs=[upload_status, use_cloned_voice, generate_btn])
553
 
 
554
  if __name__ == "__main__":
555
+ print("🌟 Starting Voice Cloning TTS System...")
556
+ demo.launch()