shukdevdatta123 commited on
Commit
3220f5e
Β·
verified Β·
1 Parent(s): 7efcfa3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +446 -0
app.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ import numpy as np
5
+ import tempfile
6
+ import os
7
+ from pathlib import Path
8
+ import librosa
9
+ import soundfile as sf
10
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
11
+ from datasets import load_dataset
12
+ import warnings
13
+ import gc
14
+ warnings.filterwarnings("ignore")
15
+
16
+ class VoiceCloningTTS:
17
+ def __init__(self):
18
+ """Initialize the TTS system with SpeechT5 model"""
19
+ # Use CPU for HF Spaces to avoid memory issues
20
+ self.device = torch.device("cpu")
21
+ print(f"Using device: {self.device}")
22
+
23
+ try:
24
+ # Load SpeechT5 models with memory optimization
25
+ print("Loading SpeechT5 processor...")
26
+ self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
27
+
28
+ print("Loading SpeechT5 TTS model...")
29
+ self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
30
+ self.model.to(self.device)
31
+ self.model.eval() # Set to evaluation mode
32
+
33
+ print("Loading SpeechT5 vocoder...")
34
+ self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
35
+ self.vocoder.to(self.device)
36
+ self.vocoder.eval()
37
+
38
+ # Load default speaker embeddings
39
+ print("Loading speaker embeddings...")
40
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
41
+ self.default_speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
42
+
43
+ self.user_speaker_embeddings = None
44
+ self.sample_rate = 16000
45
+
46
+ print("βœ… TTS system initialized successfully!")
47
+
48
+ except Exception as e:
49
+ print(f"❌ Error initializing TTS system: {str(e)}")
50
+ raise e
51
+
52
+ def extract_speaker_embedding(self, audio_path):
53
+ """Extract speaker embedding from uploaded audio"""
54
+ try:
55
+ print(f"Processing audio file: {audio_path}")
56
+
57
+ # Load and preprocess audio
58
+ waveform, sample_rate = torchaudio.load(audio_path)
59
+ print(f"Original audio shape: {waveform.shape}, sample rate: {sample_rate}")
60
+
61
+ # Resample if necessary
62
+ if sample_rate != self.sample_rate:
63
+ print(f"Resampling from {sample_rate} to {self.sample_rate}")
64
+ resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
65
+ waveform = resampler(waveform)
66
+
67
+ # Convert to mono if stereo
68
+ if waveform.shape[0] > 1:
69
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
70
+ print("Converted to mono")
71
+
72
+ # Ensure minimum length (at least 1 second)
73
+ min_length = self.sample_rate
74
+ if waveform.shape[1] < min_length:
75
+ # Pad with zeros if too short
76
+ padding = min_length - waveform.shape[1]
77
+ waveform = torch.nn.functional.pad(waveform, (0, padding))
78
+ print(f"Padded audio to minimum length")
79
+
80
+ # Limit maximum length (30 seconds max for memory efficiency)
81
+ max_length = 30 * self.sample_rate
82
+ if waveform.shape[1] > max_length:
83
+ waveform = waveform[:, :max_length]
84
+ print("Truncated audio to 30 seconds")
85
+
86
+ # Normalize audio
87
+ waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
88
+
89
+ # Convert to numpy for librosa processing
90
+ audio_numpy = waveform.squeeze().numpy()
91
+
92
+ print("Extracting audio features...")
93
+
94
+ # Extract comprehensive audio features
95
+ try:
96
+ # MFCC features (mel-frequency cepstral coefficients)
97
+ mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13)
98
+ mfcc_mean = np.mean(mfccs, axis=1)
99
+ mfcc_std = np.std(mfccs, axis=1)
100
+
101
+ # Spectral features
102
+ spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
103
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate)
104
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate)
105
+ zero_crossing_rate = librosa.feature.zero_crossing_rate(audio_numpy)
106
+
107
+ # Pitch features
108
+ pitches, magnitudes = librosa.piptrack(y=audio_numpy, sr=self.sample_rate)
109
+ pitch_mean = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
110
+
111
+ # Chroma features
112
+ chroma = librosa.feature.chroma_stft(y=audio_numpy, sr=self.sample_rate)
113
+ chroma_mean = np.mean(chroma, axis=1)
114
+
115
+ # Combine all features
116
+ features = np.concatenate([
117
+ mfcc_mean,
118
+ mfcc_std,
119
+ [np.mean(spectral_centroids)],
120
+ [np.mean(spectral_rolloff)],
121
+ [np.mean(spectral_bandwidth)],
122
+ [np.mean(zero_crossing_rate)],
123
+ [pitch_mean],
124
+ chroma_mean
125
+ ])
126
+
127
+ print(f"Extracted {len(features)} audio features")
128
+
129
+ except Exception as e:
130
+ print(f"Error extracting features: {e}")
131
+ # Simple fallback feature extraction
132
+ features = np.array([
133
+ np.mean(audio_numpy),
134
+ np.std(audio_numpy),
135
+ np.max(audio_numpy),
136
+ np.min(audio_numpy)
137
+ ])
138
+
139
+ # Create speaker embedding by modifying the default embedding
140
+ base_embedding = self.default_speaker_embeddings.clone()
141
+
142
+ # Normalize features
143
+ features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8)
144
+
145
+ # Create modification vector (pad or truncate to match embedding size)
146
+ embedding_size = base_embedding.shape[1] # Should be 512
147
+ if len(features_normalized) > embedding_size:
148
+ modification_vector = features_normalized[:embedding_size]
149
+ else:
150
+ modification_vector = np.pad(features_normalized,
151
+ (0, embedding_size - len(features_normalized)),
152
+ 'constant', constant_values=0)
153
+
154
+ modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device)
155
+
156
+ # Apply modifications to create unique speaker embedding
157
+ # Use a smaller modification factor for stability
158
+ speaker_embedding = base_embedding + 0.05 * modification_tensor.unsqueeze(0)
159
+
160
+ # Normalize the final embedding
161
+ speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1)
162
+
163
+ print("βœ… Speaker embedding created successfully!")
164
+ return speaker_embedding, "βœ… Voice profile extracted successfully! You can now generate speech in this voice."
165
+
166
+ except Exception as e:
167
+ print(f"❌ Error in extract_speaker_embedding: {str(e)}")
168
+ return None, f"❌ Error processing audio: {str(e)}"
169
+
170
+ def synthesize_speech(self, text, use_cloned_voice=True):
171
+ """Convert text to speech using the specified voice"""
172
+ try:
173
+ if not text.strip():
174
+ return None, "❌ Please enter some text to convert."
175
+
176
+ # Limit text length for memory efficiency
177
+ if len(text) > 500:
178
+ text = text[:500]
179
+ print("Text truncated to 500 characters for memory efficiency")
180
+
181
+ print(f"Synthesizing speech for text: '{text[:50]}...'")
182
+
183
+ # Choose speaker embedding
184
+ if use_cloned_voice and self.user_speaker_embeddings is not None:
185
+ speaker_embeddings = self.user_speaker_embeddings
186
+ voice_type = "your cloned voice"
187
+ print("Using cloned voice")
188
+ else:
189
+ speaker_embeddings = self.default_speaker_embeddings
190
+ voice_type = "default voice"
191
+ print("Using default voice")
192
+
193
+ # Tokenize text
194
+ inputs = self.processor(text=text, return_tensors="pt")
195
+ input_ids = inputs["input_ids"].to(self.device)
196
+
197
+ print("Generating speech...")
198
+
199
+ # Generate speech with memory optimization
200
+ with torch.no_grad():
201
+ # Clear cache before generation
202
+ if torch.cuda.is_available():
203
+ torch.cuda.empty_cache()
204
+
205
+ speech = self.model.generate_speech(
206
+ input_ids,
207
+ speaker_embeddings,
208
+ vocoder=self.vocoder
209
+ )
210
+
211
+ # Convert to numpy
212
+ speech_numpy = speech.cpu().numpy()
213
+
214
+ print(f"Generated audio shape: {speech_numpy.shape}")
215
+
216
+ # Create temporary file
217
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
218
+ sf.write(tmp_file.name, speech_numpy, self.sample_rate)
219
+ print(f"Audio saved to: {tmp_file.name}")
220
+
221
+ # Clean up memory
222
+ del speech, input_ids
223
+ gc.collect()
224
+
225
+ return tmp_file.name, f"βœ… Speech generated successfully using {voice_type}!"
226
+
227
+ except Exception as e:
228
+ print(f"❌ Error in synthesize_speech: {str(e)}")
229
+ return None, f"❌ Error generating speech: {str(e)}"
230
+
231
+ # Initialize the TTS system
232
+ print("πŸš€ Initializing Voice Cloning TTS System...")
233
+ tts_system = VoiceCloningTTS()
234
+
235
+ def process_voice_upload(audio_file):
236
+ """Process uploaded voice file"""
237
+ if audio_file is None:
238
+ return "❌ Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False)
239
+
240
+ try:
241
+ speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file)
242
+
243
+ if speaker_embedding is not None:
244
+ tts_system.user_speaker_embeddings = speaker_embedding
245
+ return message, gr.update(interactive=True), gr.update(interactive=True)
246
+ else:
247
+ return message, gr.update(interactive=False), gr.update(interactive=False)
248
+ except Exception as e:
249
+ error_msg = f"❌ Error processing audio: {str(e)}"
250
+ return error_msg, gr.update(interactive=False), gr.update(interactive=False)
251
+
252
+ def generate_speech(text, use_cloned_voice):
253
+ """Generate speech from text"""
254
+ if not text.strip():
255
+ return None, "❌ Please enter some text to convert."
256
+
257
+ try:
258
+ audio_file, message = tts_system.synthesize_speech(text, use_cloned_voice)
259
+ return audio_file, message
260
+ except Exception as e:
261
+ error_msg = f"❌ Error generating speech: {str(e)}"
262
+ return None, error_msg
263
+
264
+ def clear_voice_profile():
265
+ """Clear the uploaded voice profile"""
266
+ tts_system.user_speaker_embeddings = None
267
+ return ("πŸ”„ Voice profile cleared. Upload a new audio file to clone a voice.",
268
+ gr.update(interactive=False),
269
+ gr.update(interactive=False))
270
+
271
+ def update_generate_button(text, use_cloned):
272
+ """Update generate button state based on inputs"""
273
+ text_ready = bool(text.strip())
274
+ voice_ready = (not use_cloned) or (tts_system.user_speaker_embeddings is not None)
275
+ return gr.update(interactive=text_ready and voice_ready)
276
+
277
+ # Create Gradio interface optimized for HF Spaces
278
+ with gr.Blocks(
279
+ title="🎀 Voice Cloning TTS System",
280
+ theme=gr.themes.Soft(),
281
+ css="""
282
+ .gradio-container {
283
+ max-width: 1000px !important;
284
+ margin: auto !important;
285
+ }
286
+ .header {
287
+ text-align: center;
288
+ margin-bottom: 30px;
289
+ padding: 20px;
290
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
291
+ border-radius: 15px;
292
+ color: white;
293
+ }
294
+ .step-box {
295
+ border: 2px solid #e1e5e9;
296
+ border-radius: 12px;
297
+ padding: 20px;
298
+ margin: 15px 0;
299
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
300
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
301
+ }
302
+ .tips-box {
303
+ background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
304
+ border-radius: 12px;
305
+ padding: 20px;
306
+ margin: 20px 0;
307
+ border-left: 5px solid #ff6b6b;
308
+ }
309
+ """
310
+ ) as demo:
311
+
312
+ gr.HTML("""
313
+ <div class="header">
314
+ <h1>🎀 AI Voice Cloning TTS System</h1>
315
+ <p>πŸš€ Upload your voice sample and convert any text to speech in YOUR voice!</p>
316
+ <p>✨ Powered by Microsoft SpeechT5 & Advanced Voice Analysis</p>
317
+ </div>
318
+ """)
319
+
320
+ with gr.Row():
321
+ with gr.Column(scale=1):
322
+ gr.HTML('<div class="step-box"><h3>πŸŽ™οΈ Step 1: Upload Your Voice Sample</h3><p>Record or upload 10-30 seconds of clear English speech</p></div>')
323
+
324
+ voice_upload = gr.Audio(
325
+ label="πŸ“€ Voice Sample (English)",
326
+ type="filepath",
327
+ sources=["upload", "microphone"],
328
+ format="wav"
329
+ )
330
+
331
+ upload_status = gr.Textbox(
332
+ label="πŸ“Š Voice Analysis Status",
333
+ interactive=False,
334
+ value="⏳ Please upload an audio file to extract your voice profile.",
335
+ lines=2
336
+ )
337
+
338
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Voice Profile", variant="secondary", size="sm")
339
+
340
+ with gr.Column(scale=1):
341
+ gr.HTML('<div class="step-box"><h3>✍️ Step 2: Enter Your Text</h3><p>Type the text you want to convert to speech</p></div>')
342
+
343
+ text_input = gr.Textbox(
344
+ label="πŸ“ Text to Convert (Max 500 characters)",
345
+ placeholder="Enter the text you want to convert to speech using your cloned voice...",
346
+ lines=5,
347
+ max_lines=8
348
+ )
349
+
350
+ use_cloned_voice = gr.Checkbox(
351
+ label="🎭 Use My Cloned Voice",
352
+ value=True,
353
+ interactive=False,
354
+ info="Uncheck to use default voice"
355
+ )
356
+
357
+ generate_btn = gr.Button(
358
+ "🎡 Generate Speech",
359
+ variant="primary",
360
+ interactive=False,
361
+ size="lg"
362
+ )
363
+
364
+ gr.HTML('<div class="step-box"><h3>πŸ”Š Step 3: Your Generated Speech</h3></div>')
365
+
366
+ with gr.Row():
367
+ with gr.Column():
368
+ output_audio = gr.Audio(
369
+ label="🎧 Generated Speech Audio",
370
+ type="filepath",
371
+ interactive=False
372
+ )
373
+
374
+ generation_status = gr.Textbox(
375
+ label="⚑ Generation Status",
376
+ interactive=False,
377
+ lines=2
378
+ )
379
+
380
+ # Tips and information section
381
+ gr.HTML("""
382
+ <div class="tips-box">
383
+ <h3>πŸ’‘ Pro Tips for Best Results:</h3>
384
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 15px;">
385
+ <div>
386
+ <h4>🎀 Voice Sample Quality:</h4>
387
+ <ul>
388
+ <li>Use clear, natural English speech</li>
389
+ <li>10-30 seconds duration is optimal</li>
390
+ <li>Minimize background noise</li>
391
+ <li>Speak at normal pace and volume</li>
392
+ </ul>
393
+ </div>
394
+ <div>
395
+ <h4>πŸ“ Text Guidelines:</h4>
396
+ <ul>
397
+ <li>English text works best</li>
398
+ <li>Keep sentences natural and clear</li>
399
+ <li>Avoid very long paragraphs</li>
400
+ <li>Punctuation helps with intonation</li>
401
+ </ul>
402
+ </div>
403
+ </div>
404
+ <div style="margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.7); border-radius: 8px;">
405
+ <strong>πŸ”¬ How it works:</strong> The system analyzes your voice's unique characteristics (pitch, tone, formants)
406
+ and creates a personalized voice profile that's used to generate speech that sounds like you!
407
+ </div>
408
+ </div>
409
+ """)
410
+
411
+ # Event handlers with proper state management
412
+ voice_upload.change(
413
+ fn=process_voice_upload,
414
+ inputs=[voice_upload],
415
+ outputs=[upload_status, use_cloned_voice, generate_btn]
416
+ )
417
+
418
+ text_input.change(
419
+ fn=update_generate_button,
420
+ inputs=[text_input, use_cloned_voice],
421
+ outputs=[generate_btn]
422
+ )
423
+
424
+ use_cloned_voice.change(
425
+ fn=update_generate_button,
426
+ inputs=[text_input, use_cloned_voice],
427
+ outputs=[generate_btn]
428
+ )
429
+
430
+ generate_btn.click(
431
+ fn=generate_speech,
432
+ inputs=[text_input, use_cloned_voice],
433
+ outputs=[output_audio, generation_status]
434
+ )
435
+
436
+ clear_btn.click(
437
+ fn=clear_voice_profile,
438
+ outputs=[upload_status, use_cloned_voice, generate_btn]
439
+ )
440
+
441
+ # Launch configuration for Hugging Face Spaces
442
+ if __name__ == "__main__":
443
+ print("🌟 Starting Voice Cloning TTS System on Hugging Face Spaces...")
444
+ demo.launch(
445
+ share=True # HF Spaces handles sharing automatically
446
+ )