Nick021402 commited on
Commit
02f6d83
Β·
verified Β·
1 Parent(s): 84fefb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -111
app.py CHANGED
@@ -30,17 +30,18 @@ except ImportError:
30
  print("SVC not available, using basic voice conversion")
31
 
32
  class AICoverGenerator:
33
- def __init__(self):
 
34
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
  self.temp_dir = tempfile.mkdtemp()
36
  self.voice_models = {
37
  "drake": "Drake Style Voice",
38
- "ariana": "Ariana Style Voice",
39
  "weeknd": "The Weeknd Style Voice",
40
  "taylor": "Taylor Swift Style Voice",
41
  "custom": "Custom Voice Model"
42
  }
43
-
44
  # Initialize audio separation model
45
  if DEMUCS_AVAILABLE:
46
  try:
@@ -51,24 +52,24 @@ class AICoverGenerator:
51
  self.separation_model = None
52
  else:
53
  self.separation_model = None
54
-
55
  def separate_vocals(self, audio_path: str) -> Tuple[str, str]:
56
  """Separate vocals and instrumentals from audio"""
57
  try:
58
  # Load audio
59
  audio, sr = librosa.load(audio_path, sr=44100, mono=False)
60
-
61
  if self.separation_model and DEMUCS_AVAILABLE:
62
  # Use Demucs for high-quality separation
63
  return self._demucs_separate(audio_path)
64
  else:
65
  # Use basic spectral subtraction
66
  return self._basic_separate(audio, sr)
67
-
68
  except Exception as e:
69
  print(f"Error in vocal separation: {e}")
70
  return None, None
71
-
72
  def _demucs_separate(self, audio_path: str) -> Tuple[str, str]:
73
  """Use Demucs for audio separation"""
74
  try:
@@ -76,220 +77,223 @@ class AICoverGenerator:
76
  audio, sr = librosa.load(audio_path, sr=44100, mono=False)
77
  if audio.ndim == 1:
78
  audio = np.stack([audio, audio])
79
-
80
  # Convert to tensor
81
  audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
82
-
83
  # Apply separation
84
  with torch.no_grad():
85
  sources = apply_model(self.separation_model, audio_tensor)
86
-
87
  # Extract vocals and instrumental
88
  vocals = sources[0, 3].cpu().numpy() # vocals channel
89
  instrumental = sources[0, 0].cpu().numpy() # drums + bass + other
90
-
91
  # Save separated audio
92
  vocals_path = os.path.join(self.temp_dir, "vocals.wav")
93
  instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
94
-
95
  sf.write(vocals_path, vocals.T, 44100)
96
  sf.write(instrumental_path, instrumental.T, 44100)
97
-
98
  return vocals_path, instrumental_path
99
-
100
  except Exception as e:
101
  print(f"Demucs separation error: {e}")
102
  return self._basic_separate(audio, 44100)
103
-
104
  def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]:
105
  """Basic vocal separation using spectral subtraction"""
106
  try:
107
  # Convert to mono if stereo
108
  if audio.ndim > 1:
109
  audio = librosa.to_mono(audio)
110
-
111
  # Compute STFT
112
  stft = librosa.stft(audio, n_fft=2048, hop_length=512)
113
  magnitude, phase = np.abs(stft), np.angle(stft)
114
-
115
  # Simple vocal isolation (center channel extraction)
116
  # This is a basic approach - real implementation would be more sophisticated
117
  vocal_mask = np.ones_like(magnitude)
118
  vocal_mask[:, :magnitude.shape[1]//4] *= 0.3 # Reduce low frequencies
119
  vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3 # Reduce high frequencies
120
-
121
  # Apply mask
122
  vocal_magnitude = magnitude * vocal_mask
123
  instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7)
124
-
125
  # Reconstruct audio
126
  vocal_stft = vocal_magnitude * np.exp(1j * phase)
127
  instrumental_stft = instrumental_magnitude * np.exp(1j * phase)
128
-
129
  vocals = librosa.istft(vocal_stft, hop_length=512)
130
  instrumental = librosa.istft(instrumental_stft, hop_length=512)
131
-
132
  # Save files
133
  vocals_path = os.path.join(self.temp_dir, "vocals.wav")
134
  instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
135
-
136
  sf.write(vocals_path, vocals, sr)
137
  sf.write(instrumental_path, instrumental, sr)
138
-
139
  return vocals_path, instrumental_path
140
-
 
141
  except Exception as e:
142
  print(f"Basic separation error: {e}")
143
  return None, None
144
-
145
  def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str:
146
  """Convert vocals to target voice"""
147
  try:
148
  # Load vocal audio
149
  vocals, sr = librosa.load(vocals_path, sr=44100)
150
-
151
  # Apply pitch shifting if requested
152
  if pitch_shift != 0:
153
  vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift)
154
-
155
  # Simulate voice conversion (in real app, this would use trained models)
156
  converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength)
157
-
158
  # Save converted vocals
159
  converted_path = os.path.join(self.temp_dir, "converted_vocals.wav")
160
  sf.write(converted_path, converted_vocals, sr)
161
-
162
  return converted_path
163
-
164
  except Exception as e:
165
  print(f"Voice conversion error: {e}")
166
  return vocals_path # Return original if conversion fails
167
-
168
  def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray:
169
- """Simulate voice conversion (placeholder for actual model inference)"""
 
170
  # This is a simplified simulation - real implementation would use trained models
171
-
172
  # Apply different effects based on voice model
173
  if voice_model == "drake":
174
  # Simulate Drake's voice characteristics
175
- vocals = self._apply_voice_characteristics(vocals,
176
- pitch_factor=0.85,
177
- formant_shift=-0.1,
178
- roughness=0.3)
179
  elif voice_model == "ariana":
180
  # Simulate Ariana's voice characteristics
181
  vocals = self._apply_voice_characteristics(vocals,
182
- pitch_factor=1.2,
183
- formant_shift=0.2,
184
- breathiness=0.4)
185
  elif voice_model == "weeknd":
186
  # Simulate The Weeknd's voice characteristics
187
  vocals = self._apply_voice_characteristics(vocals,
188
- pitch_factor=0.9,
189
- formant_shift=-0.05,
190
- reverb=0.3)
191
  elif voice_model == "taylor":
192
  # Simulate Taylor Swift's voice characteristics
193
  vocals = self._apply_voice_characteristics(vocals,
194
- pitch_factor=1.1,
195
- formant_shift=0.1,
196
- clarity=0.8)
197
-
198
  # Blend with original based on strength
199
  return vocals * strength + vocals * (1 - strength) * 0.3
200
-
201
  def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray:
202
  """Apply voice characteristics transformation"""
203
  sr = 44100
204
-
205
  # Apply pitch factor
206
  if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0:
207
- vocals = librosa.effects.pitch_shift(vocals, sr=sr,
208
- n_steps=12 * np.log2(kwargs['pitch_factor']))
209
-
210
  # Apply formant shifting (simplified)
211
  if 'formant_shift' in kwargs:
212
  # This is a simplified formant shift - real implementation would be more complex
213
  stft = librosa.stft(vocals)
214
  magnitude = np.abs(stft)
215
  phase = np.angle(stft)
216
-
217
  # Shift formants by stretching frequency axis
218
  shift_factor = 1 + kwargs['formant_shift']
219
  shifted_magnitude = np.zeros_like(magnitude)
220
-
221
  for i in range(magnitude.shape[0]):
222
  shifted_idx = int(i * shift_factor)
223
  if shifted_idx < magnitude.shape[0]:
224
  shifted_magnitude[shifted_idx] = magnitude[i]
225
-
226
  shifted_stft = shifted_magnitude * np.exp(1j * phase)
227
  vocals = librosa.istft(shifted_stft)
228
-
229
  # Apply effects
230
  if 'roughness' in kwargs:
231
  # Add slight distortion for roughness
232
  vocals = np.tanh(vocals * (1 + kwargs['roughness']))
233
-
234
  if 'breathiness' in kwargs:
235
  # Add noise for breathiness
236
  noise = np.random.normal(0, 0.01, vocals.shape)
237
  vocals = vocals + noise * kwargs['breathiness']
238
-
239
  return vocals
240
-
241
  def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str:
242
  """Mix instrumental and converted vocals"""
243
  try:
244
  # Load audio files
245
  instrumental, sr = librosa.load(instrumental_path, sr=44100)
246
  vocals, _ = librosa.load(vocals_path, sr=44100)
247
-
248
  # Ensure same length
249
  min_len = min(len(instrumental), len(vocals))
250
  instrumental = instrumental[:min_len]
251
  vocals = vocals[:min_len]
252
-
253
  # Mix audio
254
  mixed = instrumental + vocals * vocal_volume
255
-
256
  # Normalize to prevent clipping
257
  max_amplitude = np.max(np.abs(mixed))
258
  if max_amplitude > 0.95:
259
  mixed = mixed / max_amplitude * 0.95
260
-
261
  # Save mixed audio
262
  output_path = os.path.join(self.temp_dir, "final_cover.wav")
263
  sf.write(output_path, mixed, sr)
264
-
265
  return output_path
266
-
267
  except Exception as e:
268
  print(f"Audio mixing error: {e}")
269
  return None
270
-
271
  def process_custom_voice(self, voice_samples: list) -> str:
272
  """Process custom voice samples for training"""
273
  if not voice_samples:
274
  return "No voice samples provided"
275
-
276
  try:
277
  # In a real implementation, this would train a voice model
278
  # For demo, we'll just validate the samples
279
  total_duration = 0
 
280
  for sample in voice_samples:
281
  if sample is not None:
282
  audio, sr = librosa.load(sample, sr=44100)
283
  duration = len(audio) / sr
284
  total_duration += duration
285
-
286
  if total_duration < 30:
287
  return "Need at least 30 seconds of voice samples"
288
  elif total_duration > 300:
289
  return "Voice samples too long (max 5 minutes)"
290
  else:
291
- return f"Custom voice model ready! ({total_duration:.1f}s of training data)"
292
-
293
  except Exception as e:
294
  return f"Error processing voice samples: {e}"
295
 
@@ -304,48 +308,50 @@ def generate_cover(
304
  auto_tune: bool = False,
305
  output_format: str = "wav"
306
  ) -> Tuple[Optional[str], str]:
307
- """Main function to generate AI cover"""
308
-
 
309
  if audio_file is None:
310
  return None, "Please upload an audio file"
311
-
312
  try:
313
  # Step 1: Separate vocals and instrumentals
314
  yield None, "🎡 Separating vocals and instrumentals..."
315
  vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name)
316
-
317
  if vocals_path is None:
318
  return None, "❌ Failed to separate vocals"
319
-
320
  # Step 2: Convert vocals to target voice
321
  yield None, f"🎀 Converting vocals to {voice_model} style..."
322
  converted_vocals_path = cover_generator.convert_voice(
323
- vocals_path,
324
- voice_model,
325
- pitch_shift,
326
  voice_strength / 100
327
  )
328
-
329
  # Step 3: Apply auto-tune if requested
330
  if auto_tune:
331
  yield None, "🎼 Applying auto-tune..."
332
  # Auto-tune implementation would go here
333
  pass
334
-
335
  # Step 4: Mix final audio
336
  yield None, "🎧 Mixing final audio..."
337
  final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path)
338
-
339
  if final_path is None:
340
  return None, "❌ Failed to mix audio"
341
-
342
- # Convert to requested format if needed
 
343
  if output_format != "wav":
344
  yield None, f"πŸ’Ύ Converting to {output_format.upper()}..."
345
  # Format conversion would go here
346
-
347
  return final_path, "βœ… AI Cover generated successfully!"
348
-
349
  except Exception as e:
350
  return None, f"❌ Error: {str(e)}"
351
 
@@ -353,18 +359,14 @@ def process_voice_samples(voice_files) -> str:
353
  """Process uploaded voice samples for custom voice training"""
354
  if not voice_files:
355
  return "No voice samples uploaded"
356
-
357
  return cover_generator.process_custom_voice(voice_files)
358
 
359
  # Create Gradio interface
360
  def create_interface():
361
  with gr.Blocks(
362
  title="🎡 AI Cover Song Platform",
363
- theme=gr.themes.Soft(
364
- primary_hue="indigo",
365
- secondary_hue="purple",
366
- neutral_hue="slate"
367
- ),
368
  css="""
369
  .gradio-container {
370
  font-family: 'Inter', sans-serif;
@@ -388,7 +390,7 @@ def create_interface():
388
  }
389
  """
390
  ) as app:
391
-
392
  # Header
393
  with gr.Row():
394
  gr.Markdown("""
@@ -402,7 +404,7 @@ def create_interface():
402
  </div>
403
  </div>
404
  """)
405
-
406
  # Step 1: Upload Audio
407
  with gr.Row():
408
  with gr.Column():
@@ -413,7 +415,7 @@ def create_interface():
413
  format="wav"
414
  )
415
  gr.Markdown("*Supports MP3, WAV, FLAC files*")
416
-
417
  # Step 2: Voice Selection
418
  with gr.Row():
419
  with gr.Column():
@@ -424,7 +426,7 @@ def create_interface():
424
  value="Drake Style Voice",
425
  interactive=True
426
  )
427
-
428
  # Custom voice training section
429
  with gr.Accordion("πŸŽ™οΈ Train Custom Voice (Optional)", open=False):
430
  voice_samples = gr.File(
@@ -434,18 +436,18 @@ def create_interface():
434
  )
435
  train_btn = gr.Button("Train Custom Voice", variant="secondary")
436
  training_status = gr.Textbox(label="Training Status", interactive=False)
437
-
438
  train_btn.click(
439
  process_voice_samples,
440
  inputs=[voice_samples],
441
  outputs=[training_status]
442
  )
443
-
444
  # Step 3: Audio Settings
445
  with gr.Row():
446
  with gr.Column():
447
  gr.Markdown("## βš™οΈ Step 3: Audio Settings")
448
-
449
  with gr.Row():
450
  pitch_shift = gr.Slider(
451
  minimum=-12,
@@ -461,7 +463,7 @@ def create_interface():
461
  step=5,
462
  label="Voice Strength (%)"
463
  )
464
-
465
  with gr.Row():
466
  auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False)
467
  output_format = gr.Dropdown(
@@ -469,7 +471,7 @@ def create_interface():
469
  label="Output Format",
470
  value="wav"
471
  )
472
-
473
  # Step 4: Generate Cover
474
  with gr.Row():
475
  with gr.Column():
@@ -479,33 +481,35 @@ def create_interface():
479
  variant="primary",
480
  size="lg"
481
  )
482
-
483
  progress_text = gr.Textbox(
484
  label="Progress",
485
  value="Ready to generate cover...",
486
  interactive=False
487
  )
488
-
489
  # Results
490
  with gr.Row():
491
  with gr.Column():
492
  gr.Markdown("## πŸŽ‰ Results")
493
-
494
  with gr.Row():
495
  original_audio = gr.Audio(label="Original Song", interactive=False)
496
  cover_audio = gr.Audio(label="AI Cover", interactive=False)
497
-
498
  # Legal Notice
499
  with gr.Row():
500
  gr.Markdown("""
501
- <div style="background: rgba(255, 193, 7, 0.1); border: 1px solid rgba(255, 193, 7, 0.3); border-radius: 10px; padding: 1rem; margin: 1rem 0;">
 
 
502
  <h3>⚠️ Legal & Ethical Notice</h3>
503
- <p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly.
504
- Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content.
505
  Respect copyright laws and artist rights.</p>
506
  </div>
507
  """)
508
-
509
  # Event handlers
510
  generate_btn.click(
511
  generate_cover,
@@ -519,14 +523,14 @@ def create_interface():
519
  ],
520
  outputs=[cover_audio, progress_text]
521
  )
522
-
523
  # Update original audio when file is uploaded
524
  audio_input.change(
525
  lambda x: x,
526
  inputs=[audio_input],
527
  outputs=[original_audio]
528
  )
529
-
530
  return app
531
 
532
  # Launch the app
@@ -537,4 +541,4 @@ if __name__ == "__main__":
537
  server_port=7860,
538
  share=True,
539
  show_error=True
540
- )
 
30
  print("SVC not available, using basic voice conversion")
31
 
32
  class AICoverGenerator:
33
+ def \
34
+ __init__(self):
35
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
  self.temp_dir = tempfile.mkdtemp()
37
  self.voice_models = {
38
  "drake": "Drake Style Voice",
39
+ "ariana": "Ariana Style Voice",
40
  "weeknd": "The Weeknd Style Voice",
41
  "taylor": "Taylor Swift Style Voice",
42
  "custom": "Custom Voice Model"
43
  }
44
+
45
  # Initialize audio separation model
46
  if DEMUCS_AVAILABLE:
47
  try:
 
52
  self.separation_model = None
53
  else:
54
  self.separation_model = None
55
+
56
  def separate_vocals(self, audio_path: str) -> Tuple[str, str]:
57
  """Separate vocals and instrumentals from audio"""
58
  try:
59
  # Load audio
60
  audio, sr = librosa.load(audio_path, sr=44100, mono=False)
61
+
62
  if self.separation_model and DEMUCS_AVAILABLE:
63
  # Use Demucs for high-quality separation
64
  return self._demucs_separate(audio_path)
65
  else:
66
  # Use basic spectral subtraction
67
  return self._basic_separate(audio, sr)
68
+
69
  except Exception as e:
70
  print(f"Error in vocal separation: {e}")
71
  return None, None
72
+
73
  def _demucs_separate(self, audio_path: str) -> Tuple[str, str]:
74
  """Use Demucs for audio separation"""
75
  try:
 
77
  audio, sr = librosa.load(audio_path, sr=44100, mono=False)
78
  if audio.ndim == 1:
79
  audio = np.stack([audio, audio])
80
+
81
  # Convert to tensor
82
  audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
83
+
84
  # Apply separation
85
  with torch.no_grad():
86
  sources = apply_model(self.separation_model, audio_tensor)
87
+
88
  # Extract vocals and instrumental
89
  vocals = sources[0, 3].cpu().numpy() # vocals channel
90
  instrumental = sources[0, 0].cpu().numpy() # drums + bass + other
91
+
92
  # Save separated audio
93
  vocals_path = os.path.join(self.temp_dir, "vocals.wav")
94
  instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
95
+
96
  sf.write(vocals_path, vocals.T, 44100)
97
  sf.write(instrumental_path, instrumental.T, 44100)
98
+
99
  return vocals_path, instrumental_path
100
+
101
  except Exception as e:
102
  print(f"Demucs separation error: {e}")
103
  return self._basic_separate(audio, 44100)
104
+
105
  def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]:
106
  """Basic vocal separation using spectral subtraction"""
107
  try:
108
  # Convert to mono if stereo
109
  if audio.ndim > 1:
110
  audio = librosa.to_mono(audio)
111
+
112
  # Compute STFT
113
  stft = librosa.stft(audio, n_fft=2048, hop_length=512)
114
  magnitude, phase = np.abs(stft), np.angle(stft)
115
+
116
  # Simple vocal isolation (center channel extraction)
117
  # This is a basic approach - real implementation would be more sophisticated
118
  vocal_mask = np.ones_like(magnitude)
119
  vocal_mask[:, :magnitude.shape[1]//4] *= 0.3 # Reduce low frequencies
120
  vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3 # Reduce high frequencies
121
+
122
  # Apply mask
123
  vocal_magnitude = magnitude * vocal_mask
124
  instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7)
125
+
126
  # Reconstruct audio
127
  vocal_stft = vocal_magnitude * np.exp(1j * phase)
128
  instrumental_stft = instrumental_magnitude * np.exp(1j * phase)
129
+
130
  vocals = librosa.istft(vocal_stft, hop_length=512)
131
  instrumental = librosa.istft(instrumental_stft, hop_length=512)
132
+
133
  # Save files
134
  vocals_path = os.path.join(self.temp_dir, "vocals.wav")
135
  instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
136
+
137
  sf.write(vocals_path, vocals, sr)
138
  sf.write(instrumental_path, instrumental, sr)
139
+
140
  return vocals_path, instrumental_path
141
+
142
+
143
  except Exception as e:
144
  print(f"Basic separation error: {e}")
145
  return None, None
146
+
147
  def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str:
148
  """Convert vocals to target voice"""
149
  try:
150
  # Load vocal audio
151
  vocals, sr = librosa.load(vocals_path, sr=44100)
152
+
153
  # Apply pitch shifting if requested
154
  if pitch_shift != 0:
155
  vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift)
156
+
157
  # Simulate voice conversion (in real app, this would use trained models)
158
  converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength)
159
+
160
  # Save converted vocals
161
  converted_path = os.path.join(self.temp_dir, "converted_vocals.wav")
162
  sf.write(converted_path, converted_vocals, sr)
163
+
164
  return converted_path
165
+
166
  except Exception as e:
167
  print(f"Voice conversion error: {e}")
168
  return vocals_path # Return original if conversion fails
169
+
170
  def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray:
171
+ """Simulate voice conversion \
172
+ (placeholder for actual model inference)"""
173
  # This is a simplified simulation - real implementation would use trained models
174
+
175
  # Apply different effects based on voice model
176
  if voice_model == "drake":
177
  # Simulate Drake's voice characteristics
178
+ vocals = self._apply_voice_characteristics(vocals,
179
+ pitch_factor=0.85,
180
+ formant_shift=-0.1,
181
+ roughness=0.3)
182
  elif voice_model == "ariana":
183
  # Simulate Ariana's voice characteristics
184
  vocals = self._apply_voice_characteristics(vocals,
185
+ pitch_factor=1.2,
186
+ formant_shift=0.2,
187
+ breathiness=0.4)
188
  elif voice_model == "weeknd":
189
  # Simulate The Weeknd's voice characteristics
190
  vocals = self._apply_voice_characteristics(vocals,
191
+ pitch_factor=0.9,
192
+ formant_shift=-0.05,
193
+ reverb=0.3)
194
  elif voice_model == "taylor":
195
  # Simulate Taylor Swift's voice characteristics
196
  vocals = self._apply_voice_characteristics(vocals,
197
+ pitch_factor=1.1,
198
+ formant_shift=0.1,
199
+ clarity=0.8)
200
+
201
  # Blend with original based on strength
202
  return vocals * strength + vocals * (1 - strength) * 0.3
203
+
204
  def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray:
205
  """Apply voice characteristics transformation"""
206
  sr = 44100
207
+
208
  # Apply pitch factor
209
  if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0:
210
+ vocals = librosa.effects.pitch_shift(vocals, sr=sr,
211
+ n_steps=12 * np.log2(kwargs['pitch_factor']))
212
+
213
  # Apply formant shifting (simplified)
214
  if 'formant_shift' in kwargs:
215
  # This is a simplified formant shift - real implementation would be more complex
216
  stft = librosa.stft(vocals)
217
  magnitude = np.abs(stft)
218
  phase = np.angle(stft)
219
+
220
  # Shift formants by stretching frequency axis
221
  shift_factor = 1 + kwargs['formant_shift']
222
  shifted_magnitude = np.zeros_like(magnitude)
223
+
224
  for i in range(magnitude.shape[0]):
225
  shifted_idx = int(i * shift_factor)
226
  if shifted_idx < magnitude.shape[0]:
227
  shifted_magnitude[shifted_idx] = magnitude[i]
228
+
229
  shifted_stft = shifted_magnitude * np.exp(1j * phase)
230
  vocals = librosa.istft(shifted_stft)
231
+
232
  # Apply effects
233
  if 'roughness' in kwargs:
234
  # Add slight distortion for roughness
235
  vocals = np.tanh(vocals * (1 + kwargs['roughness']))
236
+
237
  if 'breathiness' in kwargs:
238
  # Add noise for breathiness
239
  noise = np.random.normal(0, 0.01, vocals.shape)
240
  vocals = vocals + noise * kwargs['breathiness']
241
+
242
  return vocals
243
+
244
  def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str:
245
  """Mix instrumental and converted vocals"""
246
  try:
247
  # Load audio files
248
  instrumental, sr = librosa.load(instrumental_path, sr=44100)
249
  vocals, _ = librosa.load(vocals_path, sr=44100)
250
+
251
  # Ensure same length
252
  min_len = min(len(instrumental), len(vocals))
253
  instrumental = instrumental[:min_len]
254
  vocals = vocals[:min_len]
255
+
256
  # Mix audio
257
  mixed = instrumental + vocals * vocal_volume
258
+
259
  # Normalize to prevent clipping
260
  max_amplitude = np.max(np.abs(mixed))
261
  if max_amplitude > 0.95:
262
  mixed = mixed / max_amplitude * 0.95
263
+
264
  # Save mixed audio
265
  output_path = os.path.join(self.temp_dir, "final_cover.wav")
266
  sf.write(output_path, mixed, sr)
267
+
268
  return output_path
269
+
270
  except Exception as e:
271
  print(f"Audio mixing error: {e}")
272
  return None
273
+
274
  def process_custom_voice(self, voice_samples: list) -> str:
275
  """Process custom voice samples for training"""
276
  if not voice_samples:
277
  return "No voice samples provided"
278
+
279
  try:
280
  # In a real implementation, this would train a voice model
281
  # For demo, we'll just validate the samples
282
  total_duration = 0
283
+
284
  for sample in voice_samples:
285
  if sample is not None:
286
  audio, sr = librosa.load(sample, sr=44100)
287
  duration = len(audio) / sr
288
  total_duration += duration
289
+
290
  if total_duration < 30:
291
  return "Need at least 30 seconds of voice samples"
292
  elif total_duration > 300:
293
  return "Voice samples too long (max 5 minutes)"
294
  else:
295
+ return f"Custom voice model ready!\n({total_duration:.1f}s of training data)"
296
+
297
  except Exception as e:
298
  return f"Error processing voice samples: {e}"
299
 
 
308
  auto_tune: bool = False,
309
  output_format: str = "wav"
310
  ) -> Tuple[Optional[str], str]:
311
+ """Main \
312
+ function to generate AI cover"""
313
+
314
  if audio_file is None:
315
  return None, "Please upload an audio file"
316
+
317
  try:
318
  # Step 1: Separate vocals and instrumentals
319
  yield None, "🎡 Separating vocals and instrumentals..."
320
  vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name)
321
+
322
  if vocals_path is None:
323
  return None, "❌ Failed to separate vocals"
324
+
325
  # Step 2: Convert vocals to target voice
326
  yield None, f"🎀 Converting vocals to {voice_model} style..."
327
  converted_vocals_path = cover_generator.convert_voice(
328
+ vocals_path,
329
+ voice_model,
330
+ pitch_shift,
331
  voice_strength / 100
332
  )
333
+
334
  # Step 3: Apply auto-tune if requested
335
  if auto_tune:
336
  yield None, "🎼 Applying auto-tune..."
337
  # Auto-tune implementation would go here
338
  pass
339
+
340
  # Step 4: Mix final audio
341
  yield None, "🎧 Mixing final audio..."
342
  final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path)
343
+
344
  if final_path is None:
345
  return None, "❌ Failed to mix audio"
346
+
347
+ # Convert to requested \
348
+ format if needed
349
  if output_format != "wav":
350
  yield None, f"πŸ’Ύ Converting to {output_format.upper()}..."
351
  # Format conversion would go here
352
+
353
  return final_path, "βœ… AI Cover generated successfully!"
354
+
355
  except Exception as e:
356
  return None, f"❌ Error: {str(e)}"
357
 
 
359
  """Process uploaded voice samples for custom voice training"""
360
  if not voice_files:
361
  return "No voice samples uploaded"
362
+
363
  return cover_generator.process_custom_voice(voice_files)
364
 
365
  # Create Gradio interface
366
  def create_interface():
367
  with gr.Blocks(
368
  title="🎡 AI Cover Song Platform",
369
+ # Removed theme=gr.themes.Soft for compatibility with Gradio versions < 4.0.0
 
 
 
 
370
  css="""
371
  .gradio-container {
372
  font-family: 'Inter', sans-serif;
 
390
  }
391
  """
392
  ) as app:
393
+
394
  # Header
395
  with gr.Row():
396
  gr.Markdown("""
 
404
  </div>
405
  </div>
406
  """)
407
+
408
  # Step 1: Upload Audio
409
  with gr.Row():
410
  with gr.Column():
 
415
  format="wav"
416
  )
417
  gr.Markdown("*Supports MP3, WAV, FLAC files*")
418
+
419
  # Step 2: Voice Selection
420
  with gr.Row():
421
  with gr.Column():
 
426
  value="Drake Style Voice",
427
  interactive=True
428
  )
429
+
430
  # Custom voice training section
431
  with gr.Accordion("πŸŽ™οΈ Train Custom Voice (Optional)", open=False):
432
  voice_samples = gr.File(
 
436
  )
437
  train_btn = gr.Button("Train Custom Voice", variant="secondary")
438
  training_status = gr.Textbox(label="Training Status", interactive=False)
439
+
440
  train_btn.click(
441
  process_voice_samples,
442
  inputs=[voice_samples],
443
  outputs=[training_status]
444
  )
445
+
446
  # Step 3: Audio Settings
447
  with gr.Row():
448
  with gr.Column():
449
  gr.Markdown("## βš™οΈ Step 3: Audio Settings")
450
+
451
  with gr.Row():
452
  pitch_shift = gr.Slider(
453
  minimum=-12,
 
463
  step=5,
464
  label="Voice Strength (%)"
465
  )
466
+
467
  with gr.Row():
468
  auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False)
469
  output_format = gr.Dropdown(
 
471
  label="Output Format",
472
  value="wav"
473
  )
474
+
475
  # Step 4: Generate Cover
476
  with gr.Row():
477
  with gr.Column():
 
481
  variant="primary",
482
  size="lg"
483
  )
484
+
485
  progress_text = gr.Textbox(
486
  label="Progress",
487
  value="Ready to generate cover...",
488
  interactive=False
489
  )
490
+
491
  # Results
492
  with gr.Row():
493
  with gr.Column():
494
  gr.Markdown("## πŸŽ‰ Results")
495
+
496
  with gr.Row():
497
  original_audio = gr.Audio(label="Original Song", interactive=False)
498
  cover_audio = gr.Audio(label="AI Cover", interactive=False)
499
+
500
  # Legal Notice
501
  with gr.Row():
502
  gr.Markdown("""
503
+ <div style="background: rgba(255, 193, 7, 0.1);
504
+ border: 1px solid rgba(255, 193, 7, 0.3); border-radius: 10px; padding: 1rem;
505
+ margin: 1rem 0;">
506
  <h3>⚠️ Legal & Ethical Notice</h3>
507
+ <p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly.
508
+ Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content.
509
  Respect copyright laws and artist rights.</p>
510
  </div>
511
  """)
512
+
513
  # Event handlers
514
  generate_btn.click(
515
  generate_cover,
 
523
  ],
524
  outputs=[cover_audio, progress_text]
525
  )
526
+
527
  # Update original audio when file is uploaded
528
  audio_input.change(
529
  lambda x: x,
530
  inputs=[audio_input],
531
  outputs=[original_audio]
532
  )
533
+
534
  return app
535
 
536
  # Launch the app
 
541
  server_port=7860,
542
  share=True,
543
  show_error=True
544
+ )