Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -30,17 +30,18 @@ except ImportError:
|
|
30 |
print("SVC not available, using basic voice conversion")
|
31 |
|
32 |
class AICoverGenerator:
|
33 |
-
def
|
|
|
34 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
35 |
self.temp_dir = tempfile.mkdtemp()
|
36 |
self.voice_models = {
|
37 |
"drake": "Drake Style Voice",
|
38 |
-
"ariana": "Ariana Style Voice",
|
39 |
"weeknd": "The Weeknd Style Voice",
|
40 |
"taylor": "Taylor Swift Style Voice",
|
41 |
"custom": "Custom Voice Model"
|
42 |
}
|
43 |
-
|
44 |
# Initialize audio separation model
|
45 |
if DEMUCS_AVAILABLE:
|
46 |
try:
|
@@ -51,24 +52,24 @@ class AICoverGenerator:
|
|
51 |
self.separation_model = None
|
52 |
else:
|
53 |
self.separation_model = None
|
54 |
-
|
55 |
def separate_vocals(self, audio_path: str) -> Tuple[str, str]:
|
56 |
"""Separate vocals and instrumentals from audio"""
|
57 |
try:
|
58 |
# Load audio
|
59 |
audio, sr = librosa.load(audio_path, sr=44100, mono=False)
|
60 |
-
|
61 |
if self.separation_model and DEMUCS_AVAILABLE:
|
62 |
# Use Demucs for high-quality separation
|
63 |
return self._demucs_separate(audio_path)
|
64 |
else:
|
65 |
# Use basic spectral subtraction
|
66 |
return self._basic_separate(audio, sr)
|
67 |
-
|
68 |
except Exception as e:
|
69 |
print(f"Error in vocal separation: {e}")
|
70 |
return None, None
|
71 |
-
|
72 |
def _demucs_separate(self, audio_path: str) -> Tuple[str, str]:
|
73 |
"""Use Demucs for audio separation"""
|
74 |
try:
|
@@ -76,220 +77,223 @@ class AICoverGenerator:
|
|
76 |
audio, sr = librosa.load(audio_path, sr=44100, mono=False)
|
77 |
if audio.ndim == 1:
|
78 |
audio = np.stack([audio, audio])
|
79 |
-
|
80 |
# Convert to tensor
|
81 |
audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
82 |
-
|
83 |
# Apply separation
|
84 |
with torch.no_grad():
|
85 |
sources = apply_model(self.separation_model, audio_tensor)
|
86 |
-
|
87 |
# Extract vocals and instrumental
|
88 |
vocals = sources[0, 3].cpu().numpy() # vocals channel
|
89 |
instrumental = sources[0, 0].cpu().numpy() # drums + bass + other
|
90 |
-
|
91 |
# Save separated audio
|
92 |
vocals_path = os.path.join(self.temp_dir, "vocals.wav")
|
93 |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
|
94 |
-
|
95 |
sf.write(vocals_path, vocals.T, 44100)
|
96 |
sf.write(instrumental_path, instrumental.T, 44100)
|
97 |
-
|
98 |
return vocals_path, instrumental_path
|
99 |
-
|
100 |
except Exception as e:
|
101 |
print(f"Demucs separation error: {e}")
|
102 |
return self._basic_separate(audio, 44100)
|
103 |
-
|
104 |
def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]:
|
105 |
"""Basic vocal separation using spectral subtraction"""
|
106 |
try:
|
107 |
# Convert to mono if stereo
|
108 |
if audio.ndim > 1:
|
109 |
audio = librosa.to_mono(audio)
|
110 |
-
|
111 |
# Compute STFT
|
112 |
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
|
113 |
magnitude, phase = np.abs(stft), np.angle(stft)
|
114 |
-
|
115 |
# Simple vocal isolation (center channel extraction)
|
116 |
# This is a basic approach - real implementation would be more sophisticated
|
117 |
vocal_mask = np.ones_like(magnitude)
|
118 |
vocal_mask[:, :magnitude.shape[1]//4] *= 0.3 # Reduce low frequencies
|
119 |
vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3 # Reduce high frequencies
|
120 |
-
|
121 |
# Apply mask
|
122 |
vocal_magnitude = magnitude * vocal_mask
|
123 |
instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7)
|
124 |
-
|
125 |
# Reconstruct audio
|
126 |
vocal_stft = vocal_magnitude * np.exp(1j * phase)
|
127 |
instrumental_stft = instrumental_magnitude * np.exp(1j * phase)
|
128 |
-
|
129 |
vocals = librosa.istft(vocal_stft, hop_length=512)
|
130 |
instrumental = librosa.istft(instrumental_stft, hop_length=512)
|
131 |
-
|
132 |
# Save files
|
133 |
vocals_path = os.path.join(self.temp_dir, "vocals.wav")
|
134 |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
|
135 |
-
|
136 |
sf.write(vocals_path, vocals, sr)
|
137 |
sf.write(instrumental_path, instrumental, sr)
|
138 |
-
|
139 |
return vocals_path, instrumental_path
|
140 |
-
|
|
|
141 |
except Exception as e:
|
142 |
print(f"Basic separation error: {e}")
|
143 |
return None, None
|
144 |
-
|
145 |
def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str:
|
146 |
"""Convert vocals to target voice"""
|
147 |
try:
|
148 |
# Load vocal audio
|
149 |
vocals, sr = librosa.load(vocals_path, sr=44100)
|
150 |
-
|
151 |
# Apply pitch shifting if requested
|
152 |
if pitch_shift != 0:
|
153 |
vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift)
|
154 |
-
|
155 |
# Simulate voice conversion (in real app, this would use trained models)
|
156 |
converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength)
|
157 |
-
|
158 |
# Save converted vocals
|
159 |
converted_path = os.path.join(self.temp_dir, "converted_vocals.wav")
|
160 |
sf.write(converted_path, converted_vocals, sr)
|
161 |
-
|
162 |
return converted_path
|
163 |
-
|
164 |
except Exception as e:
|
165 |
print(f"Voice conversion error: {e}")
|
166 |
return vocals_path # Return original if conversion fails
|
167 |
-
|
168 |
def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray:
|
169 |
-
"""Simulate voice conversion
|
|
|
170 |
# This is a simplified simulation - real implementation would use trained models
|
171 |
-
|
172 |
# Apply different effects based on voice model
|
173 |
if voice_model == "drake":
|
174 |
# Simulate Drake's voice characteristics
|
175 |
-
vocals = self._apply_voice_characteristics(vocals,
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
elif voice_model == "ariana":
|
180 |
# Simulate Ariana's voice characteristics
|
181 |
vocals = self._apply_voice_characteristics(vocals,
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
elif voice_model == "weeknd":
|
186 |
# Simulate The Weeknd's voice characteristics
|
187 |
vocals = self._apply_voice_characteristics(vocals,
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
elif voice_model == "taylor":
|
192 |
# Simulate Taylor Swift's voice characteristics
|
193 |
vocals = self._apply_voice_characteristics(vocals,
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
# Blend with original based on strength
|
199 |
return vocals * strength + vocals * (1 - strength) * 0.3
|
200 |
-
|
201 |
def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray:
|
202 |
"""Apply voice characteristics transformation"""
|
203 |
sr = 44100
|
204 |
-
|
205 |
# Apply pitch factor
|
206 |
if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0:
|
207 |
-
vocals = librosa.effects.pitch_shift(vocals, sr=sr,
|
208 |
-
|
209 |
-
|
210 |
# Apply formant shifting (simplified)
|
211 |
if 'formant_shift' in kwargs:
|
212 |
# This is a simplified formant shift - real implementation would be more complex
|
213 |
stft = librosa.stft(vocals)
|
214 |
magnitude = np.abs(stft)
|
215 |
phase = np.angle(stft)
|
216 |
-
|
217 |
# Shift formants by stretching frequency axis
|
218 |
shift_factor = 1 + kwargs['formant_shift']
|
219 |
shifted_magnitude = np.zeros_like(magnitude)
|
220 |
-
|
221 |
for i in range(magnitude.shape[0]):
|
222 |
shifted_idx = int(i * shift_factor)
|
223 |
if shifted_idx < magnitude.shape[0]:
|
224 |
shifted_magnitude[shifted_idx] = magnitude[i]
|
225 |
-
|
226 |
shifted_stft = shifted_magnitude * np.exp(1j * phase)
|
227 |
vocals = librosa.istft(shifted_stft)
|
228 |
-
|
229 |
# Apply effects
|
230 |
if 'roughness' in kwargs:
|
231 |
# Add slight distortion for roughness
|
232 |
vocals = np.tanh(vocals * (1 + kwargs['roughness']))
|
233 |
-
|
234 |
if 'breathiness' in kwargs:
|
235 |
# Add noise for breathiness
|
236 |
noise = np.random.normal(0, 0.01, vocals.shape)
|
237 |
vocals = vocals + noise * kwargs['breathiness']
|
238 |
-
|
239 |
return vocals
|
240 |
-
|
241 |
def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str:
|
242 |
"""Mix instrumental and converted vocals"""
|
243 |
try:
|
244 |
# Load audio files
|
245 |
instrumental, sr = librosa.load(instrumental_path, sr=44100)
|
246 |
vocals, _ = librosa.load(vocals_path, sr=44100)
|
247 |
-
|
248 |
# Ensure same length
|
249 |
min_len = min(len(instrumental), len(vocals))
|
250 |
instrumental = instrumental[:min_len]
|
251 |
vocals = vocals[:min_len]
|
252 |
-
|
253 |
# Mix audio
|
254 |
mixed = instrumental + vocals * vocal_volume
|
255 |
-
|
256 |
# Normalize to prevent clipping
|
257 |
max_amplitude = np.max(np.abs(mixed))
|
258 |
if max_amplitude > 0.95:
|
259 |
mixed = mixed / max_amplitude * 0.95
|
260 |
-
|
261 |
# Save mixed audio
|
262 |
output_path = os.path.join(self.temp_dir, "final_cover.wav")
|
263 |
sf.write(output_path, mixed, sr)
|
264 |
-
|
265 |
return output_path
|
266 |
-
|
267 |
except Exception as e:
|
268 |
print(f"Audio mixing error: {e}")
|
269 |
return None
|
270 |
-
|
271 |
def process_custom_voice(self, voice_samples: list) -> str:
|
272 |
"""Process custom voice samples for training"""
|
273 |
if not voice_samples:
|
274 |
return "No voice samples provided"
|
275 |
-
|
276 |
try:
|
277 |
# In a real implementation, this would train a voice model
|
278 |
# For demo, we'll just validate the samples
|
279 |
total_duration = 0
|
|
|
280 |
for sample in voice_samples:
|
281 |
if sample is not None:
|
282 |
audio, sr = librosa.load(sample, sr=44100)
|
283 |
duration = len(audio) / sr
|
284 |
total_duration += duration
|
285 |
-
|
286 |
if total_duration < 30:
|
287 |
return "Need at least 30 seconds of voice samples"
|
288 |
elif total_duration > 300:
|
289 |
return "Voice samples too long (max 5 minutes)"
|
290 |
else:
|
291 |
-
return f"Custom voice model ready
|
292 |
-
|
293 |
except Exception as e:
|
294 |
return f"Error processing voice samples: {e}"
|
295 |
|
@@ -304,48 +308,50 @@ def generate_cover(
|
|
304 |
auto_tune: bool = False,
|
305 |
output_format: str = "wav"
|
306 |
) -> Tuple[Optional[str], str]:
|
307 |
-
"""Main
|
308 |
-
|
|
|
309 |
if audio_file is None:
|
310 |
return None, "Please upload an audio file"
|
311 |
-
|
312 |
try:
|
313 |
# Step 1: Separate vocals and instrumentals
|
314 |
yield None, "π΅ Separating vocals and instrumentals..."
|
315 |
vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name)
|
316 |
-
|
317 |
if vocals_path is None:
|
318 |
return None, "β Failed to separate vocals"
|
319 |
-
|
320 |
# Step 2: Convert vocals to target voice
|
321 |
yield None, f"π€ Converting vocals to {voice_model} style..."
|
322 |
converted_vocals_path = cover_generator.convert_voice(
|
323 |
-
vocals_path,
|
324 |
-
voice_model,
|
325 |
-
pitch_shift,
|
326 |
voice_strength / 100
|
327 |
)
|
328 |
-
|
329 |
# Step 3: Apply auto-tune if requested
|
330 |
if auto_tune:
|
331 |
yield None, "πΌ Applying auto-tune..."
|
332 |
# Auto-tune implementation would go here
|
333 |
pass
|
334 |
-
|
335 |
# Step 4: Mix final audio
|
336 |
yield None, "π§ Mixing final audio..."
|
337 |
final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path)
|
338 |
-
|
339 |
if final_path is None:
|
340 |
return None, "β Failed to mix audio"
|
341 |
-
|
342 |
-
# Convert to requested
|
|
|
343 |
if output_format != "wav":
|
344 |
yield None, f"πΎ Converting to {output_format.upper()}..."
|
345 |
# Format conversion would go here
|
346 |
-
|
347 |
return final_path, "β
AI Cover generated successfully!"
|
348 |
-
|
349 |
except Exception as e:
|
350 |
return None, f"β Error: {str(e)}"
|
351 |
|
@@ -353,18 +359,14 @@ def process_voice_samples(voice_files) -> str:
|
|
353 |
"""Process uploaded voice samples for custom voice training"""
|
354 |
if not voice_files:
|
355 |
return "No voice samples uploaded"
|
356 |
-
|
357 |
return cover_generator.process_custom_voice(voice_files)
|
358 |
|
359 |
# Create Gradio interface
|
360 |
def create_interface():
|
361 |
with gr.Blocks(
|
362 |
title="π΅ AI Cover Song Platform",
|
363 |
-
theme=gr.themes.Soft
|
364 |
-
primary_hue="indigo",
|
365 |
-
secondary_hue="purple",
|
366 |
-
neutral_hue="slate"
|
367 |
-
),
|
368 |
css="""
|
369 |
.gradio-container {
|
370 |
font-family: 'Inter', sans-serif;
|
@@ -388,7 +390,7 @@ def create_interface():
|
|
388 |
}
|
389 |
"""
|
390 |
) as app:
|
391 |
-
|
392 |
# Header
|
393 |
with gr.Row():
|
394 |
gr.Markdown("""
|
@@ -402,7 +404,7 @@ def create_interface():
|
|
402 |
</div>
|
403 |
</div>
|
404 |
""")
|
405 |
-
|
406 |
# Step 1: Upload Audio
|
407 |
with gr.Row():
|
408 |
with gr.Column():
|
@@ -413,7 +415,7 @@ def create_interface():
|
|
413 |
format="wav"
|
414 |
)
|
415 |
gr.Markdown("*Supports MP3, WAV, FLAC files*")
|
416 |
-
|
417 |
# Step 2: Voice Selection
|
418 |
with gr.Row():
|
419 |
with gr.Column():
|
@@ -424,7 +426,7 @@ def create_interface():
|
|
424 |
value="Drake Style Voice",
|
425 |
interactive=True
|
426 |
)
|
427 |
-
|
428 |
# Custom voice training section
|
429 |
with gr.Accordion("ποΈ Train Custom Voice (Optional)", open=False):
|
430 |
voice_samples = gr.File(
|
@@ -434,18 +436,18 @@ def create_interface():
|
|
434 |
)
|
435 |
train_btn = gr.Button("Train Custom Voice", variant="secondary")
|
436 |
training_status = gr.Textbox(label="Training Status", interactive=False)
|
437 |
-
|
438 |
train_btn.click(
|
439 |
process_voice_samples,
|
440 |
inputs=[voice_samples],
|
441 |
outputs=[training_status]
|
442 |
)
|
443 |
-
|
444 |
# Step 3: Audio Settings
|
445 |
with gr.Row():
|
446 |
with gr.Column():
|
447 |
gr.Markdown("## βοΈ Step 3: Audio Settings")
|
448 |
-
|
449 |
with gr.Row():
|
450 |
pitch_shift = gr.Slider(
|
451 |
minimum=-12,
|
@@ -461,7 +463,7 @@ def create_interface():
|
|
461 |
step=5,
|
462 |
label="Voice Strength (%)"
|
463 |
)
|
464 |
-
|
465 |
with gr.Row():
|
466 |
auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False)
|
467 |
output_format = gr.Dropdown(
|
@@ -469,7 +471,7 @@ def create_interface():
|
|
469 |
label="Output Format",
|
470 |
value="wav"
|
471 |
)
|
472 |
-
|
473 |
# Step 4: Generate Cover
|
474 |
with gr.Row():
|
475 |
with gr.Column():
|
@@ -479,33 +481,35 @@ def create_interface():
|
|
479 |
variant="primary",
|
480 |
size="lg"
|
481 |
)
|
482 |
-
|
483 |
progress_text = gr.Textbox(
|
484 |
label="Progress",
|
485 |
value="Ready to generate cover...",
|
486 |
interactive=False
|
487 |
)
|
488 |
-
|
489 |
# Results
|
490 |
with gr.Row():
|
491 |
with gr.Column():
|
492 |
gr.Markdown("## π Results")
|
493 |
-
|
494 |
with gr.Row():
|
495 |
original_audio = gr.Audio(label="Original Song", interactive=False)
|
496 |
cover_audio = gr.Audio(label="AI Cover", interactive=False)
|
497 |
-
|
498 |
# Legal Notice
|
499 |
with gr.Row():
|
500 |
gr.Markdown("""
|
501 |
-
<div style="background: rgba(255, 193, 7, 0.1);
|
|
|
|
|
502 |
<h3>β οΈ Legal & Ethical Notice</h3>
|
503 |
-
<p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly.
|
504 |
-
Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content.
|
505 |
Respect copyright laws and artist rights.</p>
|
506 |
</div>
|
507 |
""")
|
508 |
-
|
509 |
# Event handlers
|
510 |
generate_btn.click(
|
511 |
generate_cover,
|
@@ -519,14 +523,14 @@ def create_interface():
|
|
519 |
],
|
520 |
outputs=[cover_audio, progress_text]
|
521 |
)
|
522 |
-
|
523 |
# Update original audio when file is uploaded
|
524 |
audio_input.change(
|
525 |
lambda x: x,
|
526 |
inputs=[audio_input],
|
527 |
outputs=[original_audio]
|
528 |
)
|
529 |
-
|
530 |
return app
|
531 |
|
532 |
# Launch the app
|
@@ -537,4 +541,4 @@ if __name__ == "__main__":
|
|
537 |
server_port=7860,
|
538 |
share=True,
|
539 |
show_error=True
|
540 |
-
)
|
|
|
30 |
print("SVC not available, using basic voice conversion")
|
31 |
|
32 |
class AICoverGenerator:
|
33 |
+
def \
|
34 |
+
__init__(self):
|
35 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
36 |
self.temp_dir = tempfile.mkdtemp()
|
37 |
self.voice_models = {
|
38 |
"drake": "Drake Style Voice",
|
39 |
+
"ariana": "Ariana Style Voice",
|
40 |
"weeknd": "The Weeknd Style Voice",
|
41 |
"taylor": "Taylor Swift Style Voice",
|
42 |
"custom": "Custom Voice Model"
|
43 |
}
|
44 |
+
|
45 |
# Initialize audio separation model
|
46 |
if DEMUCS_AVAILABLE:
|
47 |
try:
|
|
|
52 |
self.separation_model = None
|
53 |
else:
|
54 |
self.separation_model = None
|
55 |
+
|
56 |
def separate_vocals(self, audio_path: str) -> Tuple[str, str]:
|
57 |
"""Separate vocals and instrumentals from audio"""
|
58 |
try:
|
59 |
# Load audio
|
60 |
audio, sr = librosa.load(audio_path, sr=44100, mono=False)
|
61 |
+
|
62 |
if self.separation_model and DEMUCS_AVAILABLE:
|
63 |
# Use Demucs for high-quality separation
|
64 |
return self._demucs_separate(audio_path)
|
65 |
else:
|
66 |
# Use basic spectral subtraction
|
67 |
return self._basic_separate(audio, sr)
|
68 |
+
|
69 |
except Exception as e:
|
70 |
print(f"Error in vocal separation: {e}")
|
71 |
return None, None
|
72 |
+
|
73 |
def _demucs_separate(self, audio_path: str) -> Tuple[str, str]:
|
74 |
"""Use Demucs for audio separation"""
|
75 |
try:
|
|
|
77 |
audio, sr = librosa.load(audio_path, sr=44100, mono=False)
|
78 |
if audio.ndim == 1:
|
79 |
audio = np.stack([audio, audio])
|
80 |
+
|
81 |
# Convert to tensor
|
82 |
audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
83 |
+
|
84 |
# Apply separation
|
85 |
with torch.no_grad():
|
86 |
sources = apply_model(self.separation_model, audio_tensor)
|
87 |
+
|
88 |
# Extract vocals and instrumental
|
89 |
vocals = sources[0, 3].cpu().numpy() # vocals channel
|
90 |
instrumental = sources[0, 0].cpu().numpy() # drums + bass + other
|
91 |
+
|
92 |
# Save separated audio
|
93 |
vocals_path = os.path.join(self.temp_dir, "vocals.wav")
|
94 |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
|
95 |
+
|
96 |
sf.write(vocals_path, vocals.T, 44100)
|
97 |
sf.write(instrumental_path, instrumental.T, 44100)
|
98 |
+
|
99 |
return vocals_path, instrumental_path
|
100 |
+
|
101 |
except Exception as e:
|
102 |
print(f"Demucs separation error: {e}")
|
103 |
return self._basic_separate(audio, 44100)
|
104 |
+
|
105 |
def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]:
|
106 |
"""Basic vocal separation using spectral subtraction"""
|
107 |
try:
|
108 |
# Convert to mono if stereo
|
109 |
if audio.ndim > 1:
|
110 |
audio = librosa.to_mono(audio)
|
111 |
+
|
112 |
# Compute STFT
|
113 |
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
|
114 |
magnitude, phase = np.abs(stft), np.angle(stft)
|
115 |
+
|
116 |
# Simple vocal isolation (center channel extraction)
|
117 |
# This is a basic approach - real implementation would be more sophisticated
|
118 |
vocal_mask = np.ones_like(magnitude)
|
119 |
vocal_mask[:, :magnitude.shape[1]//4] *= 0.3 # Reduce low frequencies
|
120 |
vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3 # Reduce high frequencies
|
121 |
+
|
122 |
# Apply mask
|
123 |
vocal_magnitude = magnitude * vocal_mask
|
124 |
instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7)
|
125 |
+
|
126 |
# Reconstruct audio
|
127 |
vocal_stft = vocal_magnitude * np.exp(1j * phase)
|
128 |
instrumental_stft = instrumental_magnitude * np.exp(1j * phase)
|
129 |
+
|
130 |
vocals = librosa.istft(vocal_stft, hop_length=512)
|
131 |
instrumental = librosa.istft(instrumental_stft, hop_length=512)
|
132 |
+
|
133 |
# Save files
|
134 |
vocals_path = os.path.join(self.temp_dir, "vocals.wav")
|
135 |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
|
136 |
+
|
137 |
sf.write(vocals_path, vocals, sr)
|
138 |
sf.write(instrumental_path, instrumental, sr)
|
139 |
+
|
140 |
return vocals_path, instrumental_path
|
141 |
+
|
142 |
+
|
143 |
except Exception as e:
|
144 |
print(f"Basic separation error: {e}")
|
145 |
return None, None
|
146 |
+
|
147 |
def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str:
|
148 |
"""Convert vocals to target voice"""
|
149 |
try:
|
150 |
# Load vocal audio
|
151 |
vocals, sr = librosa.load(vocals_path, sr=44100)
|
152 |
+
|
153 |
# Apply pitch shifting if requested
|
154 |
if pitch_shift != 0:
|
155 |
vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift)
|
156 |
+
|
157 |
# Simulate voice conversion (in real app, this would use trained models)
|
158 |
converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength)
|
159 |
+
|
160 |
# Save converted vocals
|
161 |
converted_path = os.path.join(self.temp_dir, "converted_vocals.wav")
|
162 |
sf.write(converted_path, converted_vocals, sr)
|
163 |
+
|
164 |
return converted_path
|
165 |
+
|
166 |
except Exception as e:
|
167 |
print(f"Voice conversion error: {e}")
|
168 |
return vocals_path # Return original if conversion fails
|
169 |
+
|
170 |
def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray:
|
171 |
+
"""Simulate voice conversion \
|
172 |
+
(placeholder for actual model inference)"""
|
173 |
# This is a simplified simulation - real implementation would use trained models
|
174 |
+
|
175 |
# Apply different effects based on voice model
|
176 |
if voice_model == "drake":
|
177 |
# Simulate Drake's voice characteristics
|
178 |
+
vocals = self._apply_voice_characteristics(vocals,
|
179 |
+
pitch_factor=0.85,
|
180 |
+
formant_shift=-0.1,
|
181 |
+
roughness=0.3)
|
182 |
elif voice_model == "ariana":
|
183 |
# Simulate Ariana's voice characteristics
|
184 |
vocals = self._apply_voice_characteristics(vocals,
|
185 |
+
pitch_factor=1.2,
|
186 |
+
formant_shift=0.2,
|
187 |
+
breathiness=0.4)
|
188 |
elif voice_model == "weeknd":
|
189 |
# Simulate The Weeknd's voice characteristics
|
190 |
vocals = self._apply_voice_characteristics(vocals,
|
191 |
+
pitch_factor=0.9,
|
192 |
+
formant_shift=-0.05,
|
193 |
+
reverb=0.3)
|
194 |
elif voice_model == "taylor":
|
195 |
# Simulate Taylor Swift's voice characteristics
|
196 |
vocals = self._apply_voice_characteristics(vocals,
|
197 |
+
pitch_factor=1.1,
|
198 |
+
formant_shift=0.1,
|
199 |
+
clarity=0.8)
|
200 |
+
|
201 |
# Blend with original based on strength
|
202 |
return vocals * strength + vocals * (1 - strength) * 0.3
|
203 |
+
|
204 |
def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray:
|
205 |
"""Apply voice characteristics transformation"""
|
206 |
sr = 44100
|
207 |
+
|
208 |
# Apply pitch factor
|
209 |
if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0:
|
210 |
+
vocals = librosa.effects.pitch_shift(vocals, sr=sr,
|
211 |
+
n_steps=12 * np.log2(kwargs['pitch_factor']))
|
212 |
+
|
213 |
# Apply formant shifting (simplified)
|
214 |
if 'formant_shift' in kwargs:
|
215 |
# This is a simplified formant shift - real implementation would be more complex
|
216 |
stft = librosa.stft(vocals)
|
217 |
magnitude = np.abs(stft)
|
218 |
phase = np.angle(stft)
|
219 |
+
|
220 |
# Shift formants by stretching frequency axis
|
221 |
shift_factor = 1 + kwargs['formant_shift']
|
222 |
shifted_magnitude = np.zeros_like(magnitude)
|
223 |
+
|
224 |
for i in range(magnitude.shape[0]):
|
225 |
shifted_idx = int(i * shift_factor)
|
226 |
if shifted_idx < magnitude.shape[0]:
|
227 |
shifted_magnitude[shifted_idx] = magnitude[i]
|
228 |
+
|
229 |
shifted_stft = shifted_magnitude * np.exp(1j * phase)
|
230 |
vocals = librosa.istft(shifted_stft)
|
231 |
+
|
232 |
# Apply effects
|
233 |
if 'roughness' in kwargs:
|
234 |
# Add slight distortion for roughness
|
235 |
vocals = np.tanh(vocals * (1 + kwargs['roughness']))
|
236 |
+
|
237 |
if 'breathiness' in kwargs:
|
238 |
# Add noise for breathiness
|
239 |
noise = np.random.normal(0, 0.01, vocals.shape)
|
240 |
vocals = vocals + noise * kwargs['breathiness']
|
241 |
+
|
242 |
return vocals
|
243 |
+
|
244 |
def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str:
|
245 |
"""Mix instrumental and converted vocals"""
|
246 |
try:
|
247 |
# Load audio files
|
248 |
instrumental, sr = librosa.load(instrumental_path, sr=44100)
|
249 |
vocals, _ = librosa.load(vocals_path, sr=44100)
|
250 |
+
|
251 |
# Ensure same length
|
252 |
min_len = min(len(instrumental), len(vocals))
|
253 |
instrumental = instrumental[:min_len]
|
254 |
vocals = vocals[:min_len]
|
255 |
+
|
256 |
# Mix audio
|
257 |
mixed = instrumental + vocals * vocal_volume
|
258 |
+
|
259 |
# Normalize to prevent clipping
|
260 |
max_amplitude = np.max(np.abs(mixed))
|
261 |
if max_amplitude > 0.95:
|
262 |
mixed = mixed / max_amplitude * 0.95
|
263 |
+
|
264 |
# Save mixed audio
|
265 |
output_path = os.path.join(self.temp_dir, "final_cover.wav")
|
266 |
sf.write(output_path, mixed, sr)
|
267 |
+
|
268 |
return output_path
|
269 |
+
|
270 |
except Exception as e:
|
271 |
print(f"Audio mixing error: {e}")
|
272 |
return None
|
273 |
+
|
274 |
def process_custom_voice(self, voice_samples: list) -> str:
|
275 |
"""Process custom voice samples for training"""
|
276 |
if not voice_samples:
|
277 |
return "No voice samples provided"
|
278 |
+
|
279 |
try:
|
280 |
# In a real implementation, this would train a voice model
|
281 |
# For demo, we'll just validate the samples
|
282 |
total_duration = 0
|
283 |
+
|
284 |
for sample in voice_samples:
|
285 |
if sample is not None:
|
286 |
audio, sr = librosa.load(sample, sr=44100)
|
287 |
duration = len(audio) / sr
|
288 |
total_duration += duration
|
289 |
+
|
290 |
if total_duration < 30:
|
291 |
return "Need at least 30 seconds of voice samples"
|
292 |
elif total_duration > 300:
|
293 |
return "Voice samples too long (max 5 minutes)"
|
294 |
else:
|
295 |
+
return f"Custom voice model ready!\n({total_duration:.1f}s of training data)"
|
296 |
+
|
297 |
except Exception as e:
|
298 |
return f"Error processing voice samples: {e}"
|
299 |
|
|
|
308 |
auto_tune: bool = False,
|
309 |
output_format: str = "wav"
|
310 |
) -> Tuple[Optional[str], str]:
|
311 |
+
"""Main \
|
312 |
+
function to generate AI cover"""
|
313 |
+
|
314 |
if audio_file is None:
|
315 |
return None, "Please upload an audio file"
|
316 |
+
|
317 |
try:
|
318 |
# Step 1: Separate vocals and instrumentals
|
319 |
yield None, "π΅ Separating vocals and instrumentals..."
|
320 |
vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name)
|
321 |
+
|
322 |
if vocals_path is None:
|
323 |
return None, "β Failed to separate vocals"
|
324 |
+
|
325 |
# Step 2: Convert vocals to target voice
|
326 |
yield None, f"π€ Converting vocals to {voice_model} style..."
|
327 |
converted_vocals_path = cover_generator.convert_voice(
|
328 |
+
vocals_path,
|
329 |
+
voice_model,
|
330 |
+
pitch_shift,
|
331 |
voice_strength / 100
|
332 |
)
|
333 |
+
|
334 |
# Step 3: Apply auto-tune if requested
|
335 |
if auto_tune:
|
336 |
yield None, "πΌ Applying auto-tune..."
|
337 |
# Auto-tune implementation would go here
|
338 |
pass
|
339 |
+
|
340 |
# Step 4: Mix final audio
|
341 |
yield None, "π§ Mixing final audio..."
|
342 |
final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path)
|
343 |
+
|
344 |
if final_path is None:
|
345 |
return None, "β Failed to mix audio"
|
346 |
+
|
347 |
+
# Convert to requested \
|
348 |
+
format if needed
|
349 |
if output_format != "wav":
|
350 |
yield None, f"πΎ Converting to {output_format.upper()}..."
|
351 |
# Format conversion would go here
|
352 |
+
|
353 |
return final_path, "β
AI Cover generated successfully!"
|
354 |
+
|
355 |
except Exception as e:
|
356 |
return None, f"β Error: {str(e)}"
|
357 |
|
|
|
359 |
"""Process uploaded voice samples for custom voice training"""
|
360 |
if not voice_files:
|
361 |
return "No voice samples uploaded"
|
362 |
+
|
363 |
return cover_generator.process_custom_voice(voice_files)
|
364 |
|
365 |
# Create Gradio interface
|
366 |
def create_interface():
|
367 |
with gr.Blocks(
|
368 |
title="π΅ AI Cover Song Platform",
|
369 |
+
# Removed theme=gr.themes.Soft for compatibility with Gradio versions < 4.0.0
|
|
|
|
|
|
|
|
|
370 |
css="""
|
371 |
.gradio-container {
|
372 |
font-family: 'Inter', sans-serif;
|
|
|
390 |
}
|
391 |
"""
|
392 |
) as app:
|
393 |
+
|
394 |
# Header
|
395 |
with gr.Row():
|
396 |
gr.Markdown("""
|
|
|
404 |
</div>
|
405 |
</div>
|
406 |
""")
|
407 |
+
|
408 |
# Step 1: Upload Audio
|
409 |
with gr.Row():
|
410 |
with gr.Column():
|
|
|
415 |
format="wav"
|
416 |
)
|
417 |
gr.Markdown("*Supports MP3, WAV, FLAC files*")
|
418 |
+
|
419 |
# Step 2: Voice Selection
|
420 |
with gr.Row():
|
421 |
with gr.Column():
|
|
|
426 |
value="Drake Style Voice",
|
427 |
interactive=True
|
428 |
)
|
429 |
+
|
430 |
# Custom voice training section
|
431 |
with gr.Accordion("ποΈ Train Custom Voice (Optional)", open=False):
|
432 |
voice_samples = gr.File(
|
|
|
436 |
)
|
437 |
train_btn = gr.Button("Train Custom Voice", variant="secondary")
|
438 |
training_status = gr.Textbox(label="Training Status", interactive=False)
|
439 |
+
|
440 |
train_btn.click(
|
441 |
process_voice_samples,
|
442 |
inputs=[voice_samples],
|
443 |
outputs=[training_status]
|
444 |
)
|
445 |
+
|
446 |
# Step 3: Audio Settings
|
447 |
with gr.Row():
|
448 |
with gr.Column():
|
449 |
gr.Markdown("## βοΈ Step 3: Audio Settings")
|
450 |
+
|
451 |
with gr.Row():
|
452 |
pitch_shift = gr.Slider(
|
453 |
minimum=-12,
|
|
|
463 |
step=5,
|
464 |
label="Voice Strength (%)"
|
465 |
)
|
466 |
+
|
467 |
with gr.Row():
|
468 |
auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False)
|
469 |
output_format = gr.Dropdown(
|
|
|
471 |
label="Output Format",
|
472 |
value="wav"
|
473 |
)
|
474 |
+
|
475 |
# Step 4: Generate Cover
|
476 |
with gr.Row():
|
477 |
with gr.Column():
|
|
|
481 |
variant="primary",
|
482 |
size="lg"
|
483 |
)
|
484 |
+
|
485 |
progress_text = gr.Textbox(
|
486 |
label="Progress",
|
487 |
value="Ready to generate cover...",
|
488 |
interactive=False
|
489 |
)
|
490 |
+
|
491 |
# Results
|
492 |
with gr.Row():
|
493 |
with gr.Column():
|
494 |
gr.Markdown("## π Results")
|
495 |
+
|
496 |
with gr.Row():
|
497 |
original_audio = gr.Audio(label="Original Song", interactive=False)
|
498 |
cover_audio = gr.Audio(label="AI Cover", interactive=False)
|
499 |
+
|
500 |
# Legal Notice
|
501 |
with gr.Row():
|
502 |
gr.Markdown("""
|
503 |
+
<div style="background: rgba(255, 193, 7, 0.1);
|
504 |
+
border: 1px solid rgba(255, 193, 7, 0.3); border-radius: 10px; padding: 1rem;
|
505 |
+
margin: 1rem 0;">
|
506 |
<h3>β οΈ Legal & Ethical Notice</h3>
|
507 |
+
<p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly.
|
508 |
+
Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content.
|
509 |
Respect copyright laws and artist rights.</p>
|
510 |
</div>
|
511 |
""")
|
512 |
+
|
513 |
# Event handlers
|
514 |
generate_btn.click(
|
515 |
generate_cover,
|
|
|
523 |
],
|
524 |
outputs=[cover_audio, progress_text]
|
525 |
)
|
526 |
+
|
527 |
# Update original audio when file is uploaded
|
528 |
audio_input.change(
|
529 |
lambda x: x,
|
530 |
inputs=[audio_input],
|
531 |
outputs=[original_audio]
|
532 |
)
|
533 |
+
|
534 |
return app
|
535 |
|
536 |
# Launch the app
|
|
|
541 |
server_port=7860,
|
542 |
share=True,
|
543 |
show_error=True
|
544 |
+
)
|