Spaces:
Build error
Build error
import gradio as gr | |
import torch | |
import librosa | |
import numpy as np | |
import soundfile as sf | |
import os | |
import tempfile | |
from pathlib import Path | |
import json | |
from typing import Tuple, Optional | |
import subprocess | |
import shutil | |
import warnings | |
warnings.filterwarnings("ignore") | |
# NLTK download for 'punkt' tokenizer data | |
import nltk | |
try: | |
nltk.data.find('tokenizers/punkt') | |
except nltk.downloader.DownloadError: | |
nltk.download('punkt') | |
# Import audio processing libraries | |
try: | |
from demucs.pretrained import get_model | |
from demucs.apply import apply_model | |
DEMUCS_AVAILABLE = True | |
except ImportError: | |
DEMUCS_AVAILABLE = False | |
print("Demucs not available, using basic separation") | |
try: | |
import so_vits_svc_fork as svc | |
SVC_AVAILABLE = True | |
except ImportError: | |
SVC_AVAILABLE = False | |
print("SVC not available, using basic voice conversion") | |
class AICoverGenerator: | |
def \ | |
__init__(self): | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
self.temp_dir = tempfile.mkdtemp() | |
self.voice_models = { | |
"drake": "Drake Style Voice", | |
"ariana": "Ariana Style Voice", | |
"weeknd": "The Weeknd Style Voice", | |
"taylor": "Taylor Swift Style Voice", | |
"custom": "Custom Voice Model" | |
} | |
# Initialize audio separation model | |
if DEMUCS_AVAILABLE: | |
try: | |
self.separation_model = get_model('htdemucs') | |
self.separation_model.to(self.device) | |
except Exception as e: | |
print(f"Error loading Demucs: {e}") | |
self.separation_model = None | |
else: | |
self.separation_model = None | |
def separate_vocals(self, audio_path: str) -> Tuple[str, str]: | |
"""Separate vocals and instrumentals from audio""" | |
try: | |
# Load audio | |
audio, sr = librosa.load(audio_path, sr=44100, mono=False) | |
if self.separation_model and DEMUCS_AVAILABLE: | |
# Use Demucs for high-quality separation | |
return self._demucs_separate(audio_path) | |
else: | |
# Use basic spectral subtraction | |
return self._basic_separate(audio, sr) | |
except Exception as e: | |
print(f"Error in vocal separation: {e}") | |
return None, None | |
def _demucs_separate(self, audio_path: str) -> Tuple[str, str]: | |
"""Use Demucs for audio separation""" | |
try: | |
# Load audio for Demucs | |
audio, sr = librosa.load(audio_path, sr=44100, mono=False) | |
if audio.ndim == 1: | |
audio = np.stack([audio, audio]) | |
# Convert to tensor | |
audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) | |
# Apply separation | |
with torch.no_grad(): | |
sources = apply_model(self.separation_model, audio_tensor) | |
# Extract vocals and instrumental | |
vocals = sources[0, 3].cpu().numpy() # vocals channel | |
instrumental = sources[0, 0].cpu().numpy() # drums + bass + other | |
# Save separated audio | |
vocals_path = os.path.join(self.temp_dir, "vocals.wav") | |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav") | |
sf.write(vocals_path, vocals.T, 44100) | |
sf.write(instrumental_path, instrumental.T, 44100) | |
return vocals_path, instrumental_path | |
except Exception as e: | |
print(f"Demucs separation error: {e}") | |
return self._basic_separate(audio, 44100) | |
def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]: | |
"""Basic vocal separation using spectral subtraction""" | |
try: | |
# Convert to mono if stereo | |
if audio.ndim > 1: | |
audio = librosa.to_mono(audio) | |
# Compute STFT | |
stft = librosa.stft(audio, n_fft=2048, hop_length=512) | |
magnitude, phase = np.abs(stft), np.angle(stft) | |
# Simple vocal isolation (center channel extraction) | |
# This is a basic approach - real implementation would be more sophisticated | |
vocal_mask = np.ones_like(magnitude) | |
vocal_mask[:, :magnitude.shape[1]//4] *= 0.3 # Reduce low frequencies | |
vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3 # Reduce high frequencies | |
# Apply mask | |
vocal_magnitude = magnitude * vocal_mask | |
instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7) | |
# Reconstruct audio | |
vocal_stft = vocal_magnitude * np.exp(1j * phase) | |
instrumental_stft = instrumental_magnitude * np.exp(1j * phase) | |
vocals = librosa.istft(vocal_stft, hop_length=512) | |
instrumental = librosa.istft(instrumental_stft, hop_length=512) | |
# Save files | |
vocals_path = os.path.join(self.temp_dir, "vocals.wav") | |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav") | |
sf.write(vocals_path, vocals, sr) | |
sf.write(instrumental_path, instrumental, sr) | |
return vocals_path, instrumental_path | |
except Exception as e: | |
print(f"Basic separation error: {e}") | |
return None, None | |
def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str: | |
"""Convert vocals to target voice""" | |
try: | |
# Load vocal audio | |
vocals, sr = librosa.load(vocals_path, sr=44100) | |
# Apply pitch shifting if requested | |
if pitch_shift != 0: | |
vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift) | |
# Simulate voice conversion (in real app, this would use trained models) | |
converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength) | |
# Save converted vocals | |
converted_path = os.path.join(self.temp_dir, "converted_vocals.wav") | |
sf.write(converted_path, converted_vocals, sr) | |
return converted_path | |
except Exception as e: | |
print(f"Voice conversion error: {e}") | |
return vocals_path # Return original if conversion fails | |
def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray: | |
"""Simulate voice conversion \ | |
(placeholder for actual model inference)""" | |
# This is a simplified simulation - real implementation would use trained models | |
# Apply different effects based on voice model | |
if voice_model == "drake": | |
# Simulate Drake's voice characteristics | |
vocals = self._apply_voice_characteristics(vocals, | |
pitch_factor=0.85, | |
formant_shift=-0.1, | |
roughness=0.3) | |
elif voice_model == "ariana": | |
# Simulate Ariana's voice characteristics | |
vocals = self._apply_voice_characteristics(vocals, | |
pitch_factor=1.2, | |
formant_shift=0.2, | |
breathiness=0.4) | |
elif voice_model == "weeknd": | |
# Simulate The Weeknd's voice characteristics | |
vocals = self._apply_voice_characteristics(vocals, | |
pitch_factor=0.9, | |
formant_shift=-0.05, | |
reverb=0.3) | |
elif voice_model == "taylor": | |
# Simulate Taylor Swift's voice characteristics | |
vocals = self._apply_voice_characteristics(vocals, | |
pitch_factor=1.1, | |
formant_shift=0.1, | |
clarity=0.8) | |
# Blend with original based on strength | |
return vocals * strength + vocals * (1 - strength) * 0.3 | |
def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray: | |
"""Apply voice characteristics transformation""" | |
sr = 44100 | |
# Apply pitch factor | |
if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0: | |
vocals = librosa.effects.pitch_shift(vocals, sr=sr, | |
n_steps=12 * np.log2(kwargs['pitch_factor'])) | |
# Apply formant shifting (simplified) | |
if 'formant_shift' in kwargs: | |
# This is a simplified formant shift - real implementation would be more complex | |
stft = librosa.stft(vocals) | |
magnitude = np.abs(stft) | |
phase = np.angle(stft) | |
# Shift formants by stretching frequency axis | |
shift_factor = 1 + kwargs['formant_shift'] | |
shifted_magnitude = np.zeros_like(magnitude) | |
for i in range(magnitude.shape[0]): | |
shifted_idx = int(i * shift_factor) | |
if shifted_idx < magnitude.shape[0]: | |
shifted_magnitude[shifted_idx] = magnitude[i] | |
shifted_stft = shifted_magnitude * np.exp(1j * phase) | |
vocals = librosa.istft(shifted_stft) | |
# Apply effects | |
if 'roughness' in kwargs: | |
# Add slight distortion for roughness | |
vocals = np.tanh(vocals * (1 + kwargs['roughness'])) | |
if 'breathiness' in kwargs: | |
# Add noise for breathiness | |
noise = np.random.normal(0, 0.01, vocals.shape) | |
vocals = vocals + noise * kwargs['breathiness'] | |
return vocals | |
def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str: | |
"""Mix instrumental and converted vocals""" | |
try: | |
# Load audio files | |
instrumental, sr = librosa.load(instrumental_path, sr=44100) | |
vocals, _ = librosa.load(vocals_path, sr=44100) | |
# Ensure same length | |
min_len = min(len(instrumental), len(vocals)) | |
instrumental = instrumental[:min_len] | |
vocals = vocals[:min_len] | |
# Mix audio | |
mixed = instrumental + vocals * vocal_volume | |
# Normalize to prevent clipping | |
max_amplitude = np.max(np.abs(mixed)) | |
if max_amplitude > 0.95: | |
mixed = mixed / max_amplitude * 0.95 | |
# Save mixed audio | |
output_path = os.path.join(self.temp_dir, "final_cover.wav") | |
sf.write(output_path, mixed, sr) | |
return output_path | |
except Exception as e: | |
print(f"Audio mixing error: {e}") | |
return None | |
def process_custom_voice(self, voice_samples: list) -> str: | |
"""Process custom voice samples for training""" | |
if not voice_samples: | |
return "No voice samples provided" | |
try: | |
# In a real implementation, this would train a voice model | |
# For demo, we'll just validate the samples | |
total_duration = 0 | |
for sample in voice_samples: | |
if sample is not None: | |
audio, sr = librosa.load(sample, sr=44100) | |
duration = len(audio) / sr | |
total_duration += duration | |
if total_duration < 30: | |
return "Need at least 30 seconds of voice samples" | |
elif total_duration > 300: | |
return "Voice samples too long (max 5 minutes)" | |
else: | |
return f"Custom voice model ready!\n({total_duration:.1f}s of training data)" | |
except Exception as e: | |
return f"Error processing voice samples: {e}" | |
# Initialize the AI Cover Generator | |
cover_generator = AICoverGenerator() | |
def generate_cover( | |
audio_file, | |
voice_model: str, | |
pitch_shift: int = 0, | |
voice_strength: float = 80, | |
auto_tune: bool = False, | |
output_format: str = "wav" | |
) -> Tuple[Optional[str], str]: | |
"""Main \ | |
function to generate AI cover""" | |
if audio_file is None: | |
return None, "Please upload an audio file" | |
try: | |
# Step 1: Separate vocals and instrumentals | |
yield None, "π΅ Separating vocals and instrumentals..." | |
vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name) | |
if vocals_path is None: | |
return None, "β Failed to separate vocals" | |
# Step 2: Convert vocals to target voice | |
yield None, f"π€ Converting vocals to {voice_model} style..." | |
converted_vocals_path = cover_generator.convert_voice( | |
vocals_path, | |
voice_model, | |
pitch_shift, | |
voice_strength / 100 | |
) | |
# Step 3: Apply auto-tune if requested | |
if auto_tune: | |
yield None, "πΌ Applying auto-tune..." | |
# Auto-tune implementation would go here | |
pass | |
# Step 4: Mix final audio | |
yield None, "π§ Mixing final audio..." | |
final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path) | |
if final_path is None: | |
return None, "β Failed to mix audio" | |
# Convert to requested \ | |
format if needed | |
if output_format != "wav": | |
yield None, f"πΎ Converting to {output_format.upper()}..." | |
# Format conversion would go here | |
return final_path, "β AI Cover generated successfully!" | |
except Exception as e: | |
return None, f"β Error: {str(e)}" | |
def process_voice_samples(voice_files) -> str: | |
"""Process uploaded voice samples for custom voice training""" | |
if not voice_files: | |
return "No voice samples uploaded" | |
return cover_generator.process_custom_voice(voice_files) | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks( | |
title="π΅ AI Cover Song Platform", | |
# Removed theme=gr.themes.Soft for compatibility with Gradio versions < 4.0.0 (as per requirements.txt change) | |
css=""" | |
.gradio-container { | |
font-family: 'Inter', sans-serif; | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
} | |
.main-header { | |
text-align: center; | |
padding: 2rem; | |
background: rgba(255, 255, 255, 0.1); | |
backdrop-filter: blur(10px); | |
border-radius: 20px; | |
margin: 1rem; | |
} | |
.step-container { | |
background: rgba(255, 255, 255, 0.05); | |
backdrop-filter: blur(10px); | |
border-radius: 15px; | |
padding: 1.5rem; | |
margin: 1rem 0; | |
border: 1px solid rgba(255, 255, 255, 0.1); | |
} | |
""" | |
) as app: | |
# Header | |
with gr.Row(): | |
gr.Markdown(""" | |
<div class="main-header"> | |
<h1 style="font-size: 3rem; margin-bottom: 1rem;">π΅ AI Cover Song Platform</h1> | |
<p style="font-size: 1.2rem; opacity: 0.9;">Transform any song with AI voice synthesis</p> | |
<div style="margin-top: 1rem;"> | |
<span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; margin: 0 0.5rem;">π΅ Voice Separation</span> | |
<span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; margin: 0 0.5rem;">π€ Voice Cloning</span> | |
<span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; margin: 0 0.5rem;">π§ High Quality Audio</span> | |
</div> | |
</div> | |
""") | |
# Step 1: Upload Audio | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## π΅ Step 1: Upload Your Song") | |
audio_input = gr.Audio( | |
label="Upload Audio File", | |
type="filepath", | |
format="wav" | |
) | |
gr.Markdown("*Supports MP3, WAV, FLAC files*") | |
# Step 2: Voice Selection | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## π€ Step 2: Choose Voice Model") | |
voice_model = gr.Dropdown( | |
choices=list(cover_generator.voice_models.values()), | |
label="Voice Model", | |
value="Drake Style Voice", | |
interactive=True | |
) | |
# Custom voice training section | |
with gr.Accordion("ποΈ Train Custom Voice (Optional)", open=False): | |
voice_samples = gr.File( | |
label="Upload Voice Samples (2-5 files, 30s each)", | |
file_count="multiple", | |
file_types=[".wav", ".mp3"] | |
) | |
train_btn = gr.Button("Train Custom Voice", variant="secondary") | |
training_status = gr.Textbox(label="Training Status", interactive=False) | |
train_btn.click( | |
process_voice_samples, | |
inputs=[voice_samples], | |
outputs=[training_status] | |
) | |
# Step 3: Audio Settings | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## βοΈ Step 3: Audio Settings") | |
with gr.Row(): | |
pitch_shift = gr.Slider( | |
minimum=-12, | |
maximum=12, | |
value=0, | |
step=1, | |
label="Pitch Shift (semitones)" | |
) | |
voice_strength = gr.Slider( | |
minimum=0, | |
maximum=100, | |
value=80, | |
step=5, | |
label="Voice Strength (%)" | |
) | |
with gr.Row(): | |
auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False) | |
output_format = gr.Dropdown( | |
choices=["wav", "mp3", "flac"], | |
label="Output Format", | |
value="wav" | |
) | |
# Step 4: Generate Cover | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## π§ Step 4: Generate Cover") | |
generate_btn = gr.Button( | |
"π΅ Generate AI Cover", | |
variant="primary", | |
size="lg" | |
) | |
progress_text = gr.Textbox( | |
label="Progress", | |
value="Ready to generate cover...", | |
interactive=False | |
) | |
# Results | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## π Results") | |
with gr.Row(): | |
original_audio = gr.Audio(label="Original Song", interactive=False) | |
cover_audio = gr.Audio(label="AI Cover", interactive=False) | |
# Legal Notice | |
with gr.Row(): | |
gr.Markdown(""" | |
<div style="background: rgba(255, 193, 7, 0.1); | |
border: 1px solid rgba(255, 193, 7, 0.3); border-radius: 10px; padding: 1rem; | |
margin: 1rem 0;"> | |
<h3>β οΈ Legal & Ethical Notice</h3> | |
<p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly. | |
Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content. | |
Respect copyright laws and artist rights.</p> | |
</div> | |
""") | |
# Event handlers | |
generate_btn.click( | |
generate_cover, | |
inputs=[ | |
audio_input, | |
voice_model, | |
pitch_shift, | |
voice_strength, | |
auto_tune, | |
output_format | |
], | |
outputs=[cover_audio, progress_text] | |
) | |
# Update original audio when file is uploaded | |
audio_input.change( | |
lambda x: x, | |
inputs=[audio_input], | |
outputs=[original_audio] | |
) | |
return app | |
# Launch the app | |
if __name__ == "__main__": | |
app = create_interface() | |
app.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=True, | |
show_error=True | |
) | |