Spaces:
Sleeping
Sleeping
import gradio as gr | |
import librosa | |
import numpy as np | |
import soundfile as sf | |
import os | |
import tempfile | |
import shutil | |
from pathlib import Path | |
import warnings | |
warnings.filterwarnings("ignore") | |
# Import for advanced features | |
try: | |
from spleeter.separator import Separator | |
SPLEETER_AVAILABLE = True | |
except ImportError: | |
SPLEETER_AVAILABLE = False | |
print("Spleeter not available - source separation disabled") | |
try: | |
import scipy.signal | |
from scipy.spatial.distance import euclidean | |
from dtw import dtw | |
ADVANCED_FEATURES = True | |
except ImportError: | |
ADVANCED_FEATURES = False | |
print("Advanced features not available") | |
class AudioEngine: | |
"""Clean, professional audio processing engine""" | |
def __init__(self): | |
self.temp_dir = tempfile.mkdtemp() | |
self.separators = {} # Cache for Spleeter models | |
def analyze_audio(self, audio_path): | |
"""Extract comprehensive audio features""" | |
try: | |
# Load audio | |
y, sr = librosa.load(audio_path) | |
# Basic properties | |
duration = len(y) / sr | |
tempo, beats = librosa.beat.beat_track(y=y, sr=sr) | |
# Spectral features | |
spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) | |
spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)) | |
zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y)) | |
# Energy features | |
rms_energy = np.mean(librosa.feature.rms(y=y)) | |
# Pitch estimation | |
pitches, magnitudes = librosa.piptrack(y=y, sr=sr) | |
pitch_values = [] | |
for t in range(pitches.shape[1]): | |
index = magnitudes[:, t].argmax() | |
pitch = pitches[index, t] | |
if pitch > 0: | |
pitch_values.append(pitch) | |
avg_pitch = np.mean(pitch_values) if pitch_values else 0 | |
return { | |
'success': True, | |
'duration': round(duration, 2), | |
'tempo': round(tempo, 1), | |
'sample_rate': sr, | |
'spectral_centroid': round(spectral_centroid, 2), | |
'spectral_rolloff': round(spectral_rolloff, 2), | |
'zero_crossing_rate': round(zero_crossing_rate, 4), | |
'rms_energy': round(rms_energy, 4), | |
'average_pitch': round(avg_pitch, 2), | |
'pitch_count': len(pitch_values), | |
'beats_detected': len(beats) | |
} | |
except Exception as e: | |
return {'success': False, 'error': str(e)} | |
def separate_vocals(self, audio_path, model_type="2stems"): | |
"""Separate vocals using Spleeter""" | |
if not SPLEETER_AVAILABLE: | |
return {'success': False, 'error': 'Spleeter not available'} | |
try: | |
# Load or create separator | |
if model_type not in self.separators: | |
self.separators[model_type] = Separator(f'spleeter:{model_type}-16kHz') | |
separator = self.separators[model_type] | |
# Create output directory | |
output_dir = os.path.join(self.temp_dir, f"separation_{np.random.randint(10000)}") | |
os.makedirs(output_dir, exist_ok=True) | |
# Separate | |
separator.separate_to_file(audio_path, output_dir) | |
# Get results | |
audio_name = Path(audio_path).stem | |
result_dir = os.path.join(output_dir, audio_name) | |
if model_type == "2stems": | |
vocals_path = os.path.join(result_dir, "vocals.wav") | |
accompaniment_path = os.path.join(result_dir, "accompaniment.wav") | |
return { | |
'success': True, | |
'vocals': vocals_path if os.path.exists(vocals_path) else None, | |
'accompaniment': accompaniment_path if os.path.exists(accompaniment_path) else None | |
} | |
elif model_type == "4stems": | |
vocals_path = os.path.join(result_dir, "vocals.wav") | |
drums_path = os.path.join(result_dir, "drums.wav") | |
bass_path = os.path.join(result_dir, "bass.wav") | |
other_path = os.path.join(result_dir, "other.wav") | |
return { | |
'success': True, | |
'vocals': vocals_path if os.path.exists(vocals_path) else None, | |
'drums': drums_path if os.path.exists(drums_path) else None, | |
'bass': bass_path if os.path.exists(bass_path) else None, | |
'other': other_path if os.path.exists(other_path) else None | |
} | |
except Exception as e: | |
return {'success': False, 'error': str(e)} | |
def apply_effects(self, audio_path, pitch_shift=0, reverb=0): | |
"""Apply vocal effects""" | |
try: | |
y, sr = librosa.load(audio_path) | |
# Apply pitch shift | |
if pitch_shift != 0: | |
y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift) | |
# Apply reverb (simple convolution) | |
if reverb > 0 and ADVANCED_FEATURES: | |
reverb_length = int(0.5 * sr) | |
impulse = np.random.randn(reverb_length) * np.exp(-np.arange(reverb_length) / (sr * 0.1)) | |
y = scipy.signal.convolve(y, impulse * reverb, mode='same') | |
y = y / np.max(np.abs(y)) # Normalize | |
# Save processed audio | |
output_path = os.path.join(self.temp_dir, f"processed_{np.random.randint(10000)}.wav") | |
sf.write(output_path, y, sr) | |
return {'success': True, 'output': output_path} | |
except Exception as e: | |
return {'success': False, 'error': str(e)} | |
def extract_vocal_features(self, audio_path): | |
"""Extract features for style coaching""" | |
try: | |
y, sr = librosa.load(audio_path) | |
# Pitch analysis | |
pitches, magnitudes = librosa.piptrack(y=y, sr=sr) | |
pitch_values = [] | |
for t in range(pitches.shape[1]): | |
index = magnitudes[:, t].argmax() | |
pitch = pitches[index, t] | |
if pitch > 0: | |
pitch_values.append(pitch) | |
if not pitch_values: | |
return {'success': False, 'error': 'No pitch detected'} | |
# Basic vocal metrics | |
mean_pitch = np.mean(pitch_values) | |
pitch_std = np.std(pitch_values) | |
pitch_range = max(pitch_values) - min(pitch_values) | |
# Tempo | |
tempo, _ = librosa.beat.beat_track(y=y, sr=sr) | |
# Spectral features | |
spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) | |
# Energy | |
rms_energy = np.mean(librosa.feature.rms(y=y)) | |
return { | |
'success': True, | |
'mean_pitch': mean_pitch, | |
'pitch_std': pitch_std, | |
'pitch_range': pitch_range, | |
'tempo': tempo, | |
'spectral_centroid': spectral_centroid, | |
'rms_energy': rms_energy | |
} | |
except Exception as e: | |
return {'success': False, 'error': str(e)} | |
def compare_vocal_styles(self, user_features, reference_features_list): | |
"""Compare user vocals to reference style""" | |
if not ADVANCED_FEATURES: | |
return {'success': False, 'error': 'Advanced features not available'} | |
try: | |
# Average reference features | |
ref_avg = {} | |
for key in ['mean_pitch', 'pitch_std', 'pitch_range', 'tempo', 'spectral_centroid', 'rms_energy']: | |
values = [ref[key] for ref in reference_features_list if key in ref] | |
ref_avg[key] = np.mean(values) if values else 0 | |
# Calculate differences | |
pitch_diff = abs(user_features['mean_pitch'] - ref_avg['mean_pitch']) | |
tempo_diff = abs(user_features['tempo'] - ref_avg['tempo']) | |
timbre_diff = abs(user_features['spectral_centroid'] - ref_avg['spectral_centroid']) | |
energy_diff = abs(user_features['rms_energy'] - ref_avg['rms_energy']) | |
# Generate feedback | |
feedback = [] | |
if pitch_diff > 50: | |
feedback.append(f"π΅ Pitch: Your average pitch differs by {pitch_diff:.1f} Hz. Practice matching the reference key.") | |
else: | |
feedback.append("π΅ Pitch: Good pitch accuracy!") | |
if tempo_diff > 10: | |
feedback.append(f"β±οΈ Tempo: Your tempo differs by {tempo_diff:.1f} BPM. Work on timing consistency.") | |
else: | |
feedback.append("β±οΈ Tempo: Good timing!") | |
if timbre_diff > 500: | |
feedback.append("π£οΈ Timbre: Try adjusting your vocal tone to match the reference style.") | |
else: | |
feedback.append("π£οΈ Timbre: Good vocal tone match!") | |
if energy_diff > 0.1: | |
feedback.append("π Energy: Adjust your vocal intensity to match the reference.") | |
else: | |
feedback.append("π Energy: Good energy level!") | |
overall_score = max(0, 100 - (pitch_diff/2 + tempo_diff + timbre_diff/10 + energy_diff*100)) | |
return { | |
'success': True, | |
'score': round(overall_score, 1), | |
'feedback': feedback, | |
'metrics': { | |
'pitch_diff': round(pitch_diff, 1), | |
'tempo_diff': round(tempo_diff, 1), | |
'timbre_diff': round(timbre_diff, 1), | |
'energy_diff': round(energy_diff, 3) | |
} | |
} | |
except Exception as e: | |
return {'success': False, 'error': str(e)} | |
def cleanup(self): | |
"""Clean up temporary files""" | |
try: | |
if os.path.exists(self.temp_dir): | |
shutil.rmtree(self.temp_dir) | |
except Exception: | |
pass | |
# Global engine instance | |
engine = AudioEngine() | |
def format_analysis_results(analysis): | |
"""Format analysis results for display""" | |
if not analysis['success']: | |
return f"β Analysis failed: {analysis['error']}" | |
return f"""π Audio Analysis Results | |
π΅ Basic Properties: | |
β’ Duration: {analysis['duration']} seconds | |
β’ Sample Rate: {analysis['sample_rate']} Hz | |
β’ Tempo: {analysis['tempo']} BPM | |
π Audio Characteristics: | |
β’ Spectral Centroid: {analysis['spectral_centroid']} Hz | |
β’ Spectral Rolloff: {analysis['spectral_rolloff']} Hz | |
β’ Zero Crossing Rate: {analysis['zero_crossing_rate']} | |
β’ RMS Energy: {analysis['rms_energy']} | |
π€ Vocal Information: | |
β’ Average Pitch: {analysis['average_pitch']} Hz | |
β’ Pitch Points Detected: {analysis['pitch_count']} | |
β’ Beats Detected: {analysis['beats_detected']}""" | |
def process_audio_separation(audio_file, separation_mode): | |
"""Main audio separation function""" | |
if not audio_file: | |
return "β Please upload an audio file", None, None, None, None, "" | |
if not SPLEETER_AVAILABLE: | |
return "β Spleeter not available for source separation", None, None, None, None, "" | |
try: | |
# Analyze audio first | |
analysis = engine.analyze_audio(audio_file) | |
analysis_text = format_analysis_results(analysis) | |
# Separate audio | |
model_type = "2stems" if "2-stem" in separation_mode else "4stems" | |
separation_result = engine.separate_vocals(audio_file, model_type) | |
if not separation_result['success']: | |
return f"β Separation failed: {separation_result['error']}", None, None, None, None, analysis_text | |
if model_type == "2stems": | |
return ( | |
"β 2-stem separation completed successfully!", | |
separation_result.get('vocals'), | |
separation_result.get('accompaniment'), | |
None, | |
None, | |
analysis_text | |
) | |
else: | |
return ( | |
"β 4-stem separation completed successfully!", | |
separation_result.get('vocals'), | |
separation_result.get('drums'), | |
separation_result.get('bass'), | |
separation_result.get('other'), | |
analysis_text | |
) | |
except Exception as e: | |
return f"β Processing error: {str(e)}", None, None, None, None, "" | |
def process_vocal_effects(audio_file, pitch_shift, reverb_amount): | |
"""Apply vocal effects to audio""" | |
if not audio_file: | |
return "β Please upload an audio file", None, "" | |
try: | |
# Analyze original | |
analysis = engine.analyze_audio(audio_file) | |
analysis_text = format_analysis_results(analysis) | |
# Apply effects | |
effects_result = engine.apply_effects(audio_file, pitch_shift, reverb_amount) | |
if not effects_result['success']: | |
return f"β Effects failed: {effects_result['error']}", None, analysis_text | |
effects_applied = [] | |
if pitch_shift != 0: | |
effects_applied.append(f"Pitch: {pitch_shift:+.1f} semitones") | |
if reverb_amount > 0: | |
effects_applied.append(f"Reverb: {reverb_amount:.2f}") | |
status = f"β Effects applied: {', '.join(effects_applied)}" if effects_applied else "β Audio processed (no effects)" | |
return status, effects_result['output'], analysis_text | |
except Exception as e: | |
return f"β Processing error: {str(e)}", None, "" | |
def process_style_coaching(reference_files, user_audio): | |
"""Style coaching analysis""" | |
if not reference_files or len(reference_files) < 2: | |
return "β Upload at least 2 reference tracks", "", "" | |
if not user_audio: | |
return "β Please record or upload your performance", "", "" | |
if not SPLEETER_AVAILABLE or not ADVANCED_FEATURES: | |
return "β Style coaching requires advanced features", "", "" | |
try: | |
# Process reference tracks | |
ref_features = [] | |
ref_status = [] | |
for i, ref_file in enumerate(reference_files[:5]): | |
# Separate vocals | |
separation_result = engine.separate_vocals(ref_file.name, "2stems") | |
if separation_result['success'] and separation_result.get('vocals'): | |
# Extract features | |
features = engine.extract_vocal_features(separation_result['vocals']) | |
if features['success']: | |
ref_features.append(features) | |
ref_status.append(f"β Reference {i+1}: Processed") | |
else: | |
ref_status.append(f"β Reference {i+1}: Feature extraction failed") | |
else: | |
ref_status.append(f"β Reference {i+1}: Vocal separation failed") | |
if len(ref_features) < 2: | |
return "β Need at least 2 valid reference tracks", "\n".join(ref_status), "" | |
# Process user audio | |
user_separation = engine.separate_vocals(user_audio, "2stems") | |
if not user_separation['success'] or not user_separation.get('vocals'): | |
return "β Could not separate vocals from your performance", "\n".join(ref_status), "" | |
user_features = engine.extract_vocal_features(user_separation['vocals']) | |
if not user_features['success']: | |
return "β Could not analyze your vocal features", "\n".join(ref_status), "" | |
# Compare styles | |
comparison = engine.compare_vocal_styles(user_features, ref_features) | |
if not comparison['success']: | |
return f"β Style comparison failed: {comparison['error']}", "\n".join(ref_status), "" | |
# Format feedback | |
feedback_text = f"""π― Vocal Style Coaching Results | |
π Overall Score: {comparison['score']}/100 | |
π΅ Detailed Feedback: | |
{chr(10).join(comparison['feedback'])} | |
π Technical Metrics: | |
β’ Pitch Difference: {comparison['metrics']['pitch_diff']} Hz | |
β’ Tempo Difference: {comparison['metrics']['tempo_diff']} BPM | |
β’ Timbre Difference: {comparison['metrics']['timbre_diff']} Hz | |
β’ Energy Difference: {comparison['metrics']['energy_diff']} | |
π― Recommendations: | |
{f"π₯ Excellent! You're very close to the target style." if comparison['score'] > 80 else | |
f"π Good progress! Focus on the areas mentioned above." if comparison['score'] > 60 else | |
f"πͺ Keep practicing! Work on basic vocal technique first."} | |
References analyzed: {len(ref_features)}/5""" | |
return f"β Style coaching complete! Score: {comparison['score']}/100", "\n".join(ref_status), feedback_text | |
except Exception as e: | |
return f"β Coaching failed: {str(e)}", "", "" | |
# Create main interface | |
def create_app(): | |
with gr.Blocks(title="Audio Singing Helper") as app: | |
gr.HTML(""" | |
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;"> | |
<h1>π€ Audio Singing Helper</h1> | |
<p>Professional audio processing for singers and musicians</p> | |
</div> | |
""") | |
with gr.Tabs(): | |
# Audio Separation Tab | |
with gr.Tab("π΅ Audio Separation"): | |
gr.Markdown("### Separate vocals from instrumental tracks") | |
with gr.Row(): | |
with gr.Column(): | |
sep_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]) | |
sep_mode = gr.Dropdown( | |
choices=["2-stem (Vocals + Instrumental)", "4-stem (Vocals + Drums + Bass + Other)"], | |
value="2-stem (Vocals + Instrumental)", | |
label="Separation Mode" | |
) | |
sep_button = gr.Button("π― Separate Audio", variant="primary") | |
with gr.Column(): | |
sep_status = gr.Textbox(label="Status", lines=2, interactive=False) | |
sep_analysis = gr.Textbox(label="Audio Analysis", lines=12, interactive=False) | |
with gr.Row(): | |
sep_vocals = gr.Audio(label="π€ Vocals", show_download_button=True) | |
sep_instrumental = gr.Audio(label="πΌ Instrumental/Drums", show_download_button=True) | |
with gr.Row(): | |
sep_bass = gr.Audio(label="πΈ Bass", show_download_button=True) | |
sep_other = gr.Audio(label="πΉ Other", show_download_button=True) | |
# Vocal Effects Tab | |
with gr.Tab("ποΈ Vocal Effects"): | |
gr.Markdown("### Apply professional vocal effects") | |
with gr.Row(): | |
with gr.Column(): | |
fx_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]) | |
fx_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Shift (semitones)") | |
fx_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb Amount") | |
fx_button = gr.Button("π΅ Apply Effects", variant="primary") | |
with gr.Column(): | |
fx_status = gr.Textbox(label="Status", lines=2, interactive=False) | |
fx_analysis = gr.Textbox(label="Audio Analysis", lines=10, interactive=False) | |
fx_output = gr.Audio(label="π§ Processed Audio", show_download_button=True) | |
# Live Recording Tab | |
with gr.Tab("ποΈ Live Recording"): | |
gr.Markdown("### Record and process your voice in real-time") | |
with gr.Row(): | |
with gr.Column(): | |
live_audio = gr.Audio(type="filepath", sources=["microphone"], label="Record Your Voice") | |
live_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Correction") | |
live_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb") | |
live_button = gr.Button("π€ Process Recording", variant="primary") | |
with gr.Column(): | |
live_status = gr.Textbox(label="Status", lines=2, interactive=False) | |
live_analysis = gr.Textbox(label="Recording Analysis", lines=10, interactive=False) | |
live_output = gr.Audio(label="π§ Processed Recording", show_download_button=True) | |
# Style Coaching Tab | |
with gr.Tab("π Style Coaching"): | |
gr.Markdown("### Get personalized vocal coaching feedback") | |
with gr.Row(): | |
with gr.Column(): | |
coach_refs = gr.File( | |
label="Reference Tracks (2-5 files)", | |
file_count="multiple", | |
file_types=["audio"] | |
) | |
coach_user = gr.Audio( | |
type="filepath", | |
label="Your Performance", | |
sources=["upload", "microphone"] | |
) | |
coach_button = gr.Button("π― Get Coaching", variant="primary") | |
with gr.Column(): | |
coach_status = gr.Textbox(label="Status", lines=3, interactive=False) | |
coach_refs_status = gr.Textbox(label="Reference Processing", lines=8, interactive=False) | |
coach_feedback = gr.Textbox(label="π― Coaching Feedback", lines=15, interactive=False) | |
# Help Tab | |
with gr.Tab("βΉοΈ Help"): | |
gr.Markdown(""" | |
# π€ Audio Singing Helper - User Guide | |
## Features | |
### π΅ Audio Separation | |
- Upload any song to separate vocals from instruments | |
- Choose 2-stem (vocals + instrumental) or 4-stem (vocals + drums + bass + other) | |
- Get detailed audio analysis of your tracks | |
### ποΈ Vocal Effects | |
- Apply pitch shifting (-12 to +12 semitones) | |
- Add reverb for spatial depth | |
- Process any audio file with professional effects | |
### ποΈ Live Recording | |
- Record directly from your microphone | |
- Apply real-time pitch correction and reverb | |
- Perfect for vocal practice and experimentation | |
### π Style Coaching | |
- Upload 2-5 reference tracks from artists you want to emulate | |
- Record or upload your performance | |
- Get AI-powered feedback on pitch, timing, and vocal characteristics | |
- Receive a score and specific improvement suggestions | |
## Tips for Best Results | |
- **Use high-quality audio files** - better input = better results | |
- **Keep files under 5 minutes** for faster processing | |
- **For style coaching**: Choose references from similar genres | |
- **Record in quiet environments** for best analysis | |
## Supported Formats | |
- Input: MP3, WAV, FLAC, M4A, OGG | |
- Output: High-quality WAV files | |
## Technical Requirements | |
- Some features require additional dependencies | |
- Processing time varies based on file length and complexity | |
--- | |
Built for singers and musicians worldwide π | |
""") | |
# Connect all the event handlers | |
sep_button.click( | |
process_audio_separation, | |
inputs=[sep_audio_input, sep_mode], | |
outputs=[sep_status, sep_vocals, sep_instrumental, sep_bass, sep_other, sep_analysis] | |
) | |
fx_button.click( | |
process_vocal_effects, | |
inputs=[fx_audio_input, fx_pitch, fx_reverb], | |
outputs=[fx_status, fx_output, fx_analysis] | |
) | |
live_button.click( | |
process_vocal_effects, | |
inputs=[live_audio, live_pitch, live_reverb], | |
outputs=[live_status, live_output, live_analysis] | |
) | |
coach_button.click( | |
process_style_coaching, | |
inputs=[coach_refs, coach_user], | |
outputs=[coach_status, coach_refs_status, coach_feedback] | |
) | |
return app | |
if __name__ == "__main__": | |
app = create_app() | |
app.launch() |