|
import gradio as gr |
|
import librosa |
|
import numpy as np |
|
import soundfile as sf |
|
import os |
|
import tempfile |
|
import shutil |
|
from pathlib import Path |
|
import warnings |
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
try: |
|
from spleeter.separator import Separator |
|
SPLEETER_AVAILABLE = True |
|
except ImportError: |
|
SPLEETER_AVAILABLE = False |
|
print("Spleeter not available - source separation disabled") |
|
|
|
try: |
|
import scipy.signal |
|
from scipy.spatial.distance import euclidean |
|
from dtw import dtw |
|
ADVANCED_FEATURES = True |
|
except ImportError: |
|
ADVANCED_FEATURES = False |
|
print("Advanced features not available") |
|
|
|
class AudioEngine: |
|
"""Clean, professional audio processing engine""" |
|
|
|
def __init__(self): |
|
self.temp_dir = tempfile.mkdtemp() |
|
self.separators = {} |
|
|
|
def analyze_audio(self, audio_path): |
|
"""Extract comprehensive audio features""" |
|
try: |
|
|
|
y, sr = librosa.load(audio_path) |
|
|
|
|
|
duration = len(y) / sr |
|
tempo, beats = librosa.beat.beat_track(y=y, sr=sr) |
|
|
|
|
|
spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) |
|
spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)) |
|
zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y)) |
|
|
|
|
|
rms_energy = np.mean(librosa.feature.rms(y=y)) |
|
|
|
|
|
pitches, magnitudes = librosa.piptrack(y=y, sr=sr) |
|
pitch_values = [] |
|
for t in range(pitches.shape[1]): |
|
index = magnitudes[:, t].argmax() |
|
pitch = pitches[index, t] |
|
if pitch > 0: |
|
pitch_values.append(pitch) |
|
|
|
avg_pitch = np.mean(pitch_values) if pitch_values else 0 |
|
|
|
return { |
|
'success': True, |
|
'duration': round(duration, 2), |
|
'tempo': round(tempo, 1), |
|
'sample_rate': sr, |
|
'spectral_centroid': round(spectral_centroid, 2), |
|
'spectral_rolloff': round(spectral_rolloff, 2), |
|
'zero_crossing_rate': round(zero_crossing_rate, 4), |
|
'rms_energy': round(rms_energy, 4), |
|
'average_pitch': round(avg_pitch, 2), |
|
'pitch_count': len(pitch_values), |
|
'beats_detected': len(beats) |
|
} |
|
|
|
except Exception as e: |
|
return {'success': False, 'error': str(e)} |
|
|
|
def separate_vocals(self, audio_path, model_type="2stems"): |
|
"""Separate vocals using Spleeter""" |
|
if not SPLEETER_AVAILABLE: |
|
return {'success': False, 'error': 'Spleeter not available'} |
|
|
|
try: |
|
|
|
if model_type not in self.separators: |
|
self.separators[model_type] = Separator(f'spleeter:{model_type}-16kHz') |
|
|
|
separator = self.separators[model_type] |
|
|
|
|
|
output_dir = os.path.join(self.temp_dir, f"separation_{np.random.randint(10000)}") |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
separator.separate_to_file(audio_path, output_dir) |
|
|
|
|
|
audio_name = Path(audio_path).stem |
|
result_dir = os.path.join(output_dir, audio_name) |
|
|
|
if model_type == "2stems": |
|
vocals_path = os.path.join(result_dir, "vocals.wav") |
|
accompaniment_path = os.path.join(result_dir, "accompaniment.wav") |
|
|
|
return { |
|
'success': True, |
|
'vocals': vocals_path if os.path.exists(vocals_path) else None, |
|
'accompaniment': accompaniment_path if os.path.exists(accompaniment_path) else None |
|
} |
|
|
|
elif model_type == "4stems": |
|
vocals_path = os.path.join(result_dir, "vocals.wav") |
|
drums_path = os.path.join(result_dir, "drums.wav") |
|
bass_path = os.path.join(result_dir, "bass.wav") |
|
other_path = os.path.join(result_dir, "other.wav") |
|
|
|
return { |
|
'success': True, |
|
'vocals': vocals_path if os.path.exists(vocals_path) else None, |
|
'drums': drums_path if os.path.exists(drums_path) else None, |
|
'bass': bass_path if os.path.exists(bass_path) else None, |
|
'other': other_path if os.path.exists(other_path) else None |
|
} |
|
|
|
except Exception as e: |
|
return {'success': False, 'error': str(e)} |
|
|
|
def apply_effects(self, audio_path, pitch_shift=0, reverb=0): |
|
"""Apply vocal effects""" |
|
try: |
|
y, sr = librosa.load(audio_path) |
|
|
|
|
|
if pitch_shift != 0: |
|
y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift) |
|
|
|
|
|
if reverb > 0 and ADVANCED_FEATURES: |
|
reverb_length = int(0.5 * sr) |
|
impulse = np.random.randn(reverb_length) * np.exp(-np.arange(reverb_length) / (sr * 0.1)) |
|
y = scipy.signal.convolve(y, impulse * reverb, mode='same') |
|
y = y / np.max(np.abs(y)) |
|
|
|
|
|
output_path = os.path.join(self.temp_dir, f"processed_{np.random.randint(10000)}.wav") |
|
sf.write(output_path, y, sr) |
|
|
|
return {'success': True, 'output': output_path} |
|
|
|
except Exception as e: |
|
return {'success': False, 'error': str(e)} |
|
|
|
def extract_vocal_features(self, audio_path): |
|
"""Extract features for style coaching""" |
|
try: |
|
y, sr = librosa.load(audio_path) |
|
|
|
|
|
pitches, magnitudes = librosa.piptrack(y=y, sr=sr) |
|
pitch_values = [] |
|
for t in range(pitches.shape[1]): |
|
index = magnitudes[:, t].argmax() |
|
pitch = pitches[index, t] |
|
if pitch > 0: |
|
pitch_values.append(pitch) |
|
|
|
if not pitch_values: |
|
return {'success': False, 'error': 'No pitch detected'} |
|
|
|
|
|
mean_pitch = np.mean(pitch_values) |
|
pitch_std = np.std(pitch_values) |
|
pitch_range = max(pitch_values) - min(pitch_values) |
|
|
|
|
|
tempo, _ = librosa.beat.beat_track(y=y, sr=sr) |
|
|
|
|
|
spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) |
|
|
|
|
|
rms_energy = np.mean(librosa.feature.rms(y=y)) |
|
|
|
return { |
|
'success': True, |
|
'mean_pitch': mean_pitch, |
|
'pitch_std': pitch_std, |
|
'pitch_range': pitch_range, |
|
'tempo': tempo, |
|
'spectral_centroid': spectral_centroid, |
|
'rms_energy': rms_energy |
|
} |
|
|
|
except Exception as e: |
|
return {'success': False, 'error': str(e)} |
|
|
|
def compare_vocal_styles(self, user_features, reference_features_list): |
|
"""Compare user vocals to reference style""" |
|
if not ADVANCED_FEATURES: |
|
return {'success': False, 'error': 'Advanced features not available'} |
|
|
|
try: |
|
|
|
ref_avg = {} |
|
for key in ['mean_pitch', 'pitch_std', 'pitch_range', 'tempo', 'spectral_centroid', 'rms_energy']: |
|
values = [ref[key] for ref in reference_features_list if key in ref] |
|
ref_avg[key] = np.mean(values) if values else 0 |
|
|
|
|
|
pitch_diff = abs(user_features['mean_pitch'] - ref_avg['mean_pitch']) |
|
tempo_diff = abs(user_features['tempo'] - ref_avg['tempo']) |
|
timbre_diff = abs(user_features['spectral_centroid'] - ref_avg['spectral_centroid']) |
|
energy_diff = abs(user_features['rms_energy'] - ref_avg['rms_energy']) |
|
|
|
|
|
feedback = [] |
|
|
|
if pitch_diff > 50: |
|
feedback.append(f"π΅ Pitch: Your average pitch differs by {pitch_diff:.1f} Hz. Practice matching the reference key.") |
|
else: |
|
feedback.append("π΅ Pitch: Good pitch accuracy!") |
|
|
|
if tempo_diff > 10: |
|
feedback.append(f"β±οΈ Tempo: Your tempo differs by {tempo_diff:.1f} BPM. Work on timing consistency.") |
|
else: |
|
feedback.append("β±οΈ Tempo: Good timing!") |
|
|
|
if timbre_diff > 500: |
|
feedback.append("π£οΈ Timbre: Try adjusting your vocal tone to match the reference style.") |
|
else: |
|
feedback.append("π£οΈ Timbre: Good vocal tone match!") |
|
|
|
if energy_diff > 0.1: |
|
feedback.append("π Energy: Adjust your vocal intensity to match the reference.") |
|
else: |
|
feedback.append("π Energy: Good energy level!") |
|
|
|
overall_score = max(0, 100 - (pitch_diff/2 + tempo_diff + timbre_diff/10 + energy_diff*100)) |
|
|
|
return { |
|
'success': True, |
|
'score': round(overall_score, 1), |
|
'feedback': feedback, |
|
'metrics': { |
|
'pitch_diff': round(pitch_diff, 1), |
|
'tempo_diff': round(tempo_diff, 1), |
|
'timbre_diff': round(timbre_diff, 1), |
|
'energy_diff': round(energy_diff, 3) |
|
} |
|
} |
|
|
|
except Exception as e: |
|
return {'success': False, 'error': str(e)} |
|
|
|
def cleanup(self): |
|
"""Clean up temporary files""" |
|
try: |
|
if os.path.exists(self.temp_dir): |
|
shutil.rmtree(self.temp_dir) |
|
except Exception: |
|
pass |
|
|
|
|
|
engine = AudioEngine() |
|
|
|
def format_analysis_results(analysis): |
|
"""Format analysis results for display""" |
|
if not analysis['success']: |
|
return f"β Analysis failed: {analysis['error']}" |
|
|
|
return f"""π Audio Analysis Results |
|
|
|
π΅ Basic Properties: |
|
β’ Duration: {analysis['duration']} seconds |
|
β’ Sample Rate: {analysis['sample_rate']} Hz |
|
β’ Tempo: {analysis['tempo']} BPM |
|
|
|
π Audio Characteristics: |
|
β’ Spectral Centroid: {analysis['spectral_centroid']} Hz |
|
β’ Spectral Rolloff: {analysis['spectral_rolloff']} Hz |
|
β’ Zero Crossing Rate: {analysis['zero_crossing_rate']} |
|
β’ RMS Energy: {analysis['rms_energy']} |
|
|
|
π€ Vocal Information: |
|
β’ Average Pitch: {analysis['average_pitch']} Hz |
|
β’ Pitch Points Detected: {analysis['pitch_count']} |
|
β’ Beats Detected: {analysis['beats_detected']}""" |
|
|
|
def process_audio_separation(audio_file, separation_mode): |
|
"""Main audio separation function""" |
|
if not audio_file: |
|
return "β Please upload an audio file", None, None, None, None, "" |
|
|
|
if not SPLEETER_AVAILABLE: |
|
return "β Spleeter not available for source separation", None, None, None, None, "" |
|
|
|
try: |
|
|
|
analysis = engine.analyze_audio(audio_file) |
|
analysis_text = format_analysis_results(analysis) |
|
|
|
|
|
model_type = "2stems" if "2-stem" in separation_mode else "4stems" |
|
separation_result = engine.separate_vocals(audio_file, model_type) |
|
|
|
if not separation_result['success']: |
|
return f"β Separation failed: {separation_result['error']}", None, None, None, None, analysis_text |
|
|
|
if model_type == "2stems": |
|
return ( |
|
"β
2-stem separation completed successfully!", |
|
separation_result.get('vocals'), |
|
separation_result.get('accompaniment'), |
|
None, |
|
None, |
|
analysis_text |
|
) |
|
else: |
|
return ( |
|
"β
4-stem separation completed successfully!", |
|
separation_result.get('vocals'), |
|
separation_result.get('drums'), |
|
separation_result.get('bass'), |
|
separation_result.get('other'), |
|
analysis_text |
|
) |
|
|
|
except Exception as e: |
|
return f"β Processing error: {str(e)}", None, None, None, None, "" |
|
|
|
def process_vocal_effects(audio_file, pitch_shift, reverb_amount): |
|
"""Apply vocal effects to audio""" |
|
if not audio_file: |
|
return "β Please upload an audio file", None, "" |
|
|
|
try: |
|
|
|
analysis = engine.analyze_audio(audio_file) |
|
analysis_text = format_analysis_results(analysis) |
|
|
|
|
|
effects_result = engine.apply_effects(audio_file, pitch_shift, reverb_amount) |
|
|
|
if not effects_result['success']: |
|
return f"β Effects failed: {effects_result['error']}", None, analysis_text |
|
|
|
effects_applied = [] |
|
if pitch_shift != 0: |
|
effects_applied.append(f"Pitch: {pitch_shift:+.1f} semitones") |
|
if reverb_amount > 0: |
|
effects_applied.append(f"Reverb: {reverb_amount:.2f}") |
|
|
|
status = f"β
Effects applied: {', '.join(effects_applied)}" if effects_applied else "β
Audio processed (no effects)" |
|
|
|
return status, effects_result['output'], analysis_text |
|
|
|
except Exception as e: |
|
return f"β Processing error: {str(e)}", None, "" |
|
|
|
def process_style_coaching(reference_files, user_audio): |
|
"""Style coaching analysis""" |
|
if not reference_files or len(reference_files) < 2: |
|
return "β Upload at least 2 reference tracks", "", "" |
|
|
|
if not user_audio: |
|
return "β Please record or upload your performance", "", "" |
|
|
|
if not SPLEETER_AVAILABLE or not ADVANCED_FEATURES: |
|
return "β Style coaching requires advanced features", "", "" |
|
|
|
try: |
|
|
|
ref_features = [] |
|
ref_status = [] |
|
|
|
for i, ref_file in enumerate(reference_files[:5]): |
|
|
|
separation_result = engine.separate_vocals(ref_file.name, "2stems") |
|
if separation_result['success'] and separation_result.get('vocals'): |
|
|
|
features = engine.extract_vocal_features(separation_result['vocals']) |
|
if features['success']: |
|
ref_features.append(features) |
|
ref_status.append(f"β
Reference {i+1}: Processed") |
|
else: |
|
ref_status.append(f"β Reference {i+1}: Feature extraction failed") |
|
else: |
|
ref_status.append(f"β Reference {i+1}: Vocal separation failed") |
|
|
|
if len(ref_features) < 2: |
|
return "β Need at least 2 valid reference tracks", "\n".join(ref_status), "" |
|
|
|
|
|
user_separation = engine.separate_vocals(user_audio, "2stems") |
|
if not user_separation['success'] or not user_separation.get('vocals'): |
|
return "β Could not separate vocals from your performance", "\n".join(ref_status), "" |
|
|
|
user_features = engine.extract_vocal_features(user_separation['vocals']) |
|
if not user_features['success']: |
|
return "β Could not analyze your vocal features", "\n".join(ref_status), "" |
|
|
|
|
|
comparison = engine.compare_vocal_styles(user_features, ref_features) |
|
if not comparison['success']: |
|
return f"β Style comparison failed: {comparison['error']}", "\n".join(ref_status), "" |
|
|
|
|
|
feedback_text = f"""π― Vocal Style Coaching Results |
|
|
|
π Overall Score: {comparison['score']}/100 |
|
|
|
π΅ Detailed Feedback: |
|
{chr(10).join(comparison['feedback'])} |
|
|
|
π Technical Metrics: |
|
β’ Pitch Difference: {comparison['metrics']['pitch_diff']} Hz |
|
β’ Tempo Difference: {comparison['metrics']['tempo_diff']} BPM |
|
β’ Timbre Difference: {comparison['metrics']['timbre_diff']} Hz |
|
β’ Energy Difference: {comparison['metrics']['energy_diff']} |
|
|
|
π― Recommendations: |
|
{f"π₯ Excellent! You're very close to the target style." if comparison['score'] > 80 else |
|
f"π Good progress! Focus on the areas mentioned above." if comparison['score'] > 60 else |
|
f"πͺ Keep practicing! Work on basic vocal technique first."} |
|
|
|
References analyzed: {len(ref_features)}/5""" |
|
|
|
return f"β
Style coaching complete! Score: {comparison['score']}/100", "\n".join(ref_status), feedback_text |
|
|
|
except Exception as e: |
|
return f"β Coaching failed: {str(e)}", "", "" |
|
|
|
|
|
def create_app(): |
|
|
|
with gr.Blocks(title="Audio Singing Helper") as app: |
|
|
|
gr.HTML(""" |
|
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;"> |
|
<h1>π€ Audio Singing Helper</h1> |
|
<p>Professional audio processing for singers and musicians</p> |
|
</div> |
|
""") |
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("π΅ Audio Separation"): |
|
gr.Markdown("### Separate vocals from instrumental tracks") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
sep_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]) |
|
sep_mode = gr.Dropdown( |
|
choices=["2-stem (Vocals + Instrumental)", "4-stem (Vocals + Drums + Bass + Other)"], |
|
value="2-stem (Vocals + Instrumental)", |
|
label="Separation Mode" |
|
) |
|
sep_button = gr.Button("π― Separate Audio", variant="primary") |
|
|
|
with gr.Column(): |
|
sep_status = gr.Textbox(label="Status", lines=2, interactive=False) |
|
sep_analysis = gr.Textbox(label="Audio Analysis", lines=12, interactive=False) |
|
|
|
with gr.Row(): |
|
sep_vocals = gr.Audio(label="π€ Vocals", show_download_button=True) |
|
sep_instrumental = gr.Audio(label="πΌ Instrumental/Drums", show_download_button=True) |
|
|
|
with gr.Row(): |
|
sep_bass = gr.Audio(label="πΈ Bass", show_download_button=True) |
|
sep_other = gr.Audio(label="πΉ Other", show_download_button=True) |
|
|
|
|
|
with gr.Tab("ποΈ Vocal Effects"): |
|
gr.Markdown("### Apply professional vocal effects") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
fx_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]) |
|
fx_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Shift (semitones)") |
|
fx_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb Amount") |
|
fx_button = gr.Button("π΅ Apply Effects", variant="primary") |
|
|
|
with gr.Column(): |
|
fx_status = gr.Textbox(label="Status", lines=2, interactive=False) |
|
fx_analysis = gr.Textbox(label="Audio Analysis", lines=10, interactive=False) |
|
|
|
fx_output = gr.Audio(label="π§ Processed Audio", show_download_button=True) |
|
|
|
|
|
with gr.Tab("ποΈ Live Recording"): |
|
gr.Markdown("### Record and process your voice in real-time") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
live_audio = gr.Audio(type="filepath", sources=["microphone"], label="Record Your Voice") |
|
live_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Correction") |
|
live_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb") |
|
live_button = gr.Button("π€ Process Recording", variant="primary") |
|
|
|
with gr.Column(): |
|
live_status = gr.Textbox(label="Status", lines=2, interactive=False) |
|
live_analysis = gr.Textbox(label="Recording Analysis", lines=10, interactive=False) |
|
|
|
live_output = gr.Audio(label="π§ Processed Recording", show_download_button=True) |
|
|
|
|
|
with gr.Tab("π Style Coaching"): |
|
gr.Markdown("### Get personalized vocal coaching feedback") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
coach_refs = gr.File( |
|
label="Reference Tracks (2-5 files)", |
|
file_count="multiple", |
|
file_types=["audio"] |
|
) |
|
coach_user = gr.Audio( |
|
type="filepath", |
|
label="Your Performance", |
|
sources=["upload", "microphone"] |
|
) |
|
coach_button = gr.Button("π― Get Coaching", variant="primary") |
|
|
|
with gr.Column(): |
|
coach_status = gr.Textbox(label="Status", lines=3, interactive=False) |
|
coach_refs_status = gr.Textbox(label="Reference Processing", lines=8, interactive=False) |
|
|
|
coach_feedback = gr.Textbox(label="π― Coaching Feedback", lines=15, interactive=False) |
|
|
|
|
|
with gr.Tab("βΉοΈ Help"): |
|
gr.Markdown(""" |
|
# π€ Audio Singing Helper - User Guide |
|
|
|
## Features |
|
|
|
### π΅ Audio Separation |
|
- Upload any song to separate vocals from instruments |
|
- Choose 2-stem (vocals + instrumental) or 4-stem (vocals + drums + bass + other) |
|
- Get detailed audio analysis of your tracks |
|
|
|
### ποΈ Vocal Effects |
|
- Apply pitch shifting (-12 to +12 semitones) |
|
- Add reverb for spatial depth |
|
- Process any audio file with professional effects |
|
|
|
### ποΈ Live Recording |
|
- Record directly from your microphone |
|
- Apply real-time pitch correction and reverb |
|
- Perfect for vocal practice and experimentation |
|
|
|
### π Style Coaching |
|
- Upload 2-5 reference tracks from artists you want to emulate |
|
- Record or upload your performance |
|
- Get AI-powered feedback on pitch, timing, and vocal characteristics |
|
- Receive a score and specific improvement suggestions |
|
|
|
## Tips for Best Results |
|
|
|
- **Use high-quality audio files** - better input = better results |
|
- **Keep files under 5 minutes** for faster processing |
|
- **For style coaching**: Choose references from similar genres |
|
- **Record in quiet environments** for best analysis |
|
|
|
## Supported Formats |
|
- Input: MP3, WAV, FLAC, M4A, OGG |
|
- Output: High-quality WAV files |
|
|
|
## Technical Requirements |
|
- Some features require additional dependencies |
|
- Processing time varies based on file length and complexity |
|
|
|
--- |
|
Built for singers and musicians worldwide π |
|
""") |
|
|
|
|
|
sep_button.click( |
|
process_audio_separation, |
|
inputs=[sep_audio_input, sep_mode], |
|
outputs=[sep_status, sep_vocals, sep_instrumental, sep_bass, sep_other, sep_analysis] |
|
) |
|
|
|
fx_button.click( |
|
process_vocal_effects, |
|
inputs=[fx_audio_input, fx_pitch, fx_reverb], |
|
outputs=[fx_status, fx_output, fx_analysis] |
|
) |
|
|
|
live_button.click( |
|
process_vocal_effects, |
|
inputs=[live_audio, live_pitch, live_reverb], |
|
outputs=[live_status, live_output, live_analysis] |
|
) |
|
|
|
coach_button.click( |
|
process_style_coaching, |
|
inputs=[coach_refs, coach_user], |
|
outputs=[coach_status, coach_refs_status, coach_feedback] |
|
) |
|
|
|
return app |
|
|
|
if __name__ == "__main__": |
|
app = create_app() |
|
app.launch() |