Spaces:

latterworks
/

A

Sleeping

App Files Files Community

A / app.py

latterworks

Rename app,py to app.py

b2d3cd1 verified 3 months ago

raw

history blame

25.7 kB

	import gradio as gr
	import librosa
	import numpy as np
	import soundfile as sf
	import os
	import tempfile
	import shutil
	from pathlib import Path
	import warnings
	warnings.filterwarnings("ignore")

	# Import for advanced features
	try:
	from spleeter.separator import Separator
	SPLEETER_AVAILABLE = True
	except ImportError:
	SPLEETER_AVAILABLE = False
	print("Spleeter not available - source separation disabled")

	try:
	import scipy.signal
	from scipy.spatial.distance import euclidean
	from dtw import dtw
	ADVANCED_FEATURES = True
	except ImportError:
	ADVANCED_FEATURES = False
	print("Advanced features not available")

	class AudioEngine:
	"""Clean, professional audio processing engine"""

	def __init__(self):
	self.temp_dir = tempfile.mkdtemp()
	self.separators = {} # Cache for Spleeter models

	def analyze_audio(self, audio_path):
	"""Extract comprehensive audio features"""
	try:
	# Load audio
	y, sr = librosa.load(audio_path)

	# Basic properties
	duration = len(y) / sr
	tempo, beats = librosa.beat.beat_track(y=y, sr=sr)

	# Spectral features
	spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
	spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
	zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))

	# Energy features
	rms_energy = np.mean(librosa.feature.rms(y=y))

	# Pitch estimation
	pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
	pitch_values = []
	for t in range(pitches.shape[1]):
	index = magnitudes[:, t].argmax()
	pitch = pitches[index, t]
	if pitch > 0:
	pitch_values.append(pitch)

	avg_pitch = np.mean(pitch_values) if pitch_values else 0

	return {
	'success': True,
	'duration': round(duration, 2),
	'tempo': round(tempo, 1),
	'sample_rate': sr,
	'spectral_centroid': round(spectral_centroid, 2),
	'spectral_rolloff': round(spectral_rolloff, 2),
	'zero_crossing_rate': round(zero_crossing_rate, 4),
	'rms_energy': round(rms_energy, 4),
	'average_pitch': round(avg_pitch, 2),
	'pitch_count': len(pitch_values),
	'beats_detected': len(beats)
	}

	except Exception as e:
	return {'success': False, 'error': str(e)}

	def separate_vocals(self, audio_path, model_type="2stems"):
	"""Separate vocals using Spleeter"""
	if not SPLEETER_AVAILABLE:
	return {'success': False, 'error': 'Spleeter not available'}

	try:
	# Load or create separator
	if model_type not in self.separators:
	self.separators[model_type] = Separator(f'spleeter:{model_type}-16kHz')

	separator = self.separators[model_type]

	# Create output directory
	output_dir = os.path.join(self.temp_dir, f"separation_{np.random.randint(10000)}")
	os.makedirs(output_dir, exist_ok=True)

	# Separate
	separator.separate_to_file(audio_path, output_dir)

	# Get results
	audio_name = Path(audio_path).stem
	result_dir = os.path.join(output_dir, audio_name)

	if model_type == "2stems":
	vocals_path = os.path.join(result_dir, "vocals.wav")
	accompaniment_path = os.path.join(result_dir, "accompaniment.wav")

	return {
	'success': True,
	'vocals': vocals_path if os.path.exists(vocals_path) else None,
	'accompaniment': accompaniment_path if os.path.exists(accompaniment_path) else None
	}

	elif model_type == "4stems":
	vocals_path = os.path.join(result_dir, "vocals.wav")
	drums_path = os.path.join(result_dir, "drums.wav")
	bass_path = os.path.join(result_dir, "bass.wav")
	other_path = os.path.join(result_dir, "other.wav")

	return {
	'success': True,
	'vocals': vocals_path if os.path.exists(vocals_path) else None,
	'drums': drums_path if os.path.exists(drums_path) else None,
	'bass': bass_path if os.path.exists(bass_path) else None,
	'other': other_path if os.path.exists(other_path) else None
	}

	except Exception as e:
	return {'success': False, 'error': str(e)}

	def apply_effects(self, audio_path, pitch_shift=0, reverb=0):
	"""Apply vocal effects"""
	try:
	y, sr = librosa.load(audio_path)

	# Apply pitch shift
	if pitch_shift != 0:
	y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)

	# Apply reverb (simple convolution)
	if reverb > 0 and ADVANCED_FEATURES:
	reverb_length = int(0.5 * sr)
	impulse = np.random.randn(reverb_length) * np.exp(-np.arange(reverb_length) / (sr * 0.1))
	y = scipy.signal.convolve(y, impulse * reverb, mode='same')
	y = y / np.max(np.abs(y)) # Normalize

	# Save processed audio
	output_path = os.path.join(self.temp_dir, f"processed_{np.random.randint(10000)}.wav")
	sf.write(output_path, y, sr)

	return {'success': True, 'output': output_path}

	except Exception as e:
	return {'success': False, 'error': str(e)}

	def extract_vocal_features(self, audio_path):
	"""Extract features for style coaching"""
	try:
	y, sr = librosa.load(audio_path)

	# Pitch analysis
	pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
	pitch_values = []
	for t in range(pitches.shape[1]):
	index = magnitudes[:, t].argmax()
	pitch = pitches[index, t]
	if pitch > 0:
	pitch_values.append(pitch)

	if not pitch_values:
	return {'success': False, 'error': 'No pitch detected'}

	# Basic vocal metrics
	mean_pitch = np.mean(pitch_values)
	pitch_std = np.std(pitch_values)
	pitch_range = max(pitch_values) - min(pitch_values)

	# Tempo
	tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

	# Spectral features
	spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))

	# Energy
	rms_energy = np.mean(librosa.feature.rms(y=y))

	return {
	'success': True,
	'mean_pitch': mean_pitch,
	'pitch_std': pitch_std,
	'pitch_range': pitch_range,
	'tempo': tempo,
	'spectral_centroid': spectral_centroid,
	'rms_energy': rms_energy
	}

	except Exception as e:
	return {'success': False, 'error': str(e)}

	def compare_vocal_styles(self, user_features, reference_features_list):
	"""Compare user vocals to reference style"""
	if not ADVANCED_FEATURES:
	return {'success': False, 'error': 'Advanced features not available'}

	try:
	# Average reference features
	ref_avg = {}
	for key in ['mean_pitch', 'pitch_std', 'pitch_range', 'tempo', 'spectral_centroid', 'rms_energy']:
	values = [ref[key] for ref in reference_features_list if key in ref]
	ref_avg[key] = np.mean(values) if values else 0

	# Calculate differences
	pitch_diff = abs(user_features['mean_pitch'] - ref_avg['mean_pitch'])
	tempo_diff = abs(user_features['tempo'] - ref_avg['tempo'])
	timbre_diff = abs(user_features['spectral_centroid'] - ref_avg['spectral_centroid'])
	energy_diff = abs(user_features['rms_energy'] - ref_avg['rms_energy'])

	# Generate feedback
	feedback = []

	if pitch_diff > 50:
	feedback.append(f"🎵 Pitch: Your average pitch differs by {pitch_diff:.1f} Hz. Practice matching the reference key.")
	else:
	feedback.append("🎵 Pitch: Good pitch accuracy!")

	if tempo_diff > 10:
	feedback.append(f"⏱️ Tempo: Your tempo differs by {tempo_diff:.1f} BPM. Work on timing consistency.")
	else:
	feedback.append("⏱️ Tempo: Good timing!")

	if timbre_diff > 500:
	feedback.append("🗣️ Timbre: Try adjusting your vocal tone to match the reference style.")
	else:
	feedback.append("🗣️ Timbre: Good vocal tone match!")

	if energy_diff > 0.1:
	feedback.append("🔊 Energy: Adjust your vocal intensity to match the reference.")
	else:
	feedback.append("🔊 Energy: Good energy level!")

	overall_score = max(0, 100 - (pitch_diff/2 + tempo_diff + timbre_diff/10 + energy_diff*100))

	return {
	'success': True,
	'score': round(overall_score, 1),
	'feedback': feedback,
	'metrics': {
	'pitch_diff': round(pitch_diff, 1),
	'tempo_diff': round(tempo_diff, 1),
	'timbre_diff': round(timbre_diff, 1),
	'energy_diff': round(energy_diff, 3)
	}
	}

	except Exception as e:
	return {'success': False, 'error': str(e)}

	def cleanup(self):
	"""Clean up temporary files"""
	try:
	if os.path.exists(self.temp_dir):
	shutil.rmtree(self.temp_dir)
	except Exception:
	pass

	# Global engine instance
	engine = AudioEngine()

	def format_analysis_results(analysis):
	"""Format analysis results for display"""
	if not analysis['success']:
	return f"❌ Analysis failed: {analysis['error']}"

	return f"""📊 Audio Analysis Results

	🎵 Basic Properties:
	• Duration: {analysis['duration']} seconds
	• Sample Rate: {analysis['sample_rate']} Hz
	• Tempo: {analysis['tempo']} BPM

	🔊 Audio Characteristics:
	• Spectral Centroid: {analysis['spectral_centroid']} Hz
	• Spectral Rolloff: {analysis['spectral_rolloff']} Hz
	• Zero Crossing Rate: {analysis['zero_crossing_rate']}
	• RMS Energy: {analysis['rms_energy']}

	🎤 Vocal Information:
	• Average Pitch: {analysis['average_pitch']} Hz
	• Pitch Points Detected: {analysis['pitch_count']}
	• Beats Detected: {analysis['beats_detected']}"""

	def process_audio_separation(audio_file, separation_mode):
	"""Main audio separation function"""
	if not audio_file:
	return "❌ Please upload an audio file", None, None, None, None, ""

	if not SPLEETER_AVAILABLE:
	return "❌ Spleeter not available for source separation", None, None, None, None, ""

	try:
	# Analyze audio first
	analysis = engine.analyze_audio(audio_file)
	analysis_text = format_analysis_results(analysis)

	# Separate audio
	model_type = "2stems" if "2-stem" in separation_mode else "4stems"
	separation_result = engine.separate_vocals(audio_file, model_type)

	if not separation_result['success']:
	return f"❌ Separation failed: {separation_result['error']}", None, None, None, None, analysis_text

	if model_type == "2stems":
	return (
	"✅ 2-stem separation completed successfully!",
	separation_result.get('vocals'),
	separation_result.get('accompaniment'),
	None,
	None,
	analysis_text
	)
	else:
	return (
	"✅ 4-stem separation completed successfully!",
	separation_result.get('vocals'),
	separation_result.get('drums'),
	separation_result.get('bass'),
	separation_result.get('other'),
	analysis_text
	)

	except Exception as e:
	return f"❌ Processing error: {str(e)}", None, None, None, None, ""

	def process_vocal_effects(audio_file, pitch_shift, reverb_amount):
	"""Apply vocal effects to audio"""
	if not audio_file:
	return "❌ Please upload an audio file", None, ""

	try:
	# Analyze original
	analysis = engine.analyze_audio(audio_file)
	analysis_text = format_analysis_results(analysis)

	# Apply effects
	effects_result = engine.apply_effects(audio_file, pitch_shift, reverb_amount)

	if not effects_result['success']:
	return f"❌ Effects failed: {effects_result['error']}", None, analysis_text

	effects_applied = []
	if pitch_shift != 0:
	effects_applied.append(f"Pitch: {pitch_shift:+.1f} semitones")
	if reverb_amount > 0:
	effects_applied.append(f"Reverb: {reverb_amount:.2f}")

	status = f"✅ Effects applied: {', '.join(effects_applied)}" if effects_applied else "✅ Audio processed (no effects)"

	return status, effects_result['output'], analysis_text

	except Exception as e:
	return f"❌ Processing error: {str(e)}", None, ""

	def process_style_coaching(reference_files, user_audio):
	"""Style coaching analysis"""
	if not reference_files or len(reference_files) < 2:
	return "❌ Upload at least 2 reference tracks", "", ""

	if not user_audio:
	return "❌ Please record or upload your performance", "", ""

	if not SPLEETER_AVAILABLE or not ADVANCED_FEATURES:
	return "❌ Style coaching requires advanced features", "", ""

	try:
	# Process reference tracks
	ref_features = []
	ref_status = []

	for i, ref_file in enumerate(reference_files[:5]):
	# Separate vocals
	separation_result = engine.separate_vocals(ref_file.name, "2stems")
	if separation_result['success'] and separation_result.get('vocals'):
	# Extract features
	features = engine.extract_vocal_features(separation_result['vocals'])
	if features['success']:
	ref_features.append(features)
	ref_status.append(f"✅ Reference {i+1}: Processed")
	else:
	ref_status.append(f"❌ Reference {i+1}: Feature extraction failed")
	else:
	ref_status.append(f"❌ Reference {i+1}: Vocal separation failed")

	if len(ref_features) < 2:
	return "❌ Need at least 2 valid reference tracks", "\n".join(ref_status), ""

	# Process user audio
	user_separation = engine.separate_vocals(user_audio, "2stems")
	if not user_separation['success'] or not user_separation.get('vocals'):
	return "❌ Could not separate vocals from your performance", "\n".join(ref_status), ""

	user_features = engine.extract_vocal_features(user_separation['vocals'])
	if not user_features['success']:
	return "❌ Could not analyze your vocal features", "\n".join(ref_status), ""

	# Compare styles
	comparison = engine.compare_vocal_styles(user_features, ref_features)
	if not comparison['success']:
	return f"❌ Style comparison failed: {comparison['error']}", "\n".join(ref_status), ""

	# Format feedback
	feedback_text = f"""🎯 Vocal Style Coaching Results

	📊 Overall Score: {comparison['score']}/100

	🎵 Detailed Feedback:
	{chr(10).join(comparison['feedback'])}

	📈 Technical Metrics:
	• Pitch Difference: {comparison['metrics']['pitch_diff']} Hz
	• Tempo Difference: {comparison['metrics']['tempo_diff']} BPM
	• Timbre Difference: {comparison['metrics']['timbre_diff']} Hz
	• Energy Difference: {comparison['metrics']['energy_diff']}

	🎯 Recommendations:
	{f"🔥 Excellent! You're very close to the target style." if comparison['score'] > 80 else
	f"📈 Good progress! Focus on the areas mentioned above." if comparison['score'] > 60 else
	f"💪 Keep practicing! Work on basic vocal technique first."}

	References analyzed: {len(ref_features)}/5"""

	return f"✅ Style coaching complete! Score: {comparison['score']}/100", "\n".join(ref_status), feedback_text

	except Exception as e:
	return f"❌ Coaching failed: {str(e)}", "", ""

	# Create main interface
	def create_app():

	with gr.Blocks(title="Audio Singing Helper") as app:

	gr.HTML("""
	<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;">
	<h1>🎤 Audio Singing Helper</h1>
	<p>Professional audio processing for singers and musicians</p>
	</div>
	""")

	with gr.Tabs():

	# Audio Separation Tab
	with gr.Tab("🎵 Audio Separation"):
	gr.Markdown("### Separate vocals from instrumental tracks")

	with gr.Row():
	with gr.Column():
	sep_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"])
	sep_mode = gr.Dropdown(
	choices=["2-stem (Vocals + Instrumental)", "4-stem (Vocals + Drums + Bass + Other)"],
	value="2-stem (Vocals + Instrumental)",
	label="Separation Mode"
	)
	sep_button = gr.Button("🎯 Separate Audio", variant="primary")

	with gr.Column():
	sep_status = gr.Textbox(label="Status", lines=2, interactive=False)
	sep_analysis = gr.Textbox(label="Audio Analysis", lines=12, interactive=False)

	with gr.Row():
	sep_vocals = gr.Audio(label="🎤 Vocals", show_download_button=True)
	sep_instrumental = gr.Audio(label="🎼 Instrumental/Drums", show_download_button=True)

	with gr.Row():
	sep_bass = gr.Audio(label="🎸 Bass", show_download_button=True)
	sep_other = gr.Audio(label="🎹 Other", show_download_button=True)

	# Vocal Effects Tab
	with gr.Tab("🎛️ Vocal Effects"):
	gr.Markdown("### Apply professional vocal effects")

	with gr.Row():
	with gr.Column():
	fx_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"])
	fx_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Shift (semitones)")
	fx_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb Amount")
	fx_button = gr.Button("🎵 Apply Effects", variant="primary")

	with gr.Column():
	fx_status = gr.Textbox(label="Status", lines=2, interactive=False)
	fx_analysis = gr.Textbox(label="Audio Analysis", lines=10, interactive=False)

	fx_output = gr.Audio(label="🎧 Processed Audio", show_download_button=True)

	# Live Recording Tab
	with gr.Tab("🎙️ Live Recording"):
	gr.Markdown("### Record and process your voice in real-time")

	with gr.Row():
	with gr.Column():
	live_audio = gr.Audio(type="filepath", sources=["microphone"], label="Record Your Voice")
	live_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Correction")
	live_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb")
	live_button = gr.Button("🎤 Process Recording", variant="primary")

	with gr.Column():
	live_status = gr.Textbox(label="Status", lines=2, interactive=False)
	live_analysis = gr.Textbox(label="Recording Analysis", lines=10, interactive=False)

	live_output = gr.Audio(label="🎧 Processed Recording", show_download_button=True)

	# Style Coaching Tab
	with gr.Tab("🎭 Style Coaching"):
	gr.Markdown("### Get personalized vocal coaching feedback")

	with gr.Row():
	with gr.Column():
	coach_refs = gr.File(
	label="Reference Tracks (2-5 files)",
	file_count="multiple",
	file_types=["audio"]
	)
	coach_user = gr.Audio(
	type="filepath",
	label="Your Performance",
	sources=["upload", "microphone"]
	)
	coach_button = gr.Button("🎯 Get Coaching", variant="primary")

	with gr.Column():
	coach_status = gr.Textbox(label="Status", lines=3, interactive=False)
	coach_refs_status = gr.Textbox(label="Reference Processing", lines=8, interactive=False)

	coach_feedback = gr.Textbox(label="🎯 Coaching Feedback", lines=15, interactive=False)

	# Help Tab
	with gr.Tab("ℹ️ Help"):
	gr.Markdown("""
	# 🎤 Audio Singing Helper - User Guide

	## Features

	### 🎵 Audio Separation
	- Upload any song to separate vocals from instruments
	- Choose 2-stem (vocals + instrumental) or 4-stem (vocals + drums + bass + other)
	- Get detailed audio analysis of your tracks

	### 🎛️ Vocal Effects
	- Apply pitch shifting (-12 to +12 semitones)
	- Add reverb for spatial depth
	- Process any audio file with professional effects

	### 🎙️ Live Recording
	- Record directly from your microphone
	- Apply real-time pitch correction and reverb
	- Perfect for vocal practice and experimentation

	### 🎭 Style Coaching
	- Upload 2-5 reference tracks from artists you want to emulate
	- Record or upload your performance
	- Get AI-powered feedback on pitch, timing, and vocal characteristics
	- Receive a score and specific improvement suggestions

	## Tips for Best Results

	- Use high-quality audio files - better input = better results
	- Keep files under 5 minutes for faster processing
	- For style coaching: Choose references from similar genres
	- Record in quiet environments for best analysis

	## Supported Formats
	- Input: MP3, WAV, FLAC, M4A, OGG
	- Output: High-quality WAV files

	## Technical Requirements
	- Some features require additional dependencies
	- Processing time varies based on file length and complexity

	---
	Built for singers and musicians worldwide 🌍
	""")

	# Connect all the event handlers
	sep_button.click(
	process_audio_separation,
	inputs=[sep_audio_input, sep_mode],
	outputs=[sep_status, sep_vocals, sep_instrumental, sep_bass, sep_other, sep_analysis]
	)

	fx_button.click(
	process_vocal_effects,
	inputs=[fx_audio_input, fx_pitch, fx_reverb],
	outputs=[fx_status, fx_output, fx_analysis]
	)

	live_button.click(
	process_vocal_effects,
	inputs=[live_audio, live_pitch, live_reverb],
	outputs=[live_status, live_output, live_analysis]
	)

	coach_button.click(
	process_style_coaching,
	inputs=[coach_refs, coach_user],
	outputs=[coach_status, coach_refs_status, coach_feedback]
	)

	return app

	if __name__ == "__main__":
	app = create_app()
	app.launch()