A2

Sleeping

File size: 25,695 Bytes

b98e447

import gradio as gr
import librosa
import numpy as np
import soundfile as sf
import os
import tempfile
import shutil
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# Import for advanced features
try:
    from spleeter.separator import Separator
    SPLEETER_AVAILABLE = True
except ImportError:
    SPLEETER_AVAILABLE = False
    print("Spleeter not available - source separation disabled")

try:
    import scipy.signal
    from scipy.spatial.distance import euclidean
    from dtw import dtw
    ADVANCED_FEATURES = True
except ImportError:
    ADVANCED_FEATURES = False
    print("Advanced features not available")

class AudioEngine:
    """Clean, professional audio processing engine"""
    
    def __init__(self):
        self.temp_dir = tempfile.mkdtemp()
        self.separators = {}  # Cache for Spleeter models
    
    def analyze_audio(self, audio_path):
        """Extract comprehensive audio features"""
        try:
            # Load audio
            y, sr = librosa.load(audio_path)
            
            # Basic properties
            duration = len(y) / sr
            tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
            
            # Spectral features
            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
            zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))
            
            # Energy features
            rms_energy = np.mean(librosa.feature.rms(y=y))
            
            # Pitch estimation
            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
            pitch_values = []
            for t in range(pitches.shape[1]):
                index = magnitudes[:, t].argmax()
                pitch = pitches[index, t]
                if pitch > 0:
                    pitch_values.append(pitch)
            
            avg_pitch = np.mean(pitch_values) if pitch_values else 0
            
            return {
                'success': True,
                'duration': round(duration, 2),
                'tempo': round(tempo, 1),
                'sample_rate': sr,
                'spectral_centroid': round(spectral_centroid, 2),
                'spectral_rolloff': round(spectral_rolloff, 2),
                'zero_crossing_rate': round(zero_crossing_rate, 4),
                'rms_energy': round(rms_energy, 4),
                'average_pitch': round(avg_pitch, 2),
                'pitch_count': len(pitch_values),
                'beats_detected': len(beats)
            }
            
        except Exception as e:
            return {'success': False, 'error': str(e)}
    
    def separate_vocals(self, audio_path, model_type="2stems"):
        """Separate vocals using Spleeter"""
        if not SPLEETER_AVAILABLE:
            return {'success': False, 'error': 'Spleeter not available'}
        
        try:
            # Load or create separator
            if model_type not in self.separators:
                self.separators[model_type] = Separator(f'spleeter:{model_type}-16kHz')
            
            separator = self.separators[model_type]
            
            # Create output directory
            output_dir = os.path.join(self.temp_dir, f"separation_{np.random.randint(10000)}")
            os.makedirs(output_dir, exist_ok=True)
            
            # Separate
            separator.separate_to_file(audio_path, output_dir)
            
            # Get results
            audio_name = Path(audio_path).stem
            result_dir = os.path.join(output_dir, audio_name)
            
            if model_type == "2stems":
                vocals_path = os.path.join(result_dir, "vocals.wav")
                accompaniment_path = os.path.join(result_dir, "accompaniment.wav")
                
                return {
                    'success': True,
                    'vocals': vocals_path if os.path.exists(vocals_path) else None,
                    'accompaniment': accompaniment_path if os.path.exists(accompaniment_path) else None
                }
            
            elif model_type == "4stems":
                vocals_path = os.path.join(result_dir, "vocals.wav")
                drums_path = os.path.join(result_dir, "drums.wav")
                bass_path = os.path.join(result_dir, "bass.wav")
                other_path = os.path.join(result_dir, "other.wav")
                
                return {
                    'success': True,
                    'vocals': vocals_path if os.path.exists(vocals_path) else None,
                    'drums': drums_path if os.path.exists(drums_path) else None,
                    'bass': bass_path if os.path.exists(bass_path) else None,
                    'other': other_path if os.path.exists(other_path) else None
                }
            
        except Exception as e:
            return {'success': False, 'error': str(e)}
    
    def apply_effects(self, audio_path, pitch_shift=0, reverb=0):
        """Apply vocal effects"""
        try:
            y, sr = librosa.load(audio_path)
            
            # Apply pitch shift
            if pitch_shift != 0:
                y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)
            
            # Apply reverb (simple convolution)
            if reverb > 0 and ADVANCED_FEATURES:
                reverb_length = int(0.5 * sr)
                impulse = np.random.randn(reverb_length) * np.exp(-np.arange(reverb_length) / (sr * 0.1))
                y = scipy.signal.convolve(y, impulse * reverb, mode='same')
                y = y / np.max(np.abs(y))  # Normalize
            
            # Save processed audio
            output_path = os.path.join(self.temp_dir, f"processed_{np.random.randint(10000)}.wav")
            sf.write(output_path, y, sr)
            
            return {'success': True, 'output': output_path}
            
        except Exception as e:
            return {'success': False, 'error': str(e)}
    
    def extract_vocal_features(self, audio_path):
        """Extract features for style coaching"""
        try:
            y, sr = librosa.load(audio_path)
            
            # Pitch analysis
            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
            pitch_values = []
            for t in range(pitches.shape[1]):
                index = magnitudes[:, t].argmax()
                pitch = pitches[index, t]
                if pitch > 0:
                    pitch_values.append(pitch)
            
            if not pitch_values:
                return {'success': False, 'error': 'No pitch detected'}
            
            # Basic vocal metrics
            mean_pitch = np.mean(pitch_values)
            pitch_std = np.std(pitch_values)
            pitch_range = max(pitch_values) - min(pitch_values)
            
            # Tempo
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            
            # Spectral features
            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            
            # Energy
            rms_energy = np.mean(librosa.feature.rms(y=y))
            
            return {
                'success': True,
                'mean_pitch': mean_pitch,
                'pitch_std': pitch_std,
                'pitch_range': pitch_range,
                'tempo': tempo,
                'spectral_centroid': spectral_centroid,
                'rms_energy': rms_energy
            }
            
        except Exception as e:
            return {'success': False, 'error': str(e)}
    
    def compare_vocal_styles(self, user_features, reference_features_list):
        """Compare user vocals to reference style"""
        if not ADVANCED_FEATURES:
            return {'success': False, 'error': 'Advanced features not available'}
        
        try:
            # Average reference features
            ref_avg = {}
            for key in ['mean_pitch', 'pitch_std', 'pitch_range', 'tempo', 'spectral_centroid', 'rms_energy']:
                values = [ref[key] for ref in reference_features_list if key in ref]
                ref_avg[key] = np.mean(values) if values else 0
            
            # Calculate differences
            pitch_diff = abs(user_features['mean_pitch'] - ref_avg['mean_pitch'])
            tempo_diff = abs(user_features['tempo'] - ref_avg['tempo'])
            timbre_diff = abs(user_features['spectral_centroid'] - ref_avg['spectral_centroid'])
            energy_diff = abs(user_features['rms_energy'] - ref_avg['rms_energy'])
            
            # Generate feedback
            feedback = []
            
            if pitch_diff > 50:
                feedback.append(f"🎵 Pitch: Your average pitch differs by {pitch_diff:.1f} Hz. Practice matching the reference key.")
            else:
                feedback.append("🎵 Pitch: Good pitch accuracy!")
            
            if tempo_diff > 10:
                feedback.append(f"⏱️ Tempo: Your tempo differs by {tempo_diff:.1f} BPM. Work on timing consistency.")
            else:
                feedback.append("⏱️ Tempo: Good timing!")
            
            if timbre_diff > 500:
                feedback.append("🗣️ Timbre: Try adjusting your vocal tone to match the reference style.")
            else:
                feedback.append("🗣️ Timbre: Good vocal tone match!")
            
            if energy_diff > 0.1:
                feedback.append("🔊 Energy: Adjust your vocal intensity to match the reference.")
            else:
                feedback.append("🔊 Energy: Good energy level!")
            
            overall_score = max(0, 100 - (pitch_diff/2 + tempo_diff + timbre_diff/10 + energy_diff*100))
            
            return {
                'success': True,
                'score': round(overall_score, 1),
                'feedback': feedback,
                'metrics': {
                    'pitch_diff': round(pitch_diff, 1),
                    'tempo_diff': round(tempo_diff, 1),
                    'timbre_diff': round(timbre_diff, 1),
                    'energy_diff': round(energy_diff, 3)
                }
            }
            
        except Exception as e:
            return {'success': False, 'error': str(e)}
    
    def cleanup(self):
        """Clean up temporary files"""
        try:
            if os.path.exists(self.temp_dir):
                shutil.rmtree(self.temp_dir)
        except Exception:
            pass

# Global engine instance
engine = AudioEngine()

def format_analysis_results(analysis):
    """Format analysis results for display"""
    if not analysis['success']:
        return f"❌ Analysis failed: {analysis['error']}"
    
    return f"""📊 Audio Analysis Results

🎵 Basic Properties:
• Duration: {analysis['duration']} seconds
• Sample Rate: {analysis['sample_rate']} Hz
• Tempo: {analysis['tempo']} BPM

🔊 Audio Characteristics:
• Spectral Centroid: {analysis['spectral_centroid']} Hz
• Spectral Rolloff: {analysis['spectral_rolloff']} Hz
• Zero Crossing Rate: {analysis['zero_crossing_rate']}
• RMS Energy: {analysis['rms_energy']}

🎤 Vocal Information:
• Average Pitch: {analysis['average_pitch']} Hz
• Pitch Points Detected: {analysis['pitch_count']}
• Beats Detected: {analysis['beats_detected']}"""

def process_audio_separation(audio_file, separation_mode):
    """Main audio separation function"""
    if not audio_file:
        return "❌ Please upload an audio file", None, None, None, None, ""
    
    if not SPLEETER_AVAILABLE:
        return "❌ Spleeter not available for source separation", None, None, None, None, ""
    
    try:
        # Analyze audio first
        analysis = engine.analyze_audio(audio_file)
        analysis_text = format_analysis_results(analysis)
        
        # Separate audio
        model_type = "2stems" if "2-stem" in separation_mode else "4stems"
        separation_result = engine.separate_vocals(audio_file, model_type)
        
        if not separation_result['success']:
            return f"❌ Separation failed: {separation_result['error']}", None, None, None, None, analysis_text
        
        if model_type == "2stems":
            return (
                "✅ 2-stem separation completed successfully!",
                separation_result.get('vocals'),
                separation_result.get('accompaniment'),
                None,
                None,
                analysis_text
            )
        else:
            return (
                "✅ 4-stem separation completed successfully!",
                separation_result.get('vocals'),
                separation_result.get('drums'),
                separation_result.get('bass'),
                separation_result.get('other'),
                analysis_text
            )
    
    except Exception as e:
        return f"❌ Processing error: {str(e)}", None, None, None, None, ""

def process_vocal_effects(audio_file, pitch_shift, reverb_amount):
    """Apply vocal effects to audio"""
    if not audio_file:
        return "❌ Please upload an audio file", None, ""
    
    try:
        # Analyze original
        analysis = engine.analyze_audio(audio_file)
        analysis_text = format_analysis_results(analysis)
        
        # Apply effects
        effects_result = engine.apply_effects(audio_file, pitch_shift, reverb_amount)
        
        if not effects_result['success']:
            return f"❌ Effects failed: {effects_result['error']}", None, analysis_text
        
        effects_applied = []
        if pitch_shift != 0:
            effects_applied.append(f"Pitch: {pitch_shift:+.1f} semitones")
        if reverb_amount > 0:
            effects_applied.append(f"Reverb: {reverb_amount:.2f}")
        
        status = f"✅ Effects applied: {', '.join(effects_applied)}" if effects_applied else "✅ Audio processed (no effects)"
        
        return status, effects_result['output'], analysis_text
    
    except Exception as e:
        return f"❌ Processing error: {str(e)}", None, ""

def process_style_coaching(reference_files, user_audio):
    """Style coaching analysis"""
    if not reference_files or len(reference_files) < 2:
        return "❌ Upload at least 2 reference tracks", "", ""
    
    if not user_audio:
        return "❌ Please record or upload your performance", "", ""
    
    if not SPLEETER_AVAILABLE or not ADVANCED_FEATURES:
        return "❌ Style coaching requires advanced features", "", ""
    
    try:
        # Process reference tracks
        ref_features = []
        ref_status = []
        
        for i, ref_file in enumerate(reference_files[:5]):
            # Separate vocals
            separation_result = engine.separate_vocals(ref_file.name, "2stems")
            if separation_result['success'] and separation_result.get('vocals'):
                # Extract features
                features = engine.extract_vocal_features(separation_result['vocals'])
                if features['success']:
                    ref_features.append(features)
                    ref_status.append(f"✅ Reference {i+1}: Processed")
                else:
                    ref_status.append(f"❌ Reference {i+1}: Feature extraction failed")
            else:
                ref_status.append(f"❌ Reference {i+1}: Vocal separation failed")
        
        if len(ref_features) < 2:
            return "❌ Need at least 2 valid reference tracks", "\n".join(ref_status), ""
        
        # Process user audio
        user_separation = engine.separate_vocals(user_audio, "2stems")
        if not user_separation['success'] or not user_separation.get('vocals'):
            return "❌ Could not separate vocals from your performance", "\n".join(ref_status), ""
        
        user_features = engine.extract_vocal_features(user_separation['vocals'])
        if not user_features['success']:
            return "❌ Could not analyze your vocal features", "\n".join(ref_status), ""
        
        # Compare styles
        comparison = engine.compare_vocal_styles(user_features, ref_features)
        if not comparison['success']:
            return f"❌ Style comparison failed: {comparison['error']}", "\n".join(ref_status), ""
        
        # Format feedback
        feedback_text = f"""🎯 Vocal Style Coaching Results

📊 Overall Score: {comparison['score']}/100

🎵 Detailed Feedback:
{chr(10).join(comparison['feedback'])}

📈 Technical Metrics:
• Pitch Difference: {comparison['metrics']['pitch_diff']} Hz
• Tempo Difference: {comparison['metrics']['tempo_diff']} BPM
• Timbre Difference: {comparison['metrics']['timbre_diff']} Hz
• Energy Difference: {comparison['metrics']['energy_diff']}

🎯 Recommendations:
{f"🔥 Excellent! You're very close to the target style." if comparison['score'] > 80 else 
 f"📈 Good progress! Focus on the areas mentioned above." if comparison['score'] > 60 else
 f"💪 Keep practicing! Work on basic vocal technique first."}

References analyzed: {len(ref_features)}/5"""
        
        return f"✅ Style coaching complete! Score: {comparison['score']}/100", "\n".join(ref_status), feedback_text
    
    except Exception as e:
        return f"❌ Coaching failed: {str(e)}", "", ""

# Create main interface
def create_app():
    
    with gr.Blocks(title="Audio Singing Helper") as app:
        
        gr.HTML("""
        <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;">
            <h1>🎤 Audio Singing Helper</h1>
            <p>Professional audio processing for singers and musicians</p>
        </div>
        """)
        
        with gr.Tabs():
            
            # Audio Separation Tab
            with gr.Tab("🎵 Audio Separation"):
                gr.Markdown("### Separate vocals from instrumental tracks")
                
                with gr.Row():
                    with gr.Column():
                        sep_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"])
                        sep_mode = gr.Dropdown(
                            choices=["2-stem (Vocals + Instrumental)", "4-stem (Vocals + Drums + Bass + Other)"],
                            value="2-stem (Vocals + Instrumental)",
                            label="Separation Mode"
                        )
                        sep_button = gr.Button("🎯 Separate Audio", variant="primary")
                    
                    with gr.Column():
                        sep_status = gr.Textbox(label="Status", lines=2, interactive=False)
                        sep_analysis = gr.Textbox(label="Audio Analysis", lines=12, interactive=False)
                
                with gr.Row():
                    sep_vocals = gr.Audio(label="🎤 Vocals", show_download_button=True)
                    sep_instrumental = gr.Audio(label="🎼 Instrumental/Drums", show_download_button=True)
                
                with gr.Row():
                    sep_bass = gr.Audio(label="🎸 Bass", show_download_button=True)
                    sep_other = gr.Audio(label="🎹 Other", show_download_button=True)
            
            # Vocal Effects Tab
            with gr.Tab("🎛️ Vocal Effects"):
                gr.Markdown("### Apply professional vocal effects")
                
                with gr.Row():
                    with gr.Column():
                        fx_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"])
                        fx_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Shift (semitones)")
                        fx_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb Amount")
                        fx_button = gr.Button("🎵 Apply Effects", variant="primary")
                    
                    with gr.Column():
                        fx_status = gr.Textbox(label="Status", lines=2, interactive=False)
                        fx_analysis = gr.Textbox(label="Audio Analysis", lines=10, interactive=False)
                
                fx_output = gr.Audio(label="🎧 Processed Audio", show_download_button=True)
            
            # Live Recording Tab
            with gr.Tab("🎙️ Live Recording"):
                gr.Markdown("### Record and process your voice in real-time")
                
                with gr.Row():
                    with gr.Column():
                        live_audio = gr.Audio(type="filepath", sources=["microphone"], label="Record Your Voice")
                        live_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Correction")
                        live_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb")
                        live_button = gr.Button("🎤 Process Recording", variant="primary")
                    
                    with gr.Column():
                        live_status = gr.Textbox(label="Status", lines=2, interactive=False)
                        live_analysis = gr.Textbox(label="Recording Analysis", lines=10, interactive=False)
                
                live_output = gr.Audio(label="🎧 Processed Recording", show_download_button=True)
            
            # Style Coaching Tab
            with gr.Tab("🎭 Style Coaching"):
                gr.Markdown("### Get personalized vocal coaching feedback")
                
                with gr.Row():
                    with gr.Column():
                        coach_refs = gr.File(
                            label="Reference Tracks (2-5 files)", 
                            file_count="multiple",
                            file_types=["audio"]
                        )
                        coach_user = gr.Audio(
                            type="filepath", 
                            label="Your Performance",
                            sources=["upload", "microphone"]
                        )
                        coach_button = gr.Button("🎯 Get Coaching", variant="primary")
                    
                    with gr.Column():
                        coach_status = gr.Textbox(label="Status", lines=3, interactive=False)
                        coach_refs_status = gr.Textbox(label="Reference Processing", lines=8, interactive=False)
                
                coach_feedback = gr.Textbox(label="🎯 Coaching Feedback", lines=15, interactive=False)
            
            # Help Tab
            with gr.Tab("ℹ️ Help"):
                gr.Markdown("""
                # 🎤 Audio Singing Helper - User Guide
                
                ## Features
                
                ### 🎵 Audio Separation
                - Upload any song to separate vocals from instruments
                - Choose 2-stem (vocals + instrumental) or 4-stem (vocals + drums + bass + other)
                - Get detailed audio analysis of your tracks
                
                ### 🎛️ Vocal Effects
                - Apply pitch shifting (-12 to +12 semitones)
                - Add reverb for spatial depth
                - Process any audio file with professional effects
                
                ### 🎙️ Live Recording
                - Record directly from your microphone
                - Apply real-time pitch correction and reverb
                - Perfect for vocal practice and experimentation
                
                ### 🎭 Style Coaching
                - Upload 2-5 reference tracks from artists you want to emulate
                - Record or upload your performance
                - Get AI-powered feedback on pitch, timing, and vocal characteristics
                - Receive a score and specific improvement suggestions
                
                ## Tips for Best Results
                
                - **Use high-quality audio files** - better input = better results
                - **Keep files under 5 minutes** for faster processing
                - **For style coaching**: Choose references from similar genres
                - **Record in quiet environments** for best analysis
                
                ## Supported Formats
                - Input: MP3, WAV, FLAC, M4A, OGG
                - Output: High-quality WAV files
                
                ## Technical Requirements
                - Some features require additional dependencies
                - Processing time varies based on file length and complexity
                
                ---
                Built for singers and musicians worldwide 🌍
                """)
        
        # Connect all the event handlers
        sep_button.click(
            process_audio_separation,
            inputs=[sep_audio_input, sep_mode],
            outputs=[sep_status, sep_vocals, sep_instrumental, sep_bass, sep_other, sep_analysis]
        )
        
        fx_button.click(
            process_vocal_effects,
            inputs=[fx_audio_input, fx_pitch, fx_reverb],
            outputs=[fx_status, fx_output, fx_analysis]
        )
        
        live_button.click(
            process_vocal_effects,
            inputs=[live_audio, live_pitch, live_reverb],
            outputs=[live_status, live_output, live_analysis]
        )
        
        coach_button.click(
            process_style_coaching,
            inputs=[coach_refs, coach_user],
            outputs=[coach_status, coach_refs_status, coach_feedback]
        )
    
    return app

if __name__ == "__main__":
    app = create_app()
    app.launch()