A2

Sleeping

App Files Files Community

latterworks commited on May 25

Commit

d7fc5bc

verified ·

1 Parent(s): d870ff6

Update app.py

Browse files

Files changed (1) hide show

app.py +387 -548

app.py CHANGED Viewed

@@ -1,611 +1,450 @@
 import gradio as gr
-import librosa
-import numpy as np
-import soundfile as sf
 import os
 import tempfile
-import shutil
 from pathlib import Path
 import warnings
-warnings.filterwarnings("ignore")
-# Import for advanced features
-try:
-    from spleeter.separator import Separator
-    SPLEETER_AVAILABLE = True
-except ImportError:
-    SPLEETER_AVAILABLE = False
-    print("Spleeter not available - source separation disabled")
-try:
-    import scipy.signal
-    from scipy.spatial.distance import euclidean
-    from dtw import dtw
-    ADVANCED_FEATURES = True
-except ImportError:
-    ADVANCED_FEATURES = False
-    print("Advanced features not available")
-class AudioEngine:
-    """Clean, professional audio processing engine"""
     def __init__(self):
         self.temp_dir = tempfile.mkdtemp()
-        self.separators = {}  # Cache for Spleeter models
-    def analyze_audio(self, audio_path):
-        """Extract comprehensive audio features"""
         try:
-            # Load audio
-            y, sr = librosa.load(audio_path)
-            # Basic properties
-            duration = len(y) / sr
-            tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
             # Spectral features
-            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
-            spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
-            zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))
-            # Energy features
-            rms_energy = np.mean(librosa.feature.rms(y=y))
-            # Pitch estimation
-            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
-            pitch_values = []
-            for t in range(pitches.shape[1]):
-                index = magnitudes[:, t].argmax()
-                pitch = pitches[index, t]
-                if pitch > 0:
-                    pitch_values.append(pitch)
-            avg_pitch = np.mean(pitch_values) if pitch_values else 0
-            return {
-                'success': True,
-                'duration': round(duration, 2),
-                'tempo': round(tempo, 1),
-                'sample_rate': sr,
-                'spectral_centroid': round(spectral_centroid, 2),
-                'spectral_rolloff': round(spectral_rolloff, 2),
-                'zero_crossing_rate': round(zero_crossing_rate, 4),
-                'rms_energy': round(rms_energy, 4),
-                'average_pitch': round(avg_pitch, 2),
-                'pitch_count': len(pitch_values),
-                'beats_detected': len(beats)
-            }
         except Exception as e:
-            return {'success': False, 'error': str(e)}
-    def separate_vocals(self, audio_path, model_type="2stems"):
-        """Separate vocals using Spleeter"""
-        if not SPLEETER_AVAILABLE:
-            return {'success': False, 'error': 'Spleeter not available'}
         try:
-            # Load or create separator
-            if model_type not in self.separators:
-                self.separators[model_type] = Separator(f'spleeter:{model_type}-16kHz')
-            separator = self.separators[model_type]
-            # Create output directory
-            output_dir = os.path.join(self.temp_dir, f"separation_{np.random.randint(10000)}")
-            os.makedirs(output_dir, exist_ok=True)
-            # Separate
-            separator.separate_to_file(audio_path, output_dir)
-            # Get results
-            audio_name = Path(audio_path).stem
-            result_dir = os.path.join(output_dir, audio_name)
-            if model_type == "2stems":
-                vocals_path = os.path.join(result_dir, "vocals.wav")
-                accompaniment_path = os.path.join(result_dir, "accompaniment.wav")
-                return {
-                    'success': True,
-                    'vocals': vocals_path if os.path.exists(vocals_path) else None,
-                    'accompaniment': accompaniment_path if os.path.exists(accompaniment_path) else None
-                }
-            elif model_type == "4stems":
-                vocals_path = os.path.join(result_dir, "vocals.wav")
-                drums_path = os.path.join(result_dir, "drums.wav")
-                bass_path = os.path.join(result_dir, "bass.wav")
-                other_path = os.path.join(result_dir, "other.wav")
-                return {
-                    'success': True,
-                    'vocals': vocals_path if os.path.exists(vocals_path) else None,
-                    'drums': drums_path if os.path.exists(drums_path) else None,
-                    'bass': bass_path if os.path.exists(bass_path) else None,
-                    'other': other_path if os.path.exists(other_path) else None
-                }
-        except Exception as e:
-            return {'success': False, 'error': str(e)}
-    def apply_effects(self, audio_path, pitch_shift=0, reverb=0):
-        """Apply vocal effects"""
-        try:
-            y, sr = librosa.load(audio_path)
-            # Apply pitch shift
-            if pitch_shift != 0:
-                y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)
-            # Apply reverb (simple convolution)
-            if reverb > 0 and ADVANCED_FEATURES:
-                reverb_length = int(0.5 * sr)
-                impulse = np.random.randn(reverb_length) * np.exp(-np.arange(reverb_length) / (sr * 0.1))
-                y = scipy.signal.convolve(y, impulse * reverb, mode='same')
-                y = y / np.max(np.abs(y))  # Normalize
-            # Save processed audio
-            output_path = os.path.join(self.temp_dir, f"processed_{np.random.randint(10000)}.wav")
-            sf.write(output_path, y, sr)
-            return {'success': True, 'output': output_path}
-        except Exception as e:
-            return {'success': False, 'error': str(e)}
-    def extract_vocal_features(self, audio_path):
-        """Extract features for style coaching"""
-        try:
-            y, sr = librosa.load(audio_path)
-            # Pitch analysis
-            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
-            pitch_values = []
-            for t in range(pitches.shape[1]):
-                index = magnitudes[:, t].argmax()
-                pitch = pitches[index, t]
-                if pitch > 0:
-                    pitch_values.append(pitch)
-            if not pitch_values:
-                return {'success': False, 'error': 'No pitch detected'}
-            # Basic vocal metrics
-            mean_pitch = np.mean(pitch_values)
-            pitch_std = np.std(pitch_values)
-            pitch_range = max(pitch_values) - min(pitch_values)
-            # Tempo
-            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
-            # Spectral features
-            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
-            # Energy
-            rms_energy = np.mean(librosa.feature.rms(y=y))
-            return {
-                'success': True,
-                'mean_pitch': mean_pitch,
-                'pitch_std': pitch_std,
-                'pitch_range': pitch_range,
-                'tempo': tempo,
-                'spectral_centroid': spectral_centroid,
-                'rms_energy': rms_energy
-            }
         except Exception as e:
-            return {'success': False, 'error': str(e)}
-    def compare_vocal_styles(self, user_features, reference_features_list):
-        """Compare user vocals to reference style"""
-        if not ADVANCED_FEATURES:
-            return {'success': False, 'error': 'Advanced features not available'}
         try:
-            # Average reference features
-            ref_avg = {}
-            for key in ['mean_pitch', 'pitch_std', 'pitch_range', 'tempo', 'spectral_centroid', 'rms_energy']:
-                values = [ref[key] for ref in reference_features_list if key in ref]
-                ref_avg[key] = np.mean(values) if values else 0
-            # Calculate differences
-            pitch_diff = abs(user_features['mean_pitch'] - ref_avg['mean_pitch'])
-            tempo_diff = abs(user_features['tempo'] - ref_avg['tempo'])
-            timbre_diff = abs(user_features['spectral_centroid'] - ref_avg['spectral_centroid'])
-            energy_diff = abs(user_features['rms_energy'] - ref_avg['rms_energy'])
-            # Generate feedback
-            feedback = []
-            if pitch_diff > 50:
-                feedback.append(f"🎵 Pitch: Your average pitch differs by {pitch_diff:.1f} Hz. Practice matching the reference key.")
-            else:
-                feedback.append("🎵 Pitch: Good pitch accuracy!")
-            if tempo_diff > 10:
-                feedback.append(f"⏱️ Tempo: Your tempo differs by {tempo_diff:.1f} BPM. Work on timing consistency.")
-            else:
-                feedback.append("⏱️ Tempo: Good timing!")
-            if timbre_diff > 500:
-                feedback.append("🗣️ Timbre: Try adjusting your vocal tone to match the reference style.")
-            else:
-                feedback.append("🗣️ Timbre: Good vocal tone match!")
-            if energy_diff > 0.1:
-                feedback.append("🔊 Energy: Adjust your vocal intensity to match the reference.")
-            else:
-                feedback.append("🔊 Energy: Good energy level!")
-            overall_score = max(0, 100 - (pitch_diff/2 + tempo_diff + timbre_diff/10 + energy_diff*100))
-            return {
-                'success': True,
-                'score': round(overall_score, 1),
-                'feedback': feedback,
-                'metrics': {
-                    'pitch_diff': round(pitch_diff, 1),
-                    'tempo_diff': round(tempo_diff, 1),
-                    'timbre_diff': round(timbre_diff, 1),
-                    'energy_diff': round(energy_diff, 3)
-                }
-            }
         except Exception as e:
-            return {'success': False, 'error': str(e)}
-    def cleanup(self):
-        """Clean up temporary files"""
-        try:
-            if os.path.exists(self.temp_dir):
-                shutil.rmtree(self.temp_dir)
-        except Exception:
-            pass
-# Global engine instance
-engine = AudioEngine()
-def format_analysis_results(analysis):
-    """Format analysis results for display"""
-    if not analysis['success']:
-        return f"❌ Analysis failed: {analysis['error']}"
-    return f"""📊 Audio Analysis Results
-🎵 Basic Properties:
-• Duration: {analysis['duration']} seconds
-• Sample Rate: {analysis['sample_rate']} Hz
-• Tempo: {analysis['tempo']} BPM
-🔊 Audio Characteristics:
-• Spectral Centroid: {analysis['spectral_centroid']} Hz
-• Spectral Rolloff: {analysis['spectral_rolloff']} Hz
-• Zero Crossing Rate: {analysis['zero_crossing_rate']}
-• RMS Energy: {analysis['rms_energy']}
-🎤 Vocal Information:
-• Average Pitch: {analysis['average_pitch']} Hz
-• Pitch Points Detected: {analysis['pitch_count']}
-• Beats Detected: {analysis['beats_detected']}"""
-def process_audio_separation(audio_file, separation_mode):
-    """Main audio separation function"""
-    if not audio_file:
-        return "❌ Please upload an audio file", None, None, None, None, ""
-    if not SPLEETER_AVAILABLE:
-        return "❌ Spleeter not available for source separation", None, None, None, None, ""
-    try:
-        # Analyze audio first
-        analysis = engine.analyze_audio(audio_file)
-        analysis_text = format_analysis_results(analysis)
-        # Separate audio
-        model_type = "2stems" if "2-stem" in separation_mode else "4stems"
-        separation_result = engine.separate_vocals(audio_file, model_type)
-        if not separation_result['success']:
-            return f"❌ Separation failed: {separation_result['error']}", None, None, None, None, analysis_text
-        if model_type == "2stems":
-            return (
-                "✅ 2-stem separation completed successfully!",
-                separation_result.get('vocals'),
-                separation_result.get('accompaniment'),
-                None,
-                None,
-                analysis_text
-            )
-        else:
-            return (
-                "✅ 4-stem separation completed successfully!",
-                separation_result.get('vocals'),
-                separation_result.get('drums'),
-                separation_result.get('bass'),
-                separation_result.get('other'),
-                analysis_text
-            )
-    except Exception as e:
-        return f"❌ Processing error: {str(e)}", None, None, None, None, ""
-def process_vocal_effects(audio_file, pitch_shift, reverb_amount):
-    """Apply vocal effects to audio"""
-    if not audio_file:
-        return "❌ Please upload an audio file", None, ""
-    try:
-        # Analyze original
-        analysis = engine.analyze_audio(audio_file)
-        analysis_text = format_analysis_results(analysis)
-        # Apply effects
-        effects_result = engine.apply_effects(audio_file, pitch_shift, reverb_amount)
-        if not effects_result['success']:
-            return f"❌ Effects failed: {effects_result['error']}", None, analysis_text
-        effects_applied = []
-        if pitch_shift != 0:
-            effects_applied.append(f"Pitch: {pitch_shift:+.1f} semitones")
-        if reverb_amount > 0:
-            effects_applied.append(f"Reverb: {reverb_amount:.2f}")
-        status = f"✅ Effects applied: {', '.join(effects_applied)}" if effects_applied else "✅ Audio processed (no effects)"
-        return status, effects_result['output'], analysis_text
-    except Exception as e:
-        return f"❌ Processing error: {str(e)}", None, ""
-def process_style_coaching(reference_files, user_audio):
-    """Style coaching analysis"""
-    if not reference_files or len(reference_files) < 2:
-        return "❌ Upload at least 2 reference tracks", "", ""
-    if not user_audio:
-        return "❌ Please record or upload your performance", "", ""
-    if not SPLEETER_AVAILABLE or not ADVANCED_FEATURES:
-        return "❌ Style coaching requires advanced features", "", ""
-    try:
-        # Process reference tracks
-        ref_features = []
-        ref_status = []
-        for i, ref_file in enumerate(reference_files[:5]):
-            # Separate vocals
-            separation_result = engine.separate_vocals(ref_file.name, "2stems")
-            if separation_result['success'] and separation_result.get('vocals'):
-                # Extract features
-                features = engine.extract_vocal_features(separation_result['vocals'])
-                if features['success']:
-                    ref_features.append(features)
-                    ref_status.append(f"✅ Reference {i+1}: Processed")
-                else:
-                    ref_status.append(f"❌ Reference {i+1}: Feature extraction failed")
-            else:
-                ref_status.append(f"❌ Reference {i+1}: Vocal separation failed")
-        if len(ref_features) < 2:
-            return "❌ Need at least 2 valid reference tracks", "\n".join(ref_status), ""
-        # Process user audio
-        user_separation = engine.separate_vocals(user_audio, "2stems")
-        if not user_separation['success'] or not user_separation.get('vocals'):
-            return "❌ Could not separate vocals from your performance", "\n".join(ref_status), ""
-        user_features = engine.extract_vocal_features(user_separation['vocals'])
-        if not user_features['success']:
-            return "❌ Could not analyze your vocal features", "\n".join(ref_status), ""
-        # Compare styles
-        comparison = engine.compare_vocal_styles(user_features, ref_features)
-        if not comparison['success']:
-            return f"❌ Style comparison failed: {comparison['error']}", "\n".join(ref_status), ""
-        # Format feedback
-        feedback_text = f"""🎯 Vocal Style Coaching Results
-📊 Overall Score: {comparison['score']}/100
-🎵 Detailed Feedback:
-{chr(10).join(comparison['feedback'])}
-📈 Technical Metrics:
-• Pitch Difference: {comparison['metrics']['pitch_diff']} Hz
-• Tempo Difference: {comparison['metrics']['tempo_diff']} BPM
-• Timbre Difference: {comparison['metrics']['timbre_diff']} Hz
-• Energy Difference: {comparison['metrics']['energy_diff']}
-🎯 Recommendations:
-{f"🔥 Excellent! You're very close to the target style." if comparison['score'] > 80 else
- f"📈 Good progress! Focus on the areas mentioned above." if comparison['score'] > 60 else
- f"💪 Keep practicing! Work on basic vocal technique first."}
-References analyzed: {len(ref_features)}/5"""
-        return f"✅ Style coaching complete! Score: {comparison['score']}/100", "\n".join(ref_status), feedback_text
-    except Exception as e:
-        return f"❌ Coaching failed: {str(e)}", "", ""
-# Create main interface
-def create_app():
-    with gr.Blocks(title="Audio Singing Helper") as app:
-        gr.HTML("""
-        <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;">
-            <h1>🎤 Audio Singing Helper</h1>
-            <p>Professional audio processing for singers and musicians</p>
-        </div>
-        """)
-        with gr.Tabs():
-            # Audio Separation Tab
-            with gr.Tab("🎵 Audio Separation"):
-                gr.Markdown("### Separate vocals from instrumental tracks")
-                with gr.Row():
-                    with gr.Column():
-                        sep_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"])
-                        sep_mode = gr.Dropdown(
-                            choices=["2-stem (Vocals + Instrumental)", "4-stem (Vocals + Drums + Bass + Other)"],
-                            value="2-stem (Vocals + Instrumental)",
-                            label="Separation Mode"
-                        )
-                        sep_button = gr.Button("🎯 Separate Audio", variant="primary")
-                    with gr.Column():
-                        sep_status = gr.Textbox(label="Status", lines=2, interactive=False)
-                        sep_analysis = gr.Textbox(label="Audio Analysis", lines=12, interactive=False)
-                with gr.Row():
-                    sep_vocals = gr.Audio(label="🎤 Vocals", show_download_button=True)
-                    sep_instrumental = gr.Audio(label="🎼 Instrumental/Drums", show_download_button=True)
-                with gr.Row():
-                    sep_bass = gr.Audio(label="🎸 Bass", show_download_button=True)
-                    sep_other = gr.Audio(label="🎹 Other", show_download_button=True)
-            # Vocal Effects Tab
-            with gr.Tab("🎛️ Vocal Effects"):
-                gr.Markdown("### Apply professional vocal effects")
-                with gr.Row():
-                    with gr.Column():
-                        fx_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"])
-                        fx_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Shift (semitones)")
-                        fx_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb Amount")
-                        fx_button = gr.Button("🎵 Apply Effects", variant="primary")
-                    with gr.Column():
-                        fx_status = gr.Textbox(label="Status", lines=2, interactive=False)
-                        fx_analysis = gr.Textbox(label="Audio Analysis", lines=10, interactive=False)
-                fx_output = gr.Audio(label="🎧 Processed Audio", show_download_button=True)
-            # Live Recording Tab
-            with gr.Tab("🎙️ Live Recording"):
-                gr.Markdown("### Record and process your voice in real-time")
-                with gr.Row():
-                    with gr.Column():
-                        live_audio = gr.Audio(type="filepath", sources=["microphone"], label="Record Your Voice")
-                        live_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Correction")
-                        live_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb")
-                        live_button = gr.Button("🎤 Process Recording", variant="primary")
-                    with gr.Column():
-                        live_status = gr.Textbox(label="Status", lines=2, interactive=False)
-                        live_analysis = gr.Textbox(label="Recording Analysis", lines=10, interactive=False)
-                live_output = gr.Audio(label="🎧 Processed Recording", show_download_button=True)
-            # Style Coaching Tab
-            with gr.Tab("🎭 Style Coaching"):
-                gr.Markdown("### Get personalized vocal coaching feedback")
-                with gr.Row():
-                    with gr.Column():
-                        coach_refs = gr.File(
-                            label="Reference Tracks (2-5 files)",
-                            file_count="multiple",
-                            file_types=["audio"]
                         )
-                        coach_user = gr.Audio(
-                            type="filepath",
-                            label="Your Performance",
-                            sources=["upload", "microphone"]
                         )
-                        coach_button = gr.Button("🎯 Get Coaching", variant="primary")
-                    with gr.Column():
-                        coach_status = gr.Textbox(label="Status", lines=3, interactive=False)
-                        coach_refs_status = gr.Textbox(label="Reference Processing", lines=8, interactive=False)
-                coach_feedback = gr.Textbox(label="🎯 Coaching Feedback", lines=15, interactive=False)
-            # Help Tab
-            with gr.Tab("ℹ️ Help"):
-                gr.Markdown("""
-                # 🎤 Audio Singing Helper - User Guide
-                ## Features
-                ### 🎵 Audio Separation
-                - Upload any song to separate vocals from instruments
-                - Choose 2-stem (vocals + instrumental) or 4-stem (vocals + drums + bass + other)
-                - Get detailed audio analysis of your tracks
-                ### 🎛️ Vocal Effects
-                - Apply pitch shifting (-12 to +12 semitones)
-                - Add reverb for spatial depth
-                - Process any audio file with professional effects
-                ### 🎙️ Live Recording
-                - Record directly from your microphone
-                - Apply real-time pitch correction and reverb
-                - Perfect for vocal practice and experimentation
-                ### 🎭 Style Coaching
-                - Upload 2-5 reference tracks from artists you want to emulate
-                - Record or upload your performance
-                - Get AI-powered feedback on pitch, timing, and vocal characteristics
-                - Receive a score and specific improvement suggestions
-                ## Tips for Best Results
-                - **Use high-quality audio files** - better input = better results
-                - **Keep files under 5 minutes** for faster processing
-                - **For style coaching**: Choose references from similar genres
-                - **Record in quiet environments** for best analysis
-                ## Supported Formats
-                - Input: MP3, WAV, FLAC, M4A, OGG
-                - Output: High-quality WAV files
-                ## Technical Requirements
-                - Some features require additional dependencies
-                - Processing time varies based on file length and complexity
-                ---
-                Built for singers and musicians worldwide 🌍
-                """)
-        # Connect all the event handlers
-        sep_button.click(
-            process_audio_separation,
-            inputs=[sep_audio_input, sep_mode],
-            outputs=[sep_status, sep_vocals, sep_instrumental, sep_bass, sep_other, sep_analysis]
-        )
-        fx_button.click(
-            process_vocal_effects,
-            inputs=[fx_audio_input, fx_pitch, fx_reverb],
-            outputs=[fx_status, fx_output, fx_analysis]
-        )
-        live_button.click(
-            process_vocal_effects,
-            inputs=[live_audio, live_pitch, live_reverb],
-            outputs=[live_status, live_output, live_analysis]
-        )
-        coach_button.click(
-            process_style_coaching,
-            inputs=[coach_refs, coach_user],
-            outputs=[coach_status, coach_refs_status, coach_feedback]
-        )
-    return app
 if __name__ == "__main__":
-    app = create_app()
-    app.launch()

 import gradio as gr
+import subprocess
 import os
 import tempfile
+import librosa
+import librosa.display
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.ndimage
 from pathlib import Path
 import warnings
+warnings.filterwarnings('ignore')
+# Set matplotlib backend for web display
+plt.switch_backend('Agg')
+class AudioAnalyzer:
     def __init__(self):
         self.temp_dir = tempfile.mkdtemp()
+    def download_youtube_audio(self, video_url, progress=gr.Progress()):
+        """Download audio from YouTube video using yt-dlp."""
+        if not video_url:
+            return None, "Please provide a YouTube URL"
+        progress(0.1, desc="Initializing download...")
+        output_dir = os.path.join(self.temp_dir, "downloaded_audio")
+        os.makedirs(output_dir, exist_ok=True)
+        # yt-dlp command to extract audio in mp3 format
+        command = [
+            "yt-dlp",
+            "-x",
+            "--audio-format", "mp3",
+            "-o", os.path.join(output_dir, "%(title)s.%(ext)s"),
+            "--no-playlist",
+            "--restrict-filenames",
+            video_url
+        ]
+        try:
+            progress(0.3, desc="Downloading audio...")
+            result = subprocess.run(command, check=True, capture_output=True, text=True)
+            # Find the downloaded file
+            for file in os.listdir(output_dir):
+                if file.endswith('.mp3'):
+                    file_path = os.path.join(output_dir, file)
+                    progress(1.0, desc="Download complete!")
+                    return file_path, f"Successfully downloaded: {file}"
+            return None, "Download completed but no audio file found"
+        except FileNotFoundError:
+            return None, "yt-dlp not found. Please install it: pip install yt-dlp"
+        except subprocess.CalledProcessError as e:
+            return None, f"Download failed: {e.stderr}"
+        except Exception as e:
+            return None, f"Unexpected error: {str(e)}"
+    def extract_basic_features(self, audio_path, sr=16000, progress=gr.Progress()):
+        """Extract basic audio features and create visualizations."""
+        if not audio_path or not os.path.exists(audio_path):
+            return None, None, "Invalid audio file"
         try:
+            progress(0.1, desc="Loading audio...")
+            y, sr = librosa.load(audio_path, sr=sr)
+            duration = librosa.get_duration(y=y, sr=sr)
+            # Limit to first 60 seconds for processing speed
+            max_duration = 60
+            if duration > max_duration:
+                y = y[:sr * max_duration]
+                duration = max_duration
+            progress(0.3, desc="Computing features...")
+            # Basic features
+            features = {}
+            features['duration'] = duration
+            features['sample_rate'] = sr
+            features['samples'] = len(y)
+            # Mel spectrogram
+            progress(0.5, desc="Computing mel spectrogram...")
+            hop_length = 512
+            S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length)
+            S_dB = librosa.power_to_db(S_mel, ref=np.max)
+            # Other features
+            features['tempo'], _ = librosa.beat.beat_track(y=y, sr=sr)
+            features['mfcc'] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
+            features['spectral_centroid'] = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
+            features['spectral_rolloff'] = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
+            features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(y)[0]
+            progress(0.8, desc="Creating visualizations...")
+            # Create visualizations
+            fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+            # Waveform
+            time_axis = librosa.frames_to_time(range(len(y)), sr=sr)
+            axes[0, 0].plot(time_axis, y)
+            axes[0, 0].set_title('Waveform')
+            axes[0, 0].set_xlabel('Time (s)')
+            axes[0, 0].set_ylabel('Amplitude')
+            # Mel spectrogram
+            librosa.display.specshow(S_dB, sr=sr, hop_length=hop_length,
+                                   x_axis='time', y_axis='mel', ax=axes[0, 1])
+            axes[0, 1].set_title('Mel Spectrogram')
+            # MFCC
+            librosa.display.specshow(features['mfcc'], sr=sr, x_axis='time', ax=axes[1, 0])
+            axes[1, 0].set_title('MFCC')
             # Spectral features
+            times = librosa.frames_to_time(range(len(features['spectral_centroid'])), sr=sr, hop_length=hop_length)
+            axes[1, 1].plot(times, features['spectral_centroid'], label='Spectral Centroid')
+            axes[1, 1].plot(times, features['spectral_rolloff'], label='Spectral Rolloff')
+            axes[1, 1].set_title('Spectral Features')
+            axes[1, 1].set_xlabel('Time (s)')
+            axes[1, 1].legend()
+            plt.tight_layout()
+            # Save plot
+            plot_path = os.path.join(self.temp_dir, f"basic_features_{np.random.randint(10000)}.png")
+            plt.savefig(plot_path, dpi=150, bbox_inches='tight')
+            plt.close()
+            # Create summary text
+            summary = f"""
+            **Audio Summary:**
+            - Duration: {duration:.2f} seconds
+            - Sample Rate: {sr} Hz
+            - Estimated Tempo: {features['tempo']:.1f} BPM
+            - Number of Samples: {len(y):,}
+            **Feature Shapes:**
+            - MFCC: {features['mfcc'].shape}
+            - Spectral Centroid: {features['spectral_centroid'].shape}
+            - Spectral Rolloff: {features['spectral_rolloff'].shape}
+            - Zero Crossing Rate: {features['zero_crossing_rate'].shape}
+            """
+            progress(1.0, desc="Analysis complete!")
+            return plot_path, summary, None
         except Exception as e:
+            return None, None, f"Error processing audio: {str(e)}"
+    def extract_chroma_features(self, audio_path, sr=16000, progress=gr.Progress()):
+        """Extract and visualize enhanced chroma features."""
+        if not audio_path or not os.path.exists(audio_path):
+            return None, "Invalid audio file"
         try:
+            progress(0.1, desc="Loading audio...")
+            y, sr = librosa.load(audio_path, sr=sr)
+            # Limit to first 30 seconds for processing speed
+            max_duration = 30
+            if len(y) > sr * max_duration:
+                y = y[:sr * max_duration]
+            progress(0.3, desc="Computing chroma variants...")
+            # Original chroma
+            chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr)
+            # Harmonic-percussive separation
+            y_harm = librosa.effects.harmonic(y=y, margin=8)
+            chroma_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr)
+            progress(0.6, desc="Applying filters...")
+            # Non-local filtering
+            chroma_filter = np.minimum(chroma_harm,
+                                     librosa.decompose.nn_filter(chroma_harm,
+                                                                aggregate=np.median,
+                                                                metric='cosine'))
+            # Median filtering
+            chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9))
+            # STFT-based chroma
+            chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
+            # CENS features
+            chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
+            progress(0.8, desc="Creating visualizations...")
+            # Create comprehensive visualization
+            fig, axes = plt.subplots(3, 2, figsize=(15, 12))
+            # Original vs Harmonic
+            librosa.display.specshow(chroma_orig, y_axis='chroma', x_axis='time', ax=axes[0, 0])
+            axes[0, 0].set_title('Original Chroma (CQT)')
+            librosa.display.specshow(chroma_harm, y_axis='chroma', x_axis='time', ax=axes[0, 1])
+            axes[0, 1].set_title('Harmonic Chroma')
+            # Filtered vs Smooth
+            librosa.display.specshow(chroma_filter, y_axis='chroma', x_axis='time', ax=axes[1, 0])
+            axes[1, 0].set_title('Non-local Filtered')
+            librosa.display.specshow(chroma_smooth, y_axis='chroma', x_axis='time', ax=axes[1, 1])
+            axes[1, 1].set_title('Median Filtered')
+            # STFT vs CENS
+            librosa.display.specshow(chroma_stft, y_axis='chroma', x_axis='time', ax=axes[2, 0])
+            axes[2, 0].set_title('Chroma (STFT)')
+            librosa.display.specshow(chroma_cens, y_axis='chroma', x_axis='time', ax=axes[2, 1])
+            axes[2, 1].set_title('CENS Features')
+            plt.tight_layout()
+            # Save plot
+            plot_path = os.path.join(self.temp_dir, f"chroma_features_{np.random.randint(10000)}.png")
+            plt.savefig(plot_path, dpi=150, bbox_inches='tight')
+            plt.close()
+            progress(1.0, desc="Chroma analysis complete!")
+            return plot_path, None
         except Exception as e:
+            return None, f"Error processing chroma features: {str(e)}"
+    def generate_patches(self, audio_path, sr=16000, patch_duration=5.0, hop_duration=1.0, progress=gr.Progress()):
+        """Generate fixed-duration patches for transformer input."""
+        if not audio_path or not os.path.exists(audio_path):
+            return None, None, "Invalid audio file"
         try:
+            progress(0.1, desc="Loading audio...")
+            y, sr = librosa.load(audio_path, sr=sr)
+            progress(0.3, desc="Computing mel spectrogram...")
+            hop_length = 512
+            S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80)
+            S_dB = librosa.power_to_db(S_mel, ref=np.max)
+            progress(0.5, desc="Generating patches...")
+            # Convert time to frames
+            patch_frames = librosa.time_to_frames(patch_duration, sr=sr, hop_length=hop_length)
+            hop_frames = librosa.time_to_frames(hop_duration, sr=sr, hop_length=hop_length)
+            # Generate patches using librosa.util.frame
+            patches = librosa.util.frame(S_dB, frame_length=patch_frames, hop_length=hop_frames)
+            progress(0.8, desc="Creating visualizations...")
+            # Visualize patches
+            num_patches_to_show = min(6, patches.shape[-1])
+            fig, axes = plt.subplots(2, 3, figsize=(18, 8))
+            axes = axes.flatten()
+            for i in range(num_patches_to_show):
+                librosa.display.specshow(patches[..., i], y_axis='mel', x_axis='time',
+                                       ax=axes[i], sr=sr, hop_length=hop_length)
+                axes[i].set_title(f'Patch {i+1}')
+            # Hide unused subplots
+            for i in range(num_patches_to_show, len(axes)):
+                axes[i].set_visible(False)
+            plt.tight_layout()
+            # Save plot
+            plot_path = os.path.join(self.temp_dir, f"patches_{np.random.randint(10000)}.png")
+            plt.savefig(plot_path, dpi=150, bbox_inches='tight')
+            plt.close()
+            # Summary
+            summary = f"""
+            **Patch Generation Summary:**
+            - Total patches generated: {patches.shape[-1]}
+            - Patch duration: {patch_duration} seconds
+            - Hop duration: {hop_duration} seconds
+            - Patch shape (mels, time, patches): {patches.shape}
+            - Each patch covers {patch_frames} time frames
+            """
+            progress(1.0, desc="Patch generation complete!")
+            return plot_path, summary, None
         except Exception as e:
+            return None, None, f"Error generating patches: {str(e)}"
+# Initialize analyzer
+analyzer = AudioAnalyzer()
+# Gradio interface functions
+def process_youtube_url(url):
+    """Process YouTube URL and return audio file."""
+    file_path, message = analyzer.download_youtube_audio(url)
+    if file_path:
+        return file_path, message, gr.update(visible=True)
+    else:
+        return None, message, gr.update(visible=False)
+def analyze_audio_basic(audio_file):
+    """Analyze audio file and return basic features."""
+    if audio_file is None:
+        return None, "Please upload an audio file or download from YouTube first."
+    plot_path, summary, error = analyzer.extract_basic_features(audio_file)
+    if error:
+        return None, error
+    return plot_path, summary
+def analyze_audio_chroma(audio_file):
+    """Analyze audio file for chroma features."""
+    if audio_file is None:
+        return None, "Please upload an audio file or download from YouTube first."
+    plot_path, error = analyzer.extract_chroma_features(audio_file)
+    if error:
+        return None, error
+    return plot_path, "Chroma feature analysis complete! This shows different chroma extraction methods for harmonic analysis."
+def analyze_audio_patches(audio_file, patch_duration, hop_duration):
+    """Generate transformer patches from audio."""
+    if audio_file is None:
+        return None, None, "Please upload an audio file or download from YouTube first."
+    plot_path, summary, error = analyzer.generate_patches(audio_file, patch_duration=patch_duration, hop_duration=hop_duration)
+    if error:
+        return None, None, error
+    return plot_path, summary
+# Create Gradio interface
+with gr.Blocks(title="🎵 Audio Analysis Suite", theme=gr.themes.Soft()) as app:
+    gr.Markdown("""
+    # 🎵 Audio Analysis Suite
+    A comprehensive tool for audio feature extraction and analysis. Upload an audio file or download from YouTube to get started!
+    **Features:**
+    - 📊 **Basic Features**: Waveform, Mel Spectrogram, MFCC, Spectral Analysis, Tempo Detection
+    - 🎼 **Chroma Features**: Advanced harmonic content analysis with multiple extraction methods
+    - 🧩 **Transformer Patches**: Generate fixed-duration patches for deep learning applications
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 📁 Audio Input")
+            # YouTube downloader
+            with gr.Group():
+                gr.Markdown("**Download from YouTube:**")
+                youtube_url = gr.Textbox(
+                    label="YouTube URL",
+                    placeholder="https://www.youtube.com/watch?v=...",
+                    info="Paste a YouTube video URL to extract audio"
+                )
+                download_btn = gr.Button("📥 Download Audio", variant="primary")
+                download_status = gr.Textbox(label="Download Status", interactive=False)
+            # File upload
+            with gr.Group():
+                gr.Markdown("**Or upload audio file:**")
+                audio_file = gr.Audio(
+                    label="Upload Audio File",
+                    type="filepath",
+                    info="Supported formats: MP3, WAV, FLAC, etc."
+                )
+        with gr.Column(scale=2):
+            gr.Markdown("### 🔍 Analysis Results")
+            with gr.Tabs():
+                with gr.Tab("📊 Basic Features"):
+                    basic_plot = gr.Image(label="Feature Visualizations")
+                    basic_summary = gr.Markdown()
+                    basic_analyze_btn = gr.Button("🔍 Analyze Basic Features", variant="secondary")
+                with gr.Tab("🎼 Chroma Features"):
+                    chroma_plot = gr.Image(label="Chroma Visualizations")
+                    chroma_summary = gr.Markdown()
+                    chroma_analyze_btn = gr.Button("🎼 Analyze Chroma Features", variant="secondary")
+                with gr.Tab("🧩 Transformer Patches"):
+                    with gr.Row():
+                        patch_duration = gr.Slider(
+                            label="Patch Duration (seconds)",
+                            minimum=1.0, maximum=10.0, value=5.0, step=0.5,
+                            info="Duration of each patch"
                         )
+                        hop_duration = gr.Slider(
+                            label="Hop Duration (seconds)",
+                            minimum=0.1, maximum=5.0, value=1.0, step=0.1,
+                            info="Time between patch starts"
                         )
+                    patches_plot = gr.Image(label="Generated Patches")
+                    patches_summary = gr.Markdown()
+                    patches_analyze_btn = gr.Button("🧩 Generate Patches", variant="secondary")
+    gr.Markdown("""
+    ### ℹ️ Usage Tips
+    - **Processing is limited to 60 seconds** for basic features and 30 seconds for chroma analysis to ensure fast response times
+    - **YouTube downloads** respect platform terms of service
+    - **Visualizations** are high-quality and suitable for research/educational use
+    - **All processing** is done locally in your browser session
+    """)
+    # Event handlers
+    download_btn.click(
+        process_youtube_url,
+        inputs=[youtube_url],
+        outputs=[audio_file, download_status, basic_analyze_btn]
+    )
+    basic_analyze_btn.click(
+        analyze_audio_basic,
+        inputs=[audio_file],
+        outputs=[basic_plot, basic_summary]
+    )
+    chroma_analyze_btn.click(
+        analyze_audio_chroma,
+        inputs=[audio_file],
+        outputs=[chroma_plot, chroma_summary]
+    )
+    patches_analyze_btn.click(
+        analyze_audio_patches,
+        inputs=[audio_file, patch_duration, hop_duration],
+        outputs=[patches_plot, patches_summary]
+    )
+    # Auto-analyze when file is uploaded
+    audio_file.change(
+        analyze_audio_basic,
+        inputs=[audio_file],
+        outputs=[basic_plot, basic_summary]
+    )
 if __name__ == "__main__":
+    app.launch()