import gradio as gr import subprocess import tempfile import librosa import librosa.display import matplotlib.pyplot as plt import numpy as np import scipy.ndimage from pathlib import Path import logging import warnings import shutil from typing import Tuple, Optional # Configure matplotlib and logging plt.switch_backend('Agg') warnings.filterwarnings('ignore') logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) class AudioAnalyzer: def __init__(self): self.temp_dir = Path(tempfile.mkdtemp()) self.plot_files = [] def cleanup(self): for plot_file in self.plot_files: Path(plot_file).unlink(missing_ok=True) shutil.rmtree(self.temp_dir, ignore_errors=True) def download_youtube_audio(self, video_url: str, progress=gr.Progress()) -> Tuple[Optional[str], str]: if not video_url: return None, "Please provide a valid YouTube URL" progress(0.1, desc="Downloading...") output_file = self.temp_dir / "audio.mp3" try: subprocess.run([ "yt-dlp", "-x", "--audio-format", "mp3", "-o", str(output_file), video_url ], check=True, capture_output=True) progress(1.0, desc="Complete!") return str(output_file), "Download successful" except FileNotFoundError: return None, "yt-dlp not found. Install with: pip install yt-dlp" except subprocess.CalledProcessError as e: return None, f"Download failed: {e.stderr}" def save_plot(self, fig) -> str: plot_path = self.temp_dir / f"plot_{len(self.plot_files)}.png" fig.savefig(plot_path, dpi=150, bbox_inches='tight') plt.close(fig) self.plot_files.append(str(plot_path)) return str(plot_path) def analyze_audio(self, audio_path: str, analysis_type: str = "basic", patch_duration: float = 5.0, progress=gr.Progress()) -> Tuple[Optional[str], str]: if not audio_path or not Path(audio_path).exists(): return None, "No audio file provided" try: progress(0.1, desc="Loading audio...") y, sr = librosa.load(audio_path, sr=22050) duration = len(y) / sr # Limit duration for processing max_duration = 60 if analysis_type == "basic" else 30 if duration > max_duration: y = y[:int(sr * max_duration)] duration = max_duration if analysis_type == "basic": return self._basic_analysis(y, sr, duration, progress) elif analysis_type == "chroma": return self._chroma_analysis(y, sr, progress) elif analysis_type == "patches": return self._patch_analysis(y, sr, patch_duration, progress) except Exception as e: logger.error(f"Analysis error: {e}") return None, f"Analysis failed: {str(e)}" def _basic_analysis(self, y, sr, duration, progress): progress(0.3, desc="Computing features...") # Extract features tempo = float(librosa.beat.beat_track(y=y, sr=sr)[0]) mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0] spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0] progress(0.6, desc="Creating visualizations...") # Create mel spectrogram S_mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80) S_dB = librosa.power_to_db(S_mel, ref=np.max) # Plot fig, axes = plt.subplots(2, 2, figsize=(12, 8)) # Waveform time = np.linspace(0, duration, len(y)) axes[0, 0].plot(time, y, alpha=0.8) axes[0, 0].set_title('Waveform', fontweight='bold') axes[0, 0].set_xlabel('Time (s)') # Mel Spectrogram librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', ax=axes[0, 1]) axes[0, 1].set_title('Mel Spectrogram', fontweight='bold') # MFCC librosa.display.specshow(mfcc, sr=sr, x_axis='time', ax=axes[1, 0]) axes[1, 0].set_title('MFCC Features', fontweight='bold') # Spectral features times = librosa.frames_to_time(range(len(spectral_centroid)), sr=sr) axes[1, 1].plot(times, spectral_centroid, label='Centroid', linewidth=2) axes[1, 1].plot(times, spectral_rolloff, label='Rolloff', linewidth=2) axes[1, 1].set_title('Spectral Features', fontweight='bold') axes[1, 1].legend() axes[1, 1].set_xlabel('Time (s)') plt.tight_layout() plot_path = self.save_plot(fig) summary = f"""**Audio Analysis Results** - Duration: {duration:.1f}s | Sample Rate: {sr:,} Hz - Tempo: {tempo:.1f} BPM | Samples: {len(y):,} - MFCC shape: {mfcc.shape} | Features extracted successfully""" progress(1.0, desc="Complete!") return plot_path, summary def _chroma_analysis(self, y, sr, progress): progress(0.3, desc="Computing chroma features...") # Different chroma extraction methods chroma_cqt = librosa.feature.chroma_cqt(y=y, sr=sr) chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr) # Harmonic separation y_harm = librosa.effects.harmonic(y=y) chroma_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr) progress(0.7, desc="Creating visualizations...") fig, axes = plt.subplots(2, 2, figsize=(12, 8)) # Plot different chroma features chromas = [ (chroma_cqt, 'Chroma (CQT)'), (chroma_stft, 'Chroma (STFT)'), (chroma_harm, 'Harmonic Chroma'), (chroma_cqt - chroma_harm, 'Chroma Difference') ] for i, (chroma, title) in enumerate(chromas): ax = axes[i//2, i%2] librosa.display.specshow(chroma, y_axis='chroma', x_axis='time', ax=ax) ax.set_title(title, fontweight='bold') plt.tight_layout() plot_path = self.save_plot(fig) summary = f"""**Chroma Analysis Results** - Multiple chroma extraction methods compared - CQT vs STFT analysis | Harmonic separation applied - Chroma shape: {chroma_cqt.shape}""" progress(1.0, desc="Complete!") return plot_path, summary def _patch_analysis(self, y, sr, patch_duration, progress): progress(0.3, desc="Generating patches...") # Create mel spectrogram hop_length = 512 S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80) S_dB = librosa.power_to_db(S_mel, ref=np.max) # Generate patches patch_frames = librosa.time_to_frames(patch_duration, sr=sr, hop_length=hop_length) hop_frames = patch_frames // 2 # 50% overlap patches = librosa.util.frame(S_dB, frame_length=patch_frames, hop_length=hop_frames) progress(0.7, desc="Creating visualizations...") # Show first 6 patches num_show = min(6, patches.shape[-1]) fig, axes = plt.subplots(2, 3, figsize=(15, 8)) axes = axes.flatten() for i in range(num_show): librosa.display.specshow(patches[..., i], y_axis='mel', x_axis='time', ax=axes[i], sr=sr, hop_length=hop_length) axes[i].set_title(f'Patch {i+1}', fontweight='bold') # Hide unused subplots for i in range(num_show, 6): axes[i].set_visible(False) plt.tight_layout() plot_path = self.save_plot(fig) summary = f"""**Patch Generation Results** - Total patches: {patches.shape[-1]} | Duration: {patch_duration}s each - Patch shape: {patches.shape} | 50% overlap between patches - Ready for transformer input""" progress(1.0, desc="Complete!") return plot_path, summary def create_interface(): analyzer = AudioAnalyzer() with gr.Blocks(title="Audio Analysis Suite") as demo: gr.Markdown("# 🎵 Audio Analysis Suite") with gr.Row(): with gr.Column(): # Input section gr.Markdown("### Input") youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://youtube.com/watch?v=...") download_btn = gr.Button("Download Audio") audio_file = gr.Audio(label="Or upload audio file", type="filepath") # Analysis options gr.Markdown("### Analysis Options") analysis_type = gr.Radio( choices=["basic", "chroma", "patches"], value="basic", label="Analysis Type" ) patch_duration = gr.Slider(1, 10, 5, step=0.5, label="Patch Duration (s)", visible=False) analyze_btn = gr.Button("Analyze Audio", variant="primary") with gr.Column(): # Results gr.Markdown("### Results") plot_output = gr.Image(label="Visualizations") summary_output = gr.Markdown() status_output = gr.Textbox(label="Status", interactive=False) # Event handlers download_btn.click( analyzer.download_youtube_audio, inputs=[youtube_url], outputs=[audio_file, status_output] ) analyze_btn.click( analyzer.analyze_audio, inputs=[audio_file, analysis_type, patch_duration], outputs=[plot_output, summary_output] ) # Show patch duration slider only for patches analysis analysis_type.change( lambda x: gr.update(visible=(x == "patches")), inputs=[analysis_type], outputs=[patch_duration] ) demo.unload(analyzer.cleanup) return demo if __name__ == "__main__": demo = create_interface() demo.launch()