Spaces:
Running
Running
import gradio as gr | |
import subprocess | |
import os | |
import tempfile | |
import librosa | |
import librosa.display | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import scipy.ndimage | |
from pathlib import Path | |
import warnings | |
warnings.filterwarnings('ignore') | |
# Set matplotlib backend for web display | |
plt.switch_backend('Agg') | |
class AudioAnalyzer: | |
def __init__(self): | |
self.temp_dir = tempfile.mkdtemp() | |
def download_youtube_audio(self, video_url, progress=gr.Progress()): | |
"""Download audio from YouTube video using yt-dlp.""" | |
if not video_url: | |
return None, "Please provide a YouTube URL" | |
progress(0.1, desc="Initializing download...") | |
output_dir = os.path.join(self.temp_dir, "downloaded_audio") | |
os.makedirs(output_dir, exist_ok=True) | |
# yt-dlp command to extract audio in mp3 format | |
command = [ | |
"yt-dlp", | |
"-x", | |
"--audio-format", "mp3", | |
"-o", os.path.join(output_dir, "%(title)s.%(ext)s"), | |
"--no-playlist", | |
"--restrict-filenames", | |
video_url | |
] | |
try: | |
progress(0.3, desc="Downloading audio...") | |
result = subprocess.run(command, check=True, capture_output=True, text=True) | |
# Find the downloaded file | |
for file in os.listdir(output_dir): | |
if file.endswith('.mp3'): | |
file_path = os.path.join(output_dir, file) | |
progress(1.0, desc="Download complete!") | |
return file_path, f"Successfully downloaded: {file}" | |
return None, "Download completed but no audio file found" | |
except FileNotFoundError: | |
return None, "yt-dlp not found. Please install it: pip install yt-dlp" | |
except subprocess.CalledProcessError as e: | |
return None, f"Download failed: {e.stderr}" | |
except Exception as e: | |
return None, f"Unexpected error: {str(e)}" | |
def extract_basic_features(self, audio_path, sr=16000, progress=gr.Progress()): | |
"""Extract basic audio features and create visualizations.""" | |
if not audio_path or not os.path.exists(audio_path): | |
return None, None, "Invalid audio file" | |
try: | |
progress(0.1, desc="Loading audio...") | |
y, sr = librosa.load(audio_path, sr=sr) | |
duration = librosa.get_duration(y=y, sr=sr) | |
# Limit to first 60 seconds for processing speed | |
max_duration = 60 | |
if duration > max_duration: | |
y = y[:sr * max_duration] | |
duration = max_duration | |
progress(0.3, desc="Computing features...") | |
# Basic features | |
features = {} | |
features['duration'] = duration | |
features['sample_rate'] = sr | |
features['samples'] = len(y) | |
# Mel spectrogram | |
progress(0.5, desc="Computing mel spectrogram...") | |
hop_length = 512 | |
S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length) | |
S_dB = librosa.power_to_db(S_mel, ref=np.max) | |
# Other features | |
features['tempo'], _ = librosa.beat.beat_track(y=y, sr=sr) | |
features['mfcc'] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
features['spectral_centroid'] = librosa.feature.spectral_centroid(y=y, sr=sr)[0] | |
features['spectral_rolloff'] = librosa.feature.spectral_rolloff(y=y, sr=sr)[0] | |
features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(y)[0] | |
progress(0.8, desc="Creating visualizations...") | |
# Create visualizations | |
fig, axes = plt.subplots(2, 2, figsize=(15, 10)) | |
# Waveform | |
time_axis = librosa.frames_to_time(range(len(y)), sr=sr) | |
axes[0, 0].plot(time_axis, y) | |
axes[0, 0].set_title('Waveform') | |
axes[0, 0].set_xlabel('Time (s)') | |
axes[0, 0].set_ylabel('Amplitude') | |
# Mel spectrogram | |
librosa.display.specshow(S_dB, sr=sr, hop_length=hop_length, | |
x_axis='time', y_axis='mel', ax=axes[0, 1]) | |
axes[0, 1].set_title('Mel Spectrogram') | |
# MFCC | |
librosa.display.specshow(features['mfcc'], sr=sr, x_axis='time', ax=axes[1, 0]) | |
axes[1, 0].set_title('MFCC') | |
# Spectral features | |
times = librosa.frames_to_time(range(len(features['spectral_centroid'])), sr=sr, hop_length=hop_length) | |
axes[1, 1].plot(times, features['spectral_centroid'], label='Spectral Centroid') | |
axes[1, 1].plot(times, features['spectral_rolloff'], label='Spectral Rolloff') | |
axes[1, 1].set_title('Spectral Features') | |
axes[1, 1].set_xlabel('Time (s)') | |
axes[1, 1].legend() | |
plt.tight_layout() | |
# Save plot | |
plot_path = os.path.join(self.temp_dir, f"basic_features_{np.random.randint(10000)}.png") | |
plt.savefig(plot_path, dpi=150, bbox_inches='tight') | |
plt.close() | |
# Create summary text | |
summary = f""" | |
**Audio Summary:** | |
- Duration: {duration:.2f} seconds | |
- Sample Rate: {sr} Hz | |
- Estimated Tempo: {features['tempo']:.1f} BPM | |
- Number of Samples: {len(y):,} | |
**Feature Shapes:** | |
- MFCC: {features['mfcc'].shape} | |
- Spectral Centroid: {features['spectral_centroid'].shape} | |
- Spectral Rolloff: {features['spectral_rolloff'].shape} | |
- Zero Crossing Rate: {features['zero_crossing_rate'].shape} | |
""" | |
progress(1.0, desc="Analysis complete!") | |
return plot_path, summary, None | |
except Exception as e: | |
return None, None, f"Error processing audio: {str(e)}" | |
def extract_chroma_features(self, audio_path, sr=16000, progress=gr.Progress()): | |
"""Extract and visualize enhanced chroma features.""" | |
if not audio_path or not os.path.exists(audio_path): | |
return None, "Invalid audio file" | |
try: | |
progress(0.1, desc="Loading audio...") | |
y, sr = librosa.load(audio_path, sr=sr) | |
# Limit to first 30 seconds for processing speed | |
max_duration = 30 | |
if len(y) > sr * max_duration: | |
y = y[:sr * max_duration] | |
progress(0.3, desc="Computing chroma variants...") | |
# Original chroma | |
chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr) | |
# Harmonic-percussive separation | |
y_harm = librosa.effects.harmonic(y=y, margin=8) | |
chroma_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr) | |
progress(0.6, desc="Applying filters...") | |
# Non-local filtering | |
chroma_filter = np.minimum(chroma_harm, | |
librosa.decompose.nn_filter(chroma_harm, | |
aggregate=np.median, | |
metric='cosine')) | |
# Median filtering | |
chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9)) | |
# STFT-based chroma | |
chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr) | |
# CENS features | |
chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr) | |
progress(0.8, desc="Creating visualizations...") | |
# Create comprehensive visualization | |
fig, axes = plt.subplots(3, 2, figsize=(15, 12)) | |
# Original vs Harmonic | |
librosa.display.specshow(chroma_orig, y_axis='chroma', x_axis='time', ax=axes[0, 0]) | |
axes[0, 0].set_title('Original Chroma (CQT)') | |
librosa.display.specshow(chroma_harm, y_axis='chroma', x_axis='time', ax=axes[0, 1]) | |
axes[0, 1].set_title('Harmonic Chroma') | |
# Filtered vs Smooth | |
librosa.display.specshow(chroma_filter, y_axis='chroma', x_axis='time', ax=axes[1, 0]) | |
axes[1, 0].set_title('Non-local Filtered') | |
librosa.display.specshow(chroma_smooth, y_axis='chroma', x_axis='time', ax=axes[1, 1]) | |
axes[1, 1].set_title('Median Filtered') | |
# STFT vs CENS | |
librosa.display.specshow(chroma_stft, y_axis='chroma', x_axis='time', ax=axes[2, 0]) | |
axes[2, 0].set_title('Chroma (STFT)') | |
librosa.display.specshow(chroma_cens, y_axis='chroma', x_axis='time', ax=axes[2, 1]) | |
axes[2, 1].set_title('CENS Features') | |
plt.tight_layout() | |
# Save plot | |
plot_path = os.path.join(self.temp_dir, f"chroma_features_{np.random.randint(10000)}.png") | |
plt.savefig(plot_path, dpi=150, bbox_inches='tight') | |
plt.close() | |
progress(1.0, desc="Chroma analysis complete!") | |
return plot_path, None | |
except Exception as e: | |
return None, f"Error processing chroma features: {str(e)}" | |
def generate_patches(self, audio_path, sr=16000, patch_duration=5.0, hop_duration=1.0, progress=gr.Progress()): | |
"""Generate fixed-duration patches for transformer input.""" | |
if not audio_path or not os.path.exists(audio_path): | |
return None, None, "Invalid audio file" | |
try: | |
progress(0.1, desc="Loading audio...") | |
y, sr = librosa.load(audio_path, sr=sr) | |
progress(0.3, desc="Computing mel spectrogram...") | |
hop_length = 512 | |
S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80) | |
S_dB = librosa.power_to_db(S_mel, ref=np.max) | |
progress(0.5, desc="Generating patches...") | |
# Convert time to frames | |
patch_frames = librosa.time_to_frames(patch_duration, sr=sr, hop_length=hop_length) | |
hop_frames = librosa.time_to_frames(hop_duration, sr=sr, hop_length=hop_length) | |
# Generate patches using librosa.util.frame | |
patches = librosa.util.frame(S_dB, frame_length=patch_frames, hop_length=hop_frames) | |
progress(0.8, desc="Creating visualizations...") | |
# Visualize patches | |
num_patches_to_show = min(6, patches.shape[-1]) | |
fig, axes = plt.subplots(2, 3, figsize=(18, 8)) | |
axes = axes.flatten() | |
for i in range(num_patches_to_show): | |
librosa.display.specshow(patches[..., i], y_axis='mel', x_axis='time', | |
ax=axes[i], sr=sr, hop_length=hop_length) | |
axes[i].set_title(f'Patch {i+1}') | |
# Hide unused subplots | |
for i in range(num_patches_to_show, len(axes)): | |
axes[i].set_visible(False) | |
plt.tight_layout() | |
# Save plot | |
plot_path = os.path.join(self.temp_dir, f"patches_{np.random.randint(10000)}.png") | |
plt.savefig(plot_path, dpi=150, bbox_inches='tight') | |
plt.close() | |
# Summary | |
summary = f""" | |
**Patch Generation Summary:** | |
- Total patches generated: {patches.shape[-1]} | |
- Patch duration: {patch_duration} seconds | |
- Hop duration: {hop_duration} seconds | |
- Patch shape (mels, time, patches): {patches.shape} | |
- Each patch covers {patch_frames} time frames | |
""" | |
progress(1.0, desc="Patch generation complete!") | |
return plot_path, summary, None | |
except Exception as e: | |
return None, None, f"Error generating patches: {str(e)}" | |
# Initialize analyzer | |
analyzer = AudioAnalyzer() | |
# Gradio interface functions | |
def process_youtube_url(url): | |
"""Process YouTube URL and return audio file.""" | |
file_path, message = analyzer.download_youtube_audio(url) | |
if file_path: | |
return file_path, message, gr.update(visible=True) | |
else: | |
return None, message, gr.update(visible=False) | |
def analyze_audio_basic(audio_file): | |
"""Analyze audio file and return basic features.""" | |
if audio_file is None: | |
return None, "Please upload an audio file or download from YouTube first." | |
plot_path, summary, error = analyzer.extract_basic_features(audio_file) | |
if error: | |
return None, error | |
return plot_path, summary | |
def analyze_audio_chroma(audio_file): | |
"""Analyze audio file for chroma features.""" | |
if audio_file is None: | |
return None, "Please upload an audio file or download from YouTube first." | |
plot_path, error = analyzer.extract_chroma_features(audio_file) | |
if error: | |
return None, error | |
return plot_path, "Chroma feature analysis complete! This shows different chroma extraction methods for harmonic analysis." | |
def analyze_audio_patches(audio_file, patch_duration, hop_duration): | |
"""Generate transformer patches from audio.""" | |
if audio_file is None: | |
return None, None, "Please upload an audio file or download from YouTube first." | |
plot_path, summary, error = analyzer.generate_patches(audio_file, patch_duration=patch_duration, hop_duration=hop_duration) | |
if error: | |
return None, None, error | |
return plot_path, summary | |
# Create Gradio interface | |
with gr.Blocks(title="π΅ Audio Analysis Suite", theme=gr.themes.Soft()) as app: | |
gr.Markdown(""" | |
# π΅ Audio Analysis Suite | |
A comprehensive tool for audio feature extraction and analysis. Upload an audio file or download from YouTube to get started! | |
**Features:** | |
- π **Basic Features**: Waveform, Mel Spectrogram, MFCC, Spectral Analysis, Tempo Detection | |
- πΌ **Chroma Features**: Advanced harmonic content analysis with multiple extraction methods | |
- π§© **Transformer Patches**: Generate fixed-duration patches for deep learning applications | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### π Audio Input") | |
# YouTube downloader | |
with gr.Group(): | |
gr.Markdown("**Download from YouTube:**") | |
youtube_url = gr.Textbox( | |
label="YouTube URL", | |
placeholder="https://www.youtube.com/watch?v=...", | |
info="Paste a YouTube video URL to extract audio" | |
) | |
download_btn = gr.Button("π₯ Download Audio", variant="primary") | |
download_status = gr.Textbox(label="Download Status", interactive=False) | |
# File upload | |
with gr.Group(): | |
gr.Markdown("**Or upload audio file:**") | |
audio_file = gr.Audio( | |
label="Upload Audio File", | |
type="filepath", | |
info="Supported formats: MP3, WAV, FLAC, etc." | |
) | |
with gr.Column(scale=2): | |
gr.Markdown("### π Analysis Results") | |
with gr.Tabs(): | |
with gr.Tab("π Basic Features"): | |
basic_plot = gr.Image(label="Feature Visualizations") | |
basic_summary = gr.Markdown() | |
basic_analyze_btn = gr.Button("π Analyze Basic Features", variant="secondary") | |
with gr.Tab("πΌ Chroma Features"): | |
chroma_plot = gr.Image(label="Chroma Visualizations") | |
chroma_summary = gr.Markdown() | |
chroma_analyze_btn = gr.Button("πΌ Analyze Chroma Features", variant="secondary") | |
with gr.Tab("π§© Transformer Patches"): | |
with gr.Row(): | |
patch_duration = gr.Slider( | |
label="Patch Duration (seconds)", | |
minimum=1.0, maximum=10.0, value=5.0, step=0.5, | |
info="Duration of each patch" | |
) | |
hop_duration = gr.Slider( | |
label="Hop Duration (seconds)", | |
minimum=0.1, maximum=5.0, value=1.0, step=0.1, | |
info="Time between patch starts" | |
) | |
patches_plot = gr.Image(label="Generated Patches") | |
patches_summary = gr.Markdown() | |
patches_analyze_btn = gr.Button("π§© Generate Patches", variant="secondary") | |
gr.Markdown(""" | |
### βΉοΈ Usage Tips | |
- **Processing is limited to 60 seconds** for basic features and 30 seconds for chroma analysis to ensure fast response times | |
- **YouTube downloads** respect platform terms of service | |
- **Visualizations** are high-quality and suitable for research/educational use | |
- **All processing** is done locally in your browser session | |
""") | |
# Event handlers | |
download_btn.click( | |
process_youtube_url, | |
inputs=[youtube_url], | |
outputs=[audio_file, download_status, basic_analyze_btn] | |
) | |
basic_analyze_btn.click( | |
analyze_audio_basic, | |
inputs=[audio_file], | |
outputs=[basic_plot, basic_summary] | |
) | |
chroma_analyze_btn.click( | |
analyze_audio_chroma, | |
inputs=[audio_file], | |
outputs=[chroma_plot, chroma_summary] | |
) | |
patches_analyze_btn.click( | |
analyze_audio_patches, | |
inputs=[audio_file, patch_duration, hop_duration], | |
outputs=[patches_plot, patches_summary] | |
) | |
# Auto-analyze when file is uploaded | |
audio_file.change( | |
analyze_audio_basic, | |
inputs=[audio_file], | |
outputs=[basic_plot, basic_summary] | |
) | |
if __name__ == "__main__": | |
app.launch() | |