|
import gradio as gr |
|
import subprocess |
|
import os |
|
import tempfile |
|
import librosa |
|
import librosa.display |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import scipy.ndimage |
|
from pathlib import Path |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
plt.switch_backend('Agg') |
|
|
|
class AudioAnalyzer: |
|
def __init__(self): |
|
self.temp_dir = tempfile.mkdtemp() |
|
|
|
def download_youtube_audio(self, video_url, progress=gr.Progress()): |
|
"""Download audio from YouTube video using yt-dlp.""" |
|
if not video_url: |
|
return None, "Please provide a YouTube URL" |
|
|
|
progress(0.1, desc="Initializing download...") |
|
|
|
output_dir = os.path.join(self.temp_dir, "downloaded_audio") |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
command = [ |
|
"yt-dlp", |
|
"-x", |
|
"--audio-format", "mp3", |
|
"-o", os.path.join(output_dir, "%(title)s.%(ext)s"), |
|
"--no-playlist", |
|
"--restrict-filenames", |
|
video_url |
|
] |
|
|
|
try: |
|
progress(0.3, desc="Downloading audio...") |
|
result = subprocess.run(command, check=True, capture_output=True, text=True) |
|
|
|
|
|
for file in os.listdir(output_dir): |
|
if file.endswith('.mp3'): |
|
file_path = os.path.join(output_dir, file) |
|
progress(1.0, desc="Download complete!") |
|
return file_path, f"Successfully downloaded: {file}" |
|
|
|
return None, "Download completed but no audio file found" |
|
|
|
except FileNotFoundError: |
|
return None, "yt-dlp not found. Please install it: pip install yt-dlp" |
|
except subprocess.CalledProcessError as e: |
|
return None, f"Download failed: {e.stderr}" |
|
except Exception as e: |
|
return None, f"Unexpected error: {str(e)}" |
|
|
|
def extract_basic_features(self, audio_path, sr=16000, progress=gr.Progress()): |
|
"""Extract basic audio features and create visualizations.""" |
|
if not audio_path or not os.path.exists(audio_path): |
|
return None, None, "Invalid audio file" |
|
|
|
try: |
|
progress(0.1, desc="Loading audio...") |
|
y, sr = librosa.load(audio_path, sr=sr) |
|
duration = librosa.get_duration(y=y, sr=sr) |
|
|
|
|
|
max_duration = 60 |
|
if duration > max_duration: |
|
y = y[:sr * max_duration] |
|
duration = max_duration |
|
|
|
progress(0.3, desc="Computing features...") |
|
|
|
|
|
features = {} |
|
features['duration'] = duration |
|
features['sample_rate'] = sr |
|
features['samples'] = len(y) |
|
|
|
|
|
progress(0.5, desc="Computing mel spectrogram...") |
|
hop_length = 512 |
|
S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length) |
|
S_dB = librosa.power_to_db(S_mel, ref=np.max) |
|
|
|
|
|
features['tempo'], _ = librosa.beat.beat_track(y=y, sr=sr) |
|
features['mfcc'] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) |
|
features['spectral_centroid'] = librosa.feature.spectral_centroid(y=y, sr=sr)[0] |
|
features['spectral_rolloff'] = librosa.feature.spectral_rolloff(y=y, sr=sr)[0] |
|
features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(y)[0] |
|
|
|
progress(0.8, desc="Creating visualizations...") |
|
|
|
|
|
fig, axes = plt.subplots(2, 2, figsize=(15, 10)) |
|
|
|
|
|
time_axis = librosa.frames_to_time(range(len(y)), sr=sr) |
|
axes[0, 0].plot(time_axis, y) |
|
axes[0, 0].set_title('Waveform') |
|
axes[0, 0].set_xlabel('Time (s)') |
|
axes[0, 0].set_ylabel('Amplitude') |
|
|
|
|
|
librosa.display.specshow(S_dB, sr=sr, hop_length=hop_length, |
|
x_axis='time', y_axis='mel', ax=axes[0, 1]) |
|
axes[0, 1].set_title('Mel Spectrogram') |
|
|
|
|
|
librosa.display.specshow(features['mfcc'], sr=sr, x_axis='time', ax=axes[1, 0]) |
|
axes[1, 0].set_title('MFCC') |
|
|
|
|
|
times = librosa.frames_to_time(range(len(features['spectral_centroid'])), sr=sr, hop_length=hop_length) |
|
axes[1, 1].plot(times, features['spectral_centroid'], label='Spectral Centroid') |
|
axes[1, 1].plot(times, features['spectral_rolloff'], label='Spectral Rolloff') |
|
axes[1, 1].set_title('Spectral Features') |
|
axes[1, 1].set_xlabel('Time (s)') |
|
axes[1, 1].legend() |
|
|
|
plt.tight_layout() |
|
|
|
|
|
plot_path = os.path.join(self.temp_dir, f"basic_features_{np.random.randint(10000)}.png") |
|
plt.savefig(plot_path, dpi=150, bbox_inches='tight') |
|
plt.close() |
|
|
|
|
|
summary = f""" |
|
**Audio Summary:** |
|
- Duration: {duration:.2f} seconds |
|
- Sample Rate: {sr} Hz |
|
- Estimated Tempo: {features['tempo']:.1f} BPM |
|
- Number of Samples: {len(y):,} |
|
|
|
**Feature Shapes:** |
|
- MFCC: {features['mfcc'].shape} |
|
- Spectral Centroid: {features['spectral_centroid'].shape} |
|
- Spectral Rolloff: {features['spectral_rolloff'].shape} |
|
- Zero Crossing Rate: {features['zero_crossing_rate'].shape} |
|
""" |
|
|
|
progress(1.0, desc="Analysis complete!") |
|
return plot_path, summary, None |
|
|
|
except Exception as e: |
|
return None, None, f"Error processing audio: {str(e)}" |
|
|
|
def extract_chroma_features(self, audio_path, sr=16000, progress=gr.Progress()): |
|
"""Extract and visualize enhanced chroma features.""" |
|
if not audio_path or not os.path.exists(audio_path): |
|
return None, "Invalid audio file" |
|
|
|
try: |
|
progress(0.1, desc="Loading audio...") |
|
y, sr = librosa.load(audio_path, sr=sr) |
|
|
|
|
|
max_duration = 30 |
|
if len(y) > sr * max_duration: |
|
y = y[:sr * max_duration] |
|
|
|
progress(0.3, desc="Computing chroma variants...") |
|
|
|
|
|
chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr) |
|
|
|
|
|
y_harm = librosa.effects.harmonic(y=y, margin=8) |
|
chroma_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr) |
|
|
|
progress(0.6, desc="Applying filters...") |
|
|
|
|
|
chroma_filter = np.minimum(chroma_harm, |
|
librosa.decompose.nn_filter(chroma_harm, |
|
aggregate=np.median, |
|
metric='cosine')) |
|
|
|
|
|
chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9)) |
|
|
|
|
|
chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr) |
|
|
|
|
|
chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr) |
|
|
|
progress(0.8, desc="Creating visualizations...") |
|
|
|
|
|
fig, axes = plt.subplots(3, 2, figsize=(15, 12)) |
|
|
|
|
|
librosa.display.specshow(chroma_orig, y_axis='chroma', x_axis='time', ax=axes[0, 0]) |
|
axes[0, 0].set_title('Original Chroma (CQT)') |
|
|
|
librosa.display.specshow(chroma_harm, y_axis='chroma', x_axis='time', ax=axes[0, 1]) |
|
axes[0, 1].set_title('Harmonic Chroma') |
|
|
|
|
|
librosa.display.specshow(chroma_filter, y_axis='chroma', x_axis='time', ax=axes[1, 0]) |
|
axes[1, 0].set_title('Non-local Filtered') |
|
|
|
librosa.display.specshow(chroma_smooth, y_axis='chroma', x_axis='time', ax=axes[1, 1]) |
|
axes[1, 1].set_title('Median Filtered') |
|
|
|
|
|
librosa.display.specshow(chroma_stft, y_axis='chroma', x_axis='time', ax=axes[2, 0]) |
|
axes[2, 0].set_title('Chroma (STFT)') |
|
|
|
librosa.display.specshow(chroma_cens, y_axis='chroma', x_axis='time', ax=axes[2, 1]) |
|
axes[2, 1].set_title('CENS Features') |
|
|
|
plt.tight_layout() |
|
|
|
|
|
plot_path = os.path.join(self.temp_dir, f"chroma_features_{np.random.randint(10000)}.png") |
|
plt.savefig(plot_path, dpi=150, bbox_inches='tight') |
|
plt.close() |
|
|
|
progress(1.0, desc="Chroma analysis complete!") |
|
return plot_path, None |
|
|
|
except Exception as e: |
|
return None, f"Error processing chroma features: {str(e)}" |
|
|
|
def generate_patches(self, audio_path, sr=16000, patch_duration=5.0, hop_duration=1.0, progress=gr.Progress()): |
|
"""Generate fixed-duration patches for transformer input.""" |
|
if not audio_path or not os.path.exists(audio_path): |
|
return None, None, "Invalid audio file" |
|
|
|
try: |
|
progress(0.1, desc="Loading audio...") |
|
y, sr = librosa.load(audio_path, sr=sr) |
|
|
|
progress(0.3, desc="Computing mel spectrogram...") |
|
hop_length = 512 |
|
S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80) |
|
S_dB = librosa.power_to_db(S_mel, ref=np.max) |
|
|
|
progress(0.5, desc="Generating patches...") |
|
|
|
|
|
patch_frames = librosa.time_to_frames(patch_duration, sr=sr, hop_length=hop_length) |
|
hop_frames = librosa.time_to_frames(hop_duration, sr=sr, hop_length=hop_length) |
|
|
|
|
|
patches = librosa.util.frame(S_dB, frame_length=patch_frames, hop_length=hop_frames) |
|
|
|
progress(0.8, desc="Creating visualizations...") |
|
|
|
|
|
num_patches_to_show = min(6, patches.shape[-1]) |
|
fig, axes = plt.subplots(2, 3, figsize=(18, 8)) |
|
axes = axes.flatten() |
|
|
|
for i in range(num_patches_to_show): |
|
librosa.display.specshow(patches[..., i], y_axis='mel', x_axis='time', |
|
ax=axes[i], sr=sr, hop_length=hop_length) |
|
axes[i].set_title(f'Patch {i+1}') |
|
|
|
|
|
for i in range(num_patches_to_show, len(axes)): |
|
axes[i].set_visible(False) |
|
|
|
plt.tight_layout() |
|
|
|
|
|
plot_path = os.path.join(self.temp_dir, f"patches_{np.random.randint(10000)}.png") |
|
plt.savefig(plot_path, dpi=150, bbox_inches='tight') |
|
plt.close() |
|
|
|
|
|
summary = f""" |
|
**Patch Generation Summary:** |
|
- Total patches generated: {patches.shape[-1]} |
|
- Patch duration: {patch_duration} seconds |
|
- Hop duration: {hop_duration} seconds |
|
- Patch shape (mels, time, patches): {patches.shape} |
|
- Each patch covers {patch_frames} time frames |
|
""" |
|
|
|
progress(1.0, desc="Patch generation complete!") |
|
return plot_path, summary, None |
|
|
|
except Exception as e: |
|
return None, None, f"Error generating patches: {str(e)}" |
|
|
|
|
|
analyzer = AudioAnalyzer() |
|
|
|
|
|
def process_youtube_url(url): |
|
"""Process YouTube URL and return audio file.""" |
|
file_path, message = analyzer.download_youtube_audio(url) |
|
if file_path: |
|
return file_path, message, gr.update(visible=True) |
|
else: |
|
return None, message, gr.update(visible=False) |
|
|
|
def analyze_audio_basic(audio_file): |
|
"""Analyze audio file and return basic features.""" |
|
if audio_file is None: |
|
return None, "Please upload an audio file or download from YouTube first." |
|
|
|
plot_path, summary, error = analyzer.extract_basic_features(audio_file) |
|
if error: |
|
return None, error |
|
return plot_path, summary |
|
|
|
def analyze_audio_chroma(audio_file): |
|
"""Analyze audio file for chroma features.""" |
|
if audio_file is None: |
|
return None, "Please upload an audio file or download from YouTube first." |
|
|
|
plot_path, error = analyzer.extract_chroma_features(audio_file) |
|
if error: |
|
return None, error |
|
return plot_path, "Chroma feature analysis complete! This shows different chroma extraction methods for harmonic analysis." |
|
|
|
def analyze_audio_patches(audio_file, patch_duration, hop_duration): |
|
"""Generate transformer patches from audio.""" |
|
if audio_file is None: |
|
return None, None, "Please upload an audio file or download from YouTube first." |
|
|
|
plot_path, summary, error = analyzer.generate_patches(audio_file, patch_duration=patch_duration, hop_duration=hop_duration) |
|
if error: |
|
return None, None, error |
|
return plot_path, summary |
|
|
|
|
|
with gr.Blocks(title="π΅ Audio Analysis Suite", theme=gr.themes.Soft()) as app: |
|
gr.Markdown(""" |
|
# π΅ Audio Analysis Suite |
|
|
|
A comprehensive tool for audio feature extraction and analysis. Upload an audio file or download from YouTube to get started! |
|
|
|
**Features:** |
|
- π **Basic Features**: Waveform, Mel Spectrogram, MFCC, Spectral Analysis, Tempo Detection |
|
- πΌ **Chroma Features**: Advanced harmonic content analysis with multiple extraction methods |
|
- π§© **Transformer Patches**: Generate fixed-duration patches for deep learning applications |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown("### π Audio Input") |
|
|
|
|
|
with gr.Group(): |
|
gr.Markdown("**Download from YouTube:**") |
|
youtube_url = gr.Textbox( |
|
label="YouTube URL", |
|
placeholder="https://www.youtube.com/watch?v=...", |
|
info="Paste a YouTube video URL to extract audio" |
|
) |
|
download_btn = gr.Button("π₯ Download Audio", variant="primary") |
|
download_status = gr.Textbox(label="Download Status", interactive=False) |
|
|
|
|
|
with gr.Group(): |
|
gr.Markdown("**Or upload audio file:**") |
|
audio_file = gr.Audio( |
|
label="Upload Audio File", |
|
type="filepath", |
|
info="Supported formats: MP3, WAV, FLAC, etc." |
|
) |
|
|
|
with gr.Column(scale=2): |
|
gr.Markdown("### π Analysis Results") |
|
|
|
with gr.Tabs(): |
|
with gr.Tab("π Basic Features"): |
|
basic_plot = gr.Image(label="Feature Visualizations") |
|
basic_summary = gr.Markdown() |
|
basic_analyze_btn = gr.Button("π Analyze Basic Features", variant="secondary") |
|
|
|
with gr.Tab("πΌ Chroma Features"): |
|
chroma_plot = gr.Image(label="Chroma Visualizations") |
|
chroma_summary = gr.Markdown() |
|
chroma_analyze_btn = gr.Button("πΌ Analyze Chroma Features", variant="secondary") |
|
|
|
with gr.Tab("π§© Transformer Patches"): |
|
with gr.Row(): |
|
patch_duration = gr.Slider( |
|
label="Patch Duration (seconds)", |
|
minimum=1.0, maximum=10.0, value=5.0, step=0.5, |
|
info="Duration of each patch" |
|
) |
|
hop_duration = gr.Slider( |
|
label="Hop Duration (seconds)", |
|
minimum=0.1, maximum=5.0, value=1.0, step=0.1, |
|
info="Time between patch starts" |
|
) |
|
|
|
patches_plot = gr.Image(label="Generated Patches") |
|
patches_summary = gr.Markdown() |
|
patches_analyze_btn = gr.Button("π§© Generate Patches", variant="secondary") |
|
|
|
gr.Markdown(""" |
|
### βΉοΈ Usage Tips |
|
- **Processing is limited to 60 seconds** for basic features and 30 seconds for chroma analysis to ensure fast response times |
|
- **YouTube downloads** respect platform terms of service |
|
- **Visualizations** are high-quality and suitable for research/educational use |
|
- **All processing** is done locally in your browser session |
|
""") |
|
|
|
|
|
download_btn.click( |
|
process_youtube_url, |
|
inputs=[youtube_url], |
|
outputs=[audio_file, download_status, basic_analyze_btn] |
|
) |
|
|
|
basic_analyze_btn.click( |
|
analyze_audio_basic, |
|
inputs=[audio_file], |
|
outputs=[basic_plot, basic_summary] |
|
) |
|
|
|
chroma_analyze_btn.click( |
|
analyze_audio_chroma, |
|
inputs=[audio_file], |
|
outputs=[chroma_plot, chroma_summary] |
|
) |
|
|
|
patches_analyze_btn.click( |
|
analyze_audio_patches, |
|
inputs=[audio_file, patch_duration, hop_duration], |
|
outputs=[patches_plot, patches_summary] |
|
) |
|
|
|
|
|
audio_file.change( |
|
analyze_audio_basic, |
|
inputs=[audio_file], |
|
outputs=[basic_plot, basic_summary] |
|
) |
|
|
|
if __name__ == "__main__": |
|
app.launch() |
|
|
|
|