Spaces:
g0th
/
Sleeping

latterworks commited on
Commit
1902030
Β·
verified Β·
1 Parent(s): d7fc5bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -327
app.py CHANGED
@@ -8,443 +8,368 @@ import matplotlib.pyplot as plt
8
  import numpy as np
9
  import scipy.ndimage
10
  from pathlib import Path
 
11
  import warnings
12
- warnings.filterwarnings('ignore')
 
13
 
14
- # Set matplotlib backend for web display
15
  plt.switch_backend('Agg')
 
 
 
 
 
 
 
 
 
16
 
17
  class AudioAnalyzer:
18
- def __init__(self):
19
- self.temp_dir = tempfile.mkdtemp()
20
-
21
- def download_youtube_audio(self, video_url, progress=gr.Progress()):
22
- """Download audio from YouTube video using yt-dlp."""
 
 
 
 
 
 
 
 
 
 
 
23
  if not video_url:
24
- return None, "Please provide a YouTube URL"
25
-
26
  progress(0.1, desc="Initializing download...")
27
-
28
- output_dir = os.path.join(self.temp_dir, "downloaded_audio")
29
- os.makedirs(output_dir, exist_ok=True)
30
-
31
- # yt-dlp command to extract audio in mp3 format
32
  command = [
33
  "yt-dlp",
34
  "-x",
35
  "--audio-format", "mp3",
36
- "-o", os.path.join(output_dir, "%(title)s.%(ext)s"),
37
  "--no-playlist",
38
  "--restrict-filenames",
39
  video_url
40
  ]
41
-
42
  try:
43
  progress(0.3, desc="Downloading audio...")
44
- result = subprocess.run(command, check=True, capture_output=True, text=True)
45
-
46
- # Find the downloaded file
47
- for file in os.listdir(output_dir):
48
- if file.endswith('.mp3'):
49
- file_path = os.path.join(output_dir, file)
50
- progress(1.0, desc="Download complete!")
51
- return file_path, f"Successfully downloaded: {file}"
52
-
53
- return None, "Download completed but no audio file found"
54
-
55
  except FileNotFoundError:
56
- return None, "yt-dlp not found. Please install it: pip install yt-dlp"
57
  except subprocess.CalledProcessError as e:
58
  return None, f"Download failed: {e.stderr}"
59
  except Exception as e:
 
60
  return None, f"Unexpected error: {str(e)}"
61
-
62
- def extract_basic_features(self, audio_path, sr=16000, progress=gr.Progress()):
63
- """Extract basic audio features and create visualizations."""
64
- if not audio_path or not os.path.exists(audio_path):
65
- return None, None, "Invalid audio file"
66
-
 
67
  try:
68
  progress(0.1, desc="Loading audio...")
69
  y, sr = librosa.load(audio_path, sr=sr)
70
  duration = librosa.get_duration(y=y, sr=sr)
71
-
72
- # Limit to first 60 seconds for processing speed
73
- max_duration = 60
74
  if duration > max_duration:
75
- y = y[:sr * max_duration]
76
  duration = max_duration
77
-
78
  progress(0.3, desc="Computing features...")
79
-
80
- # Basic features
81
- features = {}
82
- features['duration'] = duration
83
- features['sample_rate'] = sr
84
- features['samples'] = len(y)
85
-
86
- # Mel spectrogram
 
 
 
87
  progress(0.5, desc="Computing mel spectrogram...")
88
  hop_length = 512
89
- S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length)
90
  S_dB = librosa.power_to_db(S_mel, ref=np.max)
91
-
92
- # Other features
93
- features['tempo'], _ = librosa.beat.beat_track(y=y, sr=sr)
94
- features['mfcc'] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
95
- features['spectral_centroid'] = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
96
- features['spectral_rolloff'] = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
97
- features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(y)[0]
98
-
99
  progress(0.8, desc="Creating visualizations...")
100
-
101
- # Create visualizations
102
  fig, axes = plt.subplots(2, 2, figsize=(15, 10))
103
-
104
- # Waveform
105
  time_axis = librosa.frames_to_time(range(len(y)), sr=sr)
106
  axes[0, 0].plot(time_axis, y)
107
  axes[0, 0].set_title('Waveform')
108
  axes[0, 0].set_xlabel('Time (s)')
109
  axes[0, 0].set_ylabel('Amplitude')
110
-
111
- # Mel spectrogram
112
  librosa.display.specshow(S_dB, sr=sr, hop_length=hop_length,
113
- x_axis='time', y_axis='mel', ax=axes[0, 1])
114
  axes[0, 1].set_title('Mel Spectrogram')
115
-
116
- # MFCC
117
  librosa.display.specshow(features['mfcc'], sr=sr, x_axis='time', ax=axes[1, 0])
118
  axes[1, 0].set_title('MFCC')
119
-
120
- # Spectral features
121
  times = librosa.frames_to_time(range(len(features['spectral_centroid'])), sr=sr, hop_length=hop_length)
122
  axes[1, 1].plot(times, features['spectral_centroid'], label='Spectral Centroid')
123
  axes[1, 1].plot(times, features['spectral_rolloff'], label='Spectral Rolloff')
124
  axes[1, 1].set_title('Spectral Features')
125
  axes[1, 1].set_xlabel('Time (s)')
126
  axes[1, 1].legend()
127
-
128
  plt.tight_layout()
129
-
130
- # Save plot
131
- plot_path = os.path.join(self.temp_dir, f"basic_features_{np.random.randint(10000)}.png")
132
  plt.savefig(plot_path, dpi=150, bbox_inches='tight')
133
  plt.close()
134
-
135
- # Create summary text
136
  summary = f"""
137
- **Audio Summary:**
138
- - Duration: {duration:.2f} seconds
139
- - Sample Rate: {sr} Hz
140
- - Estimated Tempo: {features['tempo']:.1f} BPM
141
- - Number of Samples: {len(y):,}
142
-
143
- **Feature Shapes:**
144
- - MFCC: {features['mfcc'].shape}
145
- - Spectral Centroid: {features['spectral_centroid'].shape}
146
- - Spectral Rolloff: {features['spectral_rolloff'].shape}
147
- - Zero Crossing Rate: {features['zero_crossing_rate'].shape}
148
  """
149
-
150
  progress(1.0, desc="Analysis complete!")
151
- return plot_path, summary, None
152
-
153
  except Exception as e:
 
154
  return None, None, f"Error processing audio: {str(e)}"
155
-
156
- def extract_chroma_features(self, audio_path, sr=16000, progress=gr.Progress()):
 
157
  """Extract and visualize enhanced chroma features."""
158
- if not audio_path or not os.path.exists(audio_path):
159
- return None, "Invalid audio file"
160
-
161
  try:
162
  progress(0.1, desc="Loading audio...")
163
  y, sr = librosa.load(audio_path, sr=sr)
164
-
165
- # Limit to first 30 seconds for processing speed
166
- max_duration = 30
167
  if len(y) > sr * max_duration:
168
- y = y[:sr * max_duration]
169
-
170
  progress(0.3, desc="Computing chroma variants...")
171
-
172
- # Original chroma
173
  chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr)
174
-
175
- # Harmonic-percussive separation
176
  y_harm = librosa.effects.harmonic(y=y, margin=8)
177
  chroma_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr)
178
-
179
- progress(0.6, desc="Applying filters...")
180
-
181
- # Non-local filtering
182
  chroma_filter = np.minimum(chroma_harm,
183
  librosa.decompose.nn_filter(chroma_harm,
184
  aggregate=np.median,
185
  metric='cosine'))
186
-
187
- # Median filtering
188
  chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9))
189
-
190
- # STFT-based chroma
191
  chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
192
-
193
- # CENS features
194
  chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
195
-
196
  progress(0.8, desc="Creating visualizations...")
197
-
198
- # Create comprehensive visualization
199
  fig, axes = plt.subplots(3, 2, figsize=(15, 12))
200
-
201
- # Original vs Harmonic
202
- librosa.display.specshow(chroma_orig, y_axis='chroma', x_axis='time', ax=axes[0, 0])
203
- axes[0, 0].set_title('Original Chroma (CQT)')
204
-
205
- librosa.display.specshow(chroma_harm, y_axis='chroma', x_axis='time', ax=axes[0, 1])
206
- axes[0, 1].set_title('Harmonic Chroma')
207
-
208
- # Filtered vs Smooth
209
- librosa.display.specshow(chroma_filter, y_axis='chroma', x_axis='time', ax=axes[1, 0])
210
- axes[1, 0].set_title('Non-local Filtered')
211
-
212
- librosa.display.specshow(chroma_smooth, y_axis='chroma', x_axis='time', ax=axes[1, 1])
213
- axes[1, 1].set_title('Median Filtered')
214
-
215
- # STFT vs CENS
216
- librosa.display.specshow(chroma_stft, y_axis='chroma', x_axis='time', ax=axes[2, 0])
217
- axes[2, 0].set_title('Chroma (STFT)')
218
-
219
- librosa.display.specshow(chroma_cens, y_axis='chroma', x_axis='time', ax=axes[2, 1])
220
- axes[2, 1].set_title('CENS Features')
221
-
222
  plt.tight_layout()
223
-
224
- # Save plot
225
- plot_path = os.path.join(self.temp_dir, f"chroma_features_{np.random.randint(10000)}.png")
226
  plt.savefig(plot_path, dpi=150, bbox_inches='tight')
227
  plt.close()
228
-
 
229
  progress(1.0, desc="Chroma analysis complete!")
230
- return plot_path, None
231
-
232
  except Exception as e:
233
- return None, f"Error processing chroma features: {str(e)}"
234
-
235
- def generate_patches(self, audio_path, sr=16000, patch_duration=5.0, hop_duration=1.0, progress=gr.Progress()):
 
 
236
  """Generate fixed-duration patches for transformer input."""
237
- if not audio_path or not os.path.exists(audio_path):
238
- return None, None, "Invalid audio file"
239
-
240
  try:
241
  progress(0.1, desc="Loading audio...")
242
  y, sr = librosa.load(audio_path, sr=sr)
243
-
244
  progress(0.3, desc="Computing mel spectrogram...")
245
  hop_length = 512
246
  S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80)
247
  S_dB = librosa.power_to_db(S_mel, ref=np.max)
248
-
249
  progress(0.5, desc="Generating patches...")
250
-
251
- # Convert time to frames
252
  patch_frames = librosa.time_to_frames(patch_duration, sr=sr, hop_length=hop_length)
253
  hop_frames = librosa.time_to_frames(hop_duration, sr=sr, hop_length=hop_length)
254
-
255
- # Generate patches using librosa.util.frame
256
  patches = librosa.util.frame(S_dB, frame_length=patch_frames, hop_length=hop_frames)
257
-
258
  progress(0.8, desc="Creating visualizations...")
259
-
260
- # Visualize patches
261
  num_patches_to_show = min(6, patches.shape[-1])
262
  fig, axes = plt.subplots(2, 3, figsize=(18, 8))
263
  axes = axes.flatten()
264
-
265
  for i in range(num_patches_to_show):
266
  librosa.display.specshow(patches[..., i], y_axis='mel', x_axis='time',
267
  ax=axes[i], sr=sr, hop_length=hop_length)
268
  axes[i].set_title(f'Patch {i+1}')
269
-
270
- # Hide unused subplots
271
  for i in range(num_patches_to_show, len(axes)):
272
  axes[i].set_visible(False)
273
-
274
  plt.tight_layout()
275
-
276
- # Save plot
277
- plot_path = os.path.join(self.temp_dir, f"patches_{np.random.randint(10000)}.png")
278
  plt.savefig(plot_path, dpi=150, bbox_inches='tight')
279
  plt.close()
280
-
281
- # Summary
282
  summary = f"""
283
- **Patch Generation Summary:**
284
- - Total patches generated: {patches.shape[-1]}
285
- - Patch duration: {patch_duration} seconds
286
- - Hop duration: {hop_duration} seconds
287
- - Patch shape (mels, time, patches): {patches.shape}
288
- - Each patch covers {patch_frames} time frames
289
  """
290
-
291
  progress(1.0, desc="Patch generation complete!")
292
- return plot_path, summary, None
293
-
294
  except Exception as e:
 
295
  return None, None, f"Error generating patches: {str(e)}"
296
 
297
- # Initialize analyzer
298
- analyzer = AudioAnalyzer()
299
-
300
- # Gradio interface functions
301
- def process_youtube_url(url):
302
- """Process YouTube URL and return audio file."""
303
- file_path, message = analyzer.download_youtube_audio(url)
304
- if file_path:
305
- return file_path, message, gr.update(visible=True)
306
- else:
307
- return None, message, gr.update(visible=False)
308
-
309
- def analyze_audio_basic(audio_file):
310
- """Analyze audio file and return basic features."""
311
- if audio_file is None:
312
- return None, "Please upload an audio file or download from YouTube first."
313
-
314
- plot_path, summary, error = analyzer.extract_basic_features(audio_file)
315
- if error:
316
- return None, error
317
- return plot_path, summary
318
-
319
- def analyze_audio_chroma(audio_file):
320
- """Analyze audio file for chroma features."""
321
- if audio_file is None:
322
- return None, "Please upload an audio file or download from YouTube first."
323
-
324
- plot_path, error = analyzer.extract_chroma_features(audio_file)
325
- if error:
326
- return None, error
327
- return plot_path, "Chroma feature analysis complete! This shows different chroma extraction methods for harmonic analysis."
328
-
329
- def analyze_audio_patches(audio_file, patch_duration, hop_duration):
330
- """Generate transformer patches from audio."""
331
- if audio_file is None:
332
- return None, None, "Please upload an audio file or download from YouTube first."
333
-
334
- plot_path, summary, error = analyzer.generate_patches(audio_file, patch_duration=patch_duration, hop_duration=hop_duration)
335
- if error:
336
- return None, None, error
337
- return plot_path, summary
338
-
339
- # Create Gradio interface
340
- with gr.Blocks(title="🎡 Audio Analysis Suite", theme=gr.themes.Soft()) as app:
341
- gr.Markdown("""
342
- # 🎡 Audio Analysis Suite
343
-
344
- A comprehensive tool for audio feature extraction and analysis. Upload an audio file or download from YouTube to get started!
345
-
346
- **Features:**
347
- - πŸ“Š **Basic Features**: Waveform, Mel Spectrogram, MFCC, Spectral Analysis, Tempo Detection
348
- - 🎼 **Chroma Features**: Advanced harmonic content analysis with multiple extraction methods
349
- - 🧩 **Transformer Patches**: Generate fixed-duration patches for deep learning applications
350
- """)
351
-
352
- with gr.Row():
353
- with gr.Column(scale=1):
354
- gr.Markdown("### πŸ“ Audio Input")
355
-
356
- # YouTube downloader
357
- with gr.Group():
358
- gr.Markdown("**Download from YouTube:**")
359
- youtube_url = gr.Textbox(
360
- label="YouTube URL",
361
- placeholder="https://www.youtube.com/watch?v=...",
362
- info="Paste a YouTube video URL to extract audio"
363
- )
364
- download_btn = gr.Button("πŸ“₯ Download Audio", variant="primary")
365
- download_status = gr.Textbox(label="Download Status", interactive=False)
366
-
367
- # File upload
368
- with gr.Group():
369
- gr.Markdown("**Or upload audio file:**")
370
- audio_file = gr.Audio(
371
- label="Upload Audio File",
372
- type="filepath",
373
- info="Supported formats: MP3, WAV, FLAC, etc."
374
- )
375
-
376
- with gr.Column(scale=2):
377
- gr.Markdown("### πŸ” Analysis Results")
378
-
379
- with gr.Tabs():
380
- with gr.Tab("πŸ“Š Basic Features"):
381
- basic_plot = gr.Image(label="Feature Visualizations")
382
- basic_summary = gr.Markdown()
383
- basic_analyze_btn = gr.Button("πŸ” Analyze Basic Features", variant="secondary")
384
-
385
- with gr.Tab("🎼 Chroma Features"):
386
- chroma_plot = gr.Image(label="Chroma Visualizations")
387
- chroma_summary = gr.Markdown()
388
- chroma_analyze_btn = gr.Button("🎼 Analyze Chroma Features", variant="secondary")
389
-
390
- with gr.Tab("🧩 Transformer Patches"):
391
- with gr.Row():
392
- patch_duration = gr.Slider(
393
- label="Patch Duration (seconds)",
394
- minimum=1.0, maximum=10.0, value=5.0, step=0.5,
395
- info="Duration of each patch"
396
- )
397
- hop_duration = gr.Slider(
398
- label="Hop Duration (seconds)",
399
- minimum=0.1, maximum=5.0, value=1.0, step=0.1,
400
- info="Time between patch starts"
401
- )
402
-
403
- patches_plot = gr.Image(label="Generated Patches")
404
- patches_summary = gr.Markdown()
405
- patches_analyze_btn = gr.Button("🧩 Generate Patches", variant="secondary")
406
-
407
- gr.Markdown("""
408
- ### ℹ️ Usage Tips
409
- - **Processing is limited to 60 seconds** for basic features and 30 seconds for chroma analysis to ensure fast response times
410
- - **YouTube downloads** respect platform terms of service
411
- - **Visualizations** are high-quality and suitable for research/educational use
412
- - **All processing** is done locally in your browser session
413
- """)
414
-
415
- # Event handlers
416
- download_btn.click(
417
- process_youtube_url,
418
- inputs=[youtube_url],
419
- outputs=[audio_file, download_status, basic_analyze_btn]
420
- )
421
-
422
- basic_analyze_btn.click(
423
- analyze_audio_basic,
424
- inputs=[audio_file],
425
- outputs=[basic_plot, basic_summary]
426
- )
427
-
428
- chroma_analyze_btn.click(
429
- analyze_audio_chroma,
430
- inputs=[audio_file],
431
- outputs=[chroma_plot, chroma_summary]
432
- )
433
-
434
- patches_analyze_btn.click(
435
- analyze_audio_patches,
436
- inputs=[audio_file, patch_duration, hop_duration],
437
- outputs=[patches_plot, patches_summary]
438
- )
439
-
440
- # Auto-analyze when file is uploaded
441
- audio_file.change(
442
- analyze_audio_basic,
443
- inputs=[audio_file],
444
- outputs=[basic_plot, basic_summary]
445
- )
446
 
447
- if __name__ == "__main__":
448
- app.launch()
 
 
 
 
 
 
 
 
449
 
450
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import numpy as np
9
  import scipy.ndimage
10
  from pathlib import Path
11
+ import logging
12
  import warnings
13
+ import shutil
14
+ from typing import Tuple, Optional, Dict, Any
15
 
16
+ # Configure matplotlib for web display
17
  plt.switch_backend('Agg')
18
+ warnings.filterwarnings('ignore')
19
+
20
+ # Setup logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format="%(asctime)s - %(levelname)s - %(message)s",
24
+ handlers=[logging.StreamHandler()]
25
+ )
26
+ logger = logging.getLogger(__name__)
27
 
28
  class AudioAnalyzer:
29
+ """Core class for audio analysis with modular feature extraction methods."""
30
+
31
+ def __init__(self, temp_dir: Optional[str] = None):
32
+ """Initialize with a temporary directory for file storage."""
33
+ self.temp_dir = Path(temp_dir or tempfile.mkdtemp())
34
+ self.temp_dir.mkdir(exist_ok=True)
35
+ logger.info(f"Initialized temporary directory: {self.temp_dir}")
36
+
37
+ def cleanup(self) -> None:
38
+ """Remove temporary directory and its contents."""
39
+ if self.temp_dir.exists():
40
+ shutil.rmtree(self.temp_dir)
41
+ logger.info(f"Cleaned up temporary directory: {self.temp_dir}")
42
+
43
+ def download_youtube_audio(self, video_url: str, progress=gr.Progress()) -> Tuple[Optional[str], str]:
44
+ """Download audio from YouTube using yt-dlp."""
45
  if not video_url:
46
+ return None, "Please provide a valid YouTube URL"
47
+
48
  progress(0.1, desc="Initializing download...")
49
+ output_dir = self.temp_dir / "downloaded_audio"
50
+ output_dir.mkdir(exist_ok=True)
51
+ output_file = output_dir / "audio.mp3"
52
+
 
53
  command = [
54
  "yt-dlp",
55
  "-x",
56
  "--audio-format", "mp3",
57
+ "-o", str(output_file),
58
  "--no-playlist",
59
  "--restrict-filenames",
60
  video_url
61
  ]
62
+
63
  try:
64
  progress(0.3, desc="Downloading audio...")
65
+ subprocess.run(command, check=True, capture_output=True, text=True)
66
+ progress(1.0, desc="Download complete!")
67
+ return str(output_file), f"Successfully downloaded audio: {output_file.name}"
 
 
 
 
 
 
 
 
68
  except FileNotFoundError:
69
+ return None, "yt-dlp not found. Install it with: pip install yt-dlp"
70
  except subprocess.CalledProcessError as e:
71
  return None, f"Download failed: {e.stderr}"
72
  except Exception as e:
73
+ logger.error(f"Unexpected error during download: {str(e)}")
74
  return None, f"Unexpected error: {str(e)}"
75
+
76
+ def extract_basic_features(self, audio_path: str, sr: int = 16000, max_duration: float = 60.0,
77
+ progress=gr.Progress()) -> Tuple[Optional[str], Optional[str], Optional[str]]:
78
+ """Extract basic audio features and generate visualizations."""
79
+ if not audio_path or not Path(audio_path).exists():
80
+ return None, None, "Invalid or missing audio file"
81
+
82
  try:
83
  progress(0.1, desc="Loading audio...")
84
  y, sr = librosa.load(audio_path, sr=sr)
85
  duration = librosa.get_duration(y=y, sr=sr)
86
+
 
 
87
  if duration > max_duration:
88
+ y = y[:int(sr * max_duration)]
89
  duration = max_duration
90
+
91
  progress(0.3, desc="Computing features...")
92
+ features: Dict[str, Any] = {
93
+ 'duration': duration,
94
+ 'sample_rate': sr,
95
+ 'samples': len(y),
96
+ 'tempo': librosa.beat.beat_track(y=y, sr=sr)[0],
97
+ 'mfcc': librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13),
98
+ 'spectral_centroid': librosa.feature.spectral_centroid(y=y, sr=sr)[0],
99
+ 'spectral_rolloff': librosa.feature.spectral_rolloff(y=y, sr=sr)[0],
100
+ 'zero_crossing_rate': librosa.feature.zero_crossing_rate(y)[0]
101
+ }
102
+
103
  progress(0.5, desc="Computing mel spectrogram...")
104
  hop_length = 512
105
+ S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80)
106
  S_dB = librosa.power_to_db(S_mel, ref=np.max)
107
+
 
 
 
 
 
 
 
108
  progress(0.8, desc="Creating visualizations...")
 
 
109
  fig, axes = plt.subplots(2, 2, figsize=(15, 10))
110
+
 
111
  time_axis = librosa.frames_to_time(range(len(y)), sr=sr)
112
  axes[0, 0].plot(time_axis, y)
113
  axes[0, 0].set_title('Waveform')
114
  axes[0, 0].set_xlabel('Time (s)')
115
  axes[0, 0].set_ylabel('Amplitude')
116
+
 
117
  librosa.display.specshow(S_dB, sr=sr, hop_length=hop_length,
118
+ x_axis='time', y_axis='mel', ax=axes[0, 1])
119
  axes[0, 1].set_title('Mel Spectrogram')
120
+
 
121
  librosa.display.specshow(features['mfcc'], sr=sr, x_axis='time', ax=axes[1, 0])
122
  axes[1, 0].set_title('MFCC')
123
+
 
124
  times = librosa.frames_to_time(range(len(features['spectral_centroid'])), sr=sr, hop_length=hop_length)
125
  axes[1, 1].plot(times, features['spectral_centroid'], label='Spectral Centroid')
126
  axes[1, 1].plot(times, features['spectral_rolloff'], label='Spectral Rolloff')
127
  axes[1, 1].set_title('Spectral Features')
128
  axes[1, 1].set_xlabel('Time (s)')
129
  axes[1, 1].legend()
130
+
131
  plt.tight_layout()
132
+ plot_path = self.temp_dir / f"basic_features_{np.random.randint(10000)}.png"
 
 
133
  plt.savefig(plot_path, dpi=150, bbox_inches='tight')
134
  plt.close()
135
+
 
136
  summary = f"""
137
+ **Audio Summary:**
138
+ - Duration: {duration:.2f} seconds
139
+ - Sample Rate: {sr} Hz
140
+ - Estimated Tempo: {features['tempo']:.1f} BPM
141
+ - Number of Samples: {len(y):,}
142
+
143
+ **Feature Shapes:**
144
+ - MFCC: {features['mfcc'].shape}
145
+ - Spectral Centroid: {features['spectral_centroid'].shape}
146
+ - Spectral Rolloff: {features['spectral_rolloff'].shape}
147
+ - Zero Crossing Rate: {features['zero_crossing_rate'].shape}
148
  """
149
+
150
  progress(1.0, desc="Analysis complete!")
151
+ return str(plot_path), summary, None
152
+
153
  except Exception as e:
154
+ logger.error(f"Error processing audio: {str(e)}")
155
  return None, None, f"Error processing audio: {str(e)}"
156
+
157
+ def extract_chroma_features(self, audio_path: str, sr: int = 16000, max_duration: float = 30.0,
158
+ progress=gr.Progress()) -> Tuple[Optional[str], Optional[str], Optional[str]]:
159
  """Extract and visualize enhanced chroma features."""
160
+ if not audio_path or not Path(audio_path).exists():
161
+ return None, None, "Invalid or missing audio file"
162
+
163
  try:
164
  progress(0.1, desc="Loading audio...")
165
  y, sr = librosa.load(audio_path, sr=sr)
 
 
 
166
  if len(y) > sr * max_duration:
167
+ y = y[:int(sr * max_duration)]
168
+
169
  progress(0.3, desc="Computing chroma variants...")
 
 
170
  chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr)
 
 
171
  y_harm = librosa.effects.harmonic(y=y, margin=8)
172
  chroma_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr)
 
 
 
 
173
  chroma_filter = np.minimum(chroma_harm,
174
  librosa.decompose.nn_filter(chroma_harm,
175
  aggregate=np.median,
176
  metric='cosine'))
 
 
177
  chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9))
 
 
178
  chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
 
 
179
  chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
180
+
181
  progress(0.8, desc="Creating visualizations...")
 
 
182
  fig, axes = plt.subplots(3, 2, figsize=(15, 12))
183
+ axes = axes.flatten()
184
+
185
+ for i, (chroma, title) in enumerate([
186
+ (chroma_orig, 'Original Chroma (CQT)'),
187
+ (chroma_harm, 'Harmonic Chroma'),
188
+ (chroma_filter, 'Non-local Filtered'),
189
+ (chroma_smooth, 'Median Filtered'),
190
+ (chroma_stft, 'Chroma (STFT)'),
191
+ (chroma_cens, 'CENS Features')
192
+ ]):
193
+ librosa.display.specshow(chroma, y_axis='chroma', x_axis='time', ax=axes[i])
194
+ axes[i].set_title(title)
195
+
 
 
 
 
 
 
 
 
 
196
  plt.tight_layout()
197
+ plot_path = self.temp_dir / f"chroma_features_{np.random.randint(10000)}.png"
 
 
198
  plt.savefig(plot_path, dpi=150, bbox_inches='tight')
199
  plt.close()
200
+
201
+ summary = "Chroma feature analysis complete! Visualizations show different chroma extraction methods for harmonic analysis."
202
  progress(1.0, desc="Chroma analysis complete!")
203
+ return str(plot_path), summary, None
204
+
205
  except Exception as e:
206
+ logger.error(f"Error processing chroma features: {str(e)}")
207
+ return None, None, f"Error processing chroma features: {str(e)}"
208
+
209
+ def generate_patches(self, audio_path: str, sr: int = 16000, patch_duration: float = 5.0,
210
+ hop_duration: float = 1.0, progress=gr.Progress()) -> Tuple[Optional[str], Optional[str], Optional[str]]:
211
  """Generate fixed-duration patches for transformer input."""
212
+ if not audio_path or not Path(audio_path).exists():
213
+ return None, None, "Invalid or missing audio file"
214
+
215
  try:
216
  progress(0.1, desc="Loading audio...")
217
  y, sr = librosa.load(audio_path, sr=sr)
218
+
219
  progress(0.3, desc="Computing mel spectrogram...")
220
  hop_length = 512
221
  S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80)
222
  S_dB = librosa.power_to_db(S_mel, ref=np.max)
223
+
224
  progress(0.5, desc="Generating patches...")
 
 
225
  patch_frames = librosa.time_to_frames(patch_duration, sr=sr, hop_length=hop_length)
226
  hop_frames = librosa.time_to_frames(hop_duration, sr=sr, hop_length=hop_length)
 
 
227
  patches = librosa.util.frame(S_dB, frame_length=patch_frames, hop_length=hop_frames)
228
+
229
  progress(0.8, desc="Creating visualizations...")
 
 
230
  num_patches_to_show = min(6, patches.shape[-1])
231
  fig, axes = plt.subplots(2, 3, figsize=(18, 8))
232
  axes = axes.flatten()
233
+
234
  for i in range(num_patches_to_show):
235
  librosa.display.specshow(patches[..., i], y_axis='mel', x_axis='time',
236
  ax=axes[i], sr=sr, hop_length=hop_length)
237
  axes[i].set_title(f'Patch {i+1}')
238
+
 
239
  for i in range(num_patches_to_show, len(axes)):
240
  axes[i].set_visible(False)
241
+
242
  plt.tight_layout()
243
+ plot_path = self.temp_dir / f"patches_{np.random.randint(10000)}.png"
 
 
244
  plt.savefig(plot_path, dpi=150, bbox_inches='tight')
245
  plt.close()
246
+
 
247
  summary = f"""
248
+ **Patch Generation Summary:**
249
+ - Total patches generated: {patches.shape[-1]}
250
+ - Patch duration: {patch_duration} seconds
251
+ - Hop duration: {hop_duration} seconds
252
+ - Patch shape (mels, time, patches): {patches.shape}
253
+ - Each patch covers {patch_frames} time frames
254
  """
255
+
256
  progress(1.0, desc="Patch generation complete!")
257
+ return str(plot_path), summary, None
258
+
259
  except Exception as e:
260
+ logger.error(f"Error generating patches: {str(e)}")
261
  return None, None, f"Error generating patches: {str(e)}"
262
 
263
+ def create_gradio_interface() -> gr.Blocks:
264
+ """Create a modular Gradio interface for audio analysis."""
265
+ analyzer = AudioAnalyzer()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
+ with gr.Blocks(title="🎡 Audio Analysis Suite", theme=gr.themes.Soft()) as demo:
268
+ gr.Markdown("""
269
+ # 🎡 Audio Analysis Suite
270
+
271
+ Analyze audio from YouTube videos or uploaded files. Extract features or generate transformer patches for deep learning applications.
272
+
273
+ **Features:**
274
+ - πŸ“Š **Basic Features**: Waveform, Mel Spectrogram, MFCC, Spectral Analysis, Tempo Detection
275
+ - 🎼 **Chroma Features**: Harmonic content analysis with multiple extraction methods
276
+ - 🧩 **Transformer Patches**: Fixed-duration patches for deep learning
277
 
278
+ **Requirements**: Install `yt-dlp` with `pip install yt-dlp`.
279
+ """)
280
+
281
+ with gr.Row():
282
+ with gr.Column(scale=1):
283
+ gr.Markdown("### πŸ“ Audio Input")
284
+ with gr.Group():
285
+ gr.Markdown("**Download from YouTube** (Supported formats: MP3, WAV, etc.)")
286
+ youtube_url = gr.Textbox(
287
+ label="YouTube URL",
288
+ placeholder="https://www.youtube.com/watch?v=...",
289
+ )
290
+ download_btn = gr.Button("πŸ“₯ Download Audio", variant="primary")
291
+ download_status = gr.Textbox(label="Download Status", interactive=False)
292
+
293
+ with gr.Group():
294
+ gr.Markdown("**Or upload audio file** (Supported formats: MP3, WAV, FLAC, etc.)")
295
+ audio_file = gr.Audio(
296
+ label="Upload Audio File",
297
+ type="filepath",
298
+ )
299
+
300
+ with gr.Column(scale=2):
301
+ gr.Markdown("### πŸ” Analysis Results")
302
+ with gr.Tabs():
303
+ with gr.Tab("πŸ“Š Basic Features"):
304
+ basic_plot = gr.Image(label="Feature Visualizations")
305
+ basic_summary = gr.Markdown(label="Feature Summary")
306
+ basic_btn = gr.Button("πŸ” Analyze Basic Features", variant="secondary")
307
+
308
+ with gr.Tab("🎼 Chroma Features"):
309
+ chroma_plot = gr.Image(label="Chroma Visualizations")
310
+ chroma_summary = gr.Markdown(label="Chroma Summary")
311
+ chroma_btn = gr.Button("🎼 Analyze Chroma Features", variant="secondary")
312
+
313
+ with gr.Tab("🧩 Transformer Patches"):
314
+ with gr.Row():
315
+ patch_duration = gr.Slider(
316
+ label="Patch Duration (seconds)",
317
+ minimum=1.0, maximum=10.0, value=5.0, step=0.5,
318
+ )
319
+ hop_duration = gr.Slider(
320
+ label="Hop Duration (seconds)",
321
+ minimum=0.1, maximum=5.0, value=1.0, step=0.1,
322
+ )
323
+ patches_plot = gr.Image(label="Generated Patches")
324
+ patches_summary = gr.Markdown(label="Patch Summary")
325
+ patches_btn = gr.Button("🧩 Generate Patches", variant="secondary")
326
+
327
+ error_output = gr.Textbox(label="Error Messages", interactive=False)
328
+
329
+ gr.Markdown("""
330
+ ### ℹ️ Usage Tips
331
+ - **Processing Limits**: 60s for basic features, 30s for chroma features to ensure fast response times
332
+ - **YouTube Downloads**: Ensure URLs are valid and respect YouTube's terms of service
333
+ - **Visualizations**: High-quality, suitable for research and educational use
334
+ - **Storage**: Temporary files are automatically cleaned up when the interface closes
335
+ - **Support**: For issues, check the [GitHub repository](https://github.com/your-repo) or contact the developer
336
+ """)
337
+
338
+ # Event handlers
339
+ download_btn.click(
340
+ fn=analyzer.download_youtube_audio,
341
+ inputs=[youtube_url],
342
+ outputs=[audio_file, download_status]
343
+ )
344
+
345
+ basic_btn.click(
346
+ fn=analyzer.extract_basic_features,
347
+ inputs=[audio_file],
348
+ outputs=[basic_plot, basic_summary, error_output]
349
+ )
350
+
351
+ chroma_btn.click(
352
+ fn=analyzer.extract_chroma_features,
353
+ inputs=[audio_file],
354
+ outputs=[chroma_plot, chroma_summary, error_output]
355
+ )
356
+
357
+ patches_btn.click(
358
+ fn=analyzer.generate_patches,
359
+ inputs=[audio_file, patch_duration, hop_duration],
360
+ outputs=[patches_plot, patches_summary, error_output]
361
+ )
362
+
363
+ audio_file.change(
364
+ fn=analyzer.extract_basic_features,
365
+ inputs=[audio_file],
366
+ outputs=[basic_plot, basic_summary, error_output]
367
+ )
368
+
369
+ demo.unload(fn=analyzer.cleanup)
370
+
371
+ return demo
372
+
373
+ if __name__ == "__main__":
374
+ demo = create_gradio_interface()
375
+ demo.launch()