latterworks commited on
Commit
6310c31
·
verified ·
1 Parent(s): 3a1b5d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +443 -194
app.py CHANGED
@@ -1,205 +1,454 @@
1
  import gradio as gr
2
- from pathlib import Path
3
- import yt_dlp
4
- import logging
5
  import librosa
 
 
6
  import numpy as np
7
- from PIL import Image
8
- import ffmpeg
 
 
 
 
 
 
9
  import shutil
10
- import tempfile
11
- import time
12
-
13
- # Set up logging for debugging
14
- logging.basicConfig(level=logging.DEBUG)
15
-
16
- def analyze_audio(youtube_url, input_text, input_image=None, slider_value=50, checkbox_value=False):
17
- """
18
- Downloads YouTube audio, performs automatic audio feature analysis with librosa, and processes inputs.
19
- Automatically handles file and folder management.
20
-
21
- Args:
22
- youtube_url (str): YouTube video URL (optional).
23
- input_text (str): Text input for processing.
24
- input_image (PIL.Image, optional): Image input for processing.
25
- slider_value (float): Numerical parameter (e.g., analysis threshold).
26
- checkbox_value (bool): Toggle for enhanced analysis.
27
-
28
- Returns:
29
- tuple: (processed_text, output_image_display, output_audio, extra_info)
30
- """
31
- # Create a unique temporary directory for this run
32
- temp_dir = Path(tempfile.mkdtemp(prefix="audio_analysis_"))
33
- output_dir = temp_dir / "downloaded_media"
34
- output_dir.mkdir(parents=True, exist_ok=True)
35
- logging.debug(f"Created temporary directory: {temp_dir}, output directory: {output_dir}")
36
-
37
- try:
38
- # Initialize outputs
39
- processed_text = f"Processed: '{input_text}'."
40
- output_image_display = input_image
41
- output_audio = None
42
- extra_info = f"Threshold: {slider_value/100:.2f}"
43
-
44
- # Handle YouTube download if URL is provided
45
- if youtube_url:
46
- try:
47
- # Validate YouTube URL
48
- if not youtube_url.startswith(("https://www.youtube.com/", "https://youtu.be/")):
49
- return "Error: Invalid YouTube URL", None, None, "Processing failed."
50
-
51
- # YouTube download options (audio only)
52
- ydl_opts = {
53
- 'format': 'bestaudio/best',
54
- 'outtmpl': str(output_dir / '%(title)s.%(ext)s'),
55
- 'postprocessors': [{
56
- 'key': 'FFmpegExtractAudio',
57
- 'preferredcodec': 'mp3',
58
- 'preferredquality': '192',
59
- }],
60
- 'restrictfilenames': True,
61
- 'noplaylist': True,
62
- }
63
-
64
- # Download audio
65
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
66
- info = ydl.extract_info(youtube_url, download=True)
67
- audio_file = output_dir / f"{info['title']}.mp3"
68
- logging.debug(f"Downloaded audio: {audio_file}")
69
- output_audio = str(audio_file)
70
-
71
- # Perform automatic audio feature analysis with librosa
72
- y, sr = librosa.load(audio_file)
73
- hop_length = 512 # Valid hop_length to fix "Invalid hop_length: 0" error
74
- logging.debug(f"Using hop_length: {hop_length}")
75
-
76
- # Extract features
77
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)
78
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=hop_length)
79
- tempo, _ = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length)
80
-
81
- # Aggregate features
82
- mfcc_mean = np.mean(mfcc, axis=1).tolist()[:3] # Mean of first 3 MFCC coefficients
83
- spectral_centroid_mean = np.mean(spectral_centroid)
84
- features_summary = (
85
- f"Audio Features: MFCC (mean of first 3 coeffs): {mfcc_mean}, "
86
- f"Spectral Centroid: {spectral_centroid_mean:.2f} Hz, "
87
- f"Tempo: {tempo:.2f} BPM"
88
- )
89
-
90
- processed_text += f" {features_summary}."
91
- extra_info += f", Audio: {audio_file.name}"
92
-
93
- except Exception as e:
94
- logging.error(f"YouTube download or audio processing error: {str(e)}")
95
- processed_text += f" Error processing YouTube audio: {str(e)}."
96
-
97
- # Handle image processing if provided
98
- if input_image is not None:
99
- from PIL import ImageEnhance
100
- enhancer = ImageEnhance.Brightness(input_image)
101
- output_image_display = enhancer.enhance(1.5)
102
- processed_text += " Image processed (brightened)."
103
- else:
104
- processed_text += " No image provided."
105
-
106
- # Incorporate slider and checkbox
107
- processed_text += f" Slider: {slider_value}, Enhanced Analysis: {checkbox_value}."
108
- if checkbox_value:
109
- processed_text += " Enhanced analysis enabled."
110
- if youtube_url and slider_value > 50:
111
- processed_text += f" High threshold ({slider_value}) applied for deeper analysis."
112
-
113
- return processed_text, output_image_display, output_audio, extra_info
114
-
115
- except Exception as e:
116
- logging.error(f"Error in analyze_audio: {str(e)}")
117
- return f"Error: {str(e)}", None, None, "Processing failed."
118
-
119
- finally:
120
- # Clean up temporary directory after a delay to ensure file access
121
  try:
122
- time.sleep(1) # Brief delay to ensure Gradio can serve the audio file
123
- if temp_dir.exists():
124
- shutil.rmtree(temp_dir)
125
- logging.debug(f"Cleaned up temporary directory: {temp_dir}")
 
 
 
 
126
  except Exception as e:
127
- logging.error(f"Error cleaning up temporary directory: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- # Define input components
130
- input_youtube_url = gr.Textbox(
131
- label="YouTube Video URL",
132
- placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ",
133
- info="Optional: Enter a YouTube URL to download and analyze audio."
134
- )
135
- input_text_component = gr.Textbox(
136
- label="Input Text",
137
- placeholder="e.g., Analyze this audio track",
138
- info="Type a description or query for processing."
139
- )
140
- input_image_component = gr.Image(
141
- type="pil",
142
- label="Upload Image (Optional)",
143
- sources=["upload", "webcam", "clipboard"]
144
- )
145
- input_slider_component = gr.Slider(
146
- minimum=0,
147
- maximum=100,
148
- value=50,
149
- step=1,
150
- label="Analysis Threshold",
151
- info="Adjusts sensitivity of audio feature analysis."
152
- )
153
- input_checkbox_component = gr.Checkbox(
154
- label="Enable Enhanced Analysis",
155
- info="Toggle for deeper audio feature extraction."
156
- )
157
 
158
- # Define output components
159
- output_text_component = gr.Textbox(
160
- label="Analysis Results",
161
- info="Text results including audio feature analysis."
162
- )
163
- output_image_component = gr.Image(
164
- label="Processed Image (if any)",
165
- info="Processed image output (if provided)."
166
- )
167
- output_audio_component = gr.Audio(
168
- label="Downloaded Audio",
169
- type="filepath",
170
- info="Audio downloaded from YouTube."
171
- )
172
- output_label_component = gr.Label(
173
- label="Analysis Summary",
174
- info="Feature analysis details and processing info."
175
- )
176
 
177
- # Create the Gradio interface
178
- iface = gr.Interface(
179
- fn=analyze_audio,
180
- inputs=[
181
- input_youtube_url,
182
- input_text_component,
183
- input_image_component,
184
- input_slider_component,
185
- input_checkbox_component
186
- ],
187
- outputs=[
188
- output_text_component,
189
- output_image_component,
190
- output_audio_component,
191
- output_label_component
192
- ],
193
- title="YouTube Audio Feature Analysis",
194
- description="Download YouTube audio, analyze features with librosa, and process text/image inputs. Customize with slider and checkbox.",
195
- examples=[
196
- ["https://www.youtube.com/watch?v=dQw4w9WgXcQ", "Analyze this track", None, 75, True],
197
- [None, "Describe a music track", None, 30, False],
198
- ["https://www.youtube.com/watch?v=9bZkp7q19f0", "Extract audio features", None, 60, True]
199
- ],
200
- allow_flagging="never",
201
- theme=gr.themes.Soft()
202
- )
203
 
204
  if __name__ == "__main__":
205
- iface.launch()
 
 
1
  import gradio as gr
2
+ import subprocess
3
+ import os
4
+ import tempfile
5
  import librosa
6
+ import librosa.display
7
+ import matplotlib.pyplot as plt
8
  import numpy as np
9
+ import scipy.ndimage
10
+ from pathlib import Path
11
+
12
+ import logging
13
+ import warnings
14
+
15
+
16
+
17
  import shutil
18
+ from typing import Tuple, Optional, Dict, Any
19
+
20
+ # Configure matplotlib for web display
21
+ plt.switch_backend('Agg')
22
+ warnings.filterwarnings('ignore')
23
+
24
+ # Setup logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format="%(asctime)s - %(levelname)s - %(message)s",
28
+ handlers=[logging.StreamHandler()]
29
+ )
30
+ logger = logging.getLogger(__name__)
31
+
32
+ class AudioAnalyzer:
33
+ """Core class for audio analysis with modular feature extraction methods."""
34
+
35
+ def __init__(self, temp_dir: Optional[str] = None):
36
+ """Initialize with a temporary directory for file storage."""
37
+ self.temp_dir = Path(temp_dir or tempfile.mkdtemp())
38
+ self.temp_dir.mkdir(exist_ok=True)
39
+ self.plot_files = [] # Track plot files for cleanup
40
+ logger.info(f"Initialized temporary directory: {self.temp_dir}")
41
+
42
+ def cleanup(self) -> None:
43
+ """Remove temporary directory and plot files."""
44
+ for plot_file in self.plot_files:
45
+ if Path(plot_file).exists():
46
+ try:
47
+ Path(plot_file).unlink()
48
+ logger.info(f"Removed plot file: {plot_file}")
49
+ except Exception as e:
50
+ logger.warning(f"Failed to remove plot file {plot_file}: {str(e)}")
51
+ if self.temp_dir.exists():
52
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
53
+ logger.info(f"Cleaned up temporary directory: {self.temp_dir}")
54
+
55
+ def download_youtube_audio(self, video_url: str, progress=gr.Progress()) -> Tuple[Optional[str], str]:
56
+ """Download audio from YouTube using yt-dlp."""
57
+ if not video_url:
58
+ return None, "Please provide a valid YouTube URL"
59
+
60
+ progress(0.1, desc="Initializing download...")
61
+ output_dir = self.temp_dir / "downloaded_audio"
62
+ output_dir.mkdir(exist_ok=True)
63
+ output_file = output_dir / "audio.mp3"
64
+
65
+ command = [
66
+ "yt-dlp",
67
+ "-x",
68
+ "--audio-format", "mp3",
69
+ "-o", str(output_file),
70
+ "--no-playlist",
71
+ "--restrict-filenames",
72
+ video_url
73
+ ]
74
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  try:
76
+ progress(0.3, desc="Downloading audio...")
77
+ subprocess.run(command, check=True, capture_output=True, text=True)
78
+ progress(1.0, desc="Download complete!")
79
+ return str(output_file), f"Successfully downloaded audio: {output_file.name}"
80
+ except FileNotFoundError:
81
+ return None, "yt-dlp not found. Install it with: pip install yt-dlp"
82
+ except subprocess.CalledProcessError as e:
83
+ return None, f"Download failed: {e.stderr}"
84
  except Exception as e:
85
+ logger.error(f"Unexpected error during download: {str(e)}")
86
+ return None, f"Error: {str(e)}"
87
+
88
+ def save_plot(self, fig, filename: str) -> Optional[str]:
89
+ """Save matplotlib figure to a temporary file and verify existence."""
90
+ try:
91
+ # Use NamedTemporaryFile to ensure persistence
92
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False, dir=self.temp_dir) as tmp_file:
93
+ plot_path = tmp_file.name
94
+ fig.savefig(plot_path, dpi=300, bbox_inches='tight', format='png')
95
+ plt.close(fig)
96
+ if not Path(plot_path).exists():
97
+ logger.error(f"Plot file not created: {plot_path}")
98
+ return None
99
+ self.plot_files.append(plot_path)
100
+ logger.info(f"Saved plot: {plot_path}")
101
+ return str(plot_path)
102
+ except Exception as e:
103
+ logger.error(f"Error saving plot {filename}: {str(e)}")
104
+ plt.close(fig)
105
+ return None
106
+
107
+ def extract_basic_features(self, audio_path: str, sr: int = 16000, max_duration: float = 60.0,
108
+ progress=gr.Progress()) -> Tuple[Optional[str], Optional[str], Optional[str]]:
109
+ """Extract basic audio features and generate visualizations."""
110
+ if not audio_path or not Path(audio_path).exists():
111
+ return None, None, "Invalid or missing audio file"
112
+
113
+ try:
114
+ progress(0.1, desc="Loading audio...")
115
+ y, sr = librosa.load(audio_path, sr=sr)
116
+ duration = librosa.get_duration(y=y, sr=sr)
117
+
118
+ if duration > max_duration:
119
+ y = y[:int(sr * max_duration)]
120
+ duration = max_duration
121
+
122
+ progress(0.3, desc="Computing features...")
123
+ features: Dict[str, Any] = {
124
+ 'duration': duration,
125
+ 'sample_rate': sr,
126
+ 'samples': len(y),
127
+ 'tempo': float(librosa.beat.beat_track(y=y, sr=sr)[0]), # Convert to float
128
+ 'mfcc': librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13),
129
+ 'spectral_centroid': librosa.feature.spectral_centroid(y=y, sr=sr)[0],
130
+ 'spectral_rolloff': librosa.feature.spectral_rolloff(y=y, sr=sr)[0],
131
+ 'zero_crossing_rate': librosa.feature.zero_crossing_rate(y)[0]
132
+ }
133
+
134
+ progress(0.5, desc="Computing mel spectrogram...")
135
+ hop_length = 512
136
+ S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80)
137
+ S_dB = librosa.power_to_db(S_mel, ref=np.max)
138
+
139
+ progress(0.8, desc="Creating visualizations...")
140
+ fig, axes = plt.subplots(2, 2, figsize=(15, 10))
141
+
142
+ time_axis = np.linspace(0, duration, len(y))
143
+ axes[0, 0].plot(time_axis, y)
144
+ axes[0, 0].set_title('Waveform')
145
+ axes[0, 0].set_xlabel('Time (s)')
146
+ axes[0, 0].set_ylabel('Amplitude')
147
+
148
+ librosa.display.specshow(S_dB, sr=sr, hop_length=hop_length,
149
+ x_axis='time', y_axis='mel', ax=axes[0, 1])
150
+ axes[0, 1].set_title('Mel Spectrogram')
151
+
152
+ librosa.display.specshow(features['mfcc'], sr=sr, x_axis='time', ax=axes[1, 0])
153
+ axes[1, 0].set_title('MFCC')
154
+
155
+ times = librosa.frames_to_time(range(len(features['spectral_centroid'])), sr=sr, hop_length=hop_length)
156
+ axes[1, 1].plot(times, features['spectral_centroid'], label='Spectral Centroid')
157
+ axes[1, 1].plot(times, features['spectral_rolloff'], label='Spectral Rolloff')
158
+ axes[1, 1].set_title('Spectral Features')
159
+ axes[1, 1].set_xlabel('Time (s)')
160
+ axes[1, 1].legend()
161
+
162
+ plt.tight_layout()
163
+ plot_path = self.save_plot(fig, "basic_features")
164
+ if not plot_path:
165
+ return None, None, "Failed to save feature visualizations"
166
+
167
+ # Validate feature shapes
168
+ for key in ['mfcc', 'spectral_centroid', 'spectral_rolloff', 'zero_crossing_rate']:
169
+ if not isinstance(features[key].shape, tuple):
170
+ logger.error(f"Invalid shape for {key}: {features[key].shape}")
171
+ return None, None, f"Invalid feature shape for {key}"
172
+
173
+ summary = f"""
174
+ **Audio Summary:**
175
+ - Duration: {duration:.2f} seconds
176
+ - Sample Rate: {sr} Hz
177
+ - Estimated Tempo: {features['tempo']:.1f} BPM
178
+ - Number of Samples: {features['samples']:,}
179
+
180
+ **Feature Shapes:**
181
+ - MFCC: {features['mfcc'].shape}
182
+ - Spectral Centroid: {features['spectral_centroid'].shape}
183
+ - Spectral Rolloff: {features['spectral_rolloff'].shape}
184
+ - Zero Crossing Rate: {features['zero_crossing_rate'].shape}
185
+ """
186
+
187
+ progress(1.0, desc="Analysis complete!")
188
+ return plot_path, summary, None
189
+
190
+ except Exception as e:
191
+ logger.error(f"Error processing audio: {str(e)}")
192
+ return None, None, f"Error processing audio: {str(e)}"
193
+
194
+ def extract_chroma_features(self, audio_path: str, sr: int = 16000, max_duration: float = 30.0,
195
+ progress=gr.Progress()) -> Tuple[Optional[str], Optional[str], Optional[str]]:
196
+ """Extract and visualize enhanced chroma features."""
197
+ if not audio_path or not Path(audio_path).exists():
198
+ return None, None, "Invalid or missing audio file"
199
+
200
+ try:
201
+ progress(0.1, desc="Loading audio...")
202
+ y, sr = librosa.load(audio_path, sr=sr)
203
+ if len(y) > sr * max_duration:
204
+ y = y[:int(sr * max_duration)]
205
+
206
+ progress(0.3, desc="Computing chroma variants...")
207
+ chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr)
208
+ y_harm = librosa.effects.harmonic(y=y, margin=8)
209
+ chroma_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr)
210
+ chroma_filter = np.minimum(chroma_harm,
211
+ librosa.decompose.nn_filter(chroma_harm,
212
+ aggregate=np.median,
213
+ metric='cosine'))
214
+ chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9))
215
+ chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
216
+ chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
217
+
218
+ progress(0.8, desc="Creating visualizations...")
219
+ fig, axes = plt.subplots(3, 2, figsize=(15, 12))
220
+ axes = axes.flatten()
221
+
222
+ for i, (chroma, title) in enumerate([
223
+ (chroma_orig, 'Original Chroma (CQT)'),
224
+ (chroma_harm, 'Harmonic Chroma'),
225
+ (chroma_filter, 'Non-local Filtered'),
226
+ (chroma_smooth, 'Median Filtered'),
227
+ (chroma_stft, 'Chroma (STFT)'),
228
+ (chroma_cens, 'CENS Features')
229
+ ]):
230
+ librosa.display.specshow(chroma, y_axis='chroma', x_axis='time', ax=axes[i])
231
+ axes[i].set_title(title)
232
+
233
+ plt.tight_layout()
234
+ plot_path = self.save_plot(fig, "chroma_features")
235
+ if not plot_path:
236
+ return None, None, "Failed to save chroma visualizations"
237
+
238
+ summary = "Chroma feature analysis complete! Visualizations show different chroma extraction methods for harmonic analysis."
239
+ progress(1.0, desc="Chroma analysis complete!")
240
+ return plot_path, summary, None
241
+
242
+ except Exception as e:
243
+ logger.error(f"Error processing chroma features: {str(e)}")
244
+ return None, None, f"Error processing chroma features: {str(e)}"
245
+
246
+ def generate_patches(self, audio_path: str, sr: int = 16000, patch_duration: float = 5.0,
247
+ hop_duration: float = 1.0, progress=gr.Progress()) -> Tuple[Optional[str], Optional[str], Optional[str]]:
248
+ """Generate fixed-duration patches for transformer input."""
249
+ if not audio_path or not Path(audio_path).exists():
250
+ return None, None, "Invalid or missing audio file"
251
+
252
+ try:
253
+ progress(0.1, desc="Loading audio...")
254
+ y, sr = librosa.load(audio_path, sr=sr)
255
+
256
+ progress(0.3, desc="Computing mel spectrogram...")
257
+ hop_length = 512
258
+ S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80)
259
+ S_dB = librosa.power_to_db(S_mel, ref=np.max)
260
+
261
+ progress(0.5, desc="Generating patches...")
262
+ patch_frames = librosa.time_to_frames(patch_duration, sr=sr, hop_length=hop_length)
263
+ hop_frames = librosa.time_to_frames(hop_duration, sr=sr, hop_length=hop_length)
264
+ patches = librosa.util.frame(S_dB, frame_length=patch_frames, hop_length=hop_frames)
265
+
266
+ progress(0.8, desc="Creating visualizations...")
267
+ num_patches_to_show = min(6, patches.shape[-1])
268
+ fig, axes = plt.subplots(2, 3, figsize=(18, 8))
269
+ axes = axes.flatten()
270
+
271
+ for i in range(num_patches_to_show):
272
+ librosa.display.specshow(patches[..., i], y_axis='mel', x_axis='time',
273
+ ax=axes[i], sr=sr, hop_length=hop_length)
274
+ axes[i].set_title(f'Patch {i+1}')
275
+
276
+ for i in range(num_patches_to_show, len(axes)):
277
+ axes[i].set_visible(False)
278
+
279
+ plt.tight_layout()
280
+ plot_path = self.save_plot(fig, "patches")
281
+ if not plot_path:
282
+ return None, None, "Failed to save patch visualizations"
283
+
284
+ summary = f"""
285
+ **Patch Generation Summary:**
286
+ - Total patches generated: {patches.shape[-1]}
287
+ - Patch duration: {patch_duration:.1f} seconds
288
+ - Hop duration: {hop_duration:.1f} seconds
289
+ - Patch shape (mels, time, patches): {patches.shape}
290
+ - Each patch covers {patch_frames} time frames
291
+ """
292
+
293
+ progress(1.0, desc="Patch generation complete!")
294
+ return plot_path, summary, None
295
+
296
+ except Exception as e:
297
+ logger.error(f"Error generating patches: {str(e)}")
298
+ return None, None, f"Error generating patches: {str(e)}"
299
+
300
+ def create_gradio_interface() -> gr.Blocks:
301
+ """Create a modular Gradio interface for audio analysis."""
302
+ analyzer = AudioAnalyzer()
303
+
304
+ with gr.Blocks(title="🎵 Audio Analysis Suite", theme=gr.themes.Soft()) as demo:
305
+ gr.Markdown("""
306
+ # 🎵 Audio Analysis Suite
307
+
308
+ Analyze audio from YouTube videos or uploaded files. Extract features or generate transformer patches for deep learning applications.
309
+
310
+ **Features:**
311
+ - 📊 **Basic Features**: Waveform, Mel Spectrogram, MFCC, Spectral Analysis, Tempo Detection
312
+ - 🎼 **Chroma Features**: Harmonic content analysis with multiple extraction methods
313
+ - 🧩 **Transformer Patches**: Fixed-duration patches for deep learning
314
+
315
+ **Requirements**: Dependencies are automatically installed in Hugging Face Spaces via `requirements.txt`.
316
+ """)
317
+
318
+ with gr.Row():
319
+ with gr.Column(scale=1):
320
+ gr.Markdown("### 📁 Audio Input")
321
+ with gr.Group():
322
+ gr.Markdown("**Download from YouTube** (Supported formats: MP3, WAV, etc.)")
323
+ youtube_url = gr.Textbox(
324
+ label="YouTube URL",
325
+ placeholder="https://www.youtube.com/watch?v=...",
326
+ )
327
+ download_btn = gr.Button("📥 Download Audio", variant="primary")
328
+ download_status = gr.Textbox(label="Download Status", interactive=False)
329
+
330
+ with gr.Group():
331
+ gr.Markdown("**Or upload audio file** (Supported formats: MP3, WAV, FLAC, etc.)")
332
+ audio_file = gr.Audio(
333
+ label="Upload Audio File",
334
+ type="filepath",
335
+ )
336
+
337
+ with gr.Column(scale=2):
338
+ gr.Markdown("### 🔍 Analysis Results")
339
+ with gr.Tabs():
340
+ with gr.Tab("📊 Basic Features"):
341
+ basic_plot = gr.Image(label="Feature Visualizations")
342
+ basic_summary = gr.Markdown(label="Feature Summary")
343
+ basic_btn = gr.Button("🔍 Analyze Basic Features", variant="secondary")
344
+
345
+ with gr.Tab("🎼 Chroma Features"):
346
+ chroma_plot = gr.Image(label="Chroma Visualizations")
347
+ chroma_summary = gr.Markdown(label="Chroma Summary")
348
+ chroma_btn = gr.Button("🎼 Analyze Chroma Features", variant="secondary")
349
+
350
+ with gr.Tab("🧩 Transformer Patches"):
351
+ with gr.Row():
352
+ patch_duration = gr.Slider(
353
+ label="Patch Duration (seconds)",
354
+ minimum=1.0, maximum=10.0, value=5.0, step=0.5,
355
+ )
356
+ hop_duration = gr.Slider(
357
+ label="Hop Duration (seconds)",
358
+ minimum=0.1, maximum=5.0, value=1.0, step=0.1,
359
+ )
360
+ patches_plot = gr.Image(label="Generated Patches")
361
+ patches_summary = gr.Markdown(label="Patch Summary")
362
+ patches_btn = gr.Button("🧩 Generate Patches", variant="secondary")
363
+
364
+ error_output = gr.Textbox(label="Error Messages", interactive=False)
365
+
366
+ gr.Markdown("""
367
+ ### ℹ️ Usage Tips
368
+ - **Processing Limits**: 60s for basic features, 30s for chroma features for fast response
369
+ - **YouTube Downloads**: Ensure URLs are valid and respect YouTube's terms of service
370
+ - **Visualizations**: High-quality, suitable for research and education
371
+ - **Storage**: Temporary files are cleaned up when the interface closes
372
+ - **Support**: For issues, check the [GitHub repository](https://github.com/your-repo)
373
+ """)
374
+
375
+ # Event handlers
376
+ download_btn.click(
377
+ fn=analyzer.download_youtube_audio,
378
+ inputs=[youtube_url],
379
+ outputs=[audio_file, download_status]
380
+ )
381
+
382
+ basic_btn.click(
383
+ fn=analyzer.extract_basic_features,
384
+ inputs=[audio_file],
385
+ outputs=[basic_plot, basic_summary, error_output]
386
+ )
387
+
388
+ chroma_btn.click(
389
+ fn=analyzer.extract_chroma_features,
390
+ inputs=[audio_file],
391
+ outputs=[chroma_plot, chroma_summary, error_output]
392
+ )
393
+
394
+ patches_btn.click(
395
+ fn=analyzer.generate_patches,
396
+ inputs=[audio_file, patch_duration, hop_duration],
397
+ outputs=[patches_plot, patches_summary, error_output]
398
+ )
399
+
400
+ audio_file.change(
401
+ fn=analyzer.extract_basic_features,
402
+ inputs=[audio_file],
403
+ outputs=[basic_plot, basic_summary, error_output]
404
+ )
405
+
406
+ demo.unload(fn=analyzer.cleanup)
407
+
408
+
409
+
410
+
411
+
412
+
413
+
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+ return demo
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
441
+
442
+
443
+
444
+
445
+
446
+
447
+
448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
  if __name__ == "__main__":
453
+ demo = create_gradio_interface()
454
+ demo.launch()