latterworks commited on
Commit
d7fc5bc
·
verified ·
1 Parent(s): d870ff6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +387 -548
app.py CHANGED
@@ -1,611 +1,450 @@
1
  import gradio as gr
2
- import librosa
3
- import numpy as np
4
- import soundfile as sf
5
  import os
6
  import tempfile
7
- import shutil
 
 
 
 
8
  from pathlib import Path
9
  import warnings
10
- warnings.filterwarnings("ignore")
11
 
12
- # Import for advanced features
13
- try:
14
- from spleeter.separator import Separator
15
- SPLEETER_AVAILABLE = True
16
- except ImportError:
17
- SPLEETER_AVAILABLE = False
18
- print("Spleeter not available - source separation disabled")
19
 
20
- try:
21
- import scipy.signal
22
- from scipy.spatial.distance import euclidean
23
- from dtw import dtw
24
- ADVANCED_FEATURES = True
25
- except ImportError:
26
- ADVANCED_FEATURES = False
27
- print("Advanced features not available")
28
-
29
- class AudioEngine:
30
- """Clean, professional audio processing engine"""
31
-
32
  def __init__(self):
33
  self.temp_dir = tempfile.mkdtemp()
34
- self.separators = {} # Cache for Spleeter models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- def analyze_audio(self, audio_path):
37
- """Extract comprehensive audio features"""
 
 
 
38
  try:
39
- # Load audio
40
- y, sr = librosa.load(audio_path)
41
-
42
- # Basic properties
43
- duration = len(y) / sr
44
- tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # Spectral features
47
- spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
48
- spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
49
- zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))
50
-
51
- # Energy features
52
- rms_energy = np.mean(librosa.feature.rms(y=y))
53
-
54
- # Pitch estimation
55
- pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
56
- pitch_values = []
57
- for t in range(pitches.shape[1]):
58
- index = magnitudes[:, t].argmax()
59
- pitch = pitches[index, t]
60
- if pitch > 0:
61
- pitch_values.append(pitch)
62
-
63
- avg_pitch = np.mean(pitch_values) if pitch_values else 0
64
-
65
- return {
66
- 'success': True,
67
- 'duration': round(duration, 2),
68
- 'tempo': round(tempo, 1),
69
- 'sample_rate': sr,
70
- 'spectral_centroid': round(spectral_centroid, 2),
71
- 'spectral_rolloff': round(spectral_rolloff, 2),
72
- 'zero_crossing_rate': round(zero_crossing_rate, 4),
73
- 'rms_energy': round(rms_energy, 4),
74
- 'average_pitch': round(avg_pitch, 2),
75
- 'pitch_count': len(pitch_values),
76
- 'beats_detected': len(beats)
77
- }
78
 
79
  except Exception as e:
80
- return {'success': False, 'error': str(e)}
81
 
82
- def separate_vocals(self, audio_path, model_type="2stems"):
83
- """Separate vocals using Spleeter"""
84
- if not SPLEETER_AVAILABLE:
85
- return {'success': False, 'error': 'Spleeter not available'}
86
 
87
  try:
88
- # Load or create separator
89
- if model_type not in self.separators:
90
- self.separators[model_type] = Separator(f'spleeter:{model_type}-16kHz')
91
 
92
- separator = self.separators[model_type]
 
 
 
93
 
94
- # Create output directory
95
- output_dir = os.path.join(self.temp_dir, f"separation_{np.random.randint(10000)}")
96
- os.makedirs(output_dir, exist_ok=True)
97
 
98
- # Separate
99
- separator.separate_to_file(audio_path, output_dir)
100
 
101
- # Get results
102
- audio_name = Path(audio_path).stem
103
- result_dir = os.path.join(output_dir, audio_name)
104
 
105
- if model_type == "2stems":
106
- vocals_path = os.path.join(result_dir, "vocals.wav")
107
- accompaniment_path = os.path.join(result_dir, "accompaniment.wav")
108
-
109
- return {
110
- 'success': True,
111
- 'vocals': vocals_path if os.path.exists(vocals_path) else None,
112
- 'accompaniment': accompaniment_path if os.path.exists(accompaniment_path) else None
113
- }
114
-
115
- elif model_type == "4stems":
116
- vocals_path = os.path.join(result_dir, "vocals.wav")
117
- drums_path = os.path.join(result_dir, "drums.wav")
118
- bass_path = os.path.join(result_dir, "bass.wav")
119
- other_path = os.path.join(result_dir, "other.wav")
120
-
121
- return {
122
- 'success': True,
123
- 'vocals': vocals_path if os.path.exists(vocals_path) else None,
124
- 'drums': drums_path if os.path.exists(drums_path) else None,
125
- 'bass': bass_path if os.path.exists(bass_path) else None,
126
- 'other': other_path if os.path.exists(other_path) else None
127
- }
128
 
129
- except Exception as e:
130
- return {'success': False, 'error': str(e)}
131
-
132
- def apply_effects(self, audio_path, pitch_shift=0, reverb=0):
133
- """Apply vocal effects"""
134
- try:
135
- y, sr = librosa.load(audio_path)
136
 
137
- # Apply pitch shift
138
- if pitch_shift != 0:
139
- y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)
140
 
141
- # Apply reverb (simple convolution)
142
- if reverb > 0 and ADVANCED_FEATURES:
143
- reverb_length = int(0.5 * sr)
144
- impulse = np.random.randn(reverb_length) * np.exp(-np.arange(reverb_length) / (sr * 0.1))
145
- y = scipy.signal.convolve(y, impulse * reverb, mode='same')
146
- y = y / np.max(np.abs(y)) # Normalize
147
 
148
- # Save processed audio
149
- output_path = os.path.join(self.temp_dir, f"processed_{np.random.randint(10000)}.wav")
150
- sf.write(output_path, y, sr)
151
 
152
- return {'success': True, 'output': output_path}
153
 
154
- except Exception as e:
155
- return {'success': False, 'error': str(e)}
156
-
157
- def extract_vocal_features(self, audio_path):
158
- """Extract features for style coaching"""
159
- try:
160
- y, sr = librosa.load(audio_path)
161
 
162
- # Pitch analysis
163
- pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
164
- pitch_values = []
165
- for t in range(pitches.shape[1]):
166
- index = magnitudes[:, t].argmax()
167
- pitch = pitches[index, t]
168
- if pitch > 0:
169
- pitch_values.append(pitch)
170
 
171
- if not pitch_values:
172
- return {'success': False, 'error': 'No pitch detected'}
173
 
174
- # Basic vocal metrics
175
- mean_pitch = np.mean(pitch_values)
176
- pitch_std = np.std(pitch_values)
177
- pitch_range = max(pitch_values) - min(pitch_values)
178
 
179
- # Tempo
180
- tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
181
 
182
- # Spectral features
183
- spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
 
184
 
185
- # Energy
186
- rms_energy = np.mean(librosa.feature.rms(y=y))
187
 
188
- return {
189
- 'success': True,
190
- 'mean_pitch': mean_pitch,
191
- 'pitch_std': pitch_std,
192
- 'pitch_range': pitch_range,
193
- 'tempo': tempo,
194
- 'spectral_centroid': spectral_centroid,
195
- 'rms_energy': rms_energy
196
- }
197
 
198
  except Exception as e:
199
- return {'success': False, 'error': str(e)}
200
 
201
- def compare_vocal_styles(self, user_features, reference_features_list):
202
- """Compare user vocals to reference style"""
203
- if not ADVANCED_FEATURES:
204
- return {'success': False, 'error': 'Advanced features not available'}
205
 
206
  try:
207
- # Average reference features
208
- ref_avg = {}
209
- for key in ['mean_pitch', 'pitch_std', 'pitch_range', 'tempo', 'spectral_centroid', 'rms_energy']:
210
- values = [ref[key] for ref in reference_features_list if key in ref]
211
- ref_avg[key] = np.mean(values) if values else 0
212
-
213
- # Calculate differences
214
- pitch_diff = abs(user_features['mean_pitch'] - ref_avg['mean_pitch'])
215
- tempo_diff = abs(user_features['tempo'] - ref_avg['tempo'])
216
- timbre_diff = abs(user_features['spectral_centroid'] - ref_avg['spectral_centroid'])
217
- energy_diff = abs(user_features['rms_energy'] - ref_avg['rms_energy'])
218
-
219
- # Generate feedback
220
- feedback = []
221
-
222
- if pitch_diff > 50:
223
- feedback.append(f"🎵 Pitch: Your average pitch differs by {pitch_diff:.1f} Hz. Practice matching the reference key.")
224
- else:
225
- feedback.append("🎵 Pitch: Good pitch accuracy!")
226
-
227
- if tempo_diff > 10:
228
- feedback.append(f"⏱️ Tempo: Your tempo differs by {tempo_diff:.1f} BPM. Work on timing consistency.")
229
- else:
230
- feedback.append("⏱️ Tempo: Good timing!")
231
-
232
- if timbre_diff > 500:
233
- feedback.append("🗣️ Timbre: Try adjusting your vocal tone to match the reference style.")
234
- else:
235
- feedback.append("🗣️ Timbre: Good vocal tone match!")
236
-
237
- if energy_diff > 0.1:
238
- feedback.append("🔊 Energy: Adjust your vocal intensity to match the reference.")
239
- else:
240
- feedback.append("🔊 Energy: Good energy level!")
241
-
242
- overall_score = max(0, 100 - (pitch_diff/2 + tempo_diff + timbre_diff/10 + energy_diff*100))
243
-
244
- return {
245
- 'success': True,
246
- 'score': round(overall_score, 1),
247
- 'feedback': feedback,
248
- 'metrics': {
249
- 'pitch_diff': round(pitch_diff, 1),
250
- 'tempo_diff': round(tempo_diff, 1),
251
- 'timbre_diff': round(timbre_diff, 1),
252
- 'energy_diff': round(energy_diff, 3)
253
- }
254
- }
 
 
 
 
255
 
256
  except Exception as e:
257
- return {'success': False, 'error': str(e)}
258
-
259
- def cleanup(self):
260
- """Clean up temporary files"""
261
- try:
262
- if os.path.exists(self.temp_dir):
263
- shutil.rmtree(self.temp_dir)
264
- except Exception:
265
- pass
266
 
267
- # Global engine instance
268
- engine = AudioEngine()
269
 
270
- def format_analysis_results(analysis):
271
- """Format analysis results for display"""
272
- if not analysis['success']:
273
- return f"❌ Analysis failed: {analysis['error']}"
274
-
275
- return f"""📊 Audio Analysis Results
 
 
276
 
277
- 🎵 Basic Properties:
278
- Duration: {analysis['duration']} seconds
279
- Sample Rate: {analysis['sample_rate']} Hz
280
- Tempo: {analysis['tempo']} BPM
281
-
282
- 🔊 Audio Characteristics:
283
- • Spectral Centroid: {analysis['spectral_centroid']} Hz
284
- • Spectral Rolloff: {analysis['spectral_rolloff']} Hz
285
- • Zero Crossing Rate: {analysis['zero_crossing_rate']}
286
- • RMS Energy: {analysis['rms_energy']}
287
-
288
- 🎤 Vocal Information:
289
- • Average Pitch: {analysis['average_pitch']} Hz
290
- • Pitch Points Detected: {analysis['pitch_count']}
291
- • Beats Detected: {analysis['beats_detected']}"""
292
-
293
- def process_audio_separation(audio_file, separation_mode):
294
- """Main audio separation function"""
295
- if not audio_file:
296
- return "❌ Please upload an audio file", None, None, None, None, ""
297
-
298
- if not SPLEETER_AVAILABLE:
299
- return "❌ Spleeter not available for source separation", None, None, None, None, ""
300
 
301
- try:
302
- # Analyze audio first
303
- analysis = engine.analyze_audio(audio_file)
304
- analysis_text = format_analysis_results(analysis)
305
-
306
- # Separate audio
307
- model_type = "2stems" if "2-stem" in separation_mode else "4stems"
308
- separation_result = engine.separate_vocals(audio_file, model_type)
309
-
310
- if not separation_result['success']:
311
- return f"❌ Separation failed: {separation_result['error']}", None, None, None, None, analysis_text
312
-
313
- if model_type == "2stems":
314
- return (
315
- "✅ 2-stem separation completed successfully!",
316
- separation_result.get('vocals'),
317
- separation_result.get('accompaniment'),
318
- None,
319
- None,
320
- analysis_text
321
- )
322
- else:
323
- return (
324
- "✅ 4-stem separation completed successfully!",
325
- separation_result.get('vocals'),
326
- separation_result.get('drums'),
327
- separation_result.get('bass'),
328
- separation_result.get('other'),
329
- analysis_text
330
- )
331
-
332
- except Exception as e:
333
- return f"❌ Processing error: {str(e)}", None, None, None, None, ""
334
 
335
- def process_vocal_effects(audio_file, pitch_shift, reverb_amount):
336
- """Apply vocal effects to audio"""
337
- if not audio_file:
338
- return "Please upload an audio file", None, ""
339
 
340
- try:
341
- # Analyze original
342
- analysis = engine.analyze_audio(audio_file)
343
- analysis_text = format_analysis_results(analysis)
344
-
345
- # Apply effects
346
- effects_result = engine.apply_effects(audio_file, pitch_shift, reverb_amount)
347
-
348
- if not effects_result['success']:
349
- return f"❌ Effects failed: {effects_result['error']}", None, analysis_text
350
-
351
- effects_applied = []
352
- if pitch_shift != 0:
353
- effects_applied.append(f"Pitch: {pitch_shift:+.1f} semitones")
354
- if reverb_amount > 0:
355
- effects_applied.append(f"Reverb: {reverb_amount:.2f}")
356
-
357
- status = f"✅ Effects applied: {', '.join(effects_applied)}" if effects_applied else "✅ Audio processed (no effects)"
358
-
359
- return status, effects_result['output'], analysis_text
360
-
361
- except Exception as e:
362
- return f"❌ Processing error: {str(e)}", None, ""
363
 
364
- def process_style_coaching(reference_files, user_audio):
365
- """Style coaching analysis"""
366
- if not reference_files or len(reference_files) < 2:
367
- return " Upload at least 2 reference tracks", "", ""
368
-
369
- if not user_audio:
370
- return "❌ Please record or upload your performance", "", ""
371
 
372
- if not SPLEETER_AVAILABLE or not ADVANCED_FEATURES:
373
- return "❌ Style coaching requires advanced features", "", ""
374
-
375
- try:
376
- # Process reference tracks
377
- ref_features = []
378
- ref_status = []
379
-
380
- for i, ref_file in enumerate(reference_files[:5]):
381
- # Separate vocals
382
- separation_result = engine.separate_vocals(ref_file.name, "2stems")
383
- if separation_result['success'] and separation_result.get('vocals'):
384
- # Extract features
385
- features = engine.extract_vocal_features(separation_result['vocals'])
386
- if features['success']:
387
- ref_features.append(features)
388
- ref_status.append(f"✅ Reference {i+1}: Processed")
389
- else:
390
- ref_status.append(f"❌ Reference {i+1}: Feature extraction failed")
391
- else:
392
- ref_status.append(f"❌ Reference {i+1}: Vocal separation failed")
393
-
394
- if len(ref_features) < 2:
395
- return "❌ Need at least 2 valid reference tracks", "\n".join(ref_status), ""
396
-
397
- # Process user audio
398
- user_separation = engine.separate_vocals(user_audio, "2stems")
399
- if not user_separation['success'] or not user_separation.get('vocals'):
400
- return "❌ Could not separate vocals from your performance", "\n".join(ref_status), ""
401
-
402
- user_features = engine.extract_vocal_features(user_separation['vocals'])
403
- if not user_features['success']:
404
- return "❌ Could not analyze your vocal features", "\n".join(ref_status), ""
405
-
406
- # Compare styles
407
- comparison = engine.compare_vocal_styles(user_features, ref_features)
408
- if not comparison['success']:
409
- return f"❌ Style comparison failed: {comparison['error']}", "\n".join(ref_status), ""
410
-
411
- # Format feedback
412
- feedback_text = f"""🎯 Vocal Style Coaching Results
413
-
414
- 📊 Overall Score: {comparison['score']}/100
415
-
416
- 🎵 Detailed Feedback:
417
- {chr(10).join(comparison['feedback'])}
418
 
419
- 📈 Technical Metrics:
420
- Pitch Difference: {comparison['metrics']['pitch_diff']} Hz
421
- • Tempo Difference: {comparison['metrics']['tempo_diff']} BPM
422
- Timbre Difference: {comparison['metrics']['timbre_diff']} Hz
423
- • Energy Difference: {comparison['metrics']['energy_diff']}
424
-
425
- 🎯 Recommendations:
426
- {f"🔥 Excellent! You're very close to the target style." if comparison['score'] > 80 else
427
- f"📈 Good progress! Focus on the areas mentioned above." if comparison['score'] > 60 else
428
- f"💪 Keep practicing! Work on basic vocal technique first."}
429
-
430
- References analyzed: {len(ref_features)}/5"""
431
-
432
- return f"✅ Style coaching complete! Score: {comparison['score']}/100", "\n".join(ref_status), feedback_text
433
 
434
- except Exception as e:
435
- return f"❌ Coaching failed: {str(e)}", "", ""
436
-
437
- # Create main interface
438
- def create_app():
439
 
440
- with gr.Blocks(title="Audio Singing Helper") as app:
441
-
442
- gr.HTML("""
443
- <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;">
444
- <h1>🎤 Audio Singing Helper</h1>
445
- <p>Professional audio processing for singers and musicians</p>
446
- </div>
447
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
- with gr.Tabs():
450
-
451
- # Audio Separation Tab
452
- with gr.Tab("🎵 Audio Separation"):
453
- gr.Markdown("### Separate vocals from instrumental tracks")
454
-
455
- with gr.Row():
456
- with gr.Column():
457
- sep_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"])
458
- sep_mode = gr.Dropdown(
459
- choices=["2-stem (Vocals + Instrumental)", "4-stem (Vocals + Drums + Bass + Other)"],
460
- value="2-stem (Vocals + Instrumental)",
461
- label="Separation Mode"
462
- )
463
- sep_button = gr.Button("🎯 Separate Audio", variant="primary")
464
-
465
- with gr.Column():
466
- sep_status = gr.Textbox(label="Status", lines=2, interactive=False)
467
- sep_analysis = gr.Textbox(label="Audio Analysis", lines=12, interactive=False)
468
 
469
- with gr.Row():
470
- sep_vocals = gr.Audio(label="🎤 Vocals", show_download_button=True)
471
- sep_instrumental = gr.Audio(label="🎼 Instrumental/Drums", show_download_button=True)
472
-
473
- with gr.Row():
474
- sep_bass = gr.Audio(label="🎸 Bass", show_download_button=True)
475
- sep_other = gr.Audio(label="🎹 Other", show_download_button=True)
476
-
477
- # Vocal Effects Tab
478
- with gr.Tab("🎛️ Vocal Effects"):
479
- gr.Markdown("### Apply professional vocal effects")
480
-
481
- with gr.Row():
482
- with gr.Column():
483
- fx_audio_input = gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"])
484
- fx_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Shift (semitones)")
485
- fx_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb Amount")
486
- fx_button = gr.Button("🎵 Apply Effects", variant="primary")
487
-
488
- with gr.Column():
489
- fx_status = gr.Textbox(label="Status", lines=2, interactive=False)
490
- fx_analysis = gr.Textbox(label="Audio Analysis", lines=10, interactive=False)
491
-
492
- fx_output = gr.Audio(label="🎧 Processed Audio", show_download_button=True)
493
-
494
- # Live Recording Tab
495
- with gr.Tab("🎙️ Live Recording"):
496
- gr.Markdown("### Record and process your voice in real-time")
497
 
498
- with gr.Row():
499
- with gr.Column():
500
- live_audio = gr.Audio(type="filepath", sources=["microphone"], label="Record Your Voice")
501
- live_pitch = gr.Slider(-12, 12, 0, step=0.5, label="Pitch Correction")
502
- live_reverb = gr.Slider(0, 0.5, 0, step=0.05, label="Reverb")
503
- live_button = gr.Button("🎤 Process Recording", variant="primary")
504
-
505
- with gr.Column():
506
- live_status = gr.Textbox(label="Status", lines=2, interactive=False)
507
- live_analysis = gr.Textbox(label="Recording Analysis", lines=10, interactive=False)
508
-
509
- live_output = gr.Audio(label="🎧 Processed Recording", show_download_button=True)
510
-
511
- # Style Coaching Tab
512
- with gr.Tab("🎭 Style Coaching"):
513
- gr.Markdown("### Get personalized vocal coaching feedback")
514
-
515
- with gr.Row():
516
- with gr.Column():
517
- coach_refs = gr.File(
518
- label="Reference Tracks (2-5 files)",
519
- file_count="multiple",
520
- file_types=["audio"]
521
  )
522
- coach_user = gr.Audio(
523
- type="filepath",
524
- label="Your Performance",
525
- sources=["upload", "microphone"]
526
  )
527
- coach_button = gr.Button("🎯 Get Coaching", variant="primary")
528
 
529
- with gr.Column():
530
- coach_status = gr.Textbox(label="Status", lines=3, interactive=False)
531
- coach_refs_status = gr.Textbox(label="Reference Processing", lines=8, interactive=False)
532
-
533
- coach_feedback = gr.Textbox(label="🎯 Coaching Feedback", lines=15, interactive=False)
534
-
535
- # Help Tab
536
- with gr.Tab("ℹ️ Help"):
537
- gr.Markdown("""
538
- # 🎤 Audio Singing Helper - User Guide
539
-
540
- ## Features
541
-
542
- ### 🎵 Audio Separation
543
- - Upload any song to separate vocals from instruments
544
- - Choose 2-stem (vocals + instrumental) or 4-stem (vocals + drums + bass + other)
545
- - Get detailed audio analysis of your tracks
546
-
547
- ### 🎛️ Vocal Effects
548
- - Apply pitch shifting (-12 to +12 semitones)
549
- - Add reverb for spatial depth
550
- - Process any audio file with professional effects
551
-
552
- ### 🎙️ Live Recording
553
- - Record directly from your microphone
554
- - Apply real-time pitch correction and reverb
555
- - Perfect for vocal practice and experimentation
556
-
557
- ### 🎭 Style Coaching
558
- - Upload 2-5 reference tracks from artists you want to emulate
559
- - Record or upload your performance
560
- - Get AI-powered feedback on pitch, timing, and vocal characteristics
561
- - Receive a score and specific improvement suggestions
562
-
563
- ## Tips for Best Results
564
-
565
- - **Use high-quality audio files** - better input = better results
566
- - **Keep files under 5 minutes** for faster processing
567
- - **For style coaching**: Choose references from similar genres
568
- - **Record in quiet environments** for best analysis
569
-
570
- ## Supported Formats
571
- - Input: MP3, WAV, FLAC, M4A, OGG
572
- - Output: High-quality WAV files
573
-
574
- ## Technical Requirements
575
- - Some features require additional dependencies
576
- - Processing time varies based on file length and complexity
577
-
578
- ---
579
- Built for singers and musicians worldwide 🌍
580
- """)
581
-
582
- # Connect all the event handlers
583
- sep_button.click(
584
- process_audio_separation,
585
- inputs=[sep_audio_input, sep_mode],
586
- outputs=[sep_status, sep_vocals, sep_instrumental, sep_bass, sep_other, sep_analysis]
587
- )
588
-
589
- fx_button.click(
590
- process_vocal_effects,
591
- inputs=[fx_audio_input, fx_pitch, fx_reverb],
592
- outputs=[fx_status, fx_output, fx_analysis]
593
- )
594
-
595
- live_button.click(
596
- process_vocal_effects,
597
- inputs=[live_audio, live_pitch, live_reverb],
598
- outputs=[live_status, live_output, live_analysis]
599
- )
600
-
601
- coach_button.click(
602
- process_style_coaching,
603
- inputs=[coach_refs, coach_user],
604
- outputs=[coach_status, coach_refs_status, coach_feedback]
605
- )
606
 
607
- return app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
 
609
  if __name__ == "__main__":
610
- app = create_app()
611
- app.launch()
 
 
1
  import gradio as gr
2
+ import subprocess
 
 
3
  import os
4
  import tempfile
5
+ import librosa
6
+ import librosa.display
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import scipy.ndimage
10
  from pathlib import Path
11
  import warnings
12
+ warnings.filterwarnings('ignore')
13
 
14
+ # Set matplotlib backend for web display
15
+ plt.switch_backend('Agg')
 
 
 
 
 
16
 
17
+ class AudioAnalyzer:
 
 
 
 
 
 
 
 
 
 
 
18
  def __init__(self):
19
  self.temp_dir = tempfile.mkdtemp()
20
+
21
+ def download_youtube_audio(self, video_url, progress=gr.Progress()):
22
+ """Download audio from YouTube video using yt-dlp."""
23
+ if not video_url:
24
+ return None, "Please provide a YouTube URL"
25
+
26
+ progress(0.1, desc="Initializing download...")
27
+
28
+ output_dir = os.path.join(self.temp_dir, "downloaded_audio")
29
+ os.makedirs(output_dir, exist_ok=True)
30
+
31
+ # yt-dlp command to extract audio in mp3 format
32
+ command = [
33
+ "yt-dlp",
34
+ "-x",
35
+ "--audio-format", "mp3",
36
+ "-o", os.path.join(output_dir, "%(title)s.%(ext)s"),
37
+ "--no-playlist",
38
+ "--restrict-filenames",
39
+ video_url
40
+ ]
41
+
42
+ try:
43
+ progress(0.3, desc="Downloading audio...")
44
+ result = subprocess.run(command, check=True, capture_output=True, text=True)
45
+
46
+ # Find the downloaded file
47
+ for file in os.listdir(output_dir):
48
+ if file.endswith('.mp3'):
49
+ file_path = os.path.join(output_dir, file)
50
+ progress(1.0, desc="Download complete!")
51
+ return file_path, f"Successfully downloaded: {file}"
52
+
53
+ return None, "Download completed but no audio file found"
54
+
55
+ except FileNotFoundError:
56
+ return None, "yt-dlp not found. Please install it: pip install yt-dlp"
57
+ except subprocess.CalledProcessError as e:
58
+ return None, f"Download failed: {e.stderr}"
59
+ except Exception as e:
60
+ return None, f"Unexpected error: {str(e)}"
61
 
62
+ def extract_basic_features(self, audio_path, sr=16000, progress=gr.Progress()):
63
+ """Extract basic audio features and create visualizations."""
64
+ if not audio_path or not os.path.exists(audio_path):
65
+ return None, None, "Invalid audio file"
66
+
67
  try:
68
+ progress(0.1, desc="Loading audio...")
69
+ y, sr = librosa.load(audio_path, sr=sr)
70
+ duration = librosa.get_duration(y=y, sr=sr)
71
+
72
+ # Limit to first 60 seconds for processing speed
73
+ max_duration = 60
74
+ if duration > max_duration:
75
+ y = y[:sr * max_duration]
76
+ duration = max_duration
77
+
78
+ progress(0.3, desc="Computing features...")
79
+
80
+ # Basic features
81
+ features = {}
82
+ features['duration'] = duration
83
+ features['sample_rate'] = sr
84
+ features['samples'] = len(y)
85
+
86
+ # Mel spectrogram
87
+ progress(0.5, desc="Computing mel spectrogram...")
88
+ hop_length = 512
89
+ S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length)
90
+ S_dB = librosa.power_to_db(S_mel, ref=np.max)
91
+
92
+ # Other features
93
+ features['tempo'], _ = librosa.beat.beat_track(y=y, sr=sr)
94
+ features['mfcc'] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
95
+ features['spectral_centroid'] = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
96
+ features['spectral_rolloff'] = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
97
+ features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(y)[0]
98
+
99
+ progress(0.8, desc="Creating visualizations...")
100
+
101
+ # Create visualizations
102
+ fig, axes = plt.subplots(2, 2, figsize=(15, 10))
103
+
104
+ # Waveform
105
+ time_axis = librosa.frames_to_time(range(len(y)), sr=sr)
106
+ axes[0, 0].plot(time_axis, y)
107
+ axes[0, 0].set_title('Waveform')
108
+ axes[0, 0].set_xlabel('Time (s)')
109
+ axes[0, 0].set_ylabel('Amplitude')
110
+
111
+ # Mel spectrogram
112
+ librosa.display.specshow(S_dB, sr=sr, hop_length=hop_length,
113
+ x_axis='time', y_axis='mel', ax=axes[0, 1])
114
+ axes[0, 1].set_title('Mel Spectrogram')
115
+
116
+ # MFCC
117
+ librosa.display.specshow(features['mfcc'], sr=sr, x_axis='time', ax=axes[1, 0])
118
+ axes[1, 0].set_title('MFCC')
119
 
120
  # Spectral features
121
+ times = librosa.frames_to_time(range(len(features['spectral_centroid'])), sr=sr, hop_length=hop_length)
122
+ axes[1, 1].plot(times, features['spectral_centroid'], label='Spectral Centroid')
123
+ axes[1, 1].plot(times, features['spectral_rolloff'], label='Spectral Rolloff')
124
+ axes[1, 1].set_title('Spectral Features')
125
+ axes[1, 1].set_xlabel('Time (s)')
126
+ axes[1, 1].legend()
127
+
128
+ plt.tight_layout()
129
+
130
+ # Save plot
131
+ plot_path = os.path.join(self.temp_dir, f"basic_features_{np.random.randint(10000)}.png")
132
+ plt.savefig(plot_path, dpi=150, bbox_inches='tight')
133
+ plt.close()
134
+
135
+ # Create summary text
136
+ summary = f"""
137
+ **Audio Summary:**
138
+ - Duration: {duration:.2f} seconds
139
+ - Sample Rate: {sr} Hz
140
+ - Estimated Tempo: {features['tempo']:.1f} BPM
141
+ - Number of Samples: {len(y):,}
142
+
143
+ **Feature Shapes:**
144
+ - MFCC: {features['mfcc'].shape}
145
+ - Spectral Centroid: {features['spectral_centroid'].shape}
146
+ - Spectral Rolloff: {features['spectral_rolloff'].shape}
147
+ - Zero Crossing Rate: {features['zero_crossing_rate'].shape}
148
+ """
149
+
150
+ progress(1.0, desc="Analysis complete!")
151
+ return plot_path, summary, None
152
 
153
  except Exception as e:
154
+ return None, None, f"Error processing audio: {str(e)}"
155
 
156
+ def extract_chroma_features(self, audio_path, sr=16000, progress=gr.Progress()):
157
+ """Extract and visualize enhanced chroma features."""
158
+ if not audio_path or not os.path.exists(audio_path):
159
+ return None, "Invalid audio file"
160
 
161
  try:
162
+ progress(0.1, desc="Loading audio...")
163
+ y, sr = librosa.load(audio_path, sr=sr)
 
164
 
165
+ # Limit to first 30 seconds for processing speed
166
+ max_duration = 30
167
+ if len(y) > sr * max_duration:
168
+ y = y[:sr * max_duration]
169
 
170
+ progress(0.3, desc="Computing chroma variants...")
 
 
171
 
172
+ # Original chroma
173
+ chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr)
174
 
175
+ # Harmonic-percussive separation
176
+ y_harm = librosa.effects.harmonic(y=y, margin=8)
177
+ chroma_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr)
178
 
179
+ progress(0.6, desc="Applying filters...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ # Non-local filtering
182
+ chroma_filter = np.minimum(chroma_harm,
183
+ librosa.decompose.nn_filter(chroma_harm,
184
+ aggregate=np.median,
185
+ metric='cosine'))
 
 
186
 
187
+ # Median filtering
188
+ chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9))
 
189
 
190
+ # STFT-based chroma
191
+ chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
 
 
 
 
192
 
193
+ # CENS features
194
+ chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
 
195
 
196
+ progress(0.8, desc="Creating visualizations...")
197
 
198
+ # Create comprehensive visualization
199
+ fig, axes = plt.subplots(3, 2, figsize=(15, 12))
 
 
 
 
 
200
 
201
+ # Original vs Harmonic
202
+ librosa.display.specshow(chroma_orig, y_axis='chroma', x_axis='time', ax=axes[0, 0])
203
+ axes[0, 0].set_title('Original Chroma (CQT)')
 
 
 
 
 
204
 
205
+ librosa.display.specshow(chroma_harm, y_axis='chroma', x_axis='time', ax=axes[0, 1])
206
+ axes[0, 1].set_title('Harmonic Chroma')
207
 
208
+ # Filtered vs Smooth
209
+ librosa.display.specshow(chroma_filter, y_axis='chroma', x_axis='time', ax=axes[1, 0])
210
+ axes[1, 0].set_title('Non-local Filtered')
 
211
 
212
+ librosa.display.specshow(chroma_smooth, y_axis='chroma', x_axis='time', ax=axes[1, 1])
213
+ axes[1, 1].set_title('Median Filtered')
214
 
215
+ # STFT vs CENS
216
+ librosa.display.specshow(chroma_stft, y_axis='chroma', x_axis='time', ax=axes[2, 0])
217
+ axes[2, 0].set_title('Chroma (STFT)')
218
 
219
+ librosa.display.specshow(chroma_cens, y_axis='chroma', x_axis='time', ax=axes[2, 1])
220
+ axes[2, 1].set_title('CENS Features')
221
 
222
+ plt.tight_layout()
223
+
224
+ # Save plot
225
+ plot_path = os.path.join(self.temp_dir, f"chroma_features_{np.random.randint(10000)}.png")
226
+ plt.savefig(plot_path, dpi=150, bbox_inches='tight')
227
+ plt.close()
228
+
229
+ progress(1.0, desc="Chroma analysis complete!")
230
+ return plot_path, None
231
 
232
  except Exception as e:
233
+ return None, f"Error processing chroma features: {str(e)}"
234
 
235
+ def generate_patches(self, audio_path, sr=16000, patch_duration=5.0, hop_duration=1.0, progress=gr.Progress()):
236
+ """Generate fixed-duration patches for transformer input."""
237
+ if not audio_path or not os.path.exists(audio_path):
238
+ return None, None, "Invalid audio file"
239
 
240
  try:
241
+ progress(0.1, desc="Loading audio...")
242
+ y, sr = librosa.load(audio_path, sr=sr)
243
+
244
+ progress(0.3, desc="Computing mel spectrogram...")
245
+ hop_length = 512
246
+ S_mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length, n_mels=80)
247
+ S_dB = librosa.power_to_db(S_mel, ref=np.max)
248
+
249
+ progress(0.5, desc="Generating patches...")
250
+
251
+ # Convert time to frames
252
+ patch_frames = librosa.time_to_frames(patch_duration, sr=sr, hop_length=hop_length)
253
+ hop_frames = librosa.time_to_frames(hop_duration, sr=sr, hop_length=hop_length)
254
+
255
+ # Generate patches using librosa.util.frame
256
+ patches = librosa.util.frame(S_dB, frame_length=patch_frames, hop_length=hop_frames)
257
+
258
+ progress(0.8, desc="Creating visualizations...")
259
+
260
+ # Visualize patches
261
+ num_patches_to_show = min(6, patches.shape[-1])
262
+ fig, axes = plt.subplots(2, 3, figsize=(18, 8))
263
+ axes = axes.flatten()
264
+
265
+ for i in range(num_patches_to_show):
266
+ librosa.display.specshow(patches[..., i], y_axis='mel', x_axis='time',
267
+ ax=axes[i], sr=sr, hop_length=hop_length)
268
+ axes[i].set_title(f'Patch {i+1}')
269
+
270
+ # Hide unused subplots
271
+ for i in range(num_patches_to_show, len(axes)):
272
+ axes[i].set_visible(False)
273
+
274
+ plt.tight_layout()
275
+
276
+ # Save plot
277
+ plot_path = os.path.join(self.temp_dir, f"patches_{np.random.randint(10000)}.png")
278
+ plt.savefig(plot_path, dpi=150, bbox_inches='tight')
279
+ plt.close()
280
+
281
+ # Summary
282
+ summary = f"""
283
+ **Patch Generation Summary:**
284
+ - Total patches generated: {patches.shape[-1]}
285
+ - Patch duration: {patch_duration} seconds
286
+ - Hop duration: {hop_duration} seconds
287
+ - Patch shape (mels, time, patches): {patches.shape}
288
+ - Each patch covers {patch_frames} time frames
289
+ """
290
+
291
+ progress(1.0, desc="Patch generation complete!")
292
+ return plot_path, summary, None
293
 
294
  except Exception as e:
295
+ return None, None, f"Error generating patches: {str(e)}"
 
 
 
 
 
 
 
 
296
 
297
+ # Initialize analyzer
298
+ analyzer = AudioAnalyzer()
299
 
300
+ # Gradio interface functions
301
+ def process_youtube_url(url):
302
+ """Process YouTube URL and return audio file."""
303
+ file_path, message = analyzer.download_youtube_audio(url)
304
+ if file_path:
305
+ return file_path, message, gr.update(visible=True)
306
+ else:
307
+ return None, message, gr.update(visible=False)
308
 
309
+ def analyze_audio_basic(audio_file):
310
+ """Analyze audio file and return basic features."""
311
+ if audio_file is None:
312
+ return None, "Please upload an audio file or download from YouTube first."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
+ plot_path, summary, error = analyzer.extract_basic_features(audio_file)
315
+ if error:
316
+ return None, error
317
+ return plot_path, summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
+ def analyze_audio_chroma(audio_file):
320
+ """Analyze audio file for chroma features."""
321
+ if audio_file is None:
322
+ return None, "Please upload an audio file or download from YouTube first."
323
 
324
+ plot_path, error = analyzer.extract_chroma_features(audio_file)
325
+ if error:
326
+ return None, error
327
+ return plot_path, "Chroma feature analysis complete! This shows different chroma extraction methods for harmonic analysis."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
+ def analyze_audio_patches(audio_file, patch_duration, hop_duration):
330
+ """Generate transformer patches from audio."""
331
+ if audio_file is None:
332
+ return None, None, "Please upload an audio file or download from YouTube first."
 
 
 
333
 
334
+ plot_path, summary, error = analyzer.generate_patches(audio_file, patch_duration=patch_duration, hop_duration=hop_duration)
335
+ if error:
336
+ return None, None, error
337
+ return plot_path, summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
+ # Create Gradio interface
340
+ with gr.Blocks(title="🎵 Audio Analysis Suite", theme=gr.themes.Soft()) as app:
341
+ gr.Markdown("""
342
+ # 🎵 Audio Analysis Suite
 
 
 
 
 
 
 
 
 
 
343
 
344
+ A comprehensive tool for audio feature extraction and analysis. Upload an audio file or download from YouTube to get started!
 
 
 
 
345
 
346
+ **Features:**
347
+ - 📊 **Basic Features**: Waveform, Mel Spectrogram, MFCC, Spectral Analysis, Tempo Detection
348
+ - 🎼 **Chroma Features**: Advanced harmonic content analysis with multiple extraction methods
349
+ - 🧩 **Transformer Patches**: Generate fixed-duration patches for deep learning applications
350
+ """)
351
+
352
+ with gr.Row():
353
+ with gr.Column(scale=1):
354
+ gr.Markdown("### 📁 Audio Input")
355
+
356
+ # YouTube downloader
357
+ with gr.Group():
358
+ gr.Markdown("**Download from YouTube:**")
359
+ youtube_url = gr.Textbox(
360
+ label="YouTube URL",
361
+ placeholder="https://www.youtube.com/watch?v=...",
362
+ info="Paste a YouTube video URL to extract audio"
363
+ )
364
+ download_btn = gr.Button("📥 Download Audio", variant="primary")
365
+ download_status = gr.Textbox(label="Download Status", interactive=False)
366
+
367
+ # File upload
368
+ with gr.Group():
369
+ gr.Markdown("**Or upload audio file:**")
370
+ audio_file = gr.Audio(
371
+ label="Upload Audio File",
372
+ type="filepath",
373
+ info="Supported formats: MP3, WAV, FLAC, etc."
374
+ )
375
 
376
+ with gr.Column(scale=2):
377
+ gr.Markdown("### 🔍 Analysis Results")
378
+
379
+ with gr.Tabs():
380
+ with gr.Tab("📊 Basic Features"):
381
+ basic_plot = gr.Image(label="Feature Visualizations")
382
+ basic_summary = gr.Markdown()
383
+ basic_analyze_btn = gr.Button("🔍 Analyze Basic Features", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
384
 
385
+ with gr.Tab("🎼 Chroma Features"):
386
+ chroma_plot = gr.Image(label="Chroma Visualizations")
387
+ chroma_summary = gr.Markdown()
388
+ chroma_analyze_btn = gr.Button("🎼 Analyze Chroma Features", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
+ with gr.Tab("🧩 Transformer Patches"):
391
+ with gr.Row():
392
+ patch_duration = gr.Slider(
393
+ label="Patch Duration (seconds)",
394
+ minimum=1.0, maximum=10.0, value=5.0, step=0.5,
395
+ info="Duration of each patch"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  )
397
+ hop_duration = gr.Slider(
398
+ label="Hop Duration (seconds)",
399
+ minimum=0.1, maximum=5.0, value=1.0, step=0.1,
400
+ info="Time between patch starts"
401
  )
 
402
 
403
+ patches_plot = gr.Image(label="Generated Patches")
404
+ patches_summary = gr.Markdown()
405
+ patches_analyze_btn = gr.Button("🧩 Generate Patches", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
+ gr.Markdown("""
408
+ ### ℹ️ Usage Tips
409
+ - **Processing is limited to 60 seconds** for basic features and 30 seconds for chroma analysis to ensure fast response times
410
+ - **YouTube downloads** respect platform terms of service
411
+ - **Visualizations** are high-quality and suitable for research/educational use
412
+ - **All processing** is done locally in your browser session
413
+ """)
414
+
415
+ # Event handlers
416
+ download_btn.click(
417
+ process_youtube_url,
418
+ inputs=[youtube_url],
419
+ outputs=[audio_file, download_status, basic_analyze_btn]
420
+ )
421
+
422
+ basic_analyze_btn.click(
423
+ analyze_audio_basic,
424
+ inputs=[audio_file],
425
+ outputs=[basic_plot, basic_summary]
426
+ )
427
+
428
+ chroma_analyze_btn.click(
429
+ analyze_audio_chroma,
430
+ inputs=[audio_file],
431
+ outputs=[chroma_plot, chroma_summary]
432
+ )
433
+
434
+ patches_analyze_btn.click(
435
+ analyze_audio_patches,
436
+ inputs=[audio_file, patch_duration, hop_duration],
437
+ outputs=[patches_plot, patches_summary]
438
+ )
439
+
440
+ # Auto-analyze when file is uploaded
441
+ audio_file.change(
442
+ analyze_audio_basic,
443
+ inputs=[audio_file],
444
+ outputs=[basic_plot, basic_summary]
445
+ )
446
 
447
  if __name__ == "__main__":
448
+ app.launch()
449
+
450
+