File size: 1,063 Bytes
01e938d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import librosa
import numpy as np

class AudioConfig:
    sr = 16000
    duration = 3
    hop_length = 340 * duration
    fmin = 20
    fmax = sr // 2
    n_mels = 128
    n_fft = 128 * 20
    samples = sr * duration

def preprocess_audio(audio_path, config=None):
    if config is None:
        config = AudioConfig()
        
    # Load audio
    y, sr = librosa.load(audio_path, sr=config.sr)
    
    # Trim or pad
    if len(y) > config.samples:
        y = y[:config.samples]
    else:
        padding = config.samples - len(y)
        offset = padding // 2
        y = np.pad(y, (offset, padding - offset), 'constant')
    
    # Create mel spectrogram
    spectrogram = librosa.feature.melspectrogram(
        y=y,
        sr=config.sr,
        n_mels=config.n_mels,
        hop_length=config.hop_length,
        n_fft=config.n_fft,
        fmin=config.fmin,
        fmax=config.fmax
    )
    spectrogram = librosa.power_to_db(spectrogram)
    
    # Return with correct shape for PyTorch (channels, height, width)
    return spectrogram[np.newaxis, ...]