Nature-Nexus / utils /audio_processing.py
Vector73
Add audio model.
01e938d
import librosa
import numpy as np
class AudioConfig:
sr = 16000
duration = 3
hop_length = 340 * duration
fmin = 20
fmax = sr // 2
n_mels = 128
n_fft = 128 * 20
samples = sr * duration
def preprocess_audio(audio_path, config=None):
if config is None:
config = AudioConfig()
# Load audio
y, sr = librosa.load(audio_path, sr=config.sr)
# Trim or pad
if len(y) > config.samples:
y = y[:config.samples]
else:
padding = config.samples - len(y)
offset = padding // 2
y = np.pad(y, (offset, padding - offset), 'constant')
# Create mel spectrogram
spectrogram = librosa.feature.melspectrogram(
y=y,
sr=config.sr,
n_mels=config.n_mels,
hop_length=config.hop_length,
n_fft=config.n_fft,
fmin=config.fmin,
fmax=config.fmax
)
spectrogram = librosa.power_to_db(spectrogram)
# Return with correct shape for PyTorch (channels, height, width)
return spectrogram[np.newaxis, ...]