Spaces:
Sleeping
Sleeping
import librosa | |
import numpy as np | |
class AudioConfig: | |
sr = 16000 | |
duration = 3 | |
hop_length = 340 * duration | |
fmin = 20 | |
fmax = sr // 2 | |
n_mels = 128 | |
n_fft = 128 * 20 | |
samples = sr * duration | |
def preprocess_audio(audio_path, config=None): | |
if config is None: | |
config = AudioConfig() | |
# Load audio | |
y, sr = librosa.load(audio_path, sr=config.sr) | |
# Trim or pad | |
if len(y) > config.samples: | |
y = y[:config.samples] | |
else: | |
padding = config.samples - len(y) | |
offset = padding // 2 | |
y = np.pad(y, (offset, padding - offset), 'constant') | |
# Create mel spectrogram | |
spectrogram = librosa.feature.melspectrogram( | |
y=y, | |
sr=config.sr, | |
n_mels=config.n_mels, | |
hop_length=config.hop_length, | |
n_fft=config.n_fft, | |
fmin=config.fmin, | |
fmax=config.fmax | |
) | |
spectrogram = librosa.power_to_db(spectrogram) | |
# Return with correct shape for PyTorch (channels, height, width) | |
return spectrogram[np.newaxis, ...] |