import librosa import numpy as np class AudioConfig: sr = 16000 duration = 3 hop_length = 340 * duration fmin = 20 fmax = sr // 2 n_mels = 128 n_fft = 128 * 20 samples = sr * duration def preprocess_audio(audio_path, config=None): if config is None: config = AudioConfig() # Load audio y, sr = librosa.load(audio_path, sr=config.sr) # Trim or pad if len(y) > config.samples: y = y[:config.samples] else: padding = config.samples - len(y) offset = padding // 2 y = np.pad(y, (offset, padding - offset), 'constant') # Create mel spectrogram spectrogram = librosa.feature.melspectrogram( y=y, sr=config.sr, n_mels=config.n_mels, hop_length=config.hop_length, n_fft=config.n_fft, fmin=config.fmin, fmax=config.fmax ) spectrogram = librosa.power_to_db(spectrogram) # Return with correct shape for PyTorch (channels, height, width) return spectrogram[np.newaxis, ...]