voidful
/

gemma-3-omni-processor

Feature Extraction

Transformers

gemma_3_omni

custom_code

Model card Files Files and versions Community

voidful commited on May 16

Commit

701891b

verified ·

1 Parent(s): ddf58eb

Update processing_gemma3_omni.py

Browse files

Files changed (1) hide show

processing_gemma3_omni.py +56 -43

processing_gemma3_omni.py CHANGED Viewed

@@ -28,42 +28,61 @@ DEFAULT_MAX_LENGTH = 16384
 logger = logging.get_logger(__name__)
-def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
-                          fmax: Optional[float] = None) -> np.ndarray:
-    """Create Mel filterbank for audio processing."""
-    fmax = fmax or sampling_rate / 2.0
-    def hz_to_mel(f: float) -> float: # Slaney scale from Snippet B
-        return 1127.0 * math.log(1 + f / 700.0)
-    if fmin >= fmax:
-        raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
-    mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
-    freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1) # Inverse of Slaney hz_to_mel
-    freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
-    bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
-    bins = np.clip(bins, 0, n_fft // 2)
-    filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
-    for m_idx in range(n_mels):
-        left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
-        if center > left:
-            filterbank[m_idx, left:center + 1] = (np.arange(left, center + 1) - left) / (center - left)
-        if right > center:
-            filterbank[m_idx, center:right + 1] = (right - np.arange(center, right + 1)) / (right - center)
-        if left <= center <= right:
-            if filterbank.shape[1] > center:
-                if (center > left and filterbank[m_idx, center] < 1.0 and center < right) or \
-                   (left == center and center < right) or \
-                   (right == center and left < center):
-                    filterbank[m_idx, center] = 1.0
-                elif left == center and right == center :
-                    filterbank[m_idx, center] = 1.0
-    return filterbank
 # --- Start of Refactored Audio Feature Extractor (to match Phi4M - Snippet A) ---
@@ -106,13 +125,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):  # MODIFIED CLASS N
                 "This might lead to inconsistencies if the input audio is not resampled to 16000 Hz by this extractor."
             )
-        self._mel = create_mel_filterbank(
-            sampling_rate=16000,  # Phi4M Mel params are for 16kHz.
-            n_fft=512,
-            n_mels=_feature_size,  # Use the effective feature_size (should be 80)
-            fmin=0.0,
-            fmax=7690.0
-        ).T
         self._hamming400 = np.hamming(400)
         self._hamming200 = np.hamming(200)

 logger = logging.get_logger(__name__)
+def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
+    """Create a Mel filter-bank the same as SpeechLib FbankFC.
+    Args:
+        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
+        n_fft (int): FFT size. int > 0 [scalar]
+        n_mel (int): Mel filter size. int > 0 [scalar]
+        fmin (float): lowest frequency (in Hz). If None use 0.0.
+            float >= 0 [scalar]
+        fmax: highest frequency (in Hz). If None use sample_rate / 2.
+            float >= 0 [scalar]
+    Returns
+        out (numpy.ndarray): Mel transform matrix
+            [shape=(n_mels, 1 + n_fft/2)]
+    """
+    bank_width = int(n_fft // 2 + 1)
+    if fmax is None:
+        fmax = sample_rate / 2
+    if fmin is None:
+        fmin = 0
+    assert fmin >= 0, "fmin cannot be negtive"
+    assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
+    def mel(f):
+        return 1127.0 * np.log(1.0 + f / 700.0)
+    def bin2mel(fft_bin):
+        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
+    def f2bin(f):
+        return int((f * n_fft / sample_rate) + 0.5)
+    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
+    klo = f2bin(fmin) + 1
+    khi = f2bin(fmax)
+    khi = max(khi, klo)
+    # Spec 2: SpeechLib uses trianges in Mel space
+    mlo = mel(fmin)
+    mhi = mel(fmax)
+    m_centers = np.linspace(mlo, mhi, n_mels + 2)
+    ms = (mhi - mlo) / (n_mels + 1)
+    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
+    for m in range(0, n_mels):
+        left = m_centers[m]
+        center = m_centers[m + 1]
+        right = m_centers[m + 2]
+        for fft_bin in range(klo, khi):
+            mbin = bin2mel(fft_bin)
+            if left < mbin < right:
+                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
+    return matrix
 # --- Start of Refactored Audio Feature Extractor (to match Phi4M - Snippet A) ---
                 "This might lead to inconsistencies if the input audio is not resampled to 16000 Hz by this extractor."
             )
+        self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
         self._hamming400 = np.hamming(400)
         self._hamming200 = np.hamming(200)