Update processing_gemma3_omni.py
Browse files- processing_gemma3_omni.py +56 -43
processing_gemma3_omni.py
CHANGED
@@ -28,42 +28,61 @@ DEFAULT_MAX_LENGTH = 16384
|
|
28 |
|
29 |
logger = logging.get_logger(__name__)
|
30 |
|
31 |
-
def
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
|
69 |
# --- Start of Refactored Audio Feature Extractor (to match Phi4M - Snippet A) ---
|
@@ -106,13 +125,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor): # MODIFIED CLASS N
|
|
106 |
"This might lead to inconsistencies if the input audio is not resampled to 16000 Hz by this extractor."
|
107 |
)
|
108 |
|
109 |
-
self._mel =
|
110 |
-
sampling_rate=16000, # Phi4M Mel params are for 16kHz.
|
111 |
-
n_fft=512,
|
112 |
-
n_mels=_feature_size, # Use the effective feature_size (should be 80)
|
113 |
-
fmin=0.0,
|
114 |
-
fmax=7690.0
|
115 |
-
).T
|
116 |
self._hamming400 = np.hamming(400)
|
117 |
self._hamming200 = np.hamming(200)
|
118 |
|
|
|
28 |
|
29 |
logger = logging.get_logger(__name__)
|
30 |
|
31 |
+
def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
|
32 |
+
"""Create a Mel filter-bank the same as SpeechLib FbankFC.
|
33 |
+
Args:
|
34 |
+
sample_rate (int): Sample rate in Hz. number > 0 [scalar]
|
35 |
+
n_fft (int): FFT size. int > 0 [scalar]
|
36 |
+
n_mel (int): Mel filter size. int > 0 [scalar]
|
37 |
+
fmin (float): lowest frequency (in Hz). If None use 0.0.
|
38 |
+
float >= 0 [scalar]
|
39 |
+
fmax: highest frequency (in Hz). If None use sample_rate / 2.
|
40 |
+
float >= 0 [scalar]
|
41 |
+
Returns
|
42 |
+
out (numpy.ndarray): Mel transform matrix
|
43 |
+
[shape=(n_mels, 1 + n_fft/2)]
|
44 |
+
"""
|
45 |
+
|
46 |
+
bank_width = int(n_fft // 2 + 1)
|
47 |
+
if fmax is None:
|
48 |
+
fmax = sample_rate / 2
|
49 |
+
if fmin is None:
|
50 |
+
fmin = 0
|
51 |
+
assert fmin >= 0, "fmin cannot be negtive"
|
52 |
+
assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
|
53 |
+
|
54 |
+
def mel(f):
|
55 |
+
return 1127.0 * np.log(1.0 + f / 700.0)
|
56 |
+
|
57 |
+
def bin2mel(fft_bin):
|
58 |
+
return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
|
59 |
+
|
60 |
+
def f2bin(f):
|
61 |
+
return int((f * n_fft / sample_rate) + 0.5)
|
62 |
+
|
63 |
+
# Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
|
64 |
+
klo = f2bin(fmin) + 1
|
65 |
+
khi = f2bin(fmax)
|
66 |
+
|
67 |
+
khi = max(khi, klo)
|
68 |
+
|
69 |
+
# Spec 2: SpeechLib uses trianges in Mel space
|
70 |
+
mlo = mel(fmin)
|
71 |
+
mhi = mel(fmax)
|
72 |
+
m_centers = np.linspace(mlo, mhi, n_mels + 2)
|
73 |
+
ms = (mhi - mlo) / (n_mels + 1)
|
74 |
+
|
75 |
+
matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
|
76 |
+
for m in range(0, n_mels):
|
77 |
+
left = m_centers[m]
|
78 |
+
center = m_centers[m + 1]
|
79 |
+
right = m_centers[m + 2]
|
80 |
+
for fft_bin in range(klo, khi):
|
81 |
+
mbin = bin2mel(fft_bin)
|
82 |
+
if left < mbin < right:
|
83 |
+
matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
|
84 |
+
|
85 |
+
return matrix
|
86 |
|
87 |
|
88 |
# --- Start of Refactored Audio Feature Extractor (to match Phi4M - Snippet A) ---
|
|
|
125 |
"This might lead to inconsistencies if the input audio is not resampled to 16000 Hz by this extractor."
|
126 |
)
|
127 |
|
128 |
+
self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
self._hamming400 = np.hamming(400)
|
130 |
self._hamming200 = np.hamming(200)
|
131 |
|