voidful
/

gemma-3-omni-processor

Feature Extraction

Transformers

gemma_3_omni

custom_code

Model card Files Files and versions Community

voidful commited on 18 days ago

Commit

ddf58eb

verified ·

1 Parent(s): 3fa62c9

Update processing_gemma3_omni.py

Browse files

Files changed (1) hide show

processing_gemma3_omni.py +113 -85

processing_gemma3_omni.py CHANGED Viewed

@@ -16,116 +16,133 @@ from transformers.utils import TensorType, to_py_obj, logging
 # Constants
 DEFAULT_SAMPLING_RATE = 16000
 DEFAULT_N_FFT = 512
-DEFAULT_WIN_LENGTH = 400  # Matches Phi4M's 16kHz win_length for reference
-DEFAULT_HOP_LENGTH = 160  # Matches Phi4M's 16kHz hop_length for reference
 DEFAULT_N_MELS = 80
-DEFAULT_COMPRESSION_RATE = 4  # Generic default
-DEFAULT_QFORMER_RATE = 2  # Generic default
-DEFAULT_FEAT_STRIDE = 4  # Generic default
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
-# LOG_MEL_CLIP_EPSILON = 1e-5 # Original B's constant, A clips at 1.0
 logger = logging.get_logger(__name__)
-# This create_mel_filterbank function is from your original Snippet B.
-# It will be used by the Gemma3AudioFeatureExtractor.
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing."""
     fmax = fmax or sampling_rate / 2.0
-    def hz_to_mel(f: float) -> float:  # Slaney scale from Snippet B
         return 1127.0 * math.log(1 + f / 700.0)
     if fmin >= fmax:
         raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
-    freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1)  # Inverse of Slaney hz_to_mel
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
     bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
-    bins = np.clip(bins, 0, n_fft // 2)  # Max index for rfft output is n_fft//2
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
     for m_idx in range(n_mels):
         left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
-        if center > left:  # Rising slope
             filterbank[m_idx, left:center + 1] = (np.arange(left, center + 1) - left) / (center - left)
-        if right > center:  # Falling slope
             filterbank[m_idx, center:right + 1] = (right - np.arange(center, right + 1)) / (right - center)
-        # Ensure the peak at 'center' is 1.0 if it's a valid point.
-        # This logic is from original Snippet B. Phi4M's speechlib_mel might normalize differently.
-        if left <= center <= right:  # Check if center is within the bounds of the filter
-            if filterbank.shape[1] > center:  # Check if center index is within filterbank columns
                 if (center > left and filterbank[m_idx, center] < 1.0 and center < right) or \
-                        (left == center and center < right) or \
-                        (right == center and left < center):  # Ensure it's a triangular filter with a slope
                     filterbank[m_idx, center] = 1.0
-                elif left == center and right == center:  # Handles the case of a filter with zero width if bins are identical
                     filterbank[m_idx, center] = 1.0
     return filterbank
 # --- Start of Refactored Audio Feature Extractor (to match Phi4M - Snippet A) ---
-class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
     model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
-    def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
-        feature_size = 80  # From Phi4M
-        sampling_rate = 16000  # From Phi4M (target sampling rate)
-        padding_value = 0.0  # From Phi4M
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.compression_rate = audio_compression_rate
-        self.qformer_compression_rate = audio_downsample_rate  # In Phi4M, audio_downsample_rate is qformer_compression_rate
         self.feat_stride = audio_feat_stride
-        self._eightk_method = kwargs.get("eightk_method", "fillzero")  # 'fillzero' or 'resample'
-        # Using the provided create_mel_filterbank (Slaney scale)
-        # Parameters for Mel filterbank match Phi4M's speechlib_mel call
         self._mel = create_mel_filterbank(
-            sampling_rate=16000,  # Target sampling rate
-            n_fft=512,  # n_fft for 16kHz audio in Phi4M
-            n_mels=80,  # feature_size
-            fmin=0.0,  # Phi4M's fmin is None, typically defaults to 0
-            fmax=7690.0  # Specific fmax from Phi4M
         ).T
-        self._hamming400 = np.hamming(400)  # for 16k audio, from Phi4M
-        self._hamming200 = np.hamming(200)  # for 8k audio, from Phi4M
     def __call__(
             self,
-            audios: List[Union[AudioInput, Tuple[np.ndarray, int]]],  # More specific type hint
             return_tensors: Optional[Union[str, TensorType]] = None,
     ):
         returned_input_audio_embeds = []
         returned_audio_embed_sizes = []
-        audio_frames_list = []  # Stores num_mel_frames * feat_stride for each audio item
         for audio_input_item in audios:
             if not isinstance(audio_input_item, tuple) or len(audio_input_item) != 2:
                 raise ValueError(
                     "Each item in 'audios' must be a tuple (waveform: np.ndarray, sample_rate: int)."
                 )
-            audio_data, sample_rate = audio_input_item
-            if isinstance(audio_data, list):  # Convert list to ndarray
                 audio_data = np.array(audio_data, dtype=np.float32)
             if not isinstance(audio_data, np.ndarray):
                 raise TypeError(f"Waveform data must be a numpy array, got {type(audio_data)}")
-            audio_embeds_np = self._extract_features(audio_data, sample_rate)  # log_fbank
             num_mel_frames = audio_embeds_np.shape[0]
-            current_audio_frames = num_mel_frames * self.feat_stride  # Phi4M logic
             audio_embed_size = self._compute_audio_embed_size(current_audio_frames)
@@ -145,12 +162,12 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             max_audio_frames = tensor_audio_frames_list.max().item()
         returned_audio_attention_mask = None
-        if max_audio_frames > 0:  # Create mask only if there are frames
             if len(audios) > 1:
                 returned_audio_attention_mask = torch.arange(0, max_audio_frames,
                                                              device=tensor_audio_frames_list.device).unsqueeze(
                     0) < tensor_audio_frames_list.unsqueeze(1)
-            elif len(audios) == 1:  # For batch size 1
                 returned_audio_attention_mask = torch.ones(1, max_audio_frames, dtype=torch.bool,
                                                            device=tensor_audio_frames_list.device)
@@ -164,50 +181,59 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         return BatchFeature(data=data, tensor_type=return_tensors)
     def _extract_spectrogram(self, wav: np.ndarray, fs: int) -> np.ndarray:
         if wav.ndim > 1:
             wav = np.squeeze(wav)
-        if len(wav.shape) == 2:  # stereo to mono
-            wav = wav.mean(axis=1).astype(np.float32)  # Ensure float32 after mean
-        wav = wav.astype(np.float32)  # Ensure wav is float32
-        # Phi4M Resampling logic
-        if fs > self.sampling_rate:  # self.sampling_rate is 16000
-            wav = scipy.signal.resample_poly(wav, self.sampling_rate, fs)
-            fs = self.sampling_rate
-        elif 8000 < fs < self.sampling_rate:
-            wav = scipy.signal.resample_poly(wav, 8000, fs)  # Resample to 8000 first
-            fs = 8000
-        elif fs < 8000 and fs > 0:
-            logger.warning(f"Sample rate {fs} is less than 8000Hz. Resampling to 8000Hz.")
-            wav = scipy.signal.resample_poly(wav, 8000, fs)
-            fs = 8000
-        elif fs <= 0:
-            raise RuntimeError(f"Unsupported sample rate {fs}")
-        if fs == 8000:
             if self._eightk_method == "resample":
-                wav = scipy.signal.resample_poly(wav, self.sampling_rate, 8000)  # Resample 8k to 16k
-                fs = self.sampling_rate
-            # If "fillzero", parameters for 8k will be used, and spectrum padded later.
-        elif fs != self.sampling_rate:  # Should be 16000 if not 8000 and _eightk_method != "resample"
             raise RuntimeError(
-                f"Audio sample rate {fs} not supported after initial processing. Expected {self.sampling_rate} or 8000.")
         preemphasis_coeff = 0.97
-        if fs == 8000:
             n_fft, win_length, hop_length, fft_window = 256, 200, 80, self._hamming200
-        elif fs == 16000:
             n_fft, win_length, hop_length, fft_window = 512, 400, 160, self._hamming400
         else:
-            raise RuntimeError(f"Inconsistent fs {fs} for parameter selection.")
         if len(wav) < win_length:
             wav = np.pad(wav, (0, win_length - len(wav)), 'constant', constant_values=(0.0,))
         num_frames = (wav.shape[0] - win_length) // hop_length + 1
         if num_frames <= 0:
             return np.zeros((0, n_fft // 2 + 1), dtype=np.float32)
         y_frames = np.array(
@@ -216,19 +242,16 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         )
         _y_frames_rolled = np.roll(y_frames, 1, axis=1)
-        _y_frames_rolled[:, 0] = _y_frames_rolled[:, 1]  # Phi4M specific handling
         y_frames_preemphasized = (y_frames - preemphasis_coeff * _y_frames_rolled) * 32768.0
         S = np.fft.rfft(fft_window * y_frames_preemphasized, n=n_fft, axis=1).astype(np.complex64)
-        if fs == 8000 and self._eightk_method == "fillzero":
-            # Pad spectrum to match 16kHz feature dimension (n_fft=512 -> 257 bins)
-            # Current S has (256 // 2) + 1 = 129 bins
             target_bins = (512 // 2) + 1
-            pad_width = target_bins - S.shape[1]
-            # Phi4M: S = np.concatenate((S[:, 0:-1], padarray), axis=1) # Nyquist bin gets set to zero
-            # This means take all but last bin from 8k spectrum, then pad.
-            S_core = S[:, :-1]
             padarray = np.zeros((S_core.shape[0], target_bins - S_core.shape[1]), dtype=S.dtype)
             S = np.concatenate((S_core, padarray), axis=1)
@@ -238,15 +261,15 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
     def _extract_features(self, wav: np.ndarray, fs: int) -> np.ndarray:
         spec = self._extract_spectrogram(wav, fs)
         if spec.shape[0] == 0:
             return np.zeros((0, self.feature_size), dtype=np.float32)
         spec_power = spec ** 2
-        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)  # Clip at 1.0 before log (Phi4M)
         log_fbank = np.log(fbank_power).astype(np.float32)
         return log_fbank
     def _compute_audio_embed_size(self, audio_frames: int) -> int:
-        # Phi4M's logic for compressed size
         integer = audio_frames // self.compression_rate
         remainder = audio_frames % self.compression_rate
         result = integer if remainder == 0 else integer + 1
@@ -257,6 +280,11 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         return result
 # --- End of Refactored Audio Feature Extractor ---

 # Constants
 DEFAULT_SAMPLING_RATE = 16000
 DEFAULT_N_FFT = 512
+DEFAULT_WIN_LENGTH = 400
+DEFAULT_HOP_LENGTH = 160
 DEFAULT_N_MELS = 80
+DEFAULT_COMPRESSION_RATE = 4 # Used for default in __init__
+DEFAULT_QFORMER_RATE = 2     # Used for default in __init__ (as audio_downsample_rate)
+DEFAULT_FEAT_STRIDE = 4      # Used for default in __init__
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
 logger = logging.get_logger(__name__)
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing."""
     fmax = fmax or sampling_rate / 2.0
+    def hz_to_mel(f: float) -> float: # Slaney scale from Snippet B
         return 1127.0 * math.log(1 + f / 700.0)
     if fmin >= fmax:
         raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
+    freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1) # Inverse of Slaney hz_to_mel
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
     bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
+    bins = np.clip(bins, 0, n_fft // 2)
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
     for m_idx in range(n_mels):
         left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
+        if center > left:
             filterbank[m_idx, left:center + 1] = (np.arange(left, center + 1) - left) / (center - left)
+        if right > center:
             filterbank[m_idx, center:right + 1] = (right - np.arange(center, right + 1)) / (right - center)
+        if left <= center <= right:
+            if filterbank.shape[1] > center:
                 if (center > left and filterbank[m_idx, center] < 1.0 and center < right) or \
+                   (left == center and center < right) or \
+                   (right == center and left < center):
                     filterbank[m_idx, center] = 1.0
+                elif left == center and right == center :
                     filterbank[m_idx, center] = 1.0
     return filterbank
 # --- Start of Refactored Audio Feature Extractor (to match Phi4M - Snippet A) ---
+class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):  # MODIFIED CLASS NAME AND __INIT__
     model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
+    def __init__(self,
+                 audio_compression_rate: int = DEFAULT_COMPRESSION_RATE,  # ADDED DEFAULT
+                 audio_downsample_rate: int = DEFAULT_QFORMER_RATE,  # ADDED DEFAULT (maps to qformer_rate)
+                 audio_feat_stride: int = DEFAULT_FEAT_STRIDE,  # ADDED DEFAULT
+                 feature_size: int = DEFAULT_N_MELS,  # Added default based on constants
+                 sampling_rate: int = DEFAULT_SAMPLING_RATE,  # Added default based on constants
+                 padding_value: float = 0.0,  # Added default
+                 eightk_method: str = "fillzero",  # Added default for this custom param
+                 **kwargs):
+        # If feature_size, sampling_rate, padding_value are in kwargs, they will override defaults.
+        # The super().__init__ expects feature_size, sampling_rate, padding_value.
+        # We ensure they are passed, either from defaults or kwargs.
+        _feature_size = kwargs.pop("feature_size", feature_size)
+        _sampling_rate = kwargs.pop("sampling_rate", sampling_rate)
+        _padding_value = kwargs.pop("padding_value", padding_value)
+        super().__init__(feature_size=_feature_size, sampling_rate=_sampling_rate, padding_value=_padding_value,
+                         **kwargs)
         self.compression_rate = audio_compression_rate
+        self.qformer_compression_rate = audio_downsample_rate
         self.feat_stride = audio_feat_stride
+        self._eightk_method = eightk_method  # Use the argument, which has a default
+        # Ensure _sampling_rate is used for mel filterbank if it was overridden by kwargs for superclass
+        # However, Phi4M logic hardcodes 16000Hz for its mel parameters.
+        # self.sampling_rate from super() will be the target sampling rate.
+        if self.sampling_rate != 16000:
+            logger.warning(
+                f"The feature extractor's target sampling rate is {self.sampling_rate}, "
+                "but Phi4M-consistent Mel parameters are based on 16000 Hz. "
+                "This might lead to inconsistencies if the input audio is not resampled to 16000 Hz by this extractor."
+            )
         self._mel = create_mel_filterbank(
+            sampling_rate=16000,  # Phi4M Mel params are for 16kHz.
+            n_fft=512,
+            n_mels=_feature_size,  # Use the effective feature_size (should be 80)
+            fmin=0.0,
+            fmax=7690.0
         ).T
+        self._hamming400 = np.hamming(400)
+        self._hamming200 = np.hamming(200)
     def __call__(
             self,
+            audios: List[Union[AudioInput, Tuple[np.ndarray, int]]],
             return_tensors: Optional[Union[str, TensorType]] = None,
+            # sampling_rate: Optional[int] = None, # This was in original B, but Phi4M gets sr from AudioInput
     ):
         returned_input_audio_embeds = []
         returned_audio_embed_sizes = []
+        audio_frames_list = []
         for audio_input_item in audios:
             if not isinstance(audio_input_item, tuple) or len(audio_input_item) != 2:
                 raise ValueError(
                     "Each item in 'audios' must be a tuple (waveform: np.ndarray, sample_rate: int)."
                 )
+            audio_data, sample_rate = audio_input_item  # sample_rate is from the input audio
+            if isinstance(audio_data, list):
                 audio_data = np.array(audio_data, dtype=np.float32)
             if not isinstance(audio_data, np.ndarray):
                 raise TypeError(f"Waveform data must be a numpy array, got {type(audio_data)}")
+            # _extract_features will handle resampling to self.sampling_rate (16000 Hz)
+            audio_embeds_np = self._extract_features(audio_data, sample_rate)
             num_mel_frames = audio_embeds_np.shape[0]
+            current_audio_frames = num_mel_frames * self.feat_stride
             audio_embed_size = self._compute_audio_embed_size(current_audio_frames)
             max_audio_frames = tensor_audio_frames_list.max().item()
         returned_audio_attention_mask = None
+        if max_audio_frames > 0:
             if len(audios) > 1:
                 returned_audio_attention_mask = torch.arange(0, max_audio_frames,
                                                              device=tensor_audio_frames_list.device).unsqueeze(
                     0) < tensor_audio_frames_list.unsqueeze(1)
+            elif len(audios) == 1:
                 returned_audio_attention_mask = torch.ones(1, max_audio_frames, dtype=torch.bool,
                                                            device=tensor_audio_frames_list.device)
         return BatchFeature(data=data, tensor_type=return_tensors)
     def _extract_spectrogram(self, wav: np.ndarray, fs: int) -> np.ndarray:
+        # This method expects fs to be the original sampling rate of wav.
+        # It will resample to self.sampling_rate (16000Hz) or 8000Hz as needed.
         if wav.ndim > 1:
             wav = np.squeeze(wav)
+        if len(wav.shape) == 2:
+            wav = wav.mean(axis=1).astype(np.float32)
+        wav = wav.astype(np.float32)
+        current_fs = fs
+        if current_fs > self.sampling_rate:  # self.sampling_rate is 16000
+            wav = scipy.signal.resample_poly(wav, self.sampling_rate, current_fs)
+            current_fs = self.sampling_rate
+        elif 8000 < current_fs < self.sampling_rate:
+            wav = scipy.signal.resample_poly(wav, 8000, current_fs)
+            current_fs = 8000
+        elif current_fs < 8000 and current_fs > 0:
+            logger.warning(f"Sample rate {current_fs} is less than 8000Hz. Resampling to 8000Hz.")
+            wav = scipy.signal.resample_poly(wav, 8000, current_fs)
+            current_fs = 8000
+        elif current_fs <= 0:
+            raise RuntimeError(f"Unsupported sample rate {current_fs}")
+        # After this block, current_fs is either 16000Hz or 8000Hz, or an error was raised.
+        # Or it's the original fs if it was already 16000 or 8000.
+        if current_fs == 8000:
             if self._eightk_method == "resample":
+                wav = scipy.signal.resample_poly(wav, self.sampling_rate, 8000)
+                current_fs = self.sampling_rate
+        elif current_fs != self.sampling_rate:
+            # This case should ideally not be hit if logic above is correct and self.sampling_rate is 16000
             raise RuntimeError(
+                f"Audio sample rate {current_fs} not supported. Expected {self.sampling_rate} or 8000 for 8k methods.")
         preemphasis_coeff = 0.97
+        # current_fs is now the rate for STFT parameters (either 16000 or 8000 if fillzero)
+        if current_fs == 8000:  # This implies _eightk_method == "fillzero"
             n_fft, win_length, hop_length, fft_window = 256, 200, 80, self._hamming200
+        elif current_fs == 16000:  # This is the standard path
             n_fft, win_length, hop_length, fft_window = 512, 400, 160, self._hamming400
         else:
+            raise RuntimeError(f"Inconsistent fs {current_fs} for parameter selection. Should be 16000 or 8000.")
         if len(wav) < win_length:
             wav = np.pad(wav, (0, win_length - len(wav)), 'constant', constant_values=(0.0,))
         num_frames = (wav.shape[0] - win_length) // hop_length + 1
         if num_frames <= 0:
+            # For n_fft=512 (16k), output bins = 257. For n_fft=256 (8k), output bins = 129
+            # If fillzero for 8k, it will be padded to 257 later.
+            # So, the number of freq bins depends on n_fft here.
             return np.zeros((0, n_fft // 2 + 1), dtype=np.float32)
         y_frames = np.array(
         )
         _y_frames_rolled = np.roll(y_frames, 1, axis=1)
+        _y_frames_rolled[:, 0] = _y_frames_rolled[:, 1]
         y_frames_preemphasized = (y_frames - preemphasis_coeff * _y_frames_rolled) * 32768.0
         S = np.fft.rfft(fft_window * y_frames_preemphasized, n=n_fft, axis=1).astype(np.complex64)
+        if current_fs == 8000 and self._eightk_method == "fillzero":
+            # S.shape[1] is 129 for n_fft=256. Target is 257 for n_fft=512 equivalence.
             target_bins = (512 // 2) + 1
+            S_core = S[:, :-1]  # Drop 8kHz Nyquist bin (1 bin)
+            # Pad to target_bins. Number of columns to add: target_bins - S_core.shape[1]
             padarray = np.zeros((S_core.shape[0], target_bins - S_core.shape[1]), dtype=S.dtype)
             S = np.concatenate((S_core, padarray), axis=1)
     def _extract_features(self, wav: np.ndarray, fs: int) -> np.ndarray:
         spec = self._extract_spectrogram(wav, fs)
         if spec.shape[0] == 0:
+            # self.feature_size is n_mels (e.g. 80)
             return np.zeros((0, self.feature_size), dtype=np.float32)
         spec_power = spec ** 2
+        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)
         log_fbank = np.log(fbank_power).astype(np.float32)
         return log_fbank
     def _compute_audio_embed_size(self, audio_frames: int) -> int:
         integer = audio_frames // self.compression_rate
         remainder = audio_frames % self.compression_rate
         result = integer if remainder == 0 else integer + 1
         return result
+# The rest of your script (Gemma3ImagesKwargs, Gemma3ProcessorKwargs, Gemma3OmniProcessor) follows...
+# Make sure this Gemma3AudioFeatureExtractor class replaces the old one or
+# is correctly registered/named if your AutoProcessor setup relies on a specific name.
 # --- End of Refactored Audio Feature Extractor ---