voidful
/

gemma-3-omni-processor

Feature Extraction

Transformers

gemma_3_omni

custom_code

Model card Files Files and versions Community

voidful commited on 19 days ago

Commit

315e5b5

verified ·

1 Parent(s): 52ca1d3

Update processing_gemma3_omni.py

Browse files

Files changed (1) hide show

processing_gemma3_omni.py +57 -113

processing_gemma3_omni.py CHANGED Viewed

@@ -6,10 +6,10 @@ import numpy as np
 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
-from transformers.audio_utils import AudioInput  # type: ignore
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import make_nested_list_of_images  # If image processing is used
 from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs
 from transformers.utils import TensorType, to_py_obj, logging
@@ -19,12 +19,12 @@ DEFAULT_N_FFT = 512
 DEFAULT_WIN_LENGTH = 400
 DEFAULT_HOP_LENGTH = 160
 DEFAULT_N_MELS = 80
-DEFAULT_COMPRESSION_RATE = 4  # For _calculate_embed_length
-DEFAULT_QFORMER_RATE = 2  # For _calculate_embed_length
-DEFAULT_FEAT_STRIDE = 4  # For _calculate_embed_length / 'frames'
-IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"  # Not used in this file directly
-AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"  # Not used in this file directly
-DEFAULT_MAX_LENGTH = 16384  # For tokenizer default
 LOG_MEL_CLIP_EPSILON = 1e-5
 logger = logging.get_logger(__name__)
@@ -35,39 +35,35 @@ def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: flo
     """Create Mel filterbank for audio processing."""
     fmax = fmax or sampling_rate / 2.0
-    def hz_to_mel(f: float) -> float:  # User's formula
         return 1127.0 * math.log(1 + f / 700.0)
     if fmin >= fmax:
         raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
-    freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1)  # Inverse of user's hz_to_mel
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
-    bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(
-        int)  # (n_fft+1) or n_fft/2 ? Librosa uses n_fft//2 * hz / sr_nyquist
-    bins = np.clip(bins, 0, n_fft // 2)  # Max index for rfft output is n_fft//2
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
     for m_idx in range(n_mels):
         left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
-        if center > left:  # Rising slope
             filterbank[m_idx, left:center + 1] = (np.arange(left, center + 1) - left) / (center - left)
-        if right > center:  # Falling slope
-            # Need to ensure the peak is 1 if center was part of rising slope
-            # If left==center, this part creates the full triangle (rising is skipped)
             filterbank[m_idx, center:right + 1] = (right - np.arange(center, right + 1)) / (right - center)
         # Ensure the peak at 'center' is 1.0 if it's a valid point.
-        # This handles cases where left=center or center=right if the slopes don't perfectly set it.
         if left <= center <= right:
-            if filterbank.shape[1] > center:  # Check bounds for center index
                 if (center > left and filterbank[m_idx, center] < 1.0) or \
-                        (center < right and filterbank[m_idx, center] < 1.0) or \
-                        (left == center and center < right) or \
-                        (right == center and left < center):
                     filterbank[m_idx, center] = 1.0
     return filterbank
@@ -92,14 +88,14 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
     ):
         _win_length = win_length if win_length is not None else n_fft
         _hop_length = hop_length if hop_length is not None else _win_length // 4
         kwargs.pop("feature_size", None)
         kwargs.pop("sampling_rate", None)
         kwargs.pop("padding_value", None)
         super().__init__(
-            feature_size=n_mels,  # This is num_mel_bins
-            sampling_rate=sampling_rate,  # This is the target sampling rate for featurization
             padding_value=padding_value,
             **kwargs
         )
@@ -129,7 +125,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
     def __call__(
             self,
             audios: Union[AudioInput, List[AudioInput]],
-            sampling_rate: Optional[int] = None,  # SR of input raw audio arrays
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
@@ -138,19 +134,17 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         processed_mels: List[torch.Tensor] = []
         actual_mel_lengths: List[int] = []
-        # These lists are from your original code; their values might be used by Gemma3OmniProcessor later.
         sizes_for_downstream_calc: List[torch.Tensor] = []
         frames_scaled_for_downstream_calc: List[int] = []
         for audio_item in audios:
             current_wav_array: np.ndarray
-            source_sr: int  # Original sampling rate of the current_wav_array
             if isinstance(audio_item, tuple) and len(audio_item) == 2 and isinstance(audio_item[1], int):
                 current_wav_array, source_sr = audio_item
                 current_wav_array = np.asarray(current_wav_array, dtype=np.float32)
-            elif isinstance(audio_item, (np.ndarray, list)):  # Raw waveform as array/list
                 current_wav_array = np.asarray(audio_item, dtype=np.float32)
                 if sampling_rate is None:
                     raise ValueError(
@@ -159,44 +153,27 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
                     )
                 source_sr = sampling_rate
             else:
-                # If you expect to load from paths/bytes, you'd use transformers.audio_utils.load_audio here
                 raise TypeError(
                     f"Unsupported audio_item type: {type(audio_item)}. Expected np.ndarray, list of floats, "
                     "or Tuple[np.ndarray, int (sampling_rate)]."
                 )
-            logger.debug(
-                f"Gemma3AudioFeatureExtractor: Processing audio item with original shape {current_wav_array.shape}, source_sr {source_sr}")
-            # 1. Preprocess: convert to mono, resample to self.sampling_rate, normalize
             processed_wav_for_mel = self._preprocess_audio(current_wav_array, source_sr)
-            # 2. Compute Log-Mel Spectrogram: results in (NumFrames, self.n_mels)
             mel_spectrogram_np = self._compute_log_mel_spectrogram(processed_wav_for_mel)
-            logger.debug(f"Gemma3AudioFeatureExtractor: Computed mel_spectrogram shape: {mel_spectrogram_np.shape}")
             if not (mel_spectrogram_np.ndim == 2 and mel_spectrogram_np.shape[1] == self.n_mels):
-                # This check is important if _compute_log_mel_spectrogram could return variable shapes
-                logger.error(
-                    f"Mel spectrogram computation resulted in unexpected shape {mel_spectrogram_np.shape}. Expected (NumFrames, {self.n_mels})")
-                # Fallback to a zero-feature tensor of correct feature dimension but zero time, or handle error
-                # This indicates a problem in _compute_log_mel_spectrogram or very unusual input
-                # For now, let it proceed, but this would be an issue.
-                # If num_frames was 0, shape would be (0, n_mels), which is valid.
-            feature_tensor = torch.from_numpy(mel_spectrogram_np)  # Already float32
             processed_mels.append(feature_tensor)
-            actual_mel_lengths.append(feature_tensor.shape[0])  # Number of time frames
-            # Original logic for 'sizes' and 'frames' (kept for compatibility with your processor)
             sizes_for_downstream_calc.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
             frames_scaled_for_downstream_calc.append(feature_tensor.shape[0] * self.feat_stride)
-        # Pad the list of 2D Mel spectrograms to form a 3D batch
-        # Output shape: (Batch, MaxNumFrames, NumMels)
         audio_values_batched = pad_sequence(processed_mels, batch_first=True, padding_value=self.padding_value)
-        # Create attention mask for the padded batch
         max_t_mel_in_batch = audio_values_batched.shape[1]
         attention_mask_batched = torch.zeros(len(audios), max_t_mel_in_batch, dtype=torch.bool)
@@ -204,15 +181,13 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             attention_mask_batched[i, :length] = True
         output_data = {
-            "audio_values": audio_values_batched,  # Expected by model as (B, T, F)
-            "audio_attention_mask": attention_mask_batched  # Mask for "audio_values"
         }
-        if sizes_for_downstream_calc:  # If these are used by the OmniProcessor
             output_data["audio_values_sizes"] = torch.stack(sizes_for_downstream_calc)
-        logger.info(
-            f"Gemma3AudioFeatureExtractor: Final 'audio_values' batch shape: {output_data['audio_values'].shape}")
         return BatchFeature(data=output_data, tensor_type=return_tensors)
     def _preprocess_audio(self, wav: np.ndarray, source_sr: int) -> np.ndarray:
@@ -229,15 +204,14 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             wav = wav.mean(axis=0)
         if source_sr != self.sampling_rate:
-            # logger.info(f"Resampling audio from {source_sr} Hz to {self.sampling_rate} Hz.")
             common_divisor = math.gcd(self.sampling_rate, source_sr)
             up_factor = self.sampling_rate // common_divisor
             down_factor = source_sr // common_divisor
-            if up_factor != down_factor:
                 wav = scipy.signal.resample_poly(wav, up=up_factor, down=down_factor)
         max_abs_val = np.abs(wav).max()
-        if max_abs_val > 1e-7:  # Avoid division by zero/small numbers for silent/near-silent audio
             wav = wav / max_abs_val
         return wav
@@ -249,11 +223,10 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         if len(wav) >= self.win_length:
             num_frames = 1 + (len(wav) - self.win_length) // self.hop_length
         else:
-            num_frames = 0  # Should be caught by the padding above, but defensive.
         if num_frames <= 0:
-            # logger.warning(...)
-            return np.zeros((0, self.n_mels), dtype=np.float32)  # Return shape (0, N_Mels)
         frames_view = np.lib.stride_tricks.as_strided(
             wav,
@@ -261,7 +234,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             strides=(wav.strides[0] * self.hop_length, wav.strides[0]),
             writeable=False
         )
-        frames_data = frames_view.copy()  # Ensure it's a copy before in-place modification
         frames_data *= self.window
         spectrum = np.fft.rfft(frames_data, n=self.n_fft, axis=-1).astype(np.complex64)
@@ -301,7 +274,7 @@ class Gemma3OmniProcessor(ProcessorMixin):
     valid_kwargs = ["chat_template", "image_seq_length"]
     image_processor_class = "AutoImageProcessor"
-    audio_processor_class = "AutoFeatureExtractor"  # CRITICAL: Must be string name of your custom class
     tokenizer_class = "AutoTokenizer"
     def __init__(
@@ -313,9 +286,6 @@ class Gemma3OmniProcessor(ProcessorMixin):
             image_seq_length: int = 256,
             **kwargs
     ):
-        # ProcessorMixin.__init__ handles instantiation of audio_processor, image_processor, tokenizer
-        # if they are None when passed to it, using the *_class attributes defined above.
-        # If actual instances are passed (e.g., from from_pretrained), they will be used.
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
@@ -324,21 +294,16 @@ class Gemma3OmniProcessor(ProcessorMixin):
             **kwargs
         )
-        # These attributes depend on self.tokenizer being properly initialized by super()
         self.image_seq_length = image_seq_length
         if self.tokenizer is not None:
-            # Use getattr for robustness, providing defaults if attributes are missing
             self.image_token_id = getattr(self.tokenizer, "image_token_id",
                                           self.tokenizer.unk_token_id if hasattr(self.tokenizer,
                                                                                  "unk_token_id") else None)
-            self.boi_token = getattr(self.tokenizer, "boi_token", "<image>")  # More common default
             self.image_token = getattr(self.tokenizer, "image_token", "<image>")
-            self.eoi_token = getattr(self.tokenizer, "eoi_token", "")  # Default to empty if not present
-            # User's original attributes for audio tokens
             self.audio_token_str_from_user_code = "<audio_soft_token>"
-            # self.expected_audio_token_id = 262143 # User's reference, keep commented for minimal change
             self.audio_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_token_str_from_user_code)
             if hasattr(self.tokenizer, "unk_token_id") and self.audio_token_id == self.tokenizer.unk_token_id:
                 logger.warning(
@@ -347,7 +312,6 @@ class Gemma3OmniProcessor(ProcessorMixin):
                 )
             self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * image_seq_length)}{self.eoi_token}\n\n"
         else:
-            # This state (tokenizer is None after super init) should ideally not occur if from_pretrained works.
             logger.error(
                 "Gemma3OmniProcessor initialized, but self.tokenizer is None. Token-dependent attributes will use placeholders or defaults.")
             self.image_token_id = None
@@ -355,17 +319,15 @@ class Gemma3OmniProcessor(ProcessorMixin):
             self.image_token = "<image>"
             self.eoi_token = ""
             self.audio_token_str_from_user_code = "<audio_soft_token>"
-            self.audio_token_id = -1  # Placeholder
             self.full_image_sequence = ""
-        # These are parameters for this processor's logic for number of audio tokens in prompt
         self.prompt_audio_compression_rate = kwargs.pop("audio_prompt_compression_rate", 8)
         self.prompt_audio_qformer_rate = kwargs.pop("audio_prompt_qformer_rate", 1)
         self.prompt_audio_feat_stride = kwargs.pop("audio_prompt_feat_stride", 1)
         self.audio_placeholder_token = kwargs.pop("audio_placeholder_token", "<|audio_placeholder|>")
     def _merge_kwargs(self, KwargsClassWithDefaults, tokenizer_init_kwargs, **kwargs_from_call):
-        # This method merges default kwargs, tokenizer init kwargs, and call-specific kwargs
         final_kwargs = {}
         _defaults = getattr(KwargsClassWithDefaults, "_defaults", {})
         if not isinstance(_defaults, dict): _defaults = {}
@@ -374,20 +336,20 @@ class Gemma3OmniProcessor(ProcessorMixin):
             final_kwargs[modality_key] = default_modality_kwargs.copy()
         for modality_key_in_call, modality_kwargs_in_call in kwargs_from_call.items():
-            if modality_key_in_call in final_kwargs:  # e.g. "text_kwargs"
                 if isinstance(modality_kwargs_in_call, dict):
                     final_kwargs[modality_key_in_call].update(modality_kwargs_in_call)
-            elif isinstance(modality_kwargs_in_call, dict):  # New modality not in _defaults (e.g. "video_kwargs")
                 final_kwargs[modality_key_in_call] = modality_kwargs_in_call.copy()
-        if self.tokenizer:  # Ensure tokenizer is available for its init_kwargs
             for modality_key in final_kwargs:
                 modality_dict = final_kwargs[modality_key]
                 if isinstance(modality_dict, dict):
                     for key_in_mod_dict in list(modality_dict.keys()):
-                        if key_in_mod_dict in tokenizer_init_kwargs:  # tokenizer_init_kwargs from self.tokenizer.init_kwargs
                             value = (
-                                getattr(self.tokenizer, key_in_mod_dict)  # Check actual tokenizer attribute first
                                 if hasattr(self.tokenizer, key_in_mod_dict)
                                 else tokenizer_init_kwargs[key_in_mod_dict]
                             )
@@ -395,7 +357,6 @@ class Gemma3OmniProcessor(ProcessorMixin):
         if "text_kwargs" not in final_kwargs:
             final_kwargs["text_kwargs"] = {}
-        # Ensure these text_kwargs have defaults if not set otherwise
         final_kwargs["text_kwargs"]["truncation"] = final_kwargs["text_kwargs"].get("truncation", False)
         final_kwargs["text_kwargs"]["max_length"] = final_kwargs["text_kwargs"].get("max_length", DEFAULT_MAX_LENGTH)
@@ -418,23 +379,19 @@ class Gemma3OmniProcessor(ProcessorMixin):
         if text is None and images is None and audios is None:
             raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
-        # Determine final return_tensors strategy (explicit __call__ arg > from text_kwargs > default)
         final_rt = return_tensors
-        # _merge_kwargs uses Gemma3ProcessorKwargs to structure the **kwargs from __call__
         merged_call_kwargs = self._merge_kwargs(
-            Gemma3ProcessorKwargs,  # Class defining _defaults structure
             self.tokenizer.init_kwargs if hasattr(self.tokenizer, 'init_kwargs') else {},
             **kwargs
         )
-        if final_rt is None:  # If not passed directly to __call__
-            # Get from merged_call_kwargs (which would have picked it up from kwargs['text_kwargs'])
-            # and remove it to prevent passing twice to tokenizer
             final_rt = merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", TensorType.PYTORCH)
-        else:  # If passed directly, ensure it's removed from text_kwargs to avoid conflict
             merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
-        if text is None:  # Default text if only other modalities are provided
             num_samples = 0
             if images is not None:
                 _images_list = images if isinstance(images, list) and (
@@ -443,13 +400,12 @@ class Gemma3OmniProcessor(ProcessorMixin):
             elif audios is not None:
                 _audios_list = audios if isinstance(audios, list) else [audios]
                 num_samples = len(_audios_list)
-            text = [""] * num_samples if num_samples > 0 else [""]  # Create empty strings or one if no samples
         if isinstance(text, str): text = [text]
         if not (isinstance(text, list) and all(isinstance(t, str) for t in text)):
             raise ValueError("Input `text` must be a string or a list of strings.")
-        # --- Image Processing (User's structure) ---
         image_features_dict = {}
         if images is not None:
             if self.image_processor is None: raise ValueError("Images provided but self.image_processor is None.")
@@ -459,7 +415,6 @@ class Gemma3OmniProcessor(ProcessorMixin):
             image_features_dict = _img_proc_output.data if isinstance(_img_proc_output,
                                                                       BatchFeature) else _img_proc_output
-            # Adjust text based on images (user's original logic)
             if len(text) == 0 and len(batched_images) > 0: text = [" ".join([self.boi_token] * len(img_batch)) for
                                                                    img_batch in batched_images]
             if len(batched_images) != len(text): raise ValueError(
@@ -496,18 +451,14 @@ class Gemma3OmniProcessor(ProcessorMixin):
                 text = temp_text_img
                 text = [p.replace(self.boi_token, self.full_image_sequence) for p in text]
-        # --- Audio Processing ---
         audio_features_dict = {}
         if audios is not None:
             if self.audio_processor is None: raise ValueError("Audios provided but self.audio_processor is None.")
             audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
-            if sampling_rate is not None: audio_call_kwargs[
-                "sampling_rate"] = sampling_rate  # Pass SR to feature extractor
             _audio_proc_output = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
             audio_features_dict = _audio_proc_output.data
-            logger.info(
-                f"Gemma3OmniProcessor: 'audio_values' shape from Feature Extractor: {audio_features_dict['audio_values'].shape}")
             new_text_with_audio, actual_mel_frames_per_sample = [], to_py_obj(
                 audio_features_dict["audio_attention_mask"].sum(axis=-1))
@@ -516,9 +467,8 @@ class Gemma3OmniProcessor(ProcessorMixin):
             for i, prompt in enumerate(text):
                 num_soft_tokens = self._compute_audio_embed_size(actual_mel_frames_per_sample[i])
-                audio_token_sequence_str = self.audio_token_str_from_user_code * num_soft_tokens  # e.g. "<audio_soft_token>" * N
-                # User's original boa_token for replacement was " ", which is risky. Using defined placeholder.
                 if self.audio_placeholder_token in prompt:
                     prompt = prompt.replace(self.audio_placeholder_token, audio_token_sequence_str, 1)
                 else:
@@ -526,10 +476,9 @@ class Gemma3OmniProcessor(ProcessorMixin):
                 new_text_with_audio.append(prompt)
             text = new_text_with_audio
-        # --- Text Tokenization ---
         text_tokenizer_kwargs = merged_call_kwargs.get("text_kwargs", {})
         text_features_dict = self.tokenizer(text=text, return_tensors=None,
-                                            **text_tokenizer_kwargs)  # Get lists/np.arrays
         input_ids_list_of_lists = text_features_dict["input_ids"]
         if not isinstance(input_ids_list_of_lists, list) or not (
@@ -551,11 +500,6 @@ class Gemma3OmniProcessor(ProcessorMixin):
             token_type_ids_list.append(types)
         text_features_dict["token_type_ids"] = token_type_ids_list
-        # Ensure text_features_dict also has 'attention_mask' if tokenizer applied padding
-        # If tokenizer was called with padding=True/strategy, it would add 'attention_mask'
-        # If called with padding=False (default), 'attention_mask' might be missing or all 1s.
-        # BatchFeature will handle final tensor conversion and padding based on final_rt.
         final_batch_data = {**text_features_dict}
         if image_features_dict: final_batch_data.update(image_features_dict)
         if audio_features_dict: final_batch_data.update(audio_features_dict)
@@ -581,7 +525,7 @@ class Gemma3OmniProcessor(ProcessorMixin):
                 hasattr(self.audio_processor, 'model_input_names'):
             input_names.update(self.audio_processor.model_input_names)
         elif hasattr(self,
-                     'audio_processor') and self.audio_processor is not None:  # Fallback if model_input_names not on custom audio_processor
             input_names.update(["audio_values", "audio_attention_mask"])
         return list(input_names)

 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
+from transformers.audio_utils import AudioInput # type: ignore
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import make_nested_list_of_images # If image processing is used
 from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs
 from transformers.utils import TensorType, to_py_obj, logging
 DEFAULT_WIN_LENGTH = 400
 DEFAULT_HOP_LENGTH = 160
 DEFAULT_N_MELS = 80
+DEFAULT_COMPRESSION_RATE = 4
+DEFAULT_QFORMER_RATE = 2
+DEFAULT_FEAT_STRIDE = 4
+IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
+AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
+DEFAULT_MAX_LENGTH = 16384
 LOG_MEL_CLIP_EPSILON = 1e-5
 logger = logging.get_logger(__name__)
     """Create Mel filterbank for audio processing."""
     fmax = fmax or sampling_rate / 2.0
+    def hz_to_mel(f: float) -> float:
         return 1127.0 * math.log(1 + f / 700.0)
     if fmin >= fmax:
         raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
+    freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1) # Inverse of user's hz_to_mel
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
+    bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
+    bins = np.clip(bins, 0, n_fft // 2) # Max index for rfft output is n_fft//2
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
     for m_idx in range(n_mels):
         left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
+        if center > left: # Rising slope
             filterbank[m_idx, left:center + 1] = (np.arange(left, center + 1) - left) / (center - left)
+        if right > center: # Falling slope
             filterbank[m_idx, center:right + 1] = (right - np.arange(center, right + 1)) / (right - center)
         # Ensure the peak at 'center' is 1.0 if it's a valid point.
         if left <= center <= right:
+            if filterbank.shape[1] > center:
                 if (center > left and filterbank[m_idx, center] < 1.0) or \
+                   (center < right and filterbank[m_idx, center] < 1.0) or \
+                   (left == center and center < right) or \
+                   (right == center and left < center):
                     filterbank[m_idx, center] = 1.0
     return filterbank
     ):
         _win_length = win_length if win_length is not None else n_fft
         _hop_length = hop_length if hop_length is not None else _win_length // 4
         kwargs.pop("feature_size", None)
         kwargs.pop("sampling_rate", None)
         kwargs.pop("padding_value", None)
         super().__init__(
+            feature_size=n_mels,
+            sampling_rate=sampling_rate,
             padding_value=padding_value,
             **kwargs
         )
     def __call__(
             self,
             audios: Union[AudioInput, List[AudioInput]],
+            sampling_rate: Optional[int] = None,
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
         processed_mels: List[torch.Tensor] = []
         actual_mel_lengths: List[int] = []
         sizes_for_downstream_calc: List[torch.Tensor] = []
         frames_scaled_for_downstream_calc: List[int] = []
         for audio_item in audios:
             current_wav_array: np.ndarray
+            source_sr: int
             if isinstance(audio_item, tuple) and len(audio_item) == 2 and isinstance(audio_item[1], int):
                 current_wav_array, source_sr = audio_item
                 current_wav_array = np.asarray(current_wav_array, dtype=np.float32)
+            elif isinstance(audio_item, (np.ndarray, list)):
                 current_wav_array = np.asarray(audio_item, dtype=np.float32)
                 if sampling_rate is None:
                     raise ValueError(
                     )
                 source_sr = sampling_rate
             else:
                 raise TypeError(
                     f"Unsupported audio_item type: {type(audio_item)}. Expected np.ndarray, list of floats, "
                     "or Tuple[np.ndarray, int (sampling_rate)]."
                 )
             processed_wav_for_mel = self._preprocess_audio(current_wav_array, source_sr)
             mel_spectrogram_np = self._compute_log_mel_spectrogram(processed_wav_for_mel)
             if not (mel_spectrogram_np.ndim == 2 and mel_spectrogram_np.shape[1] == self.n_mels):
+                # This could indicate an issue in _compute_log_mel_spectrogram or very unusual input.
+                # Depending on downstream requirements, this might need more robust error handling or a clear fallback.
+                pass # Allowing to proceed, but output shape might be unexpected.
+            feature_tensor = torch.from_numpy(mel_spectrogram_np)
             processed_mels.append(feature_tensor)
+            actual_mel_lengths.append(feature_tensor.shape[0])
             sizes_for_downstream_calc.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
             frames_scaled_for_downstream_calc.append(feature_tensor.shape[0] * self.feat_stride)
         audio_values_batched = pad_sequence(processed_mels, batch_first=True, padding_value=self.padding_value)
         max_t_mel_in_batch = audio_values_batched.shape[1]
         attention_mask_batched = torch.zeros(len(audios), max_t_mel_in_batch, dtype=torch.bool)
             attention_mask_batched[i, :length] = True
         output_data = {
+            "audio_values": audio_values_batched,
+            "audio_attention_mask": attention_mask_batched
         }
+        if sizes_for_downstream_calc:
             output_data["audio_values_sizes"] = torch.stack(sizes_for_downstream_calc)
         return BatchFeature(data=output_data, tensor_type=return_tensors)
     def _preprocess_audio(self, wav: np.ndarray, source_sr: int) -> np.ndarray:
             wav = wav.mean(axis=0)
         if source_sr != self.sampling_rate:
             common_divisor = math.gcd(self.sampling_rate, source_sr)
             up_factor = self.sampling_rate // common_divisor
             down_factor = source_sr // common_divisor
+            if up_factor != down_factor: # Avoid resampling if factors are identical
                 wav = scipy.signal.resample_poly(wav, up=up_factor, down=down_factor)
         max_abs_val = np.abs(wav).max()
+        if max_abs_val > 1e-7:
             wav = wav / max_abs_val
         return wav
         if len(wav) >= self.win_length:
             num_frames = 1 + (len(wav) - self.win_length) // self.hop_length
         else:
+            num_frames = 0
         if num_frames <= 0:
+            return np.zeros((0, self.n_mels), dtype=np.float32) # Return shape (0, N_Mels)
         frames_view = np.lib.stride_tricks.as_strided(
             wav,
             strides=(wav.strides[0] * self.hop_length, wav.strides[0]),
             writeable=False
         )
+        frames_data = frames_view.copy()
         frames_data *= self.window
         spectrum = np.fft.rfft(frames_data, n=self.n_fft, axis=-1).astype(np.complex64)
     valid_kwargs = ["chat_template", "image_seq_length"]
     image_processor_class = "AutoImageProcessor"
+    audio_processor_class = "AutoFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
     def __init__(
             image_seq_length: int = 256,
             **kwargs
     ):
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
             **kwargs
         )
         self.image_seq_length = image_seq_length
         if self.tokenizer is not None:
             self.image_token_id = getattr(self.tokenizer, "image_token_id",
                                           self.tokenizer.unk_token_id if hasattr(self.tokenizer,
                                                                                  "unk_token_id") else None)
+            self.boi_token = getattr(self.tokenizer, "boi_token", "<image>")
             self.image_token = getattr(self.tokenizer, "image_token", "<image>")
+            self.eoi_token = getattr(self.tokenizer, "eoi_token", "")
             self.audio_token_str_from_user_code = "<audio_soft_token>"
             self.audio_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_token_str_from_user_code)
             if hasattr(self.tokenizer, "unk_token_id") and self.audio_token_id == self.tokenizer.unk_token_id:
                 logger.warning(
                 )
             self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * image_seq_length)}{self.eoi_token}\n\n"
         else:
             logger.error(
                 "Gemma3OmniProcessor initialized, but self.tokenizer is None. Token-dependent attributes will use placeholders or defaults.")
             self.image_token_id = None
             self.image_token = "<image>"
             self.eoi_token = ""
             self.audio_token_str_from_user_code = "<audio_soft_token>"
+            self.audio_token_id = -1
             self.full_image_sequence = ""
         self.prompt_audio_compression_rate = kwargs.pop("audio_prompt_compression_rate", 8)
         self.prompt_audio_qformer_rate = kwargs.pop("audio_prompt_qformer_rate", 1)
         self.prompt_audio_feat_stride = kwargs.pop("audio_prompt_feat_stride", 1)
         self.audio_placeholder_token = kwargs.pop("audio_placeholder_token", "<|audio_placeholder|>")
     def _merge_kwargs(self, KwargsClassWithDefaults, tokenizer_init_kwargs, **kwargs_from_call):
         final_kwargs = {}
         _defaults = getattr(KwargsClassWithDefaults, "_defaults", {})
         if not isinstance(_defaults, dict): _defaults = {}
             final_kwargs[modality_key] = default_modality_kwargs.copy()
         for modality_key_in_call, modality_kwargs_in_call in kwargs_from_call.items():
+            if modality_key_in_call in final_kwargs:
                 if isinstance(modality_kwargs_in_call, dict):
                     final_kwargs[modality_key_in_call].update(modality_kwargs_in_call)
+            elif isinstance(modality_kwargs_in_call, dict):
                 final_kwargs[modality_key_in_call] = modality_kwargs_in_call.copy()
+        if self.tokenizer:
             for modality_key in final_kwargs:
                 modality_dict = final_kwargs[modality_key]
                 if isinstance(modality_dict, dict):
                     for key_in_mod_dict in list(modality_dict.keys()):
+                        if key_in_mod_dict in tokenizer_init_kwargs:
                             value = (
+                                getattr(self.tokenizer, key_in_mod_dict)
                                 if hasattr(self.tokenizer, key_in_mod_dict)
                                 else tokenizer_init_kwargs[key_in_mod_dict]
                             )
         if "text_kwargs" not in final_kwargs:
             final_kwargs["text_kwargs"] = {}
         final_kwargs["text_kwargs"]["truncation"] = final_kwargs["text_kwargs"].get("truncation", False)
         final_kwargs["text_kwargs"]["max_length"] = final_kwargs["text_kwargs"].get("max_length", DEFAULT_MAX_LENGTH)
         if text is None and images is None and audios is None:
             raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
         final_rt = return_tensors
         merged_call_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,
             self.tokenizer.init_kwargs if hasattr(self.tokenizer, 'init_kwargs') else {},
             **kwargs
         )
+        if final_rt is None:
             final_rt = merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", TensorType.PYTORCH)
+        else:
             merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
+        if text is None:
             num_samples = 0
             if images is not None:
                 _images_list = images if isinstance(images, list) and (
             elif audios is not None:
                 _audios_list = audios if isinstance(audios, list) else [audios]
                 num_samples = len(_audios_list)
+            text = [""] * num_samples if num_samples > 0 else [""]
         if isinstance(text, str): text = [text]
         if not (isinstance(text, list) and all(isinstance(t, str) for t in text)):
             raise ValueError("Input `text` must be a string or a list of strings.")
         image_features_dict = {}
         if images is not None:
             if self.image_processor is None: raise ValueError("Images provided but self.image_processor is None.")
             image_features_dict = _img_proc_output.data if isinstance(_img_proc_output,
                                                                       BatchFeature) else _img_proc_output
             if len(text) == 0 and len(batched_images) > 0: text = [" ".join([self.boi_token] * len(img_batch)) for
                                                                    img_batch in batched_images]
             if len(batched_images) != len(text): raise ValueError(
                 text = temp_text_img
                 text = [p.replace(self.boi_token, self.full_image_sequence) for p in text]
         audio_features_dict = {}
         if audios is not None:
             if self.audio_processor is None: raise ValueError("Audios provided but self.audio_processor is None.")
             audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
+            if sampling_rate is not None: audio_call_kwargs["sampling_rate"] = sampling_rate
             _audio_proc_output = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
             audio_features_dict = _audio_proc_output.data
             new_text_with_audio, actual_mel_frames_per_sample = [], to_py_obj(
                 audio_features_dict["audio_attention_mask"].sum(axis=-1))
             for i, prompt in enumerate(text):
                 num_soft_tokens = self._compute_audio_embed_size(actual_mel_frames_per_sample[i])
+                audio_token_sequence_str = self.audio_token_str_from_user_code * num_soft_tokens
                 if self.audio_placeholder_token in prompt:
                     prompt = prompt.replace(self.audio_placeholder_token, audio_token_sequence_str, 1)
                 else:
                 new_text_with_audio.append(prompt)
             text = new_text_with_audio
         text_tokenizer_kwargs = merged_call_kwargs.get("text_kwargs", {})
         text_features_dict = self.tokenizer(text=text, return_tensors=None,
+                                            **text_tokenizer_kwargs)
         input_ids_list_of_lists = text_features_dict["input_ids"]
         if not isinstance(input_ids_list_of_lists, list) or not (
             token_type_ids_list.append(types)
         text_features_dict["token_type_ids"] = token_type_ids_list
         final_batch_data = {**text_features_dict}
         if image_features_dict: final_batch_data.update(image_features_dict)
         if audio_features_dict: final_batch_data.update(audio_features_dict)
                 hasattr(self.audio_processor, 'model_input_names'):
             input_names.update(self.audio_processor.model_input_names)
         elif hasattr(self,
+                     'audio_processor') and self.audio_processor is not None:
             input_names.update(["audio_values", "audio_attention_mask"])
         return list(input_names)