voidful
/

gemma-3-omni-processor

Feature Extraction

Transformers

gemma_3_omni

custom_code

Model card Files Files and versions Community

voidful commited on May 16

Commit

3fa62c9

verified ·

1 Parent(s): 315e5b5

Update processing_gemma3_omni.py

Browse files

Files changed (1) hide show

processing_gemma3_omni.py +347 -250

processing_gemma3_omni.py CHANGED Viewed

@@ -1,253 +1,263 @@
 import re
-from typing import List, Optional, Union, Dict, Any
 import math
 import numpy as np
 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
-from transformers.audio_utils import AudioInput # type: ignore
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import make_nested_list_of_images # If image processing is used
 from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs
 from transformers.utils import TensorType, to_py_obj, logging
 # Constants
 DEFAULT_SAMPLING_RATE = 16000
 DEFAULT_N_FFT = 512
-DEFAULT_WIN_LENGTH = 400
-DEFAULT_HOP_LENGTH = 160
 DEFAULT_N_MELS = 80
-DEFAULT_COMPRESSION_RATE = 4
-DEFAULT_QFORMER_RATE = 2
-DEFAULT_FEAT_STRIDE = 4
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
-LOG_MEL_CLIP_EPSILON = 1e-5
 logger = logging.get_logger(__name__)
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing."""
     fmax = fmax or sampling_rate / 2.0
-    def hz_to_mel(f: float) -> float:
         return 1127.0 * math.log(1 + f / 700.0)
     if fmin >= fmax:
         raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
-    freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1) # Inverse of user's hz_to_mel
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
     bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
-    bins = np.clip(bins, 0, n_fft // 2) # Max index for rfft output is n_fft//2
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
     for m_idx in range(n_mels):
         left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
-        if center > left: # Rising slope
             filterbank[m_idx, left:center + 1] = (np.arange(left, center + 1) - left) / (center - left)
-        if right > center: # Falling slope
             filterbank[m_idx, center:right + 1] = (right - np.arange(center, right + 1)) / (right - center)
         # Ensure the peak at 'center' is 1.0 if it's a valid point.
-        if left <= center <= right:
-            if filterbank.shape[1] > center:
-                if (center > left and filterbank[m_idx, center] < 1.0) or \
-                   (center < right and filterbank[m_idx, center] < 1.0) or \
-                   (left == center and center < right) or \
-                   (right == center and left < center):
                     filterbank[m_idx, center] = 1.0
     return filterbank
 class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
-    model_input_names = ["audio_values", "audio_attention_mask"]
-    def __init__(
             self,
-            compression_rate: int = DEFAULT_COMPRESSION_RATE,
-            qformer_rate: int = DEFAULT_QFORMER_RATE,
-            feat_stride: int = DEFAULT_FEAT_STRIDE,
-            sampling_rate: int = DEFAULT_SAMPLING_RATE,
-            n_fft: int = DEFAULT_N_FFT,
-            win_length: Optional[int] = None,
-            hop_length: Optional[int] = None,
-            n_mels: int = DEFAULT_N_MELS,
-            f_min: float = 0.0,
-            f_max: Optional[float] = None,
-            padding_value: float = 0.0,
-            **kwargs
     ):
-        _win_length = win_length if win_length is not None else n_fft
-        _hop_length = hop_length if hop_length is not None else _win_length // 4
-        kwargs.pop("feature_size", None)
-        kwargs.pop("sampling_rate", None)
-        kwargs.pop("padding_value", None)
-        super().__init__(
-            feature_size=n_mels,
-            sampling_rate=sampling_rate,
-            padding_value=padding_value,
-            **kwargs
-        )
-        self.compression_rate = compression_rate
-        self.qformer_rate = qformer_rate
-        self.feat_stride = feat_stride
-        # self.sampling_rate is set by super() to the target rate
-        self.n_fft = n_fft
-        self.win_length = _win_length
-        self.hop_length = _hop_length
-        self.n_mels = n_mels
-        self.f_min = f_min
-        self.f_max = f_max if f_max is not None else self.sampling_rate / 2.0
-        if self.win_length > self.n_fft:
-            logger.warning(
-                f"win_length ({self.win_length}) is greater than n_fft ({self.n_fft}). "
-                "Window will be applied, then data zero-padded/truncated to n_fft by np.fft.rfft."
-            )
-        self.window = np.hamming(self.win_length).astype(np.float32)
-        self.mel_filterbank = create_mel_filterbank(
-            self.sampling_rate, self.n_fft, self.n_mels, fmin=self.f_min, fmax=self.f_max
-        ).T
-    def __call__(
-            self,
-            audios: Union[AudioInput, List[AudioInput]],
-            sampling_rate: Optional[int] = None,
-            return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
-    ) -> BatchFeature:
-        if not isinstance(audios, list):
-            audios = [audios]
-        processed_mels: List[torch.Tensor] = []
-        actual_mel_lengths: List[int] = []
-        sizes_for_downstream_calc: List[torch.Tensor] = []
-        frames_scaled_for_downstream_calc: List[int] = []
-        for audio_item in audios:
-            current_wav_array: np.ndarray
-            source_sr: int
-            if isinstance(audio_item, tuple) and len(audio_item) == 2 and isinstance(audio_item[1], int):
-                current_wav_array, source_sr = audio_item
-                current_wav_array = np.asarray(current_wav_array, dtype=np.float32)
-            elif isinstance(audio_item, (np.ndarray, list)):
-                current_wav_array = np.asarray(audio_item, dtype=np.float32)
-                if sampling_rate is None:
-                    raise ValueError(
-                        "sampling_rate argument must be provided to __call__ if 'audios' items "
-                        "are raw numpy arrays or lists (without embedded sampling rate info)."
-                    )
-                source_sr = sampling_rate
-            else:
-                raise TypeError(
-                    f"Unsupported audio_item type: {type(audio_item)}. Expected np.ndarray, list of floats, "
-                    "or Tuple[np.ndarray, int (sampling_rate)]."
-                )
-            processed_wav_for_mel = self._preprocess_audio(current_wav_array, source_sr)
-            mel_spectrogram_np = self._compute_log_mel_spectrogram(processed_wav_for_mel)
-            if not (mel_spectrogram_np.ndim == 2 and mel_spectrogram_np.shape[1] == self.n_mels):
-                # This could indicate an issue in _compute_log_mel_spectrogram or very unusual input.
-                # Depending on downstream requirements, this might need more robust error handling or a clear fallback.
-                pass # Allowing to proceed, but output shape might be unexpected.
-            feature_tensor = torch.from_numpy(mel_spectrogram_np)
-            processed_mels.append(feature_tensor)
-            actual_mel_lengths.append(feature_tensor.shape[0])
-            sizes_for_downstream_calc.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
-            frames_scaled_for_downstream_calc.append(feature_tensor.shape[0] * self.feat_stride)
-        audio_values_batched = pad_sequence(processed_mels, batch_first=True, padding_value=self.padding_value)
-        max_t_mel_in_batch = audio_values_batched.shape[1]
-        attention_mask_batched = torch.zeros(len(audios), max_t_mel_in_batch, dtype=torch.bool)
-        for i, length in enumerate(actual_mel_lengths):
-            attention_mask_batched[i, :length] = True
-        output_data = {
-            "audio_values": audio_values_batched,
-            "audio_attention_mask": attention_mask_batched
-        }
-        if sizes_for_downstream_calc:
-            output_data["audio_values_sizes"] = torch.stack(sizes_for_downstream_calc)
-        return BatchFeature(data=output_data, tensor_type=return_tensors)
-    def _preprocess_audio(self, wav: np.ndarray, source_sr: int) -> np.ndarray:
-        if wav.dtype not in [np.float32, np.float64]:
-            if np.issubdtype(wav.dtype, np.integer):
-                max_val = np.iinfo(wav.dtype).max if wav.size > 0 else 1.0
-                wav = wav.astype(np.float32) / max_val
-            else:
-                wav = wav.astype(np.float32)
-        elif wav.dtype == np.float64:
-            wav = wav.astype(np.float32)
-        if wav.ndim > 1:
-            wav = wav.mean(axis=0)
-        if source_sr != self.sampling_rate:
-            common_divisor = math.gcd(self.sampling_rate, source_sr)
-            up_factor = self.sampling_rate // common_divisor
-            down_factor = source_sr // common_divisor
-            if up_factor != down_factor: # Avoid resampling if factors are identical
-                wav = scipy.signal.resample_poly(wav, up=up_factor, down=down_factor)
-        max_abs_val = np.abs(wav).max()
-        if max_abs_val > 1e-7:
-            wav = wav / max_abs_val
-        return wav
-    def _compute_log_mel_spectrogram(self, wav: np.ndarray) -> np.ndarray:
-        if len(wav) < self.win_length:
-            padding = self.win_length - len(wav)
-            wav = np.pad(wav, (0, padding), mode='constant', constant_values=0.0)
-        if len(wav) >= self.win_length:
-            num_frames = 1 + (len(wav) - self.win_length) // self.hop_length
-        else:
-            num_frames = 0
-        if num_frames <= 0:
-            return np.zeros((0, self.n_mels), dtype=np.float32) # Return shape (0, N_Mels)
-        frames_view = np.lib.stride_tricks.as_strided(
-            wav,
-            shape=(num_frames, self.win_length),
-            strides=(wav.strides[0] * self.hop_length, wav.strides[0]),
-            writeable=False
-        )
-        frames_data = frames_view.copy()
-        frames_data *= self.window
-        spectrum = np.fft.rfft(frames_data, n=self.n_fft, axis=-1).astype(np.complex64)
-        power = np.abs(spectrum) ** 2
-        mel_spectrogram = np.dot(power, self.mel_filterbank)
-        mel_spectrogram = np.clip(mel_spectrogram, LOG_MEL_CLIP_EPSILON, None)
-        log_mel_spectrogram = np.log(mel_spectrogram)
-        return log_mel_spectrogram.astype(np.float32)
-    def _calculate_embed_length(self, frame_count: int) -> int:
-        compressed = math.ceil(frame_count / self.compression_rate)
-        return math.ceil(compressed / self.qformer_rate)
 class Gemma3ImagesKwargs(ImagesKwargs):
@@ -280,7 +290,7 @@ class Gemma3OmniProcessor(ProcessorMixin):
     def __init__(
             self,
             image_processor=None,
-            audio_processor=None,
             tokenizer=None,
             chat_template=None,
             image_seq_length: int = 256,
@@ -303,7 +313,8 @@ class Gemma3OmniProcessor(ProcessorMixin):
             self.image_token = getattr(self.tokenizer, "image_token", "<image>")
             self.eoi_token = getattr(self.tokenizer, "eoi_token", "")
-            self.audio_token_str_from_user_code = "<audio_soft_token>"
             self.audio_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_token_str_from_user_code)
             if hasattr(self.tokenizer, "unk_token_id") and self.audio_token_id == self.tokenizer.unk_token_id:
                 logger.warning(
@@ -319,12 +330,14 @@ class Gemma3OmniProcessor(ProcessorMixin):
             self.image_token = "<image>"
             self.eoi_token = ""
             self.audio_token_str_from_user_code = "<audio_soft_token>"
-            self.audio_token_id = -1
             self.full_image_sequence = ""
-        self.prompt_audio_compression_rate = kwargs.pop("audio_prompt_compression_rate", 8)
-        self.prompt_audio_qformer_rate = kwargs.pop("audio_prompt_qformer_rate", 1)
-        self.prompt_audio_feat_stride = kwargs.pop("audio_prompt_feat_stride", 1)
         self.audio_placeholder_token = kwargs.pop("audio_placeholder_token", "<|audio_placeholder|>")
     def _merge_kwargs(self, KwargsClassWithDefaults, tokenizer_init_kwargs, **kwargs_from_call):
@@ -339,14 +352,14 @@ class Gemma3OmniProcessor(ProcessorMixin):
             if modality_key_in_call in final_kwargs:
                 if isinstance(modality_kwargs_in_call, dict):
                     final_kwargs[modality_key_in_call].update(modality_kwargs_in_call)
-            elif isinstance(modality_kwargs_in_call, dict):
                 final_kwargs[modality_key_in_call] = modality_kwargs_in_call.copy()
-        if self.tokenizer:
             for modality_key in final_kwargs:
                 modality_dict = final_kwargs[modality_key]
-                if isinstance(modality_dict, dict):
-                    for key_in_mod_dict in list(modality_dict.keys()):
                         if key_in_mod_dict in tokenizer_init_kwargs:
                             value = (
                                 getattr(self.tokenizer, key_in_mod_dict)
@@ -355,148 +368,218 @@ class Gemma3OmniProcessor(ProcessorMixin):
                             )
                             modality_dict[key_in_mod_dict] = value
-        if "text_kwargs" not in final_kwargs:
-            final_kwargs["text_kwargs"] = {}
         final_kwargs["text_kwargs"]["truncation"] = final_kwargs["text_kwargs"].get("truncation", False)
         final_kwargs["text_kwargs"]["max_length"] = final_kwargs["text_kwargs"].get("max_length", DEFAULT_MAX_LENGTH)
         return final_kwargs
     def _compute_audio_embed_size(self, audio_mel_frames: int) -> int:
-        scaled_frames = audio_mel_frames * self.prompt_audio_feat_stride
-        result = math.ceil(scaled_frames / self.prompt_audio_compression_rate)
-        return math.ceil(result / self.prompt_audio_qformer_rate)
     def __call__(
             self,
             text: Union[str, List[str]] = None,
             images: Optional[Any] = None,
             audios: Optional[Union[AudioInput, List[AudioInput]]] = None,
-            sampling_rate: Optional[int] = None,
             return_tensors: Optional[Union[str, TensorType]] = None,
             **kwargs: Any
     ) -> BatchFeature:
         if text is None and images is None and audios is None:
             raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
-        final_rt = return_tensors
         merged_call_kwargs = self._merge_kwargs(
-            Gemma3ProcessorKwargs,
-            self.tokenizer.init_kwargs if hasattr(self.tokenizer, 'init_kwargs') else {},
-            **kwargs
         )
-        if final_rt is None:
             final_rt = merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", TensorType.PYTORCH)
-        else:
             merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
-        if text is None:
             num_samples = 0
             if images is not None:
                 _images_list = images if isinstance(images, list) and (
                             not images or not isinstance(images[0], (int, float))) else [images]
                 num_samples = len(_images_list)
             elif audios is not None:
-                _audios_list = audios if isinstance(audios, list) else [audios]
                 num_samples = len(_audios_list)
-            text = [""] * num_samples if num_samples > 0 else [""]
-        if isinstance(text, str): text = [text]
         if not (isinstance(text, list) and all(isinstance(t, str) for t in text)):
             raise ValueError("Input `text` must be a string or a list of strings.")
         image_features_dict = {}
         if images is not None:
             if self.image_processor is None: raise ValueError("Images provided but self.image_processor is None.")
-            batched_images = make_nested_list_of_images(images)
             _img_proc_output = self.image_processor(batched_images, return_tensors=None,
-                                                    **merged_call_kwargs.get("images_kwargs", {}))
             image_features_dict = _img_proc_output.data if isinstance(_img_proc_output,
                                                                       BatchFeature) else _img_proc_output
-            if len(text) == 0 and len(batched_images) > 0: text = [" ".join([self.boi_token] * len(img_batch)) for
-                                                                   img_batch in batched_images]
-            if len(batched_images) != len(text): raise ValueError(
-                f"Inconsistent batch: {len(batched_images)} images, {len(text)} texts")
             num_crops_popped = image_features_dict.pop("num_crops", None)
             if num_crops_popped is not None:
                 num_crops_all = to_py_obj(num_crops_popped)
                 temp_text_img, current_crop_idx_offset = [], 0
                 for batch_idx, (prompt, current_imgs_in_batch) in enumerate(zip(text, batched_images)):
-                    crops_for_this_batch_sample = []
-                    if num_crops_all:
-                        for _ in current_imgs_in_batch:
                             if current_crop_idx_offset < len(num_crops_all):
-                                crops_for_this_batch_sample.append(
-                                    num_crops_all[current_crop_idx_offset]); current_crop_idx_offset += 1
                             else:
-                                crops_for_this_batch_sample.append(0)
-                    image_indexes = [m.start() for m in re.finditer(re.escape(self.boi_token), prompt)]
                     processed_prompt = prompt
-                    iter_count = min(len(crops_for_this_batch_sample), len(image_indexes))
-                    for i_crop_idx in range(iter_count - 1, -1, -1):
-                        num_additional_crops = crops_for_this_batch_sample[i_crop_idx]
-                        original_token_idx = image_indexes[i_crop_idx]
-                        if num_additional_crops > 0:
-                            replacement_text = (
-                                        f"Here is the original image {self.boi_token} and here are some crops to help you see better " + " ".join(
-                                    [self.boi_token] * num_additional_crops))
-                            processed_prompt = processed_prompt[
-                                               :original_token_idx] + replacement_text + processed_prompt[
-                                                                                         original_token_idx + len(
-                                                                                             self.boi_token):]
                     temp_text_img.append(processed_prompt)
                 text = temp_text_img
-                text = [p.replace(self.boi_token, self.full_image_sequence) for p in text]
         audio_features_dict = {}
         if audios is not None:
             if self.audio_processor is None: raise ValueError("Audios provided but self.audio_processor is None.")
             audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
             if sampling_rate is not None: audio_call_kwargs["sampling_rate"] = sampling_rate
             _audio_proc_output = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
             audio_features_dict = _audio_proc_output.data
-            new_text_with_audio, actual_mel_frames_per_sample = [], to_py_obj(
-                audio_features_dict["audio_attention_mask"].sum(axis=-1))
-            if len(actual_mel_frames_per_sample) != len(text): raise ValueError(
-                f"Inconsistent batch for audio/text: {len(actual_mel_frames_per_sample)} audio, {len(text)} text.")
             for i, prompt in enumerate(text):
-                num_soft_tokens = self._compute_audio_embed_size(actual_mel_frames_per_sample[i])
                 audio_token_sequence_str = self.audio_token_str_from_user_code * num_soft_tokens
                 if self.audio_placeholder_token in prompt:
-                    prompt = prompt.replace(self.audio_placeholder_token, audio_token_sequence_str, 1)
                 else:
-                    prompt += audio_token_sequence_str
                 new_text_with_audio.append(prompt)
             text = new_text_with_audio
         text_tokenizer_kwargs = merged_call_kwargs.get("text_kwargs", {})
         text_features_dict = self.tokenizer(text=text, return_tensors=None,
-                                            **text_tokenizer_kwargs)
         input_ids_list_of_lists = text_features_dict["input_ids"]
         if not isinstance(input_ids_list_of_lists, list) or not (
                 input_ids_list_of_lists and isinstance(input_ids_list_of_lists[0], list)):
             if isinstance(input_ids_list_of_lists, (torch.Tensor, np.ndarray)):
-                input_ids_list_of_lists = to_py_obj(input_ids_list_of_lists)
             elif isinstance(input_ids_list_of_lists, list) and (
                     not input_ids_list_of_lists or isinstance(input_ids_list_of_lists[0], int)):
-                input_ids_list_of_lists = [input_ids_list_of_lists]
         token_type_ids_list = []
         for ids_sample in input_ids_list_of_lists:
-            types = [0] * len(ids_sample)
             for j, token_id_val in enumerate(ids_sample):
                 if self.image_token_id is not None and token_id_val == self.image_token_id:
-                    types[j] = 1
-                elif self.audio_token_id != -1 and token_id_val == self.audio_token_id:
-                    types[j] = 2
             token_type_ids_list.append(types)
         text_features_dict["token_type_ids"] = token_type_ids_list
@@ -504,6 +587,7 @@ class Gemma3OmniProcessor(ProcessorMixin):
         if image_features_dict: final_batch_data.update(image_features_dict)
         if audio_features_dict: final_batch_data.update(audio_features_dict)
         return BatchFeature(data=final_batch_data, tensor_type=final_rt)
     def batch_decode(self, *args, **kwargs):
@@ -516,16 +600,29 @@ class Gemma3OmniProcessor(ProcessorMixin):
     def model_input_names(self) -> List[str]:
         input_names = set()
         if hasattr(self, 'tokenizer') and self.tokenizer is not None:
-            input_names.update(self.tokenizer.model_input_names + ["token_type_ids"])
         if hasattr(self, 'image_processor') and self.image_processor is not None:
-            input_names.update(self.image_processor.model_input_names)
-        if hasattr(self, 'audio_processor') and self.audio_processor is not None and \
-                hasattr(self.audio_processor, 'model_input_names'):
-            input_names.update(self.audio_processor.model_input_names)
-        elif hasattr(self,
-                     'audio_processor') and self.audio_processor is not None:
-            input_names.update(["audio_values", "audio_attention_mask"])
         return list(input_names)

 import re
+from typing import List, Optional, Union, Dict, Any, Tuple  # Added Tuple
 import math
 import numpy as np
 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
+from transformers.audio_utils import AudioInput  # type: ignore
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import make_nested_list_of_images  # If image processing is used
 from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs
 from transformers.utils import TensorType, to_py_obj, logging
 # Constants
 DEFAULT_SAMPLING_RATE = 16000
 DEFAULT_N_FFT = 512
+DEFAULT_WIN_LENGTH = 400  # Matches Phi4M's 16kHz win_length for reference
+DEFAULT_HOP_LENGTH = 160  # Matches Phi4M's 16kHz hop_length for reference
 DEFAULT_N_MELS = 80
+DEFAULT_COMPRESSION_RATE = 4  # Generic default
+DEFAULT_QFORMER_RATE = 2  # Generic default
+DEFAULT_FEAT_STRIDE = 4  # Generic default
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
+# LOG_MEL_CLIP_EPSILON = 1e-5 # Original B's constant, A clips at 1.0
 logger = logging.get_logger(__name__)
+# This create_mel_filterbank function is from your original Snippet B.
+# It will be used by the Gemma3AudioFeatureExtractor.
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing."""
     fmax = fmax or sampling_rate / 2.0
+    def hz_to_mel(f: float) -> float:  # Slaney scale from Snippet B
         return 1127.0 * math.log(1 + f / 700.0)
     if fmin >= fmax:
         raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
+    freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1)  # Inverse of Slaney hz_to_mel
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
     bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
+    bins = np.clip(bins, 0, n_fft // 2)  # Max index for rfft output is n_fft//2
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
     for m_idx in range(n_mels):
         left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
+        if center > left:  # Rising slope
             filterbank[m_idx, left:center + 1] = (np.arange(left, center + 1) - left) / (center - left)
+        if right > center:  # Falling slope
             filterbank[m_idx, center:right + 1] = (right - np.arange(center, right + 1)) / (right - center)
         # Ensure the peak at 'center' is 1.0 if it's a valid point.
+        # This logic is from original Snippet B. Phi4M's speechlib_mel might normalize differently.
+        if left <= center <= right:  # Check if center is within the bounds of the filter
+            if filterbank.shape[1] > center:  # Check if center index is within filterbank columns
+                if (center > left and filterbank[m_idx, center] < 1.0 and center < right) or \
+                        (left == center and center < right) or \
+                        (right == center and left < center):  # Ensure it's a triangular filter with a slope
                     filterbank[m_idx, center] = 1.0
+                elif left == center and right == center:  # Handles the case of a filter with zero width if bins are identical
+                    filterbank[m_idx, center] = 1.0
     return filterbank
+# --- Start of Refactored Audio Feature Extractor (to match Phi4M - Snippet A) ---
 class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
+    model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
+    def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
+        feature_size = 80  # From Phi4M
+        sampling_rate = 16000  # From Phi4M (target sampling rate)
+        padding_value = 0.0  # From Phi4M
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.compression_rate = audio_compression_rate
+        self.qformer_compression_rate = audio_downsample_rate  # In Phi4M, audio_downsample_rate is qformer_compression_rate
+        self.feat_stride = audio_feat_stride
+        self._eightk_method = kwargs.get("eightk_method", "fillzero")  # 'fillzero' or 'resample'
+        # Using the provided create_mel_filterbank (Slaney scale)
+        # Parameters for Mel filterbank match Phi4M's speechlib_mel call
+        self._mel = create_mel_filterbank(
+            sampling_rate=16000,  # Target sampling rate
+            n_fft=512,  # n_fft for 16kHz audio in Phi4M
+            n_mels=80,  # feature_size
+            fmin=0.0,  # Phi4M's fmin is None, typically defaults to 0
+            fmax=7690.0  # Specific fmax from Phi4M
+        ).T
+        self._hamming400 = np.hamming(400)  # for 16k audio, from Phi4M
+        self._hamming200 = np.hamming(200)  # for 8k audio, from Phi4M
+    def __call__(
             self,
+            audios: List[Union[AudioInput, Tuple[np.ndarray, int]]],  # More specific type hint
+            return_tensors: Optional[Union[str, TensorType]] = None,
     ):
+        returned_input_audio_embeds = []
+        returned_audio_embed_sizes = []
+        audio_frames_list = []  # Stores num_mel_frames * feat_stride for each audio item
+        for audio_input_item in audios:
+            if not isinstance(audio_input_item, tuple) or len(audio_input_item) != 2:
+                raise ValueError(
+                    "Each item in 'audios' must be a tuple (waveform: np.ndarray, sample_rate: int)."
+                )
+            audio_data, sample_rate = audio_input_item
+            if isinstance(audio_data, list):  # Convert list to ndarray
+                audio_data = np.array(audio_data, dtype=np.float32)
+            if not isinstance(audio_data, np.ndarray):
+                raise TypeError(f"Waveform data must be a numpy array, got {type(audio_data)}")
+            audio_embeds_np = self._extract_features(audio_data, sample_rate)  # log_fbank
+            num_mel_frames = audio_embeds_np.shape[0]
+            current_audio_frames = num_mel_frames * self.feat_stride  # Phi4M logic
+            audio_embed_size = self._compute_audio_embed_size(current_audio_frames)
+            returned_input_audio_embeds.append(torch.from_numpy(audio_embeds_np))
+            returned_audio_embed_sizes.append(torch.tensor(audio_embed_size).long())
+            audio_frames_list.append(current_audio_frames)
+        padded_input_audio_embeds = pad_sequence(
+            returned_input_audio_embeds, batch_first=True, padding_value=self.padding_value
+        )
+        stacked_audio_embed_sizes = torch.stack(returned_audio_embed_sizes, dim=0)
+        tensor_audio_frames_list = torch.tensor(audio_frames_list, dtype=torch.long)
+        max_audio_frames = 0
+        if len(audios) > 0 and tensor_audio_frames_list.numel() > 0:
+            max_audio_frames = tensor_audio_frames_list.max().item()
+        returned_audio_attention_mask = None
+        if max_audio_frames > 0:  # Create mask only if there are frames
+            if len(audios) > 1:
+                returned_audio_attention_mask = torch.arange(0, max_audio_frames,
+                                                             device=tensor_audio_frames_list.device).unsqueeze(
+                    0) < tensor_audio_frames_list.unsqueeze(1)
+            elif len(audios) == 1:  # For batch size 1
+                returned_audio_attention_mask = torch.ones(1, max_audio_frames, dtype=torch.bool,
+                                                           device=tensor_audio_frames_list.device)
+        data = {
+            "input_audio_embeds": padded_input_audio_embeds,
+            "audio_embed_sizes": stacked_audio_embed_sizes,
+        }
+        if returned_audio_attention_mask is not None:
+            data["audio_attention_mask"] = returned_audio_attention_mask
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def _extract_spectrogram(self, wav: np.ndarray, fs: int) -> np.ndarray:
+        if wav.ndim > 1:
+            wav = np.squeeze(wav)
+        if len(wav.shape) == 2:  # stereo to mono
+            wav = wav.mean(axis=1).astype(np.float32)  # Ensure float32 after mean
+        wav = wav.astype(np.float32)  # Ensure wav is float32
+        # Phi4M Resampling logic
+        if fs > self.sampling_rate:  # self.sampling_rate is 16000
+            wav = scipy.signal.resample_poly(wav, self.sampling_rate, fs)
+            fs = self.sampling_rate
+        elif 8000 < fs < self.sampling_rate:
+            wav = scipy.signal.resample_poly(wav, 8000, fs)  # Resample to 8000 first
+            fs = 8000
+        elif fs < 8000 and fs > 0:
+            logger.warning(f"Sample rate {fs} is less than 8000Hz. Resampling to 8000Hz.")
+            wav = scipy.signal.resample_poly(wav, 8000, fs)
+            fs = 8000
+        elif fs <= 0:
+            raise RuntimeError(f"Unsupported sample rate {fs}")
+        if fs == 8000:
+            if self._eightk_method == "resample":
+                wav = scipy.signal.resample_poly(wav, self.sampling_rate, 8000)  # Resample 8k to 16k
+                fs = self.sampling_rate
+            # If "fillzero", parameters for 8k will be used, and spectrum padded later.
+        elif fs != self.sampling_rate:  # Should be 16000 if not 8000 and _eightk_method != "resample"
+            raise RuntimeError(
+                f"Audio sample rate {fs} not supported after initial processing. Expected {self.sampling_rate} or 8000.")
+        preemphasis_coeff = 0.97
+        if fs == 8000:
+            n_fft, win_length, hop_length, fft_window = 256, 200, 80, self._hamming200
+        elif fs == 16000:
+            n_fft, win_length, hop_length, fft_window = 512, 400, 160, self._hamming400
+        else:
+            raise RuntimeError(f"Inconsistent fs {fs} for parameter selection.")
+        if len(wav) < win_length:
+            wav = np.pad(wav, (0, win_length - len(wav)), 'constant', constant_values=(0.0,))
+        num_frames = (wav.shape[0] - win_length) // hop_length + 1
+        if num_frames <= 0:
+            return np.zeros((0, n_fft // 2 + 1), dtype=np.float32)
+        y_frames = np.array(
+            [wav[i * hop_length: i * hop_length + win_length] for i in range(num_frames)],
+            dtype=np.float32,
+        )
+        _y_frames_rolled = np.roll(y_frames, 1, axis=1)
+        _y_frames_rolled[:, 0] = _y_frames_rolled[:, 1]  # Phi4M specific handling
+        y_frames_preemphasized = (y_frames - preemphasis_coeff * _y_frames_rolled) * 32768.0
+        S = np.fft.rfft(fft_window * y_frames_preemphasized, n=n_fft, axis=1).astype(np.complex64)
+        if fs == 8000 and self._eightk_method == "fillzero":
+            # Pad spectrum to match 16kHz feature dimension (n_fft=512 -> 257 bins)
+            # Current S has (256 // 2) + 1 = 129 bins
+            target_bins = (512 // 2) + 1
+            pad_width = target_bins - S.shape[1]
+            # Phi4M: S = np.concatenate((S[:, 0:-1], padarray), axis=1) # Nyquist bin gets set to zero
+            # This means take all but last bin from 8k spectrum, then pad.
+            S_core = S[:, :-1]
+            padarray = np.zeros((S_core.shape[0], target_bins - S_core.shape[1]), dtype=S.dtype)
+            S = np.concatenate((S_core, padarray), axis=1)
+        spec = np.abs(S).astype(np.float32)
+        return spec
+    def _extract_features(self, wav: np.ndarray, fs: int) -> np.ndarray:
+        spec = self._extract_spectrogram(wav, fs)
+        if spec.shape[0] == 0:
+            return np.zeros((0, self.feature_size), dtype=np.float32)
+        spec_power = spec ** 2
+        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)  # Clip at 1.0 before log (Phi4M)
+        log_fbank = np.log(fbank_power).astype(np.float32)
+        return log_fbank
+    def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        # Phi4M's logic for compressed size
+        integer = audio_frames // self.compression_rate
+        remainder = audio_frames % self.compression_rate
+        result = integer if remainder == 0 else integer + 1
+        integer = result // self.qformer_compression_rate
+        remainder = result % self.qformer_compression_rate
+        result = integer if remainder == 0 else integer + 1
+        return result
+# --- End of Refactored Audio Feature Extractor ---
 class Gemma3ImagesKwargs(ImagesKwargs):
     def __init__(
             self,
             image_processor=None,
+            audio_processor=None,  # User can pass an instance of RefactoredGemma3... here
             tokenizer=None,
             chat_template=None,
             image_seq_length: int = 256,
             self.image_token = getattr(self.tokenizer, "image_token", "<image>")
             self.eoi_token = getattr(self.tokenizer, "eoi_token", "")
+            self.audio_token_str_from_user_code = "<audio_soft_token>"  # Example
+            # Ensure this token is actually in the tokenizer vocab as a special token
             self.audio_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_token_str_from_user_code)
             if hasattr(self.tokenizer, "unk_token_id") and self.audio_token_id == self.tokenizer.unk_token_id:
                 logger.warning(
             self.image_token = "<image>"
             self.eoi_token = ""
             self.audio_token_str_from_user_code = "<audio_soft_token>"
+            self.audio_token_id = -1  # Placeholder if tokenizer is missing
             self.full_image_sequence = ""
+        # These attributes are specific to Gemma3OmniProcessor for its internal _compute_audio_embed_size
+        self.prompt_audio_compression_rate = kwargs.pop("prompt_audio_compression_rate", DEFAULT_COMPRESSION_RATE)
+        self.prompt_audio_qformer_rate = kwargs.pop("prompt_audio_qformer_rate", DEFAULT_QFORMER_RATE)
+        # self.prompt_audio_feat_stride = kwargs.pop("prompt_audio_feat_stride", DEFAULT_FEAT_STRIDE) # Not used by its _compute_audio_embed_size
         self.audio_placeholder_token = kwargs.pop("audio_placeholder_token", "<|audio_placeholder|>")
     def _merge_kwargs(self, KwargsClassWithDefaults, tokenizer_init_kwargs, **kwargs_from_call):
             if modality_key_in_call in final_kwargs:
                 if isinstance(modality_kwargs_in_call, dict):
                     final_kwargs[modality_key_in_call].update(modality_kwargs_in_call)
+            elif isinstance(modality_kwargs_in_call, dict):  # New modality not in defaults
                 final_kwargs[modality_key_in_call] = modality_kwargs_in_call.copy()
+        if self.tokenizer:  # Ensure tokenizer exists before accessing its attributes
             for modality_key in final_kwargs:
                 modality_dict = final_kwargs[modality_key]
+                if isinstance(modality_dict, dict):  # Check if it's a dictionary
+                    for key_in_mod_dict in list(modality_dict.keys()):  # Iterate over keys
                         if key_in_mod_dict in tokenizer_init_kwargs:
                             value = (
                                 getattr(self.tokenizer, key_in_mod_dict)
                             )
                             modality_dict[key_in_mod_dict] = value
+        if "text_kwargs" not in final_kwargs: final_kwargs["text_kwargs"] = {}  # Ensure text_kwargs exists
         final_kwargs["text_kwargs"]["truncation"] = final_kwargs["text_kwargs"].get("truncation", False)
         final_kwargs["text_kwargs"]["max_length"] = final_kwargs["text_kwargs"].get("max_length", DEFAULT_MAX_LENGTH)
         return final_kwargs
     def _compute_audio_embed_size(self, audio_mel_frames: int) -> int:
+        # This method is part of Gemma3OmniProcessor.
+        # It calculates a number of soft tokens based on its own compression rates.
+        # Note: `audio_mel_frames` here is the number of raw Mel frames from the feature extractor's perspective
+        # if the attention mask sum is directly used before feat_stride scaling by the processor.
+        # However, if using the Refactored processor, audio_attention_mask.sum() will yield
+        # num_mel_frames * feat_stride. This method should then correctly compress that value.
+        # Using prompt_audio_compression_rate and prompt_audio_qformer_rate
+        # which are attributes of this Gemma3OmniProcessor class.
+        # First compression
+        # audio_mel_frames here should ideally be num_actual_mel_frames * feat_stride_of_the_audio_processor
+        # if trying to match the number of tokens from a Phi4M-style processor.
+        # The refactored audio processor does this scaling internally before its own _compute_audio_embed_size.
+        # If actual_mel_frames_per_sample (from sum of attention_mask) *is* already scaled by feat_stride
+        # (as it would be if using the refactored processor's attention_mask), then this calculation is correct.
+        integer = audio_mel_frames // self.prompt_audio_compression_rate
+        remainder = audio_mel_frames % self.prompt_audio_compression_rate
+        result = integer if remainder == 0 else integer + 1
+        # Second compression
+        integer = result // self.prompt_audio_qformer_rate
+        remainder = result % self.prompt_audio_qformer_rate
+        result = integer if remainder == 0 else integer + 1
+        return result
     def __call__(
             self,
             text: Union[str, List[str]] = None,
             images: Optional[Any] = None,
             audios: Optional[Union[AudioInput, List[AudioInput]]] = None,
+            sampling_rate: Optional[int] = None,  # sampling_rate for raw audio arrays
             return_tensors: Optional[Union[str, TensorType]] = None,
             **kwargs: Any
     ) -> BatchFeature:
         if text is None and images is None and audios is None:
             raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
+        final_rt = return_tensors  # Store original return_tensors
+        # Properly merge kwargs for text, images, audio
         merged_call_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,  # The class defining _defaults
+            self.tokenizer.init_kwargs if hasattr(self.tokenizer, 'init_kwargs') else {},  # Tokenizer defaults
+            **kwargs  # User-provided kwargs from the call
         )
+        # Determine final return_tensors, prioritizing call > text_kwargs > default
+        if final_rt is None:  # If not specified in call
             final_rt = merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", TensorType.PYTORCH)
+        else:  # If specified in call, remove from text_kwargs to avoid conflict
             merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
+        if text is None:  # If no text, create empty strings based on other inputs
             num_samples = 0
             if images is not None:
                 _images_list = images if isinstance(images, list) and (
                             not images or not isinstance(images[0], (int, float))) else [images]
                 num_samples = len(_images_list)
             elif audios is not None:
+                _audios_list = audios if isinstance(audios, list) and not (
+                            isinstance(audios[0], tuple) and isinstance(audios[0][0], (int, float))) else [
+                    audios]  # check if audios is list of items or list of (wave,sr)
                 num_samples = len(_audios_list)
+            text = [""] * num_samples if num_samples > 0 else [""]  # Default to one empty string if no inputs
+        if isinstance(text, str): text = [text]  # Ensure text is a list
         if not (isinstance(text, list) and all(isinstance(t, str) for t in text)):
             raise ValueError("Input `text` must be a string or a list of strings.")
         image_features_dict = {}
         if images is not None:
             if self.image_processor is None: raise ValueError("Images provided but self.image_processor is None.")
+            # Ensure images are correctly batched
+            batched_images = make_nested_list_of_images(images)  # handles various image input types
+            _img_kwargs = merged_call_kwargs.get("images_kwargs", {})
             _img_proc_output = self.image_processor(batched_images, return_tensors=None,
+                                                    **_img_kwargs)  # Pass None to handle tensors later
             image_features_dict = _img_proc_output.data if isinstance(_img_proc_output,
                                                                       BatchFeature) else _img_proc_output
+            if len(text) == 1 and text[0] == "" and len(
+                    batched_images) > 0:  # If text is default empty and images exist
+                text = [" ".join([self.boi_token] * len(img_batch)) for img_batch in batched_images]
+            elif len(batched_images) != len(text):  # If text was provided, ensure consistency
+                raise ValueError(
+                    f"Inconsistent batch: {len(batched_images)} image groups, {len(text)} texts. Ensure one text prompt per image group."
+                )
             num_crops_popped = image_features_dict.pop("num_crops", None)
             if num_crops_popped is not None:
                 num_crops_all = to_py_obj(num_crops_popped)
                 temp_text_img, current_crop_idx_offset = [], 0
                 for batch_idx, (prompt, current_imgs_in_batch) in enumerate(zip(text, batched_images)):
+                    crops_for_this_batch_sample = []  # Number of *additional* crops for each original image
+                    if num_crops_all:  # If num_crops_all is not None or empty
+                        for _ in current_imgs_in_batch:  # For each original image in the current batch sample
                             if current_crop_idx_offset < len(num_crops_all):
+                                # num_crops_all contains total items (original + crops) for each image
+                                # We need number of *additional* crops. Assuming num_crops_all[i] >= 1
+                                crops_for_this_batch_sample.append(max(0, num_crops_all[current_crop_idx_offset] - 1))
+                                current_crop_idx_offset += 1
                             else:
+                                crops_for_this_batch_sample.append(0)  # Should not happen if num_crops_all is correct
+                    image_placeholders_in_prompt = [m.start() for m in re.finditer(re.escape(self.boi_token), prompt)]
                     processed_prompt = prompt
+                    # Iterate backwards to preserve indices for replacement
+                    iter_count = min(len(crops_for_this_batch_sample), len(image_placeholders_in_prompt))
+                    for i_placeholder_idx in range(iter_count - 1, -1, -1):
+                        num_additional_crops_for_this_image = crops_for_this_batch_sample[i_placeholder_idx]
+                        original_token_idx_in_prompt = image_placeholders_in_prompt[i_placeholder_idx]
+                        if num_additional_crops_for_this_image > 0:
+                            # Create replacement text: original image placeholder + placeholders for additional crops
+                            replacement_text = self.boi_token + "".join(
+                                [self.boi_token] * num_additional_crops_for_this_image)
+                            # Replace the single original boi_token with the new sequence
+                            processed_prompt = (
+                                    processed_prompt[:original_token_idx_in_prompt] +
+                                    replacement_text +
+                                    processed_prompt[original_token_idx_in_prompt + len(self.boi_token):]
+                            )
                     temp_text_img.append(processed_prompt)
                 text = temp_text_img
+            # Replace all BOI tokens with the full image sequence (BOI + IMAGE*N + EOI)
+            # This step assumes that if additional crops were handled, self.boi_token still marks each image.
+            text = [p.replace(self.boi_token, self.full_image_sequence) for p in text]
         audio_features_dict = {}
         if audios is not None:
             if self.audio_processor is None: raise ValueError("Audios provided but self.audio_processor is None.")
             audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
+            # Pass sampling_rate from __call__ to audio_processor if provided (for raw arrays)
             if sampling_rate is not None: audio_call_kwargs["sampling_rate"] = sampling_rate
+            # The audio_processor (e.g., RefactoredGemma3...) will return its model_input_names
+            # e.g., {"input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"}
             _audio_proc_output = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
             audio_features_dict = _audio_proc_output.data
+            new_text_with_audio = []
+            # Determine the number of actual audio items processed by the audio_processor
+            # This should match len(text) if batching is consistent.
+            # The 'audio_attention_mask' or 'input_audio_embeds' can indicate this.
+            num_audio_samples_processed = audio_features_dict[self.audio_processor.model_input_names[0]].shape[0]
+            if num_audio_samples_processed != len(text):
+                raise ValueError(
+                    f"Inconsistent batch for audio/text: {num_audio_samples_processed} audio samples processed, {len(text)} text prompts."
+                )
+            # If using Gemma3AudioFeatureExtractor,
+            # "audio_embed_sizes" is already computed correctly (num compressed tokens).
+            # The processor's own _compute_audio_embed_size is called to determine how many
+            # self.audio_token_str_from_user_code to insert. Ideally, this matches.
+            # Get the number of frames that the processor's _compute_audio_embed_size expects.
+            # If the audio_processor is RefactoredGemma3..., its attention_mask is over (num_mel_frames * feat_stride).
+            # So, sum of that mask gives the input for this processor's _compute_audio_embed_size.
+            frames_for_embed_size_calc = to_py_obj(audio_features_dict[self.audio_processor.model_input_names[2]].sum(
+                axis=-1))  # sum of audio_attention_mask
             for i, prompt in enumerate(text):
+                # num_soft_tokens should be the final number of audio tokens to insert in the text.
+                # This is calculated by the Gemma3OmniProcessor's own method.
+                num_soft_tokens = self._compute_audio_embed_size(frames_for_embed_size_calc[i])
                 audio_token_sequence_str = self.audio_token_str_from_user_code * num_soft_tokens
                 if self.audio_placeholder_token in prompt:
+                    prompt = prompt.replace(self.audio_placeholder_token, audio_token_sequence_str,
+                                            1)  # Replace only first
                 else:
+                    prompt += audio_token_sequence_str  # Append if no placeholder
                 new_text_with_audio.append(prompt)
             text = new_text_with_audio
         text_tokenizer_kwargs = merged_call_kwargs.get("text_kwargs", {})
         text_features_dict = self.tokenizer(text=text, return_tensors=None,
+                                            **text_tokenizer_kwargs)  # Pass None for tensors
+        # Create token_type_ids
         input_ids_list_of_lists = text_features_dict["input_ids"]
+        # Ensure it's a list of lists
         if not isinstance(input_ids_list_of_lists, list) or not (
                 input_ids_list_of_lists and isinstance(input_ids_list_of_lists[0], list)):
             if isinstance(input_ids_list_of_lists, (torch.Tensor, np.ndarray)):
+                input_ids_list_of_lists = to_py_obj(input_ids_list_of_lists)  # to nested python lists
             elif isinstance(input_ids_list_of_lists, list) and (
                     not input_ids_list_of_lists or isinstance(input_ids_list_of_lists[0], int)):
+                input_ids_list_of_lists = [input_ids_list_of_lists]  # wrap single list
         token_type_ids_list = []
         for ids_sample in input_ids_list_of_lists:
+            types = [0] * len(ids_sample)  # 0 for text
             for j, token_id_val in enumerate(ids_sample):
                 if self.image_token_id is not None and token_id_val == self.image_token_id:
+                    types[j] = 1  # 1 for image
+                elif self.audio_token_id != -1 and token_id_val == self.audio_token_id:  # Check if audio_token_id is valid
+                    types[j] = 2  # 2 for audio
             token_type_ids_list.append(types)
         text_features_dict["token_type_ids"] = token_type_ids_list
         if image_features_dict: final_batch_data.update(image_features_dict)
         if audio_features_dict: final_batch_data.update(audio_features_dict)
+        # Convert all data to tensors if final_rt is specified
         return BatchFeature(data=final_batch_data, tensor_type=final_rt)
     def batch_decode(self, *args, **kwargs):
     def model_input_names(self) -> List[str]:
         input_names = set()
         if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+            # Make sure model_input_names is a list/set before +
+            tokenizer_inputs = self.tokenizer.model_input_names
+            if isinstance(tokenizer_inputs, (list, set)):
+                input_names.update(tokenizer_inputs)
+            else:  # Fallback if it's a single string
+                input_names.add(str(tokenizer_inputs))
+            input_names.add("token_type_ids")
         if hasattr(self, 'image_processor') and self.image_processor is not None:
+            # Similar check for image_processor
+            image_inputs = self.image_processor.model_input_names
+            if isinstance(image_inputs, (list, set)):
+                input_names.update(image_inputs)
+            else:
+                input_names.add(str(image_inputs))
+        if hasattr(self, 'audio_processor') and self.audio_processor is not None:
+            # Use model_input_names from the instantiated audio_processor
+            # This will correctly reflect the names from RefactoredGemma3... if it's used.
+            audio_inputs = self.audio_processor.model_input_names
+            if isinstance(audio_inputs, (list, set)):
+                input_names.update(audio_inputs)
+            else:
+                input_names.add(str(audio_inputs))
         return list(input_names)