voidful
/

gemma-3-omni-processor

Feature Extraction

Transformers

gemma_3_omni

custom_code

Model card Files Files and versions Community

voidful commited on May 14

Commit

0173a9f

verified ·

1 Parent(s): 9faac02

Update processing_gemma3_omni.py

Browse files

Files changed (1) hide show

processing_gemma3_omni.py +136 -138

processing_gemma3_omni.py CHANGED Viewed

@@ -7,11 +7,12 @@ import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
 # Using the original AudioInput for minimal change from your provided code
-from transformers.audio_utils import AudioInput # type: ignore
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import make_nested_list_of_images
-from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs # Removed Unpack as it's not standard
 from transformers.utils import TensorType, to_py_obj, logging
 # Constants
@@ -26,7 +27,7 @@ DEFAULT_FEAT_STRIDE = 4
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
-LOG_MEL_CLIP_EPSILON = 1e-5 # Epsilon for log mel clipping
 logger = logging.get_logger(__name__)
@@ -34,19 +35,18 @@ logger = logging.get_logger(__name__)
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing. (User's version)"""
-    fmax = fmax or sampling_rate / 2.0 # Ensure float division
     # User's Mel scale formula
     def hz_to_mel(f: float) -> float:
         return 1127.0 * math.log(1 + f / 700.0)
-    def mel_to_hz(mel: float) -> float: # Added for completeness if needed
-        return 700.0 * (math.exp(mel / 1127.0) - 1)
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
     # freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1) # Original
-    freq_points = mel_to_hz(mel_points) # Using the inverse function
     # Clip freq_points to be within [0, sampling_rate/2]
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
@@ -55,12 +55,11 @@ def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: flo
     # Ensure bins are within valid range for rfft output indices
     bins = np.clip(bins, 0, n_fft // 2)
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
-    for m_idx in range(n_mels): # Loop from 0 to n_mels-1 to fill filterbank[m_idx]
         # Bins for (m_idx)-th filter are bins[m_idx], bins[m_idx+1], bins[m_idx+2]
         left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
         # Original logic for applying triangular filter
         # Ensure no division by zero if points coincide
         if center > left:
@@ -69,9 +68,8 @@ def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: flo
             filterbank[m_idx, center:right] = (right - np.arange(center, right)) / (right - center)
         # If left=center or center=right, the corresponding slope is zero, which is implicitly handled.
         # Ensure peak is 1.0 if center is a valid point within a slope.
-        if left <= center < right and center > left : # If center forms a peak of a valid triangle part
-             filterbank[m_idx, center] = 1.0
     return filterbank
@@ -84,14 +82,14 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             compression_rate: int = DEFAULT_COMPRESSION_RATE,
             qformer_rate: int = DEFAULT_QFORMER_RATE,
             feat_stride: int = DEFAULT_FEAT_STRIDE,
-            sampling_rate: int = DEFAULT_SAMPLING_RATE, # Target sampling rate
             n_fft: int = DEFAULT_N_FFT,
             win_length: Optional[int] = None,
             hop_length: Optional[int] = None,
             n_mels: int = DEFAULT_N_MELS,
-            f_min: float = 0.0, # Added for mel filterbank control
-            f_max: Optional[float] = None, # Added for mel filterbank control
-            padding_value: float = 0.0, # Explicitly define for clarity
             **kwargs
     ):
         _win_length = win_length if win_length is not None else n_fft
@@ -100,7 +98,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         # feature_size is n_mels for the superclass
         super().__init__(
             feature_size=n_mels,
-            sampling_rate=sampling_rate, # This sets self.sampling_rate
             padding_value=padding_value,
             **kwargs
         )
@@ -115,32 +113,32 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         self.hop_length = _hop_length
         self.n_mels = n_mels
         self.f_min = f_min
-        self.f_max = f_max # Will be sampling_rate/2 if None in create_mel_filterbank call
         if self.win_length > self.n_fft:
             logger.warning(
                 f"win_length ({self.win_length}) is greater than n_fft ({self.n_fft}). "
                 "Window will be applied, then data will be zero-padded/truncated to n_fft by np.fft.rfft."
             )
-        self.window = np.hamming(self.win_length).astype(np.float32) # Or scipy.signal.get_window("hann", self.win_length)
         self.mel_filterbank = create_mel_filterbank(
             self.sampling_rate, self.n_fft, self.n_mels, fmin=self.f_min, fmax=self.f_max
-        ).T # Transpose for dot product: (n_fft // 2 + 1, n_mels)
     def __call__(
             self,
-            audios: Union[AudioInput, List[AudioInput]], # Accept single or list
-            sampling_rate: Optional[int] = None, # To specify SR if audios are raw arrays
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
         if not isinstance(audios, list):
             audios = [audios]
         processed_mels: List[torch.Tensor] = []
         actual_mel_lengths: List[int] = []
         # Kept from user's code - their purpose might be for token calculation downstream
         sizes_for_embed_length: List[torch.Tensor] = []
         frames_scaled_by_feat_stride: List[int] = []
@@ -151,7 +149,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             if isinstance(audio_item, tuple) and len(audio_item) == 2 and isinstance(audio_item[1], int):
                 current_wav, source_sr = audio_item
-                current_wav = np.asarray(current_wav, dtype=np.float32) # Ensure float32 numpy array
             elif isinstance(audio_item, (np.ndarray, list)):
                 current_wav = np.asarray(audio_item, dtype=np.float32)
                 if sampling_rate is None:
@@ -170,13 +168,13 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
                     f"Unsupported audio input type: {type(audio_item)}. "
                     "Expected np.ndarray, list of floats, or Tuple[np.ndarray, int]."
                 )
             processed_wav_array = self._preprocess_audio(current_wav, source_sr)
-            mel_spectrogram = self._compute_log_mel_spectrogram(processed_wav_array) # Shape: (T_mel, N_Mels)
-            feature_tensor = torch.from_numpy(mel_spectrogram) # Already float32
             processed_mels.append(feature_tensor)
-            actual_mel_lengths.append(feature_tensor.shape[0]) # T_mel for this item
             # User's original logic for 'sizes' and 'frames'
             sizes_for_embed_length.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
@@ -188,16 +186,16 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         # Create attention mask corresponding to the actual lengths of mel spectrograms
         max_t_mel_in_batch = audio_embeds.shape[1]
-        current_device = audio_embeds.device # Get device from padded tensor if using PyTorch tensors earlier
         # Create attention mask directly based on actual_mel_lengths
         attention_mask = torch.zeros(len(audios), max_t_mel_in_batch, dtype=torch.bool, device=current_device)
         for i, length in enumerate(actual_mel_lengths):
             attention_mask[i, :length] = True
         output_data = {
             "audio_values": audio_embeds,
-            "audio_attention_mask": attention_mask # Correctly shaped mask for audio_values
         }
         # Include user's 'sizes' if they are needed downstream
@@ -211,7 +209,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         # Ensure wav is float32
         if wav.dtype not in [np.float32, np.float64]:
             if np.issubdtype(wav.dtype, np.integer):
-                max_val = np.iinfo(wav.dtype).max if wav.size > 0 else 1.0 # Avoid error on empty array
                 wav = wav.astype(np.float32) / max_val
             else:
                 wav = wav.astype(np.float32)
@@ -219,20 +217,20 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             wav = wav.astype(np.float32)
         if wav.ndim > 1:
-            wav = wav.mean(axis=0) # Convert to mono
         if source_sr != self.sampling_rate:
             logger.info(f"Resampling audio from {source_sr} Hz to {self.sampling_rate} Hz.")
             # Calculate integer up/down factors for resample_poly
             common_divisor = math.gcd(self.sampling_rate, source_sr)
             up_factor = self.sampling_rate // common_divisor
             down_factor = source_sr // common_divisor
-            if up_factor != down_factor : # Only if actual resampling is needed
                 wav = scipy.signal.resample_poly(wav, up=up_factor, down=down_factor)
         # Normalize amplitude to roughly [-1, 1]
         max_abs_val = np.abs(wav).max()
-        if max_abs_val > 1e-7: # Avoid division by zero or tiny numbers
             wav = wav / max_abs_val
         return wav
@@ -245,10 +243,10 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         # Calculate number of frames
         # This calculation ensures at least one frame if len(wav) == self.win_length
         if len(wav) >= self.win_length:
-             num_frames = 1 + (len(wav) - self.win_length) // self.hop_length
-        else: # Should be covered by padding, but as safeguard
-             num_frames = 0
         if num_frames <= 0:
             logger.warning(f"Audio is too short (length {len(wav)}) to produce any frames "
                            f"with win_length {self.win_length} and hop_length {self.hop_length}. "
@@ -263,21 +261,21 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             strides=(strides * self.hop_length, strides),
             writeable=False
         )
-        frames_data = frames_view.copy() # Important: copy after as_strided if modifying
-        frames_data *= self.window # Apply window in-place on the copy
         # Compute STFT (rfft for real inputs)
         # n_fft determines zero-padding or truncation for FFT input from each frame
         spectrum = np.fft.rfft(frames_data, n=self.n_fft, axis=-1).astype(np.complex64)
-        power = np.abs(spectrum)**2
-        mel_spectrogram = np.dot(power, self.mel_filterbank) # (num_frames, n_mels)
         # Clip and take log
-        mel_spectrogram = np.clip(mel_spectrogram, LOG_MEL_CLIP_EPSILON, None) # Use defined epsilon
         log_mel_spectrogram = np.log(mel_spectrogram)
         return log_mel_spectrogram.astype(np.float32)
     def _calculate_embed_length(self, frame_count: int) -> int:
@@ -286,7 +284,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         return math.ceil(compressed / self.qformer_rate)
-class Gemma3ImagesKwargs(ImagesKwargs): # User's definition
     do_pan_and_scan: Optional[bool]
     pan_and_scan_min_crop_size: Optional[int]
     pan_and_scan_max_num_crops: Optional[int]
@@ -294,11 +292,11 @@ class Gemma3ImagesKwargs(ImagesKwargs): # User's definition
     do_convert_rgb: Optional[bool]
-class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): # User's definition
     images_kwargs: Dict[str, Any]
     audio_kwargs: Dict[str, Any]
     # Added text_kwargs as it's commonly part of such structures
-    text_kwargs: Optional[Dict[str, Any]] = None
     _defaults = {
         "text_kwargs": {"padding": False, "truncation": False, "max_length": DEFAULT_MAX_LENGTH},
         "images_kwargs": {},
@@ -308,30 +306,30 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): # User's definition
 class Gemma3OmniProcessor(ProcessorMixin):
     attributes = ["image_processor", "audio_processor", "tokenizer"]
-    valid_kwargs = ["chat_template", "image_seq_length"] # From user's code
     # --- FIXED CLASS ATTRIBUTES ---
-    image_processor_class = "AutoImageProcessor" # As in user's original code
-    audio_processor_class = Gemma3AudioFeatureExtractor # Corrected to custom class
-    tokenizer_class = "AutoTokenizer" # As in user's original code
     def __init__(
             self,
-            image_processor=None, # Allow None, superclass or from_pretrained handles loading via _class
-            audio_processor=None, # Allow None or instance
-            tokenizer=None,       # Allow None or instance
             chat_template=None,
             image_seq_length: int = 256,
-            **kwargs
     ):
         # The ProcessorMixin's __init__ will handle instantiating these if they are None,
         # using the respective *_class attributes.
         # If specific instances are passed, they will be used.
         # Retaining user's specific logic for setting attributes if needed,
         # though much of this might be handled by super() or better placed after super()
         self.image_seq_length = image_seq_length
         # These tokenizer-dependent attributes should be set *after* super().__init__
         # ensures self.tokenizer is populated, or if tokenizer is passed directly.
         # If tokenizer is None and loaded by super(), these need to be set post-super().
@@ -340,53 +338,53 @@ class Gemma3OmniProcessor(ProcessorMixin):
             # This is a basic placeholder; HF's from_pretrained mechanism is more robust for loading
             # For now, we'll assume if tokenizer is None, super() handles it or it's an error later.
             pass
-        else: # Tokenizer was provided
-            self.image_token_id = getattr(tokenizer, "image_token_id", None) # More robust with getattr
-            self.boi_token = getattr(tokenizer, "boi_token", "<|image|>") # Defaulting if not present
             self.image_token = getattr(tokenizer, "image_token", "<|image|>")
-            self.eoi_token = getattr(tokenizer, "eoi_token", "") # Added eoi_token as it was used
-            self.audio_token = "<audio_soft_token>" # User's definition
             # self.expected_audio_token_id = 262143 # User's reference
             # The existence of this token should be ensured when the tokenizer is prepared/saved.
-            self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
             # if self.audio_token_id != self.expected_audio_token_id: # User's warning
             #     logger.warning(...)
             if self.audio_token_id == tokenizer.unk_token_id:
-                 logger.warning(f"Audio token '{self.audio_token}' not found in tokenizer, maps to UNK. Ensure it's added.")
             self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * image_seq_length)}{self.eoi_token if hasattr(tokenizer, 'eoi_token') else ''}\n\n"
         # These seem specific to this processor's logic for determining audio token sequence length
         # It's better to initialize them here.
         self.audio_prompt_compression_rate = kwargs.pop("audio_prompt_compression_rate", 8)
         self.audio_prompt_qformer_rate = kwargs.pop("audio_prompt_qformer_rate", 1)
         self.audio_prompt_feat_stride = kwargs.pop("audio_prompt_feat_stride", 1)
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
             tokenizer=tokenizer,
             chat_template=chat_template,
-            **kwargs # Pass remaining kwargs to super
         )
         # If tokenizer was loaded by super(), set tokenizer-dependent attributes now
         if not hasattr(self, 'image_token_id') and self.tokenizer is not None:
-            self.image_token_id = getattr(self.tokenizer, "image_token_id", self.tokenizer.unk_token_id if hasattr(self.tokenizer, "unk_token_id") else None)
             self.boi_token = getattr(self.tokenizer, "boi_token", "<|image|>")
             self.image_token = getattr(self.tokenizer, "image_token", "<|image|>")
             self.eoi_token = getattr(self.tokenizer, "eoi_token", "")
             self.audio_token = "<audio_soft_token>"
             self.audio_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_token)
             if self.audio_token_id == self.tokenizer.unk_token_id:
-                 logger.warning(f"Audio token '{self.audio_token}' not found in tokenizer (post-super), maps to UNK. Ensure it's added.")
             self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * self.image_seq_length)}{self.eoi_token}\n\n"
     def _merge_kwargs(self, ModelProcessorKwargs, tokenizer_init_kwargs, **kwargs_from_call):
         # User's original _merge_kwargs logic
         default_kwargs = {}
@@ -400,17 +398,16 @@ class Gemma3OmniProcessor(ProcessorMixin):
         for modality_key_in_call, modality_kwargs_in_call in kwargs_from_call.items():
             if modality_key_in_call in default_kwargs:
-                 if isinstance(modality_kwargs_in_call, dict):
                     default_kwargs[modality_key_in_call].update(modality_kwargs_in_call)
-            elif isinstance(modality_kwargs_in_call, dict): # New modality not in defaults
-                 default_kwargs[modality_key_in_call] = modality_kwargs_in_call.copy()
         # Update defaults with tokenizer init kwargs (original logic)
-        for modality_key in default_kwargs: # Iterate over current keys in default_kwargs
             modality_dict = default_kwargs[modality_key]
-            if isinstance(modality_dict, dict): # Ensure it's a dict before trying to access keys
-                for key_in_mod_dict in list(modality_dict.keys()): # Iterate over copy of keys
                     if key_in_mod_dict in tokenizer_init_kwargs:
                         value = (
                             getattr(self.tokenizer, key_in_mod_dict)
@@ -418,13 +415,13 @@ class Gemma3OmniProcessor(ProcessorMixin):
                             else tokenizer_init_kwargs[key_in_mod_dict]
                         )
                         modality_dict[key_in_mod_dict] = value
         # Ensure text_kwargs processing (original logic)
-        if "text_kwargs" not in default_kwargs: # Ensure text_kwargs exists
             default_kwargs["text_kwargs"] = {}
         default_kwargs["text_kwargs"]["truncation"] = default_kwargs["text_kwargs"].get("truncation", False)
-        default_kwargs["text_kwargs"]["max_length"] = default_kwargs["text_kwargs"].get("max_length", DEFAULT_MAX_LENGTH)
         return default_kwargs
@@ -436,14 +433,14 @@ class Gemma3OmniProcessor(ProcessorMixin):
     def __call__(
             self,
             images=None,
-            text:Union[str, List[str]]=None, # text is optional but often primary
             # videos=None, # Removed 'videos' as it's not handled
             audios: Optional[Union[AudioInput, List[AudioInput]]] = None,
-            sampling_rate: Optional[int] = None, # For audio_processor if audios are raw arrays
             return_tensors: Optional[Union[str, TensorType]] = None,
-            **kwargs: Any # Replaced Unpack for broader compatibility here
     ) -> BatchFeature:
-        if text is None and images is None and audios is None: # Added audios to check
             raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
         # Determine final return_tensors strategy
@@ -452,11 +449,11 @@ class Gemma3OmniProcessor(ProcessorMixin):
         # This call to _merge_kwargs primarily populates kwargs for each modality if passed in __call__
         # e.g. if user calls proc(..., text_kwargs={...})
         merged_call_kwargs = self._merge_kwargs(
-            Gemma3ProcessorKwargs,
             self.tokenizer.init_kwargs if hasattr(self.tokenizer, "init_kwargs") else {},
             **kwargs
         )
         # If return_tensors wasn't passed to __call__, try to get it from merged text_kwargs
         # and remove it from there to avoid passing it twice to tokenizer.
         # Default to PYTORCH if still None.
@@ -465,17 +462,17 @@ class Gemma3OmniProcessor(ProcessorMixin):
         else:
             merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
         # Standardize text input
-        if text is None: # If no text given, create dummy text based on other modalities
             num_samples = 0
             if images is not None:
-                _images_list = images if isinstance(images, list) and (not images or not isinstance(images[0], (int,float))) else [images]
                 num_samples = len(_images_list)
             elif audios is not None:
                 _audios_list = audios if isinstance(audios, list) else [audios]
                 num_samples = len(_audios_list)
-            text = [""] * num_samples if num_samples > 0 else [""] # Fallback for safety
         if isinstance(text, str):
             text = [text]
@@ -485,19 +482,20 @@ class Gemma3OmniProcessor(ProcessorMixin):
         # --- Image Processing ---
         image_features_dict = {}
         if images is not None and self.image_processor is not None:
-            batched_images = make_nested_list_of_images(images) # HF utility
             # Assuming image_processor returns a dict or BatchFeature. If BatchFeature, get .data
-            _img_proc_output = self.image_processor(batched_images, return_tensors=None, **merged_call_kwargs.get("images_kwargs", {}))
-            image_features_dict = _img_proc_output.data if isinstance(_img_proc_output, BatchFeature) else _img_proc_output
-            if len(batched_images) != len(text): # Validate batch consistency
                 raise ValueError(f"Inconsistent batch sizes: {len(batched_images)} images, {len(text)} texts")
             # User's original image token replacement logic (complex, depends on num_crops etc from image_processor output)
             # This part needs to be carefully adapted based on actual image_processor output structure
             # For now, a simplified placeholder for the concept:
-            if "num_crops" in image_features_dict: # Example check
                 num_crops_list = to_py_obj(image_features_dict.pop("num_crops"))
                 # ... user's original logic for text modification with self.full_image_sequence ...
                 # This was: text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
@@ -505,8 +503,8 @@ class Gemma3OmniProcessor(ProcessorMixin):
                 # For simplicity, assuming one image sequence per text for now if an image is present.
                 temp_text = []
                 for i, prompt in enumerate(text):
-                    if i < len(batched_images): # if this text sample has corresponding images
-                         # Replace first boi_token or append if not found
                         if self.boi_token in prompt:
                             temp_text.append(prompt.replace(self.boi_token, self.full_image_sequence, 1))
                         else:
@@ -515,14 +513,13 @@ class Gemma3OmniProcessor(ProcessorMixin):
                         temp_text.append(prompt)
                 text = temp_text
         # --- Audio Processing ---
         audio_features_dict = {}
         if audios is not None and self.audio_processor is not None:
             audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
             if sampling_rate is not None:
-                 audio_call_kwargs["sampling_rate"] = sampling_rate
             # audio_processor.__call__ returns BatchFeature, get its .data attribute for the dict
             _audio_proc_output = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
             audio_features_dict = _audio_proc_output.data
@@ -533,21 +530,22 @@ class Gemma3OmniProcessor(ProcessorMixin):
             actual_mel_frames_per_sample = to_py_obj(audio_features_dict["audio_attention_mask"].sum(axis=-1))
             if len(actual_mel_frames_per_sample) != len(text):
-                 raise ValueError(f"Inconsistent batch sizes for audio and text: {len(actual_mel_frames_per_sample)} audio samples, {len(text)} texts.")
             for i, prompt in enumerate(text):
                 num_soft_tokens = self._compute_audio_embed_size(actual_mel_frames_per_sample[i])
-                audio_token_sequence_str = self.audio_soft_token_str * num_soft_tokens # Repeat soft token string
                 # Replace a placeholder or append
-                placeholder = getattr(self, "audio_placeholder_token", "<|audio|>") # Use defined placeholder
                 if placeholder in prompt:
                     prompt_with_audio = prompt.replace(placeholder, audio_token_sequence_str, 1)
-                else:
-                    prompt_with_audio = prompt + audio_token_sequence_str
                 new_text_with_audio_tokens.append(prompt_with_audio)
             text = new_text_with_audio_tokens
         # --- Text Tokenization ---
         text_tokenizer_kwargs = merged_call_kwargs.get("text_kwargs", {})
         # Tokenize the (potentially modified) text, request lists/np arrays
@@ -562,28 +560,28 @@ class Gemma3OmniProcessor(ProcessorMixin):
             if isinstance(input_ids_list_of_lists, (torch.Tensor, np.ndarray)):
                 input_ids_list_of_lists = to_py_obj(input_ids_list_of_lists)
             elif isinstance(input_ids_list_of_lists, list) and \
-                 (not input_ids_list_of_lists or isinstance(input_ids_list_of_lists[0], int)):
-                input_ids_list_of_lists = [input_ids_list_of_lists] # Batch of 1
         mm_token_type_ids_list = []
         for ids_sample in input_ids_list_of_lists:
-            type_ids_sample = [0] * len(ids_sample) # Default type 0 (text)
             for idx, token_id_val in enumerate(ids_sample):
                 if self.image_token_id is not None and token_id_val == self.image_token_id:
-                    type_ids_sample[idx] = 1 # Image token type
-                elif token_id_val == self.audio_token_id: # Compare with ID of <audio_soft_token>
-                    type_ids_sample[idx] = 2 # Audio token type
             mm_token_type_ids_list.append(type_ids_sample)
         text_features_dict["token_type_ids"] = mm_token_type_ids_list
         # Combine all features
         final_batch_data = {**text_features_dict}
-        if image_features_dict:
             final_batch_data.update(image_features_dict)
-        if audio_features_dict:
             final_batch_data.update(audio_features_dict)
-        return BatchFeature(data=final_batch_data, tensor_type=final_rt) # Use determined final_rt
     def batch_decode(self, *args, **kwargs):
         return self.tokenizer.batch_decode(*args, **kwargs)
@@ -595,11 +593,11 @@ class Gemma3OmniProcessor(ProcessorMixin):
     def model_input_names(self):
         tokenizer_inputs = self.tokenizer.model_input_names + ["token_type_ids"]
         image_processor_inputs = []
-        if self.image_processor is not None: # Check if image_processor exists
-             image_processor_inputs = self.image_processor.model_input_names
         audio_processor_inputs = []
-        if self.audio_processor is not None: # Check if audio_processor exists
             # These are the keys Gemma3AudioFeatureExtractor puts in its output BatchFeature.data
             audio_processor_inputs = ["audio_values", "audio_attention_mask"]
             # "audio_values_sizes" was in user's original Gemma3AudioFeatureExtractor output,

 import torch
 from torch.nn.utils.rnn import pad_sequence
 # Using the original AudioInput for minimal change from your provided code
+from transformers.audio_utils import AudioInput  # type: ignore
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import make_nested_list_of_images
+from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, \
+    ImagesKwargs  # Removed Unpack as it's not standard
 from transformers.utils import TensorType, to_py_obj, logging
 # Constants
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
+LOG_MEL_CLIP_EPSILON = 1e-5  # Epsilon for log mel clipping
 logger = logging.get_logger(__name__)
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing. (User's version)"""
+    fmax = fmax or sampling_rate / 2.0  # Ensure float division
     # User's Mel scale formula
     def hz_to_mel(f: float) -> float:
         return 1127.0 * math.log(1 + f / 700.0)
+    def mel_to_hz(mel: float) -> float:  # Added for completeness if needed
+        return 700.0 * (math.exp(mel / 1127.0) - 1)
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
     # freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1) # Original
+    freq_points = mel_to_hz(mel_points)  # Using the inverse function
     # Clip freq_points to be within [0, sampling_rate/2]
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
     # Ensure bins are within valid range for rfft output indices
     bins = np.clip(bins, 0, n_fft // 2)
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
+    for m_idx in range(n_mels):  # Loop from 0 to n_mels-1 to fill filterbank[m_idx]
         # Bins for (m_idx)-th filter are bins[m_idx], bins[m_idx+1], bins[m_idx+2]
         left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
         # Original logic for applying triangular filter
         # Ensure no division by zero if points coincide
         if center > left:
             filterbank[m_idx, center:right] = (right - np.arange(center, right)) / (right - center)
         # If left=center or center=right, the corresponding slope is zero, which is implicitly handled.
         # Ensure peak is 1.0 if center is a valid point within a slope.
+        if left <= center < right and center > left:  # If center forms a peak of a valid triangle part
+            filterbank[m_idx, center] = 1.0
     return filterbank
             compression_rate: int = DEFAULT_COMPRESSION_RATE,
             qformer_rate: int = DEFAULT_QFORMER_RATE,
             feat_stride: int = DEFAULT_FEAT_STRIDE,
+            sampling_rate: int = DEFAULT_SAMPLING_RATE,  # Target sampling rate
             n_fft: int = DEFAULT_N_FFT,
             win_length: Optional[int] = None,
             hop_length: Optional[int] = None,
             n_mels: int = DEFAULT_N_MELS,
+            f_min: float = 0.0,  # Added for mel filterbank control
+            f_max: Optional[float] = None,  # Added for mel filterbank control
+            padding_value: float = 0.0,  # Explicitly define for clarity
             **kwargs
     ):
         _win_length = win_length if win_length is not None else n_fft
         # feature_size is n_mels for the superclass
         super().__init__(
             feature_size=n_mels,
+            sampling_rate=sampling_rate,  # This sets self.sampling_rate
             padding_value=padding_value,
             **kwargs
         )
         self.hop_length = _hop_length
         self.n_mels = n_mels
         self.f_min = f_min
+        self.f_max = f_max  # Will be sampling_rate/2 if None in create_mel_filterbank call
         if self.win_length > self.n_fft:
             logger.warning(
                 f"win_length ({self.win_length}) is greater than n_fft ({self.n_fft}). "
                 "Window will be applied, then data will be zero-padded/truncated to n_fft by np.fft.rfft."
             )
+        self.window = np.hamming(self.win_length).astype(
+            np.float32)  # Or scipy.signal.get_window("hann", self.win_length)
         self.mel_filterbank = create_mel_filterbank(
             self.sampling_rate, self.n_fft, self.n_mels, fmin=self.f_min, fmax=self.f_max
+        ).T  # Transpose for dot product: (n_fft // 2 + 1, n_mels)
     def __call__(
             self,
+            audios: Union[AudioInput, List[AudioInput]],  # Accept single or list
+            sampling_rate: Optional[int] = None,  # To specify SR if audios are raw arrays
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
         if not isinstance(audios, list):
             audios = [audios]
         processed_mels: List[torch.Tensor] = []
         actual_mel_lengths: List[int] = []
         # Kept from user's code - their purpose might be for token calculation downstream
         sizes_for_embed_length: List[torch.Tensor] = []
         frames_scaled_by_feat_stride: List[int] = []
             if isinstance(audio_item, tuple) and len(audio_item) == 2 and isinstance(audio_item[1], int):
                 current_wav, source_sr = audio_item
+                current_wav = np.asarray(current_wav, dtype=np.float32)  # Ensure float32 numpy array
             elif isinstance(audio_item, (np.ndarray, list)):
                 current_wav = np.asarray(audio_item, dtype=np.float32)
                 if sampling_rate is None:
                     f"Unsupported audio input type: {type(audio_item)}. "
                     "Expected np.ndarray, list of floats, or Tuple[np.ndarray, int]."
                 )
             processed_wav_array = self._preprocess_audio(current_wav, source_sr)
+            mel_spectrogram = self._compute_log_mel_spectrogram(processed_wav_array)  # Shape: (T_mel, N_Mels)
+            feature_tensor = torch.from_numpy(mel_spectrogram)  # Already float32
             processed_mels.append(feature_tensor)
+            actual_mel_lengths.append(feature_tensor.shape[0])  # T_mel for this item
             # User's original logic for 'sizes' and 'frames'
             sizes_for_embed_length.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
         # Create attention mask corresponding to the actual lengths of mel spectrograms
         max_t_mel_in_batch = audio_embeds.shape[1]
+        current_device = audio_embeds.device  # Get device from padded tensor if using PyTorch tensors earlier
         # Create attention mask directly based on actual_mel_lengths
         attention_mask = torch.zeros(len(audios), max_t_mel_in_batch, dtype=torch.bool, device=current_device)
         for i, length in enumerate(actual_mel_lengths):
             attention_mask[i, :length] = True
         output_data = {
             "audio_values": audio_embeds,
+            "audio_attention_mask": attention_mask  # Correctly shaped mask for audio_values
         }
         # Include user's 'sizes' if they are needed downstream
         # Ensure wav is float32
         if wav.dtype not in [np.float32, np.float64]:
             if np.issubdtype(wav.dtype, np.integer):
+                max_val = np.iinfo(wav.dtype).max if wav.size > 0 else 1.0  # Avoid error on empty array
                 wav = wav.astype(np.float32) / max_val
             else:
                 wav = wav.astype(np.float32)
             wav = wav.astype(np.float32)
         if wav.ndim > 1:
+            wav = wav.mean(axis=0)  # Convert to mono
         if source_sr != self.sampling_rate:
             logger.info(f"Resampling audio from {source_sr} Hz to {self.sampling_rate} Hz.")
             # Calculate integer up/down factors for resample_poly
             common_divisor = math.gcd(self.sampling_rate, source_sr)
             up_factor = self.sampling_rate // common_divisor
             down_factor = source_sr // common_divisor
+            if up_factor != down_factor:  # Only if actual resampling is needed
                 wav = scipy.signal.resample_poly(wav, up=up_factor, down=down_factor)
         # Normalize amplitude to roughly [-1, 1]
         max_abs_val = np.abs(wav).max()
+        if max_abs_val > 1e-7:  # Avoid division by zero or tiny numbers
             wav = wav / max_abs_val
         return wav
         # Calculate number of frames
         # This calculation ensures at least one frame if len(wav) == self.win_length
         if len(wav) >= self.win_length:
+            num_frames = 1 + (len(wav) - self.win_length) // self.hop_length
+        else:  # Should be covered by padding, but as safeguard
+            num_frames = 0
         if num_frames <= 0:
             logger.warning(f"Audio is too short (length {len(wav)}) to produce any frames "
                            f"with win_length {self.win_length} and hop_length {self.hop_length}. "
             strides=(strides * self.hop_length, strides),
             writeable=False
         )
+        frames_data = frames_view.copy()  # Important: copy after as_strided if modifying
+        frames_data *= self.window  # Apply window in-place on the copy
         # Compute STFT (rfft for real inputs)
         # n_fft determines zero-padding or truncation for FFT input from each frame
         spectrum = np.fft.rfft(frames_data, n=self.n_fft, axis=-1).astype(np.complex64)
+        power = np.abs(spectrum) ** 2
+        mel_spectrogram = np.dot(power, self.mel_filterbank)  # (num_frames, n_mels)
         # Clip and take log
+        mel_spectrogram = np.clip(mel_spectrogram, LOG_MEL_CLIP_EPSILON, None)  # Use defined epsilon
         log_mel_spectrogram = np.log(mel_spectrogram)
         return log_mel_spectrogram.astype(np.float32)
     def _calculate_embed_length(self, frame_count: int) -> int:
         return math.ceil(compressed / self.qformer_rate)
+class Gemma3ImagesKwargs(ImagesKwargs):  # User's definition
     do_pan_and_scan: Optional[bool]
     pan_and_scan_min_crop_size: Optional[int]
     pan_and_scan_max_num_crops: Optional[int]
     do_convert_rgb: Optional[bool]
+class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):  # User's definition
     images_kwargs: Dict[str, Any]
     audio_kwargs: Dict[str, Any]
     # Added text_kwargs as it's commonly part of such structures
+    text_kwargs: Optional[Dict[str, Any]] = None
     _defaults = {
         "text_kwargs": {"padding": False, "truncation": False, "max_length": DEFAULT_MAX_LENGTH},
         "images_kwargs": {},
 class Gemma3OmniProcessor(ProcessorMixin):
     attributes = ["image_processor", "audio_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_seq_length"]  # From user's code
     # --- FIXED CLASS ATTRIBUTES ---
+    image_processor_class = "AutoImageProcessor"  # As in user's original code
+    audio_processor_class = "AutoFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"  # As in user's original code
     def __init__(
             self,
+            image_processor=None,  # Allow None, superclass or from_pretrained handles loading via _class
+            audio_processor=None,  # Allow None or instance
+            tokenizer=None,  # Allow None or instance
             chat_template=None,
             image_seq_length: int = 256,
+            **kwargs
     ):
         # The ProcessorMixin's __init__ will handle instantiating these if they are None,
         # using the respective *_class attributes.
         # If specific instances are passed, they will be used.
         # Retaining user's specific logic for setting attributes if needed,
         # though much of this might be handled by super() or better placed after super()
         self.image_seq_length = image_seq_length
         # These tokenizer-dependent attributes should be set *after* super().__init__
         # ensures self.tokenizer is populated, or if tokenizer is passed directly.
         # If tokenizer is None and loaded by super(), these need to be set post-super().
             # This is a basic placeholder; HF's from_pretrained mechanism is more robust for loading
             # For now, we'll assume if tokenizer is None, super() handles it or it's an error later.
             pass
+        else:  # Tokenizer was provided
+            self.image_token_id = getattr(tokenizer, "image_token_id", None)  # More robust with getattr
+            self.boi_token = getattr(tokenizer, "boi_token", "<|image|>")  # Defaulting if not present
             self.image_token = getattr(tokenizer, "image_token", "<|image|>")
+            self.eoi_token = getattr(tokenizer, "eoi_token", "")  # Added eoi_token as it was used
+            self.audio_token = "<audio_soft_token>"  # User's definition
             # self.expected_audio_token_id = 262143 # User's reference
             # The existence of this token should be ensured when the tokenizer is prepared/saved.
+            self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
             # if self.audio_token_id != self.expected_audio_token_id: # User's warning
             #     logger.warning(...)
             if self.audio_token_id == tokenizer.unk_token_id:
+                logger.warning(
+                    f"Audio token '{self.audio_token}' not found in tokenizer, maps to UNK. Ensure it's added.")
             self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * image_seq_length)}{self.eoi_token if hasattr(tokenizer, 'eoi_token') else ''}\n\n"
         # These seem specific to this processor's logic for determining audio token sequence length
         # It's better to initialize them here.
         self.audio_prompt_compression_rate = kwargs.pop("audio_prompt_compression_rate", 8)
         self.audio_prompt_qformer_rate = kwargs.pop("audio_prompt_qformer_rate", 1)
         self.audio_prompt_feat_stride = kwargs.pop("audio_prompt_feat_stride", 1)
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
             tokenizer=tokenizer,
             chat_template=chat_template,
+            **kwargs  # Pass remaining kwargs to super
         )
         # If tokenizer was loaded by super(), set tokenizer-dependent attributes now
         if not hasattr(self, 'image_token_id') and self.tokenizer is not None:
+            self.image_token_id = getattr(self.tokenizer, "image_token_id",
+                                          self.tokenizer.unk_token_id if hasattr(self.tokenizer,
+                                                                                 "unk_token_id") else None)
             self.boi_token = getattr(self.tokenizer, "boi_token", "<|image|>")
             self.image_token = getattr(self.tokenizer, "image_token", "<|image|>")
             self.eoi_token = getattr(self.tokenizer, "eoi_token", "")
             self.audio_token = "<audio_soft_token>"
             self.audio_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_token)
             if self.audio_token_id == self.tokenizer.unk_token_id:
+                logger.warning(
+                    f"Audio token '{self.audio_token}' not found in tokenizer (post-super), maps to UNK. Ensure it's added.")
             self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * self.image_seq_length)}{self.eoi_token}\n\n"
     def _merge_kwargs(self, ModelProcessorKwargs, tokenizer_init_kwargs, **kwargs_from_call):
         # User's original _merge_kwargs logic
         default_kwargs = {}
         for modality_key_in_call, modality_kwargs_in_call in kwargs_from_call.items():
             if modality_key_in_call in default_kwargs:
+                if isinstance(modality_kwargs_in_call, dict):
                     default_kwargs[modality_key_in_call].update(modality_kwargs_in_call)
+            elif isinstance(modality_kwargs_in_call, dict):  # New modality not in defaults
+                default_kwargs[modality_key_in_call] = modality_kwargs_in_call.copy()
         # Update defaults with tokenizer init kwargs (original logic)
+        for modality_key in default_kwargs:  # Iterate over current keys in default_kwargs
             modality_dict = default_kwargs[modality_key]
+            if isinstance(modality_dict, dict):  # Ensure it's a dict before trying to access keys
+                for key_in_mod_dict in list(modality_dict.keys()):  # Iterate over copy of keys
                     if key_in_mod_dict in tokenizer_init_kwargs:
                         value = (
                             getattr(self.tokenizer, key_in_mod_dict)
                             else tokenizer_init_kwargs[key_in_mod_dict]
                         )
                         modality_dict[key_in_mod_dict] = value
         # Ensure text_kwargs processing (original logic)
+        if "text_kwargs" not in default_kwargs:  # Ensure text_kwargs exists
             default_kwargs["text_kwargs"] = {}
         default_kwargs["text_kwargs"]["truncation"] = default_kwargs["text_kwargs"].get("truncation", False)
+        default_kwargs["text_kwargs"]["max_length"] = default_kwargs["text_kwargs"].get("max_length",
+                                                                                        DEFAULT_MAX_LENGTH)
         return default_kwargs
     def __call__(
             self,
             images=None,
+            text: Union[str, List[str]] = None,  # text is optional but often primary
             # videos=None, # Removed 'videos' as it's not handled
             audios: Optional[Union[AudioInput, List[AudioInput]]] = None,
+            sampling_rate: Optional[int] = None,  # For audio_processor if audios are raw arrays
             return_tensors: Optional[Union[str, TensorType]] = None,
+            **kwargs: Any  # Replaced Unpack for broader compatibility here
     ) -> BatchFeature:
+        if text is None and images is None and audios is None:  # Added audios to check
             raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
         # Determine final return_tensors strategy
         # This call to _merge_kwargs primarily populates kwargs for each modality if passed in __call__
         # e.g. if user calls proc(..., text_kwargs={...})
         merged_call_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,
             self.tokenizer.init_kwargs if hasattr(self.tokenizer, "init_kwargs") else {},
             **kwargs
         )
         # If return_tensors wasn't passed to __call__, try to get it from merged text_kwargs
         # and remove it from there to avoid passing it twice to tokenizer.
         # Default to PYTORCH if still None.
         else:
             merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
         # Standardize text input
+        if text is None:  # If no text given, create dummy text based on other modalities
             num_samples = 0
             if images is not None:
+                _images_list = images if isinstance(images, list) and (
+                            not images or not isinstance(images[0], (int, float))) else [images]
                 num_samples = len(_images_list)
             elif audios is not None:
                 _audios_list = audios if isinstance(audios, list) else [audios]
                 num_samples = len(_audios_list)
+            text = [""] * num_samples if num_samples > 0 else [""]  # Fallback for safety
         if isinstance(text, str):
             text = [text]
         # --- Image Processing ---
         image_features_dict = {}
         if images is not None and self.image_processor is not None:
+            batched_images = make_nested_list_of_images(images)  # HF utility
             # Assuming image_processor returns a dict or BatchFeature. If BatchFeature, get .data
+            _img_proc_output = self.image_processor(batched_images, return_tensors=None,
+                                                    **merged_call_kwargs.get("images_kwargs", {}))
+            image_features_dict = _img_proc_output.data if isinstance(_img_proc_output,
+                                                                      BatchFeature) else _img_proc_output
+            if len(batched_images) != len(text):  # Validate batch consistency
                 raise ValueError(f"Inconsistent batch sizes: {len(batched_images)} images, {len(text)} texts")
             # User's original image token replacement logic (complex, depends on num_crops etc from image_processor output)
             # This part needs to be carefully adapted based on actual image_processor output structure
             # For now, a simplified placeholder for the concept:
+            if "num_crops" in image_features_dict:  # Example check
                 num_crops_list = to_py_obj(image_features_dict.pop("num_crops"))
                 # ... user's original logic for text modification with self.full_image_sequence ...
                 # This was: text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
                 # For simplicity, assuming one image sequence per text for now if an image is present.
                 temp_text = []
                 for i, prompt in enumerate(text):
+                    if i < len(batched_images):  # if this text sample has corresponding images
+                        # Replace first boi_token or append if not found
                         if self.boi_token in prompt:
                             temp_text.append(prompt.replace(self.boi_token, self.full_image_sequence, 1))
                         else:
                         temp_text.append(prompt)
                 text = temp_text
         # --- Audio Processing ---
         audio_features_dict = {}
         if audios is not None and self.audio_processor is not None:
             audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
             if sampling_rate is not None:
+                audio_call_kwargs["sampling_rate"] = sampling_rate
             # audio_processor.__call__ returns BatchFeature, get its .data attribute for the dict
             _audio_proc_output = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
             audio_features_dict = _audio_proc_output.data
             actual_mel_frames_per_sample = to_py_obj(audio_features_dict["audio_attention_mask"].sum(axis=-1))
             if len(actual_mel_frames_per_sample) != len(text):
+                raise ValueError(
+                    f"Inconsistent batch sizes for audio and text: {len(actual_mel_frames_per_sample)} audio samples, {len(text)} texts.")
             for i, prompt in enumerate(text):
                 num_soft_tokens = self._compute_audio_embed_size(actual_mel_frames_per_sample[i])
+                audio_token_sequence_str = self.audio_soft_token_str * num_soft_tokens  # Repeat soft token string
                 # Replace a placeholder or append
+                placeholder = getattr(self, "audio_placeholder_token", "<|audio|>")  # Use defined placeholder
                 if placeholder in prompt:
                     prompt_with_audio = prompt.replace(placeholder, audio_token_sequence_str, 1)
+                else:
+                    prompt_with_audio = prompt + audio_token_sequence_str
                 new_text_with_audio_tokens.append(prompt_with_audio)
             text = new_text_with_audio_tokens
         # --- Text Tokenization ---
         text_tokenizer_kwargs = merged_call_kwargs.get("text_kwargs", {})
         # Tokenize the (potentially modified) text, request lists/np arrays
             if isinstance(input_ids_list_of_lists, (torch.Tensor, np.ndarray)):
                 input_ids_list_of_lists = to_py_obj(input_ids_list_of_lists)
             elif isinstance(input_ids_list_of_lists, list) and \
+                    (not input_ids_list_of_lists or isinstance(input_ids_list_of_lists[0], int)):
+                input_ids_list_of_lists = [input_ids_list_of_lists]  # Batch of 1
         mm_token_type_ids_list = []
         for ids_sample in input_ids_list_of_lists:
+            type_ids_sample = [0] * len(ids_sample)  # Default type 0 (text)
             for idx, token_id_val in enumerate(ids_sample):
                 if self.image_token_id is not None and token_id_val == self.image_token_id:
+                    type_ids_sample[idx] = 1  # Image token type
+                elif token_id_val == self.audio_token_id:  # Compare with ID of <audio_soft_token>
+                    type_ids_sample[idx] = 2  # Audio token type
             mm_token_type_ids_list.append(type_ids_sample)
         text_features_dict["token_type_ids"] = mm_token_type_ids_list
         # Combine all features
         final_batch_data = {**text_features_dict}
+        if image_features_dict:
             final_batch_data.update(image_features_dict)
+        if audio_features_dict:
             final_batch_data.update(audio_features_dict)
+        return BatchFeature(data=final_batch_data, tensor_type=final_rt)  # Use determined final_rt
     def batch_decode(self, *args, **kwargs):
         return self.tokenizer.batch_decode(*args, **kwargs)
     def model_input_names(self):
         tokenizer_inputs = self.tokenizer.model_input_names + ["token_type_ids"]
         image_processor_inputs = []
+        if self.image_processor is not None:  # Check if image_processor exists
+            image_processor_inputs = self.image_processor.model_input_names
         audio_processor_inputs = []
+        if self.audio_processor is not None:  # Check if audio_processor exists
             # These are the keys Gemma3AudioFeatureExtractor puts in its output BatchFeature.data
             audio_processor_inputs = ["audio_values", "audio_attention_mask"]
             # "audio_values_sizes" was in user's original Gemma3AudioFeatureExtractor output,