voidful
/

gemma-3-omni-processor

Feature Extraction

Transformers

gemma_3_omni

custom_code

Model card Files Files and versions Community

voidful commited on May 14

Commit

9faac02

verified ·

1 Parent(s): f1bb3f9

Update processing_gemma3_omni.py

Browse files

Files changed (1) hide show

processing_gemma3_omni.py +406 -305

processing_gemma3_omni.py CHANGED Viewed

@@ -1,30 +1,24 @@
 import re
-from typing import List, Optional, Union, Dict, Any, Tuple
 import math
 import numpy as np
 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
-# Assuming AudioInput might be from transformers.audio_utils for full robustness,
-# but for now, let's define a clear supported set.
-# from transformers.audio_utils import AudioInput as HfAudioInput, load_audio
-# For this fix, we define AudioInput locally for clarity on what's handled.
-AudioInput = Union[np.ndarray, List[float], Tuple[np.ndarray, int]]
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
-from transformers.processing_utils import ProcessorMixin, ProcessingKwargs
 from transformers.utils import TensorType, to_py_obj, logging
-# For AutoImageProcessor, AutoTokenizer if needed for default loading
-from transformers import AutoImageProcessor, AutoTokenizer
-# Constants (as defined before)
 DEFAULT_SAMPLING_RATE = 16000
 DEFAULT_N_FFT = 512
-DEFAULT_WIN_LENGTH = 400  # Will be n_fft if None in __init__
-DEFAULT_HOP_LENGTH = 160  # Will be win_length // 4 if None in __init__
 DEFAULT_N_MELS = 80
 DEFAULT_COMPRESSION_RATE = 4
 DEFAULT_QFORMER_RATE = 2
@@ -32,59 +26,56 @@ DEFAULT_FEAT_STRIDE = 4
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
-LOG_MEL_CLIP_EPSILON = 1e-5
 logger = logging.get_logger(__name__)
-# create_mel_filterbank function (assuming it's correctly defined from previous response)
-# ... (create_mel_filterbank function from the previous corrected response) ...
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
-    """Create Mel filterbank for audio processing."""
-    fmax = fmax or sampling_rate / 2.0
-    if fmin >= fmax:
-        raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
-    def hz_to_mel(f: float) -> float:  # Using HTK formula (as in librosa default)
-        return 2595.0 * math.log10(1 + f / 700.0)
-    def mel_to_hz(mel: float) -> float:
-        return 700.0 * (10 ** (mel / 2595.0) - 1)
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
-    freq_points = mel_to_hz(mel_points)
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
-    bins = np.floor((n_fft / 2.0) * freq_points / (sampling_rate / 2.0)).astype(int)
     bins = np.clip(bins, 0, n_fft // 2)
-    filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
-    for m in range(n_mels):
-        left, center, right = bins[m], bins[m + 1], bins[m + 2]
-        # Simplified triangle creation logic (more robust versions exist in libraries like librosa)
         if center > left:
-            filterbank[m, left:center + 1] = (np.arange(left, center + 1) - left) / (center - left)
         if right > center:
-            filterbank[m, center:right + 1] = (right - np.arange(center, right + 1)) / (right - center)
-        # Ensure peak is 1 if multiple points coincide at center (can happen with narrow filters/low resolution)
-        if left <= center <= right and filterbank[m, center] < 1.0 and (
-                center > left or center < right):  # check if it's a valid point for a peak
-            # if filterbank[m,center] is not properly set to 1 by slopes (e.g. left==center or right==center)
-            filterbank[m, center] = 1.0
-            if left == center and right > center:  # only falling slope
-                # Ensure it doesn't double-dip if already set
-                pass
-            elif right == center and left < center:  # only rising slope
-                pass
     return filterbank
-# Gemma3AudioFeatureExtractor class (assuming it's correctly defined from previous response)
-# ... (Gemma3AudioFeatureExtractor class from the previous corrected response) ...
 class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
     model_input_names = ["audio_values", "audio_attention_mask"]
@@ -93,168 +84,221 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             compression_rate: int = DEFAULT_COMPRESSION_RATE,
             qformer_rate: int = DEFAULT_QFORMER_RATE,
             feat_stride: int = DEFAULT_FEAT_STRIDE,
-            sampling_rate: int = DEFAULT_SAMPLING_RATE,
             n_fft: int = DEFAULT_N_FFT,
             win_length: Optional[int] = None,
             hop_length: Optional[int] = None,
             n_mels: int = DEFAULT_N_MELS,
-            f_min: float = 0.0,
-            f_max: Optional[float] = None,
-            padding_value: float = 0.0,
             **kwargs
     ):
-        kwargs.pop("feature_size", None)
-        kwargs.pop("sampling_rate", None)
-        kwargs.pop("padding_value", None)
         super().__init__(
             feature_size=n_mels,
-            sampling_rate=sampling_rate,
-            padding_value=0.0,
             **kwargs
         )
         self.compression_rate = compression_rate
         self.qformer_rate = qformer_rate
         self.feat_stride = feat_stride
         self.n_fft = n_fft
-        self.win_length = win_length if win_length is not None else n_fft
-        self.hop_length = hop_length if hop_length is not None else self.win_length // 4
         self.n_mels = n_mels
         self.f_min = f_min
-        self.f_max = f_max if f_max is not None else self.sampling_rate / 2.0
         if self.win_length > self.n_fft:
             logger.warning(
                 f"win_length ({self.win_length}) is greater than n_fft ({self.n_fft}). "
-                f"For FFT computation, the window will effectively be truncated or the signal zero-padded to n_fft length."
             )
-        self.window = scipy.signal.get_window("hann", self.win_length).astype(np.float32)
         self.mel_filterbank = create_mel_filterbank(
             self.sampling_rate, self.n_fft, self.n_mels, fmin=self.f_min, fmax=self.f_max
-        ).T
     def __call__(
             self,
-            audios: Union[AudioInput, List[AudioInput]],
-            sampling_rate: Optional[int] = None,
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
         if not isinstance(audios, list):
             audios = [audios]
-        processed_mel_spectrograms: List[torch.Tensor] = []
         actual_mel_lengths: List[int] = []
-        downstream_sizes_for_token_calc: List[torch.Tensor] = []
-        downstream_frames_scaled_for_token_calc: List[int] = []
-        for audio_input_item in audios:
-            current_wav_array: np.ndarray
             source_sr: int
-            if isinstance(audio_input_item, tuple):
-                current_wav_array, source_sr = audio_input_item
-                current_wav_array = np.asarray(current_wav_array, dtype=np.float32)
-            elif isinstance(audio_input_item, (np.ndarray, list)):
-                current_wav_array = np.asarray(audio_input_item, dtype=np.float32)
                 if sampling_rate is None:
                     raise ValueError(
-                        "sampling_rate must be provided if audio inputs are raw numpy arrays or lists."
                     )
                 source_sr = sampling_rate
             else:
                 raise TypeError(
-                    f"Unsupported audio input type: {type(audio_input_item)}. "
-                    "This extractor expects np.ndarray, list of floats, or Tuple[np.ndarray, int indicating SR]."
                 )
-            processed_wav = self._preprocess_audio(current_wav_array, source_sr)
-            mel_spectrogram = self._compute_log_mel_spectrogram(processed_wav)
-            feature_tensor = torch.from_numpy(mel_spectrogram)
-            processed_mel_spectrograms.append(feature_tensor)
-            actual_mel_lengths.append(feature_tensor.shape[0])
-            downstream_sizes_for_token_calc.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
-            downstream_frames_scaled_for_token_calc.append(feature_tensor.shape[0] * self.feat_stride)
-        audio_values = pad_sequence(processed_mel_spectrograms, batch_first=True, padding_value=self.padding_value)
-        max_mel_len = audio_values.shape[1]
-        lengths_tensor = torch.tensor(actual_mel_lengths, dtype=torch.long)
-        audio_attention_mask = torch.arange(max_mel_len).unsqueeze(0).expand(len(audios),
-                                                                             -1) < lengths_tensor.unsqueeze(1)
         output_data = {
-            "audio_values": audio_values,
-            "audio_attention_mask": audio_attention_mask
         }
-        if downstream_sizes_for_token_calc:
-            output_data["audio_token_calc_sizes"] = torch.stack(downstream_sizes_for_token_calc)
         return BatchFeature(data=output_data, tensor_type=return_tensors)
     def _preprocess_audio(self, wav: np.ndarray, source_sr: int) -> np.ndarray:
         if wav.dtype not in [np.float32, np.float64]:
             if np.issubdtype(wav.dtype, np.integer):
-                max_val = np.iinfo(wav.dtype).max
                 wav = wav.astype(np.float32) / max_val
             else:
                 wav = wav.astype(np.float32)
         if wav.ndim > 1:
-            wav = wav.mean(axis=0)
         if source_sr != self.sampling_rate:
-            gcd = math.gcd(self.sampling_rate, source_sr)
-            up_factor = self.sampling_rate // gcd
-            down_factor = source_sr // gcd
-            if up_factor != down_factor:
-                logger.info(f"Resampling audio from {source_sr} Hz to {self.sampling_rate} Hz.")
                 wav = scipy.signal.resample_poly(wav, up=up_factor, down=down_factor)
-        norm_factor = np.abs(wav).max()
-        if norm_factor > 1e-9:
-            wav = wav / norm_factor
         return wav
     def _compute_log_mel_spectrogram(self, wav: np.ndarray) -> np.ndarray:
         if len(wav) < self.win_length:
             padding = self.win_length - len(wav)
             wav = np.pad(wav, (0, padding), mode='constant', constant_values=0.0)
-        num_frames = 1 + (len(wav) - self.win_length) // self.hop_length
         if num_frames <= 0:
-            logger.warning(
-                f"Audio of length {len(wav)} is too short to produce frames with win_length {self.win_length} and hop_length {self.hop_length}. Returning empty mel spectrogram.")
             return np.zeros((0, self.n_mels), dtype=np.float32)
-        frames = np.lib.stride_tricks.as_strided(
             wav,
             shape=(num_frames, self.win_length),
-            strides=(wav.strides[0] * self.hop_length, wav.strides[0]),
             writeable=False
         )
-        windowed_frames = frames * self.window
-        stft_matrix = np.fft.rfft(windowed_frames, n=self.n_fft, axis=-1)
-        powers = np.abs(stft_matrix) ** 2
-        mel_spectrogram = np.dot(powers, self.mel_filterbank)
-        mel_spectrogram = np.clip(mel_spectrogram, LOG_MEL_CLIP_EPSILON, None)
         log_mel_spectrogram = np.log(mel_spectrogram)
         return log_mel_spectrogram.astype(np.float32)
     def _calculate_embed_length(self, frame_count: int) -> int:
         compressed = math.ceil(frame_count / self.compression_rate)
         return math.ceil(compressed / self.qformer_rate)
-class Gemma3DummyProcessorKwargs(ProcessingKwargs, total=False):  # Dummy for testing structure
     images_kwargs: Dict[str, Any]
     audio_kwargs: Dict[str, Any]
-    text_kwargs: Dict[str, Any]
     _defaults = {
         "text_kwargs": {"padding": False, "truncation": False, "max_length": DEFAULT_MAX_LENGTH},
         "images_kwargs": {},
@@ -264,230 +308,282 @@ class Gemma3DummyProcessorKwargs(ProcessingKwargs, total=False):  # Dummy for te
 class Gemma3OmniProcessor(ProcessorMixin):
     attributes = ["image_processor", "audio_processor", "tokenizer"]
-    valid_kwargs = ["chat_template", "image_seq_length"]
-    image_processor_class = "AutoImageProcessor"
-    audio_processor_class = "AutoFeatureExtractor"
-    tokenizer_class = "AutoTokenizer"
-    # valid_kwargs was in user's code, its role depends on ProcessorMixin internal usage
-    valid_kwargs = ["chat_template", "image_seq_length"]
     def __init__(
             self,
-            tokenizer,
-            audio_processor: Optional[Union[Gemma3AudioFeatureExtractor, Dict]] = None,
-            image_processor=None,
             chat_template=None,
             image_seq_length: int = 256,
-            audio_prompt_compression_rate: int = 8,
-            audio_prompt_qformer_rate: int = 1,
-            audio_prompt_feat_stride: int = 1,
-            audio_placeholder_token: str = "<|audio_placeholder|>",
-            audio_soft_token_str: str = "<audio_soft_token>",
-            **kwargs
     ):
-        # Instantiate audio_processor if config dict is passed or if None (use defaults)
-        if audio_processor is None:
-            logger.info("Initializing Gemma3AudioFeatureExtractor with default parameters for Gemma3OmniProcessor.")
-            audio_processor = Gemma3AudioFeatureExtractor()
-        elif isinstance(audio_processor, Dict):
-            audio_processor = Gemma3AudioFeatureExtractor(**audio_processor)
-        elif not isinstance(audio_processor, Gemma3AudioFeatureExtractor):  # Check type if instance is passed
-            raise TypeError(
-                f"audio_processor must be an instance of Gemma3AudioFeatureExtractor or a config dict, got {type(audio_processor)}")
-        # Handle image_processor similarly if it can be None or a dict
-        if image_processor is None and self.image_processor_class:
-            # This is a basic way; from_pretrained usually handles complex loading
-            if isinstance(self.image_processor_class, str) and self.image_processor_class == "AutoImageProcessor":
-                logger.info(
-                    f"Attempting to load a default {self.image_processor_class}. This might require a default model name or fail.")
-                # image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32") # Example default
-            # else if self.image_processor_class is an actual class, instantiate it.
-        elif isinstance(image_processor, Dict):
-            # image_processor = AutoImageProcessor.from_config(config_class(**image_processor)) # Example
-            pass  # Actual instantiation from dict would be more complex
-        # Ensure tokenizer is an instantiated object
-        if isinstance(tokenizer, str):  # If tokenizer is a string (model name/path)
-            logger.info(f"Loading tokenizer from {tokenizer}")
-            # tokenizer = AutoTokenizer.from_pretrained(tokenizer) # This is how it's usually done
-        elif tokenizer is None:
-            raise ValueError("A tokenizer instance or identifier must be provided.")
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
             tokenizer=tokenizer,
             chat_template=chat_template,
-            **kwargs  # Pass other kwargs to super
         )
-        self.image_seq_length = image_seq_length
-        self.image_token_id = getattr(self.tokenizer, "image_token_id",
-                                      self.tokenizer.unk_token_id if hasattr(self.tokenizer, "unk_token_id") else None)
-        self.boi_token = getattr(self.tokenizer, "boi_token", "<|image|>")
-        self.image_token = getattr(self.tokenizer, "image_token", "<|image|>")
-        self.eoi_token = getattr(self.tokenizer, "eoi_token", "")
-        self.audio_placeholder_token = audio_placeholder_token
-        self.audio_soft_token_str = audio_soft_token_str
-        self.audio_soft_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_soft_token_str)
-        if self.audio_soft_token_id == self.tokenizer.unk_token_id:  # Check if UNK
-            logger.warning(
-                f"The audio soft token string '{self.audio_soft_token_str}' maps to UNK token (ID: {self.audio_soft_token_id}). "
-                "Ensure it is added to the tokenizer's vocabulary as a special token."
-            )
-        self.full_image_sequence_str = f"\n\n{self.boi_token}{''.join([self.image_token] * self.image_seq_length)}{self.eoi_token}\n\n"
-        self.audio_prompt_compression_rate = audio_prompt_compression_rate
-        self.audio_prompt_qformer_rate = audio_prompt_qformer_rate
-        self.audio_prompt_feat_stride = audio_prompt_feat_stride
-    def _merge_kwargs(self, KwargsClassWithDefaults, tokenizer_init_kwargs, **kwargs_passed_to_call):
-        final_kwargs = {}
-        # Initialize with _defaults from the Kwargs class
-        # Ensure KwargsClassWithDefaults has a _defaults attribute
-        _defaults = getattr(KwargsClassWithDefaults, "_defaults", {})
-        for modality_key, default_modality_kwargs in _defaults.items():
-            final_kwargs[modality_key] = default_modality_kwargs.copy()
-        # Override with tokenizer's init_kwargs if they exist for a given key
-        for modality_key, modality_dict in final_kwargs.items():
-            for key in list(modality_dict.keys()):
-                if key in tokenizer_init_kwargs:
-                    modality_dict[key] = tokenizer_init_kwargs[key]
-        # Override with kwargs passed directly to __call__
-        for modality_key_from_call, modality_dict_from_call in kwargs_passed_to_call.items():
-            if modality_key_from_call in final_kwargs and isinstance(modality_dict_from_call, dict):
-                final_kwargs[modality_key_from_call].update(modality_dict_from_call)
-            # If a new modality_kwargs (e.g., "video_kwargs") is passed, add it
-            elif modality_key_from_call not in final_kwargs and isinstance(modality_dict_from_call, dict):
-                final_kwargs[modality_key_from_call] = modality_dict_from_call.copy()
-        # Specific handling for text_kwargs
-        if "text_kwargs" not in final_kwargs:
-            final_kwargs["text_kwargs"] = {}  # Ensure it exists
-        final_kwargs["text_kwargs"]["truncation"] = final_kwargs["text_kwargs"].get("truncation", False)
-        final_kwargs["text_kwargs"]["max_length"] = final_kwargs["text_kwargs"].get("max_length", DEFAULT_MAX_LENGTH)
-        return final_kwargs
-    def _compute_audio_prompt_token_count(self, actual_mel_frames_count: int) -> int:
-        scaled_frames = actual_mel_frames_count * self.audio_prompt_feat_stride
-        compressed_once = math.ceil(scaled_frames / self.audio_prompt_compression_rate)
-        compressed_twice = math.ceil(compressed_once / self.audio_prompt_qformer_rate)
-        return compressed_twice
     def __call__(
             self,
-            text: Union[str, List[str]] = None,
-            images: Optional[Any] = None,
             audios: Optional[Union[AudioInput, List[AudioInput]]] = None,
-            sampling_rate: Optional[int] = None,
             return_tensors: Optional[Union[str, TensorType]] = None,
-            **kwargs: Any
     ) -> BatchFeature:
-        if text is None and images is None and audios is None:
             raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
         # Determine final return_tensors strategy
-        # Priority: 1. Explicit return_tensors, 2. from text_kwargs in **kwargs, 3. Default (PT)
         final_rt = return_tensors
         merged_call_kwargs = self._merge_kwargs(
-            Gemma3DummyProcessorKwargs,  # Using dummy for _defaults structure
-            self.tokenizer.init_kwargs if hasattr(self.tokenizer, 'init_kwargs') else {},
             **kwargs
         )
-        if final_rt is None:  # If not passed directly to __call__
             final_rt = merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", TensorType.PYTORCH)
-        else:  # If passed directly, remove from text_kwargs to avoid conflict
             merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
-        if text is None:
             num_samples = 0
             if images is not None:
-                _images_list = images if isinstance(images, list) and (
-                            not images or not isinstance(images[0], (int, float))) else [images]
                 num_samples = len(_images_list)
             elif audios is not None:
                 _audios_list = audios if isinstance(audios, list) else [audios]
                 num_samples = len(_audios_list)
-            text = [""] * num_samples if num_samples > 0 else [""]
         if isinstance(text, str):
             text = [text]
-        if not (isinstance(text, list) and all(isinstance(t, str) for t in text)):
-            raise ValueError("Input `text` must be a string or a list of strings.")
         image_features_dict = {}
         if images is not None and self.image_processor is not None:
-            logger.info("Processing images...")
-            # image_features_dict = self.image_processor(images, return_tensors=None, **merged_call_kwargs.get("images_kwargs", {}))
-            # Simplified: Actual image token replacement logic for `text` would go here.
-            # text = self._handle_image_text_replacement(text, images, image_features_dict)
-            pass
         audio_features_dict = {}
         if audios is not None and self.audio_processor is not None:
-            logger.info("Processing audio...")
             audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
-            if sampling_rate:
-                audio_call_kwargs["sampling_rate"] = sampling_rate
-            # audio_processor.__call__ returns BatchFeature, we need its .data attribute
-            audio_features_batch_feature = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
-            audio_features_dict = audio_features_batch_feature.data  # Get the dict
-            new_text_with_audio = []
-            # audio_attention_mask shape is (B, Max_T_mel)
-            audio_sample_mel_lengths = to_py_obj(audio_features_dict["audio_attention_mask"].sum(axis=-1))
             for i, prompt in enumerate(text):
-                num_soft_tokens = self._compute_audio_prompt_token_count(audio_sample_mel_lengths[i])
-                audio_token_sequence_str = self.audio_soft_token_str * num_soft_tokens
-                if self.audio_placeholder_token in prompt:
-                    prompt = prompt.replace(self.audio_placeholder_token, audio_token_sequence_str, 1)
-                else:
-                    prompt += audio_token_sequence_str
-                new_text_with_audio.append(prompt)
-            text = new_text_with_audio
-        logger.info("Tokenizing text...")
-        text_call_kwargs = merged_call_kwargs.get("text_kwargs", {})
-        text_features_dict = self.tokenizer(text, return_tensors=None, **text_call_kwargs)
-        input_ids_list = text_features_dict["input_ids"]
-        if not isinstance(input_ids_list, list) or not (input_ids_list and isinstance(input_ids_list[0], list)):
-            if isinstance(input_ids_list, (torch.Tensor, np.ndarray)):
-                input_ids_list = to_py_obj(input_ids_list)  # Convert tensor/np.array to list of lists
-            elif isinstance(input_ids_list, list) and (not input_ids_list or isinstance(input_ids_list[0], int)):
-                input_ids_list = [input_ids_list]
-        token_type_ids_list = []
-        for ids_sample in input_ids_list:
-            types = [0] * len(ids_sample)
-            for j, token_id in enumerate(ids_sample):
-                if self.image_token_id is not None and token_id == self.image_token_id:
-                    types[j] = 1
-                elif token_id == self.audio_soft_token_id:
-                    types[j] = 2
-            token_type_ids_list.append(types)
-        text_features_dict["token_type_ids"] = token_type_ids_list
-        combined_features = {**text_features_dict}
-        if image_features_dict:
-            combined_features.update(image_features_dict)
-        if audio_features_dict:
-            combined_features.update(audio_features_dict)
-        return BatchFeature(data=combined_features, tensor_type=final_rt)
     def batch_decode(self, *args, **kwargs):
         return self.tokenizer.batch_decode(*args, **kwargs)
@@ -496,12 +592,17 @@ class Gemma3OmniProcessor(ProcessorMixin):
         return self.tokenizer.decode(*args, **kwargs)
     @property
-    def model_input_names(self) -> List[str]:
-        input_names = set(self.tokenizer.model_input_names + ["token_type_ids"])
-        if self.image_processor is not None:
-            input_names.update(self.image_processor.model_input_names)
-        if self.audio_processor is not None:
-            # From Gemma3AudioFeatureExtractor's output_data keys
-            input_names.update(["audio_values", "audio_attention_mask"])
-            # "audio_token_calc_sizes" is internal to processor, not model.
-        return list(input_names)

 import re
+from typing import List, Optional, Union, Dict, Any
 import math
 import numpy as np
 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
+# Using the original AudioInput for minimal change from your provided code
+from transformers.audio_utils import AudioInput # type: ignore
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import make_nested_list_of_images
+from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs # Removed Unpack as it's not standard
 from transformers.utils import TensorType, to_py_obj, logging
+# Constants
 DEFAULT_SAMPLING_RATE = 16000
 DEFAULT_N_FFT = 512
+DEFAULT_WIN_LENGTH = 400
+DEFAULT_HOP_LENGTH = 160
 DEFAULT_N_MELS = 80
 DEFAULT_COMPRESSION_RATE = 4
 DEFAULT_QFORMER_RATE = 2
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
+LOG_MEL_CLIP_EPSILON = 1e-5 # Epsilon for log mel clipping
 logger = logging.get_logger(__name__)
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
+    """Create Mel filterbank for audio processing. (User's version)"""
+    fmax = fmax or sampling_rate / 2.0 # Ensure float division
+    # User's Mel scale formula
+    def hz_to_mel(f: float) -> float:
+        return 1127.0 * math.log(1 + f / 700.0)
+    def mel_to_hz(mel: float) -> float: # Added for completeness if needed
+        return 700.0 * (math.exp(mel / 1127.0) - 1)
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
+    # freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1) # Original
+    freq_points = mel_to_hz(mel_points) # Using the inverse function
+    # Clip freq_points to be within [0, sampling_rate/2]
     freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
+    bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
+    # Ensure bins are within valid range for rfft output indices
     bins = np.clip(bins, 0, n_fft // 2)
+    filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
+    for m_idx in range(n_mels): # Loop from 0 to n_mels-1 to fill filterbank[m_idx]
+        # Bins for (m_idx)-th filter are bins[m_idx], bins[m_idx+1], bins[m_idx+2]
+        left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
+        # Original logic for applying triangular filter
+        # Ensure no division by zero if points coincide
         if center > left:
+            filterbank[m_idx, left:center] = (np.arange(left, center) - left) / (center - left)
         if right > center:
+            filterbank[m_idx, center:right] = (right - np.arange(center, right)) / (right - center)
+        # If left=center or center=right, the corresponding slope is zero, which is implicitly handled.
+        # Ensure peak is 1.0 if center is a valid point within a slope.
+        if left <= center < right and center > left : # If center forms a peak of a valid triangle part
+             filterbank[m_idx, center] = 1.0
     return filterbank
 class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
     model_input_names = ["audio_values", "audio_attention_mask"]
             compression_rate: int = DEFAULT_COMPRESSION_RATE,
             qformer_rate: int = DEFAULT_QFORMER_RATE,
             feat_stride: int = DEFAULT_FEAT_STRIDE,
+            sampling_rate: int = DEFAULT_SAMPLING_RATE, # Target sampling rate
             n_fft: int = DEFAULT_N_FFT,
             win_length: Optional[int] = None,
             hop_length: Optional[int] = None,
             n_mels: int = DEFAULT_N_MELS,
+            f_min: float = 0.0, # Added for mel filterbank control
+            f_max: Optional[float] = None, # Added for mel filterbank control
+            padding_value: float = 0.0, # Explicitly define for clarity
             **kwargs
     ):
+        _win_length = win_length if win_length is not None else n_fft
+        _hop_length = hop_length if hop_length is not None else _win_length // 4
+        # feature_size is n_mels for the superclass
         super().__init__(
             feature_size=n_mels,
+            sampling_rate=sampling_rate, # This sets self.sampling_rate
+            padding_value=padding_value,
             **kwargs
         )
         self.compression_rate = compression_rate
         self.qformer_rate = qformer_rate
         self.feat_stride = feat_stride
+        # self.sampling_rate is now set by super()
         self.n_fft = n_fft
+        self.win_length = _win_length
+        self.hop_length = _hop_length
         self.n_mels = n_mels
         self.f_min = f_min
+        self.f_max = f_max # Will be sampling_rate/2 if None in create_mel_filterbank call
         if self.win_length > self.n_fft:
             logger.warning(
                 f"win_length ({self.win_length}) is greater than n_fft ({self.n_fft}). "
+                "Window will be applied, then data will be zero-padded/truncated to n_fft by np.fft.rfft."
             )
+        self.window = np.hamming(self.win_length).astype(np.float32) # Or scipy.signal.get_window("hann", self.win_length)
         self.mel_filterbank = create_mel_filterbank(
             self.sampling_rate, self.n_fft, self.n_mels, fmin=self.f_min, fmax=self.f_max
+        ).T # Transpose for dot product: (n_fft // 2 + 1, n_mels)
     def __call__(
             self,
+            audios: Union[AudioInput, List[AudioInput]], # Accept single or list
+            sampling_rate: Optional[int] = None, # To specify SR if audios are raw arrays
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
         if not isinstance(audios, list):
             audios = [audios]
+        processed_mels: List[torch.Tensor] = []
         actual_mel_lengths: List[int] = []
+        # Kept from user's code - their purpose might be for token calculation downstream
+        sizes_for_embed_length: List[torch.Tensor] = []
+        frames_scaled_by_feat_stride: List[int] = []
+        for audio_item in audios:
+            current_wav: np.ndarray
             source_sr: int
+            if isinstance(audio_item, tuple) and len(audio_item) == 2 and isinstance(audio_item[1], int):
+                current_wav, source_sr = audio_item
+                current_wav = np.asarray(current_wav, dtype=np.float32) # Ensure float32 numpy array
+            elif isinstance(audio_item, (np.ndarray, list)):
+                current_wav = np.asarray(audio_item, dtype=np.float32)
                 if sampling_rate is None:
                     raise ValueError(
+                        "sampling_rate must be provided if audio inputs are raw numpy arrays or lists without sr."
                     )
                 source_sr = sampling_rate
+            # Add more robust loading for paths/bytes if transformers.audio_utils.load_audio is permissible
+            # Example:
+            # elif isinstance(audio_input, (str, bytes, Path)): # Path needs to be imported from pathlib
+            #    current_wav, sr_dict = load_audio(audio_input_item) # Uses librosa or soundfile
+            #    source_sr = sr_dict["sampling_rate"]
+            #    current_wav = current_wav.astype(np.float32)
             else:
                 raise TypeError(
+                    f"Unsupported audio input type: {type(audio_item)}. "
+                    "Expected np.ndarray, list of floats, or Tuple[np.ndarray, int]."
                 )
+            processed_wav_array = self._preprocess_audio(current_wav, source_sr)
+            mel_spectrogram = self._compute_log_mel_spectrogram(processed_wav_array) # Shape: (T_mel, N_Mels)
+            feature_tensor = torch.from_numpy(mel_spectrogram) # Already float32
+            processed_mels.append(feature_tensor)
+            actual_mel_lengths.append(feature_tensor.shape[0]) # T_mel for this item
+            # User's original logic for 'sizes' and 'frames'
+            sizes_for_embed_length.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
+            frames_scaled_by_feat_stride.append(feature_tensor.shape[0] * self.feat_stride)
+        # Pad the mel spectrograms to form a batch
+        audio_embeds = pad_sequence(processed_mels, batch_first=True, padding_value=self.padding_value)
+        # audio_embeds shape: (Batch, Max_T_mel, N_Mels)
+        # Create attention mask corresponding to the actual lengths of mel spectrograms
+        max_t_mel_in_batch = audio_embeds.shape[1]
+        current_device = audio_embeds.device # Get device from padded tensor if using PyTorch tensors earlier
+        # Create attention mask directly based on actual_mel_lengths
+        attention_mask = torch.zeros(len(audios), max_t_mel_in_batch, dtype=torch.bool, device=current_device)
+        for i, length in enumerate(actual_mel_lengths):
+            attention_mask[i, :length] = True
         output_data = {
+            "audio_values": audio_embeds,
+            "audio_attention_mask": attention_mask # Correctly shaped mask for audio_values
         }
+        # Include user's 'sizes' if they are needed downstream
+        if sizes_for_embed_length:
+            output_data["audio_values_sizes"] = torch.stack(sizes_for_embed_length)
+        # Note: 'frames_scaled_by_feat_stride' is a list of ints, handle conversion if needed in BatchFeature
         return BatchFeature(data=output_data, tensor_type=return_tensors)
     def _preprocess_audio(self, wav: np.ndarray, source_sr: int) -> np.ndarray:
+        # Ensure wav is float32
         if wav.dtype not in [np.float32, np.float64]:
             if np.issubdtype(wav.dtype, np.integer):
+                max_val = np.iinfo(wav.dtype).max if wav.size > 0 else 1.0 # Avoid error on empty array
                 wav = wav.astype(np.float32) / max_val
             else:
                 wav = wav.astype(np.float32)
+        elif wav.dtype == np.float64:
+            wav = wav.astype(np.float32)
         if wav.ndim > 1:
+            wav = wav.mean(axis=0) # Convert to mono
         if source_sr != self.sampling_rate:
+            logger.info(f"Resampling audio from {source_sr} Hz to {self.sampling_rate} Hz.")
+            # Calculate integer up/down factors for resample_poly
+            common_divisor = math.gcd(self.sampling_rate, source_sr)
+            up_factor = self.sampling_rate // common_divisor
+            down_factor = source_sr // common_divisor
+            if up_factor != down_factor : # Only if actual resampling is needed
                 wav = scipy.signal.resample_poly(wav, up=up_factor, down=down_factor)
+        # Normalize amplitude to roughly [-1, 1]
+        max_abs_val = np.abs(wav).max()
+        if max_abs_val > 1e-7: # Avoid division by zero or tiny numbers
+            wav = wav / max_abs_val
         return wav
     def _compute_log_mel_spectrogram(self, wav: np.ndarray) -> np.ndarray:
         if len(wav) < self.win_length:
+            # Pad if audio is shorter than one window
             padding = self.win_length - len(wav)
             wav = np.pad(wav, (0, padding), mode='constant', constant_values=0.0)
+        # Calculate number of frames
+        # This calculation ensures at least one frame if len(wav) == self.win_length
+        if len(wav) >= self.win_length:
+             num_frames = 1 + (len(wav) - self.win_length) // self.hop_length
+        else: # Should be covered by padding, but as safeguard
+             num_frames = 0
         if num_frames <= 0:
+            logger.warning(f"Audio is too short (length {len(wav)}) to produce any frames "
+                           f"with win_length {self.win_length} and hop_length {self.hop_length}. "
+                           "Returning empty mel spectrogram.")
             return np.zeros((0, self.n_mels), dtype=np.float32)
+        # Framing using stride_tricks
+        strides = wav.strides[0]
+        frames_view = np.lib.stride_tricks.as_strided(
             wav,
             shape=(num_frames, self.win_length),
+            strides=(strides * self.hop_length, strides),
             writeable=False
         )
+        frames_data = frames_view.copy() # Important: copy after as_strided if modifying
+        frames_data *= self.window # Apply window in-place on the copy
+        # Compute STFT (rfft for real inputs)
+        # n_fft determines zero-padding or truncation for FFT input from each frame
+        spectrum = np.fft.rfft(frames_data, n=self.n_fft, axis=-1).astype(np.complex64)
+        power = np.abs(spectrum)**2
+        mel_spectrogram = np.dot(power, self.mel_filterbank) # (num_frames, n_mels)
+        # Clip and take log
+        mel_spectrogram = np.clip(mel_spectrogram, LOG_MEL_CLIP_EPSILON, None) # Use defined epsilon
         log_mel_spectrogram = np.log(mel_spectrogram)
         return log_mel_spectrogram.astype(np.float32)
     def _calculate_embed_length(self, frame_count: int) -> int:
+        # User's original function
         compressed = math.ceil(frame_count / self.compression_rate)
         return math.ceil(compressed / self.qformer_rate)
+class Gemma3ImagesKwargs(ImagesKwargs): # User's definition
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+    do_convert_rgb: Optional[bool]
+class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): # User's definition
     images_kwargs: Dict[str, Any]
     audio_kwargs: Dict[str, Any]
+    # Added text_kwargs as it's commonly part of such structures
+    text_kwargs: Optional[Dict[str, Any]] = None
     _defaults = {
         "text_kwargs": {"padding": False, "truncation": False, "max_length": DEFAULT_MAX_LENGTH},
         "images_kwargs": {},
 class Gemma3OmniProcessor(ProcessorMixin):
     attributes = ["image_processor", "audio_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_seq_length"] # From user's code
+    # --- FIXED CLASS ATTRIBUTES ---
+    image_processor_class = "AutoImageProcessor" # As in user's original code
+    audio_processor_class = Gemma3AudioFeatureExtractor # Corrected to custom class
+    tokenizer_class = "AutoTokenizer" # As in user's original code
     def __init__(
             self,
+            image_processor=None, # Allow None, superclass or from_pretrained handles loading via _class
+            audio_processor=None, # Allow None or instance
+            tokenizer=None,       # Allow None or instance
             chat_template=None,
             image_seq_length: int = 256,
+            **kwargs
     ):
+        # The ProcessorMixin's __init__ will handle instantiating these if they are None,
+        # using the respective *_class attributes.
+        # If specific instances are passed, they will be used.
+        # Retaining user's specific logic for setting attributes if needed,
+        # though much of this might be handled by super() or better placed after super()
+        self.image_seq_length = image_seq_length
+        # These tokenizer-dependent attributes should be set *after* super().__init__
+        # ensures self.tokenizer is populated, or if tokenizer is passed directly.
+        # If tokenizer is None and loaded by super(), these need to be set post-super().
+        # Assuming tokenizer is passed as an instantiated object for this snippet for now.
+        if tokenizer is None:
+            # This is a basic placeholder; HF's from_pretrained mechanism is more robust for loading
+            # For now, we'll assume if tokenizer is None, super() handles it or it's an error later.
+            pass
+        else: # Tokenizer was provided
+            self.image_token_id = getattr(tokenizer, "image_token_id", None) # More robust with getattr
+            self.boi_token = getattr(tokenizer, "boi_token", "<|image|>") # Defaulting if not present
+            self.image_token = getattr(tokenizer, "image_token", "<|image|>")
+            self.eoi_token = getattr(tokenizer, "eoi_token", "") # Added eoi_token as it was used
+            self.audio_token = "<audio_soft_token>" # User's definition
+            # self.expected_audio_token_id = 262143 # User's reference
+            # The existence of this token should be ensured when the tokenizer is prepared/saved.
+            self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
+            # if self.audio_token_id != self.expected_audio_token_id: # User's warning
+            #     logger.warning(...)
+            if self.audio_token_id == tokenizer.unk_token_id:
+                 logger.warning(f"Audio token '{self.audio_token}' not found in tokenizer, maps to UNK. Ensure it's added.")
+            self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * image_seq_length)}{self.eoi_token if hasattr(tokenizer, 'eoi_token') else ''}\n\n"
+        # These seem specific to this processor's logic for determining audio token sequence length
+        # It's better to initialize them here.
+        self.audio_prompt_compression_rate = kwargs.pop("audio_prompt_compression_rate", 8)
+        self.audio_prompt_qformer_rate = kwargs.pop("audio_prompt_qformer_rate", 1)
+        self.audio_prompt_feat_stride = kwargs.pop("audio_prompt_feat_stride", 1)
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
             tokenizer=tokenizer,
             chat_template=chat_template,
+            **kwargs # Pass remaining kwargs to super
         )
+        # If tokenizer was loaded by super(), set tokenizer-dependent attributes now
+        if not hasattr(self, 'image_token_id') and self.tokenizer is not None:
+            self.image_token_id = getattr(self.tokenizer, "image_token_id", self.tokenizer.unk_token_id if hasattr(self.tokenizer, "unk_token_id") else None)
+            self.boi_token = getattr(self.tokenizer, "boi_token", "<|image|>")
+            self.image_token = getattr(self.tokenizer, "image_token", "<|image|>")
+            self.eoi_token = getattr(self.tokenizer, "eoi_token", "")
+            self.audio_token = "<audio_soft_token>"
+            self.audio_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_token)
+            if self.audio_token_id == self.tokenizer.unk_token_id:
+                 logger.warning(f"Audio token '{self.audio_token}' not found in tokenizer (post-super), maps to UNK. Ensure it's added.")
+            self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * self.image_seq_length)}{self.eoi_token}\n\n"
+    def _merge_kwargs(self, ModelProcessorKwargs, tokenizer_init_kwargs, **kwargs_from_call):
+        # User's original _merge_kwargs logic
+        default_kwargs = {}
+        # Ensure ModelProcessorKwargs._defaults exists and is a dict
+        _defaults_attr = getattr(ModelProcessorKwargs, "_defaults", {})
+        if not isinstance(_defaults_attr, dict):
+            _defaults_attr = {}
+        for modality in _defaults_attr:
+            default_kwargs[modality] = _defaults_attr.get(modality, {}).copy()
+        for modality_key_in_call, modality_kwargs_in_call in kwargs_from_call.items():
+            if modality_key_in_call in default_kwargs:
+                 if isinstance(modality_kwargs_in_call, dict):
+                    default_kwargs[modality_key_in_call].update(modality_kwargs_in_call)
+            elif isinstance(modality_kwargs_in_call, dict): # New modality not in defaults
+                 default_kwargs[modality_key_in_call] = modality_kwargs_in_call.copy()
+        # Update defaults with tokenizer init kwargs (original logic)
+        for modality_key in default_kwargs: # Iterate over current keys in default_kwargs
+            modality_dict = default_kwargs[modality_key]
+            if isinstance(modality_dict, dict): # Ensure it's a dict before trying to access keys
+                for key_in_mod_dict in list(modality_dict.keys()): # Iterate over copy of keys
+                    if key_in_mod_dict in tokenizer_init_kwargs:
+                        value = (
+                            getattr(self.tokenizer, key_in_mod_dict)
+                            if hasattr(self.tokenizer, key_in_mod_dict)
+                            else tokenizer_init_kwargs[key_in_mod_dict]
+                        )
+                        modality_dict[key_in_mod_dict] = value
+        # Ensure text_kwargs processing (original logic)
+        if "text_kwargs" not in default_kwargs: # Ensure text_kwargs exists
+            default_kwargs["text_kwargs"] = {}
+        default_kwargs["text_kwargs"]["truncation"] = default_kwargs["text_kwargs"].get("truncation", False)
+        default_kwargs["text_kwargs"]["max_length"] = default_kwargs["text_kwargs"].get("max_length", DEFAULT_MAX_LENGTH)
+        return default_kwargs
+    def _compute_audio_embed_size(self, audio_mel_frames: int) -> int:
+        # Using processor's own rates for this calculation
+        result = math.ceil((audio_mel_frames * self.audio_prompt_feat_stride) / self.audio_prompt_compression_rate)
+        return math.ceil(result / self.audio_prompt_qformer_rate)
     def __call__(
             self,
+            images=None,
+            text:Union[str, List[str]]=None, # text is optional but often primary
+            # videos=None, # Removed 'videos' as it's not handled
             audios: Optional[Union[AudioInput, List[AudioInput]]] = None,
+            sampling_rate: Optional[int] = None, # For audio_processor if audios are raw arrays
             return_tensors: Optional[Union[str, TensorType]] = None,
+            **kwargs: Any # Replaced Unpack for broader compatibility here
     ) -> BatchFeature:
+        if text is None and images is None and audios is None: # Added audios to check
             raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
         # Determine final return_tensors strategy
         final_rt = return_tensors
+        # Using Gemma3ProcessorKwargs as the class that holds _defaults structure
+        # This call to _merge_kwargs primarily populates kwargs for each modality if passed in __call__
+        # e.g. if user calls proc(..., text_kwargs={...})
         merged_call_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            self.tokenizer.init_kwargs if hasattr(self.tokenizer, "init_kwargs") else {},
             **kwargs
         )
+        # If return_tensors wasn't passed to __call__, try to get it from merged text_kwargs
+        # and remove it from there to avoid passing it twice to tokenizer.
+        # Default to PYTORCH if still None.
+        if final_rt is None:
             final_rt = merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", TensorType.PYTORCH)
+        else:
             merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
+        # Standardize text input
+        if text is None: # If no text given, create dummy text based on other modalities
             num_samples = 0
             if images is not None:
+                _images_list = images if isinstance(images, list) and (not images or not isinstance(images[0], (int,float))) else [images]
                 num_samples = len(_images_list)
             elif audios is not None:
                 _audios_list = audios if isinstance(audios, list) else [audios]
                 num_samples = len(_audios_list)
+            text = [""] * num_samples if num_samples > 0 else [""] # Fallback for safety
         if isinstance(text, str):
             text = [text]
+        elif not (isinstance(text, list) and all(isinstance(t, str) for t in text)):
+            raise ValueError("Input text must be a string or list of strings")
+        # --- Image Processing ---
         image_features_dict = {}
         if images is not None and self.image_processor is not None:
+            batched_images = make_nested_list_of_images(images) # HF utility
+            # Assuming image_processor returns a dict or BatchFeature. If BatchFeature, get .data
+            _img_proc_output = self.image_processor(batched_images, return_tensors=None, **merged_call_kwargs.get("images_kwargs", {}))
+            image_features_dict = _img_proc_output.data if isinstance(_img_proc_output, BatchFeature) else _img_proc_output
+            if len(batched_images) != len(text): # Validate batch consistency
+                raise ValueError(f"Inconsistent batch sizes: {len(batched_images)} images, {len(text)} texts")
+            # User's original image token replacement logic (complex, depends on num_crops etc from image_processor output)
+            # This part needs to be carefully adapted based on actual image_processor output structure
+            # For now, a simplified placeholder for the concept:
+            if "num_crops" in image_features_dict: # Example check
+                num_crops_list = to_py_obj(image_features_dict.pop("num_crops"))
+                # ... user's original logic for text modification with self.full_image_sequence ...
+                # This was: text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
+                # Need to adapt it if multiple images/crops per text sample.
+                # For simplicity, assuming one image sequence per text for now if an image is present.
+                temp_text = []
+                for i, prompt in enumerate(text):
+                    if i < len(batched_images): # if this text sample has corresponding images
+                         # Replace first boi_token or append if not found
+                        if self.boi_token in prompt:
+                            temp_text.append(prompt.replace(self.boi_token, self.full_image_sequence, 1))
+                        else:
+                            temp_text.append(prompt + self.full_image_sequence)
+                    else:
+                        temp_text.append(prompt)
+                text = temp_text
+        # --- Audio Processing ---
         audio_features_dict = {}
         if audios is not None and self.audio_processor is not None:
             audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
+            if sampling_rate is not None:
+                 audio_call_kwargs["sampling_rate"] = sampling_rate
+            # audio_processor.__call__ returns BatchFeature, get its .data attribute for the dict
+            _audio_proc_output = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
+            audio_features_dict = _audio_proc_output.data
+            # Modify text to include audio soft tokens based on actual mel lengths
+            new_text_with_audio_tokens = []
+            # audio_attention_mask is (B, Max_T_mel)
+            actual_mel_frames_per_sample = to_py_obj(audio_features_dict["audio_attention_mask"].sum(axis=-1))
+            if len(actual_mel_frames_per_sample) != len(text):
+                 raise ValueError(f"Inconsistent batch sizes for audio and text: {len(actual_mel_frames_per_sample)} audio samples, {len(text)} texts.")
             for i, prompt in enumerate(text):
+                num_soft_tokens = self._compute_audio_embed_size(actual_mel_frames_per_sample[i])
+                audio_token_sequence_str = self.audio_soft_token_str * num_soft_tokens # Repeat soft token string
+                # Replace a placeholder or append
+                placeholder = getattr(self, "audio_placeholder_token", "<|audio|>") # Use defined placeholder
+                if placeholder in prompt:
+                    prompt_with_audio = prompt.replace(placeholder, audio_token_sequence_str, 1)
+                else:
+                    prompt_with_audio = prompt + audio_token_sequence_str
+                new_text_with_audio_tokens.append(prompt_with_audio)
+            text = new_text_with_audio_tokens
+        # --- Text Tokenization ---
+        text_tokenizer_kwargs = merged_call_kwargs.get("text_kwargs", {})
+        # Tokenize the (potentially modified) text, request lists/np arrays
+        text_features_dict = self.tokenizer(text=text, return_tensors=None, **text_tokenizer_kwargs)
+        # Create token_type_ids
+        input_ids_list_of_lists = text_features_dict["input_ids"]
+        # Ensure it's a list of lists
+        if not (isinstance(input_ids_list_of_lists, list) and \
+                input_ids_list_of_lists and \
+                isinstance(input_ids_list_of_lists[0], list)):
+            if isinstance(input_ids_list_of_lists, (torch.Tensor, np.ndarray)):
+                input_ids_list_of_lists = to_py_obj(input_ids_list_of_lists)
+            elif isinstance(input_ids_list_of_lists, list) and \
+                 (not input_ids_list_of_lists or isinstance(input_ids_list_of_lists[0], int)):
+                input_ids_list_of_lists = [input_ids_list_of_lists] # Batch of 1
+        mm_token_type_ids_list = []
+        for ids_sample in input_ids_list_of_lists:
+            type_ids_sample = [0] * len(ids_sample) # Default type 0 (text)
+            for idx, token_id_val in enumerate(ids_sample):
+                if self.image_token_id is not None and token_id_val == self.image_token_id:
+                    type_ids_sample[idx] = 1 # Image token type
+                elif token_id_val == self.audio_token_id: # Compare with ID of <audio_soft_token>
+                    type_ids_sample[idx] = 2 # Audio token type
+            mm_token_type_ids_list.append(type_ids_sample)
+        text_features_dict["token_type_ids"] = mm_token_type_ids_list
+        # Combine all features
+        final_batch_data = {**text_features_dict}
+        if image_features_dict:
+            final_batch_data.update(image_features_dict)
+        if audio_features_dict:
+            final_batch_data.update(audio_features_dict)
+        return BatchFeature(data=final_batch_data, tensor_type=final_rt) # Use determined final_rt
     def batch_decode(self, *args, **kwargs):
         return self.tokenizer.batch_decode(*args, **kwargs)
         return self.tokenizer.decode(*args, **kwargs)
     @property
+    def model_input_names(self):
+        tokenizer_inputs = self.tokenizer.model_input_names + ["token_type_ids"]
+        image_processor_inputs = []
+        if self.image_processor is not None: # Check if image_processor exists
+             image_processor_inputs = self.image_processor.model_input_names
+        audio_processor_inputs = []
+        if self.audio_processor is not None: # Check if audio_processor exists
+            # These are the keys Gemma3AudioFeatureExtractor puts in its output BatchFeature.data
+            audio_processor_inputs = ["audio_values", "audio_attention_mask"]
+            # "audio_values_sizes" was in user's original Gemma3AudioFeatureExtractor output,
+            # I renamed it to "audio_token_calc_sizes" for clarity; if it's a model input, add it back.
+        return list(dict.fromkeys(tokenizer_inputs + image_processor_inputs + audio_processor_inputs))