voidful
/

gemma-3-omni-processor

Feature Extraction

Transformers

gemma_3_omni

custom_code

Model card Files Files and versions Community

voidful commited on May 14

Commit

eaad0f5

verified ·

1 Parent(s): 5fc5a97

Update processing_gemma3_omni.py

Browse files

Files changed (1) hide show

processing_gemma3_omni.py +444 -214

processing_gemma3_omni.py CHANGED Viewed

@@ -6,11 +6,11 @@ import numpy as np
 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
-from transformers.audio_utils import AudioInput
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import make_nested_list_of_images
-from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs, Unpack
 from transformers.utils import TensorType, to_py_obj, logging
 # Constants
@@ -19,12 +19,13 @@ DEFAULT_N_FFT = 512
 DEFAULT_WIN_LENGTH = 400
 DEFAULT_HOP_LENGTH = 160
 DEFAULT_N_MELS = 80
-DEFAULT_COMPRESSION_RATE = 4
-DEFAULT_QFORMER_RATE = 2
-DEFAULT_FEAT_STRIDE = 4
-IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
-AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
-DEFAULT_MAX_LENGTH = 16384
 logger = logging.get_logger(__name__)
@@ -32,25 +33,48 @@ logger = logging.get_logger(__name__)
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing."""
-    fmax = fmax or sampling_rate / 2
-    def hz_to_mel(f: float) -> float:
         return 1127.0 * math.log(1 + f / 700.0)
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
-    freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1)
-    bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
-    filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
-    for m in range(1, n_mels + 1):
-        left, center, right = bins[m - 1:m + 2]
-        filterbank[m - 1, left:center] = (np.arange(left, center) - left) / (center - left)
-        filterbank[m - 1, center:right] = (right - np.arange(center, right)) / (right - center)
     return filterbank
 class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
     def __init__(
             self,
             compression_rate: int = DEFAULT_COMPRESSION_RATE,
@@ -58,89 +82,191 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             feat_stride: int = DEFAULT_FEAT_STRIDE,
             sampling_rate: int = DEFAULT_SAMPLING_RATE,
             n_fft: int = DEFAULT_N_FFT,
-            win_length: int = DEFAULT_WIN_LENGTH,
-            hop_length: int = DEFAULT_HOP_LENGTH,
             n_mels: int = DEFAULT_N_MELS,
             **kwargs
     ):
-        kwargs.pop("feature_size", None)
-        kwargs.pop("sampling_rate", None)
-        kwargs.pop("padding_value", None)
         super().__init__(
-            feature_size=n_mels,
-            sampling_rate=sampling_rate,
-            padding_value=0.0,
             **kwargs
         )
         self.compression_rate = compression_rate
         self.qformer_rate = qformer_rate
         self.feat_stride = feat_stride
-        self.sampling_rate = sampling_rate
-        self.window = np.hamming(win_length).astype(np.float32)
-        self.mel_filterbank = create_mel_filterbank(sampling_rate, n_fft, n_mels).T
         self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.win_length = win_length
     def __call__(
             self,
-            audios: List[AudioInput],
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
-        features, sizes, frames = [], [], []
-        for wav in audios:
-            processed_wav = self._preprocess_audio(wav, 22500)
-            mel_spectrogram = self._compute_log_mel_spectrogram(processed_wav)
-            feature_tensor = torch.tensor(mel_spectrogram, dtype=torch.float32)
-            features.append(feature_tensor)
-            sizes.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
-            frames.append(feature_tensor.shape[0] * self.feat_stride)
-        audio_embeds = pad_sequence(features, batch_first=True)
-        size_tensor = torch.stack(sizes)
-        attention_mask = None
-        if len(audios) > 1:
-            frame_lengths = torch.tensor(frames)
-            attention_mask = torch.arange(frame_lengths.max()).unsqueeze(0) < frame_lengths.unsqueeze(1)
         output_data = {
-            "audio_values": audio_embeds,
-            "audio_values_sizes": size_tensor
         }
-        if attention_mask is not None:
-            output_data["audio_attention_mask"] = attention_mask
         return BatchFeature(data=output_data, tensor_type=return_tensors)
     def _preprocess_audio(self, wav: np.ndarray, source_sr: int) -> np.ndarray:
-        wav = torch.as_tensor(wav).float().numpy()
         if wav.ndim > 1:
             wav = wav.mean(axis=0)
         if source_sr != self.sampling_rate:
-            wav = scipy.signal.resample_poly(wav, self.sampling_rate, source_sr)
-        return wav / max(np.abs(wav).max(), 1e-6)
     def _compute_log_mel_spectrogram(self, wav: np.ndarray) -> np.ndarray:
-        frame_count = 1 + (len(wav) - self.win_length) // self.hop_length
-        strides = wav.strides[0]
-        frames = np.lib.stride_tricks.as_strided(
             wav,
-            shape=(frame_count, self.win_length),
-            strides=(strides * self.hop_length, strides),
             writeable=False
-        ).copy()
-        frames *= self.window
-        spectrum = np.fft.rfft(frames, n=self.n_fft).astype(np.complex64)
         power = np.abs(spectrum) ** 2
         mel_spectrogram = np.dot(power, self.mel_filterbank)
-        mel_spectrogram = np.clip(mel_spectrogram, 1.0, None)
-        return np.log(mel_spectrogram, dtype=np.float32)
     def _calculate_embed_length(self, frame_count: int) -> int:
         compressed = math.ceil(frame_count / self.compression_rate)
@@ -156,8 +282,9 @@ class Gemma3ImagesKwargs(ImagesKwargs):
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Dict[str, Any]
-    audio_kwargs: Dict[str, Any]
     _defaults = {
         "text_kwargs": {"padding": False, "truncation": False, "max_length": DEFAULT_MAX_LENGTH},
         "images_kwargs": {},
@@ -168,38 +295,23 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
 class Gemma3OmniProcessor(ProcessorMixin):
     attributes = ["image_processor", "audio_processor", "tokenizer"]
     valid_kwargs = ["chat_template", "image_seq_length"]
     image_processor_class = "AutoImageProcessor"
-    audio_processor_class = "AutoFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
     def __init__(
             self,
-            image_processor,
-            audio_processor,
-            tokenizer,
             chat_template=None,
             image_seq_length: int = 256,
             **kwargs
     ):
-        self.image_seq_length = image_seq_length
-        self.image_token_id = tokenizer.image_token_id
-        self.boi_token = tokenizer.boi_token
-        self.image_token = tokenizer.image_token
-        self.audio_token = "<audio_soft_token>"
-        self.expected_audio_token_id = 262143
-        self.full_image_sequence = f"\n\n{tokenizer.boi_token}{''.join([tokenizer.image_token] * image_seq_length)}{tokenizer.eoi_token}\n\n"
-        self.compression_rate = 8
-        self.qformer_compression_rate = 1
-        self.feat_stride = 1
-        self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
-        if self.audio_token_id != self.expected_audio_token_id:
-            logger.warning(
-                f"Assigned ID {self.audio_token_id} for '{self.audio_token}' does not match expected ID {self.expected_audio_token_id}. "
-                "Using assigned ID. Model embedding layer may need resizing."
-            )
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
@@ -208,136 +320,243 @@ class Gemma3OmniProcessor(ProcessorMixin):
             **kwargs
         )
-    def _merge_kwargs(self, ModelProcessorKwargs, tokenizer_init_kwargs, **kwargs):
-        default_kwargs = {}
-        for modality in ModelProcessorKwargs._defaults:
-            default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
-        # Update defaults with tokenizer init kwargs
-        for modality in default_kwargs:
-            modality_kwargs = default_kwargs[modality]
-            for key in modality_kwargs:
-                if key in tokenizer_init_kwargs:
-                    value = (
-                        getattr(self.tokenizer, key)
-                        if hasattr(self.tokenizer, key)
-                        else tokenizer_init_kwargs[key]
-                    )
-                    modality_kwargs[key] = value
-        # Update with user-provided kwargs
-        for modality in default_kwargs:
-            if modality in kwargs:
-                default_kwargs[modality].update(kwargs[modality])
-        # Ensure text_kwargs has truncation=False and large max_length
-        default_kwargs["text_kwargs"]["truncation"] = False
-        default_kwargs["text_kwargs"]["max_length"] = default_kwargs["text_kwargs"].get("max_length",
-                                                                                        DEFAULT_MAX_LENGTH)
-        return default_kwargs
-    def _compute_audio_embed_size(self, audio_frames: int) -> int:
-        result = math.ceil(audio_frames / self.compression_rate)
-        return math.ceil(result / self.qformer_compression_rate)
     def __call__(
             self,
-            images=None,
-            text=None,
-            videos=None,
-            audio=None,
-            **kwargs: Unpack[Gemma3ProcessorKwargs]
     ) -> BatchFeature:
-        if text is None and images is None:
-            raise ValueError("Provide at least one of `text` or `images`.")
-        output_kwargs = self._merge_kwargs(
-            Gemma3ProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs
         )
-        if isinstance(text, str):
-            text = [text]
-        elif not isinstance(text, list) or not all(isinstance(t, str) for t in text):
-            raise ValueError("Input text must be a string or list of strings")
-        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", "pt")
-        image_inputs = {}
         if images is not None:
             batched_images = make_nested_list_of_images(images)
-            image_inputs = self.image_processor(batched_images, **output_kwargs["images_kwargs"])
-            if not text:
-                text = [" ".join([self.boi_token] * len(images)) for images in batched_images]
-            if len(batched_images) != len(text):
-                raise ValueError(
-                    f"Inconsistent batch sizes: {len(batched_images)} images, {len(text)} texts"
-                )
-            num_crops = to_py_obj(image_inputs.pop("num_crops"))
-            batch_num_crops = [[num_crops.pop(0) for _ in range(len(images))] for images in batched_images]
-            for batch_idx, (prompt, images, crops) in enumerate(zip(text, batched_images, batch_num_crops)):
-                image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)]
-                if len(images) != len(image_indexes):
-                    raise ValueError(
-                        f"Prompt has {len(image_indexes)} image tokens but received {len(images)} images"
-                    )
-                for num, idx in reversed(list(zip(crops, image_indexes))):
-                    if num:
-                        formatted_image_text = (
-                                f"Here is the original image {self.boi_token} and here are some crops to help you see better "
-                                + " ".join([self.boi_token] * num)
-                        )
-                        prompt = prompt[:idx] + formatted_image_text + prompt[idx + len(self.boi_token):]
-                        text[batch_idx] = prompt
-            text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
-        audio_inputs = {}
-        if audio is not None:
-            audio_inputs = self.audio_processor(audio, return_tensors)
-            audio_embeds = audio_inputs['audio_values']
-            audio_frames = audio_embeds.shape[1] * self.feat_stride
-            audio_seq_length = self._compute_audio_embed_size(audio_frames)
-            audio_tokens = {
-                "boa_token": "<start_of_audio>",
-                "eoa_token": "<end_of_audio>",
-                "audio_token": "<audio_soft_token>",
-                "boa_token_id": 256001,
-                "eoa_token_id": 256002,
-                "audio_token_id": self.audio_token_id  # Use dynamic ID
-            }
-            audio_sequence = f"\n\n{audio_tokens['boa_token']}{''.join([audio_tokens['audio_token']] * audio_seq_length)}{audio_tokens['eoa_token']}\n\n"
-            text = [prompt.replace(audio_tokens['boa_token'], audio_sequence) for prompt in text]
-        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors=return_tensors)
-        # Debug: Log text and token counts before validation
-        for i, (txt, ids) in enumerate(zip(text, text_inputs["input_ids"])):
-            audio_text_count = txt.count(self.audio_token)
-            audio_ids_count = list(ids).count(self.audio_token_id)
-            logger.debug(
-                f"Sample {i}: Audio tokens in text={audio_text_count}, in input_ids={audio_ids_count}, "
-                f"Text length={len(txt)}, Input IDs length={len(ids)}"
-            )
-        array_ids = text_inputs["input_ids"]
-        if return_tensors == "pt":
-            mm_token_type_ids = torch.zeros_like(array_ids)
-        else:
-            mm_token_type_ids = np.zeros_like(array_ids)
-        mm_token_type_ids[array_ids == self.image_token_id] = 1  # Image token type
-        mm_token_type_ids[array_ids == self.audio_token_id] = 2  # Audio token type
-        text_inputs["token_type_ids"] = mm_token_type_ids
-        return BatchFeature(data={**text_inputs, **image_inputs, **audio_inputs}, tensor_type=return_tensors)
     def batch_decode(self, *args, **kwargs):
         return self.tokenizer.batch_decode(*args, **kwargs)
@@ -346,8 +565,19 @@ class Gemma3OmniProcessor(ProcessorMixin):
         return self.tokenizer.decode(*args, **kwargs)
     @property
-    def model_input_names(self):
-        tokenizer_inputs = self.tokenizer.model_input_names + ["token_type_ids"]
-        image_processor_inputs = self.image_processor.model_input_names
-        audio_processor_inputs = self.audio_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_inputs + image_processor_inputs + audio_processor_inputs))

 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
+from transformers.audio_utils import AudioInput  # type: ignore
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import make_nested_list_of_images  # If image processing is used
+from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs
 from transformers.utils import TensorType, to_py_obj, logging
 # Constants
 DEFAULT_WIN_LENGTH = 400
 DEFAULT_HOP_LENGTH = 160
 DEFAULT_N_MELS = 80
+DEFAULT_COMPRESSION_RATE = 4  # For _calculate_embed_length
+DEFAULT_QFORMER_RATE = 2  # For _calculate_embed_length
+DEFAULT_FEAT_STRIDE = 4  # For _calculate_embed_length / 'frames'
+IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"  # Not used in this file directly
+AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"  # Not used in this file directly
+DEFAULT_MAX_LENGTH = 16384  # For tokenizer default
+LOG_MEL_CLIP_EPSILON = 1e-5
 logger = logging.get_logger(__name__)
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing."""
+    fmax = fmax or sampling_rate / 2.0
+    def hz_to_mel(f: float) -> float:  # User's formula
         return 1127.0 * math.log(1 + f / 700.0)
+    if fmin >= fmax:
+        raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
+    freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1)  # Inverse of user's hz_to_mel
+    freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
+    bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(
+        int)  # (n_fft+1) or n_fft/2 ? Librosa uses n_fft//2 * hz / sr_nyquist
+    bins = np.clip(bins, 0, n_fft // 2)  # Max index for rfft output is n_fft//2
+    filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
+    for m_idx in range(n_mels):
+        left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
+        if center > left:  # Rising slope
+            filterbank[m_idx, left:center + 1] = (np.arange(left, center + 1) - left) / (center - left)
+        if right > center:  # Falling slope
+            # Need to ensure the peak is 1 if center was part of rising slope
+            # If left==center, this part creates the full triangle (rising is skipped)
+            filterbank[m_idx, center:right + 1] = (right - np.arange(center, right + 1)) / (right - center)
+        # Ensure the peak at 'center' is 1.0 if it's a valid point.
+        # This handles cases where left=center or center=right if the slopes don't perfectly set it.
+        if left <= center <= right:
+            if filterbank.shape[1] > center:  # Check bounds for center index
+                if (center > left and filterbank[m_idx, center] < 1.0) or \
+                        (center < right and filterbank[m_idx, center] < 1.0) or \
+                        (left == center and center < right) or \
+                        (right == center and left < center):
+                    filterbank[m_idx, center] = 1.0
     return filterbank
 class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
+    model_input_names = ["audio_values", "audio_attention_mask"]
     def __init__(
             self,
             compression_rate: int = DEFAULT_COMPRESSION_RATE,
             feat_stride: int = DEFAULT_FEAT_STRIDE,
             sampling_rate: int = DEFAULT_SAMPLING_RATE,
             n_fft: int = DEFAULT_N_FFT,
+            win_length: Optional[int] = None,
+            hop_length: Optional[int] = None,
             n_mels: int = DEFAULT_N_MELS,
+            f_min: float = 0.0,
+            f_max: Optional[float] = None,
+            padding_value: float = 0.0,
             **kwargs
     ):
+        _win_length = win_length if win_length is not None else n_fft
+        _hop_length = hop_length if hop_length is not None else _win_length // 4
         super().__init__(
+            feature_size=n_mels,  # This is num_mel_bins
+            sampling_rate=sampling_rate,  # This is the target sampling rate for featurization
+            padding_value=padding_value,
             **kwargs
         )
         self.compression_rate = compression_rate
         self.qformer_rate = qformer_rate
         self.feat_stride = feat_stride
+        # self.sampling_rate is set by super() to the target rate
         self.n_fft = n_fft
+        self.win_length = _win_length
+        self.hop_length = _hop_length
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max if f_max is not None else self.sampling_rate / 2.0
+        if self.win_length > self.n_fft:
+            logger.warning(
+                f"win_length ({self.win_length}) is greater than n_fft ({self.n_fft}). "
+                "Window will be applied, then data zero-padded/truncated to n_fft by np.fft.rfft."
+            )
+        self.window = np.hamming(self.win_length).astype(np.float32)
+        self.mel_filterbank = create_mel_filterbank(
+            self.sampling_rate, self.n_fft, self.n_mels, fmin=self.f_min, fmax=self.f_max
+        ).T
     def __call__(
             self,
+            audios: Union[AudioInput, List[AudioInput]],
+            sampling_rate: Optional[int] = None,  # SR of input raw audio arrays
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
+        if not isinstance(audios, list):
+            audios = [audios]
+        processed_mels: List[torch.Tensor] = []
+        actual_mel_lengths: List[int] = []
+        # These lists are from your original code; their values might be used by Gemma3OmniProcessor later.
+        sizes_for_downstream_calc: List[torch.Tensor] = []
+        frames_scaled_for_downstream_calc: List[int] = []
+        for audio_item in audios:
+            current_wav_array: np.ndarray
+            source_sr: int  # Original sampling rate of the current_wav_array
+            if isinstance(audio_item, tuple) and len(audio_item) == 2 and isinstance(audio_item[1], int):
+                current_wav_array, source_sr = audio_item
+                current_wav_array = np.asarray(current_wav_array, dtype=np.float32)
+            elif isinstance(audio_item, (np.ndarray, list)):  # Raw waveform as array/list
+                current_wav_array = np.asarray(audio_item, dtype=np.float32)
+                if sampling_rate is None:
+                    raise ValueError(
+                        "sampling_rate argument must be provided to __call__ if 'audios' items "
+                        "are raw numpy arrays or lists (without embedded sampling rate info)."
+                    )
+                source_sr = sampling_rate
+            else:
+                # If you expect to load from paths/bytes, you'd use transformers.audio_utils.load_audio here
+                raise TypeError(
+                    f"Unsupported audio_item type: {type(audio_item)}. Expected np.ndarray, list of floats, "
+                    "or Tuple[np.ndarray, int (sampling_rate)]."
+                )
+            logger.debug(
+                f"Gemma3AudioFeatureExtractor: Processing audio item with original shape {current_wav_array.shape}, source_sr {source_sr}")
+            # 1. Preprocess: convert to mono, resample to self.sampling_rate, normalize
+            processed_wav_for_mel = self._preprocess_audio(current_wav_array, source_sr)
+            # 2. Compute Log-Mel Spectrogram: results in (NumFrames, self.n_mels)
+            mel_spectrogram_np = self._compute_log_mel_spectrogram(processed_wav_for_mel)
+            logger.debug(f"Gemma3AudioFeatureExtractor: Computed mel_spectrogram shape: {mel_spectrogram_np.shape}")
+            if not (mel_spectrogram_np.ndim == 2 and mel_spectrogram_np.shape[1] == self.n_mels):
+                # This check is important if _compute_log_mel_spectrogram could return variable shapes
+                logger.error(
+                    f"Mel spectrogram computation resulted in unexpected shape {mel_spectrogram_np.shape}. Expected (NumFrames, {self.n_mels})")
+                # Fallback to a zero-feature tensor of correct feature dimension but zero time, or handle error
+                # This indicates a problem in _compute_log_mel_spectrogram or very unusual input
+                # For now, let it proceed, but this would be an issue.
+                # If num_frames was 0, shape would be (0, n_mels), which is valid.
+            feature_tensor = torch.from_numpy(mel_spectrogram_np)  # Already float32
+            processed_mels.append(feature_tensor)
+            actual_mel_lengths.append(feature_tensor.shape[0])  # Number of time frames
+            # Original logic for 'sizes' and 'frames' (kept for compatibility with your processor)
+            sizes_for_downstream_calc.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
+            frames_scaled_for_downstream_calc.append(feature_tensor.shape[0] * self.feat_stride)
+        # Pad the list of 2D Mel spectrograms to form a 3D batch
+        # Output shape: (Batch, MaxNumFrames, NumMels)
+        audio_values_batched = pad_sequence(processed_mels, batch_first=True, padding_value=self.padding_value)
+        # Create attention mask for the padded batch
+        max_t_mel_in_batch = audio_values_batched.shape[1]
+        attention_mask_batched = torch.zeros(len(audios), max_t_mel_in_batch, dtype=torch.bool)
+        for i, length in enumerate(actual_mel_lengths):
+            attention_mask_batched[i, :length] = True
         output_data = {
+            "audio_values": audio_values_batched,  # Expected by model as (B, T, F)
+            "audio_attention_mask": attention_mask_batched  # Mask for "audio_values"
         }
+        if sizes_for_downstream_calc:  # If these are used by the OmniProcessor
+            output_data["audio_values_sizes"] = torch.stack(sizes_for_downstream_calc)
+        logger.info(
+            f"Gemma3AudioFeatureExtractor: Final 'audio_values' batch shape: {output_data['audio_values'].shape}")
         return BatchFeature(data=output_data, tensor_type=return_tensors)
     def _preprocess_audio(self, wav: np.ndarray, source_sr: int) -> np.ndarray:
+        if wav.dtype not in [np.float32, np.float64]:
+            if np.issubdtype(wav.dtype, np.integer):
+                max_val = np.iinfo(wav.dtype).max if wav.size > 0 else 1.0
+                wav = wav.astype(np.float32) / max_val
+            else:
+                wav = wav.astype(np.float32)
+        elif wav.dtype == np.float64:
+            wav = wav.astype(np.float32)
         if wav.ndim > 1:
             wav = wav.mean(axis=0)
         if source_sr != self.sampling_rate:
+            # logger.info(f"Resampling audio from {source_sr} Hz to {self.sampling_rate} Hz.")
+            common_divisor = math.gcd(self.sampling_rate, source_sr)
+            up_factor = self.sampling_rate // common_divisor
+            down_factor = source_sr // common_divisor
+            if up_factor != down_factor:
+                wav = scipy.signal.resample_poly(wav, up=up_factor, down=down_factor)
+        max_abs_val = np.abs(wav).max()
+        if max_abs_val > 1e-7:  # Avoid division by zero/small numbers for silent/near-silent audio
+            wav = wav / max_abs_val
+        return wav
     def _compute_log_mel_spectrogram(self, wav: np.ndarray) -> np.ndarray:
+        if len(wav) < self.win_length:
+            padding = self.win_length - len(wav)
+            wav = np.pad(wav, (0, padding), mode='constant', constant_values=0.0)
+        if len(wav) >= self.win_length:
+            num_frames = 1 + (len(wav) - self.win_length) // self.hop_length
+        else:
+            num_frames = 0  # Should be caught by the padding above, but defensive.
+        if num_frames <= 0:
+            # logger.warning(...)
+            return np.zeros((0, self.n_mels), dtype=np.float32)  # Return shape (0, N_Mels)
+        frames_view = np.lib.stride_tricks.as_strided(
             wav,
+            shape=(num_frames, self.win_length),
+            strides=(wav.strides[0] * self.hop_length, wav.strides[0]),
             writeable=False
+        )
+        frames_data = frames_view.copy()  # Ensure it's a copy before in-place modification
+        frames_data *= self.window
+        spectrum = np.fft.rfft(frames_data, n=self.n_fft, axis=-1).astype(np.complex64)
         power = np.abs(spectrum) ** 2
         mel_spectrogram = np.dot(power, self.mel_filterbank)
+        mel_spectrogram = np.clip(mel_spectrogram, LOG_MEL_CLIP_EPSILON, None)
+        log_mel_spectrogram = np.log(mel_spectrogram)
+        return log_mel_spectrogram.astype(np.float32)
     def _calculate_embed_length(self, frame_count: int) -> int:
         compressed = math.ceil(frame_count / self.compression_rate)
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Optional[Dict[str, Any]] = None
+    audio_kwargs: Optional[Dict[str, Any]] = None
+    text_kwargs: Optional[Dict[str, Any]] = None
     _defaults = {
         "text_kwargs": {"padding": False, "truncation": False, "max_length": DEFAULT_MAX_LENGTH},
         "images_kwargs": {},
 class Gemma3OmniProcessor(ProcessorMixin):
     attributes = ["image_processor", "audio_processor", "tokenizer"]
     valid_kwargs = ["chat_template", "image_seq_length"]
     image_processor_class = "AutoImageProcessor"
+    audio_processor_class = "AutoFeatureExtractor"  # CRITICAL: Must be string name of your custom class
     tokenizer_class = "AutoTokenizer"
     def __init__(
             self,
+            image_processor=None,
+            audio_processor=None,
+            tokenizer=None,
             chat_template=None,
             image_seq_length: int = 256,
             **kwargs
     ):
+        # ProcessorMixin.__init__ handles instantiation of audio_processor, image_processor, tokenizer
+        # if they are None when passed to it, using the *_class attributes defined above.
+        # If actual instances are passed (e.g., from from_pretrained), they will be used.
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
             **kwargs
         )
+        # These attributes depend on self.tokenizer being properly initialized by super()
+        self.image_seq_length = image_seq_length
+        if self.tokenizer is not None:
+            # Use getattr for robustness, providing defaults if attributes are missing
+            self.image_token_id = getattr(self.tokenizer, "image_token_id",
+                                          self.tokenizer.unk_token_id if hasattr(self.tokenizer,
+                                                                                 "unk_token_id") else None)
+            self.boi_token = getattr(self.tokenizer, "boi_token", "<image>")  # More common default
+            self.image_token = getattr(self.tokenizer, "image_token", "<image>")
+            self.eoi_token = getattr(self.tokenizer, "eoi_token", "")  # Default to empty if not present
+            # User's original attributes for audio tokens
+            self.audio_token_str_from_user_code = "<audio_soft_token>"
+            # self.expected_audio_token_id = 262143 # User's reference, keep commented for minimal change
+            self.audio_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_token_str_from_user_code)
+            if hasattr(self.tokenizer, "unk_token_id") and self.audio_token_id == self.tokenizer.unk_token_id:
+                logger.warning(
+                    f"The audio token string '{self.audio_token_str_from_user_code}' maps to the UNK token. "
+                    "Please ensure it is added to the tokenizer's vocabulary as a special token."
+                )
+            self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * image_seq_length)}{self.eoi_token}\n\n"
+        else:
+            # This state (tokenizer is None after super init) should ideally not occur if from_pretrained works.
+            logger.error(
+                "Gemma3OmniProcessor initialized, but self.tokenizer is None. Token-dependent attributes will use placeholders or defaults.")
+            self.image_token_id = None
+            self.boi_token = "<image>"
+            self.image_token = "<image>"
+            self.eoi_token = ""
+            self.audio_token_str_from_user_code = "<audio_soft_token>"
+            self.audio_token_id = -1  # Placeholder
+            self.full_image_sequence = ""
+        # These are parameters for this processor's logic for number of audio tokens in prompt
+        self.prompt_audio_compression_rate = kwargs.pop("audio_prompt_compression_rate", 8)
+        self.prompt_audio_qformer_rate = kwargs.pop("audio_prompt_qformer_rate", 1)
+        self.prompt_audio_feat_stride = kwargs.pop("audio_prompt_feat_stride", 1)
+        self.audio_placeholder_token = kwargs.pop("audio_placeholder_token", "<|audio_placeholder|>")
+    def _merge_kwargs(self, KwargsClassWithDefaults, tokenizer_init_kwargs, **kwargs_from_call):
+        # This method merges default kwargs, tokenizer init kwargs, and call-specific kwargs
+        final_kwargs = {}
+        _defaults = getattr(KwargsClassWithDefaults, "_defaults", {})
+        if not isinstance(_defaults, dict): _defaults = {}
+        for modality_key, default_modality_kwargs in _defaults.items():
+            final_kwargs[modality_key] = default_modality_kwargs.copy()
+        for modality_key_in_call, modality_kwargs_in_call in kwargs_from_call.items():
+            if modality_key_in_call in final_kwargs:  # e.g. "text_kwargs"
+                if isinstance(modality_kwargs_in_call, dict):
+                    final_kwargs[modality_key_in_call].update(modality_kwargs_in_call)
+            elif isinstance(modality_kwargs_in_call, dict):  # New modality not in _defaults (e.g. "video_kwargs")
+                final_kwargs[modality_key_in_call] = modality_kwargs_in_call.copy()
+        if self.tokenizer:  # Ensure tokenizer is available for its init_kwargs
+            for modality_key in final_kwargs:
+                modality_dict = final_kwargs[modality_key]
+                if isinstance(modality_dict, dict):
+                    for key_in_mod_dict in list(modality_dict.keys()):
+                        if key_in_mod_dict in tokenizer_init_kwargs:  # tokenizer_init_kwargs from self.tokenizer.init_kwargs
+                            value = (
+                                getattr(self.tokenizer, key_in_mod_dict)  # Check actual tokenizer attribute first
+                                if hasattr(self.tokenizer, key_in_mod_dict)
+                                else tokenizer_init_kwargs[key_in_mod_dict]
+                            )
+                            modality_dict[key_in_mod_dict] = value
+        if "text_kwargs" not in final_kwargs:
+            final_kwargs["text_kwargs"] = {}
+        # Ensure these text_kwargs have defaults if not set otherwise
+        final_kwargs["text_kwargs"]["truncation"] = final_kwargs["text_kwargs"].get("truncation", False)
+        final_kwargs["text_kwargs"]["max_length"] = final_kwargs["text_kwargs"].get("max_length", DEFAULT_MAX_LENGTH)
+        return final_kwargs
+    def _compute_audio_embed_size(self, audio_mel_frames: int) -> int:
+        scaled_frames = audio_mel_frames * self.prompt_audio_feat_stride
+        result = math.ceil(scaled_frames / self.prompt_audio_compression_rate)
+        return math.ceil(result / self.prompt_audio_qformer_rate)
     def __call__(
             self,
+            text: Union[str, List[str]] = None,
+            images: Optional[Any] = None,
+            audios: Optional[Union[AudioInput, List[AudioInput]]] = None,
+            sampling_rate: Optional[int] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            **kwargs: Any
     ) -> BatchFeature:
+        if text is None and images is None and audios is None:
+            raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
+        # Determine final return_tensors strategy (explicit __call__ arg > from text_kwargs > default)
+        final_rt = return_tensors
+        # _merge_kwargs uses Gemma3ProcessorKwargs to structure the **kwargs from __call__
+        merged_call_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,  # Class defining _defaults structure
+            self.tokenizer.init_kwargs if hasattr(self.tokenizer, 'init_kwargs') else {},
             **kwargs
         )
+        if final_rt is None:  # If not passed directly to __call__
+            # Get from merged_call_kwargs (which would have picked it up from kwargs['text_kwargs'])
+            # and remove it to prevent passing twice to tokenizer
+            final_rt = merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", TensorType.PYTORCH)
+        else:  # If passed directly, ensure it's removed from text_kwargs to avoid conflict
+            merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
+        if text is None:  # Default text if only other modalities are provided
+            num_samples = 0
+            if images is not None:
+                _images_list = images if isinstance(images, list) and (
+                            not images or not isinstance(images[0], (int, float))) else [images]
+                num_samples = len(_images_list)
+            elif audios is not None:
+                _audios_list = audios if isinstance(audios, list) else [audios]
+                num_samples = len(_audios_list)
+            text = [""] * num_samples if num_samples > 0 else [""]  # Create empty strings or one if no samples
+        if isinstance(text, str): text = [text]
+        if not (isinstance(text, list) and all(isinstance(t, str) for t in text)):
+            raise ValueError("Input `text` must be a string or a list of strings.")
+        # --- Image Processing (User's structure) ---
+        image_features_dict = {}
         if images is not None:
+            if self.image_processor is None: raise ValueError("Images provided but self.image_processor is None.")
             batched_images = make_nested_list_of_images(images)
+            _img_proc_output = self.image_processor(batched_images, return_tensors=None,
+                                                    **merged_call_kwargs.get("images_kwargs", {}))
+            image_features_dict = _img_proc_output.data if isinstance(_img_proc_output,
+                                                                      BatchFeature) else _img_proc_output
+            # Adjust text based on images (user's original logic)
+            if len(text) == 0 and len(batched_images) > 0: text = [" ".join([self.boi_token] * len(img_batch)) for
+                                                                   img_batch in batched_images]
+            if len(batched_images) != len(text): raise ValueError(
+                f"Inconsistent batch: {len(batched_images)} images, {len(text)} texts")
+            num_crops_popped = image_features_dict.pop("num_crops", None)
+            if num_crops_popped is not None:
+                num_crops_all = to_py_obj(num_crops_popped)
+                temp_text_img, current_crop_idx_offset = [], 0
+                for batch_idx, (prompt, current_imgs_in_batch) in enumerate(zip(text, batched_images)):
+                    crops_for_this_batch_sample = []
+                    if num_crops_all:
+                        for _ in current_imgs_in_batch:
+                            if current_crop_idx_offset < len(num_crops_all):
+                                crops_for_this_batch_sample.append(
+                                    num_crops_all[current_crop_idx_offset]); current_crop_idx_offset += 1
+                            else:
+                                crops_for_this_batch_sample.append(0)
+                    image_indexes = [m.start() for m in re.finditer(re.escape(self.boi_token), prompt)]
+                    processed_prompt = prompt
+                    iter_count = min(len(crops_for_this_batch_sample), len(image_indexes))
+                    for i_crop_idx in range(iter_count - 1, -1, -1):
+                        num_additional_crops = crops_for_this_batch_sample[i_crop_idx]
+                        original_token_idx = image_indexes[i_crop_idx]
+                        if num_additional_crops > 0:
+                            replacement_text = (
+                                        f"Here is the original image {self.boi_token} and here are some crops to help you see better " + " ".join(
+                                    [self.boi_token] * num_additional_crops))
+                            processed_prompt = processed_prompt[
+                                               :original_token_idx] + replacement_text + processed_prompt[
+                                                                                         original_token_idx + len(
+                                                                                             self.boi_token):]
+                    temp_text_img.append(processed_prompt)
+                text = temp_text_img
+                text = [p.replace(self.boi_token, self.full_image_sequence) for p in text]
+        # --- Audio Processing ---
+        audio_features_dict = {}
+        if audios is not None:
+            if self.audio_processor is None: raise ValueError("Audios provided but self.audio_processor is None.")
+            audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
+            if sampling_rate is not None: audio_call_kwargs[
+                "sampling_rate"] = sampling_rate  # Pass SR to feature extractor
+            _audio_proc_output = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
+            audio_features_dict = _audio_proc_output.data
+            logger.info(
+                f"Gemma3OmniProcessor: 'audio_values' shape from Feature Extractor: {audio_features_dict['audio_values'].shape}")
+            new_text_with_audio, actual_mel_frames_per_sample = [], to_py_obj(
+                audio_features_dict["audio_attention_mask"].sum(axis=-1))
+            if len(actual_mel_frames_per_sample) != len(text): raise ValueError(
+                f"Inconsistent batch for audio/text: {len(actual_mel_frames_per_sample)} audio, {len(text)} text.")
+            for i, prompt in enumerate(text):
+                num_soft_tokens = self._compute_audio_embed_size(actual_mel_frames_per_sample[i])
+                audio_token_sequence_str = self.audio_token_str_from_user_code * num_soft_tokens  # e.g. "<audio_soft_token>" * N
+                # User's original boa_token for replacement was " ", which is risky. Using defined placeholder.
+                if self.audio_placeholder_token in prompt:
+                    prompt = prompt.replace(self.audio_placeholder_token, audio_token_sequence_str, 1)
+                else:
+                    prompt += audio_token_sequence_str
+                new_text_with_audio.append(prompt)
+            text = new_text_with_audio
+        # --- Text Tokenization ---
+        text_tokenizer_kwargs = merged_call_kwargs.get("text_kwargs", {})
+        text_features_dict = self.tokenizer(text=text, return_tensors=None,
+                                            **text_tokenizer_kwargs)  # Get lists/np.arrays
+        input_ids_list_of_lists = text_features_dict["input_ids"]
+        if not isinstance(input_ids_list_of_lists, list) or not (
+                input_ids_list_of_lists and isinstance(input_ids_list_of_lists[0], list)):
+            if isinstance(input_ids_list_of_lists, (torch.Tensor, np.ndarray)):
+                input_ids_list_of_lists = to_py_obj(input_ids_list_of_lists)
+            elif isinstance(input_ids_list_of_lists, list) and (
+                    not input_ids_list_of_lists or isinstance(input_ids_list_of_lists[0], int)):
+                input_ids_list_of_lists = [input_ids_list_of_lists]
+        token_type_ids_list = []
+        for ids_sample in input_ids_list_of_lists:
+            types = [0] * len(ids_sample)
+            for j, token_id_val in enumerate(ids_sample):
+                if self.image_token_id is not None and token_id_val == self.image_token_id:
+                    types[j] = 1
+                elif self.audio_token_id != -1 and token_id_val == self.audio_token_id:
+                    types[j] = 2
+            token_type_ids_list.append(types)
+        text_features_dict["token_type_ids"] = token_type_ids_list
+        # Ensure text_features_dict also has 'attention_mask' if tokenizer applied padding
+        # If tokenizer was called with padding=True/strategy, it would add 'attention_mask'
+        # If called with padding=False (default), 'attention_mask' might be missing or all 1s.
+        # BatchFeature will handle final tensor conversion and padding based on final_rt.
+        final_batch_data = {**text_features_dict}
+        if image_features_dict: final_batch_data.update(image_features_dict)
+        if audio_features_dict: final_batch_data.update(audio_features_dict)
+        return BatchFeature(data=final_batch_data, tensor_type=final_rt)
     def batch_decode(self, *args, **kwargs):
         return self.tokenizer.batch_decode(*args, **kwargs)
         return self.tokenizer.decode(*args, **kwargs)
     @property
+    def model_input_names(self) -> List[str]:
+        input_names = set()
+        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+            input_names.update(self.tokenizer.model_input_names + ["token_type_ids"])
+        if hasattr(self, 'image_processor') and self.image_processor is not None:
+            input_names.update(self.image_processor.model_input_names)
+        if hasattr(self, 'audio_processor') and self.audio_processor is not None and \
+                hasattr(self.audio_processor, 'model_input_names'):
+            input_names.update(self.audio_processor.model_input_names)
+        elif hasattr(self,
+                     'audio_processor') and self.audio_processor is not None:  # Fallback if model_input_names not on custom audio_processor
+            input_names.update(["audio_values", "audio_attention_mask"])
+        return list(input_names)