voidful
/

gemma-3-omni-processor

Feature Extraction

Transformers

gemma_3_omni

custom_code

Model card Files Files and versions Community

voidful commited on May 16

Commit

9e58a2b

verified ·

1 Parent(s): 701891b

Update processing_gemma3_omni.py

Browse files

Files changed (1) hide show

processing_gemma3_omni.py +9 -41

processing_gemma3_omni.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import re
 from typing import List, Optional, Union, Dict, Any, Tuple  # Added Tuple
-import math
 import numpy as np
 import scipy.signal
 import torch
@@ -19,15 +18,16 @@ DEFAULT_N_FFT = 512
 DEFAULT_WIN_LENGTH = 400
 DEFAULT_HOP_LENGTH = 160
 DEFAULT_N_MELS = 80
-DEFAULT_COMPRESSION_RATE = 4 # Used for default in __init__
-DEFAULT_QFORMER_RATE = 2     # Used for default in __init__ (as audio_downsample_rate)
-DEFAULT_FEAT_STRIDE = 4      # Used for default in __init__
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
 logger = logging.get_logger(__name__)
 def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
     """Create a Mel filter-bank the same as SpeechLib FbankFC.
     Args:
@@ -283,6 +283,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):  # MODIFIED CLASS N
         return log_fbank
     def _compute_audio_embed_size(self, audio_frames: int) -> int:
         integer = audio_frames // self.compression_rate
         remainder = audio_frames % self.compression_rate
         result = integer if remainder == 0 else integer + 1
@@ -293,14 +294,6 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):  # MODIFIED CLASS N
         return result
-# The rest of your script (Gemma3ImagesKwargs, Gemma3ProcessorKwargs, Gemma3OmniProcessor) follows...
-# Make sure this Gemma3AudioFeatureExtractor class replaces the old one or
-# is correctly registered/named if your AutoProcessor setup relies on a specific name.
-# --- End of Refactored Audio Feature Extractor ---
 class Gemma3ImagesKwargs(ImagesKwargs):
     do_pan_and_scan: Optional[bool]
     pan_and_scan_min_crop_size: Optional[int]
@@ -416,23 +409,7 @@ class Gemma3OmniProcessor(ProcessorMixin):
         return final_kwargs
     def _compute_audio_embed_size(self, audio_mel_frames: int) -> int:
-        # This method is part of Gemma3OmniProcessor.
-        # It calculates a number of soft tokens based on its own compression rates.
-        # Note: `audio_mel_frames` here is the number of raw Mel frames from the feature extractor's perspective
-        # if the attention mask sum is directly used before feat_stride scaling by the processor.
-        # However, if using the Refactored processor, audio_attention_mask.sum() will yield
-        # num_mel_frames * feat_stride. This method should then correctly compress that value.
-        # Using prompt_audio_compression_rate and prompt_audio_qformer_rate
-        # which are attributes of this Gemma3OmniProcessor class.
-        # First compression
-        # audio_mel_frames here should ideally be num_actual_mel_frames * feat_stride_of_the_audio_processor
-        # if trying to match the number of tokens from a Phi4M-style processor.
-        # The refactored audio processor does this scaling internally before its own _compute_audio_embed_size.
-        # If actual_mel_frames_per_sample (from sum of attention_mask) *is* already scaled by feat_stride
-        # (as it would be if using the refactored processor's attention_mask), then this calculation is correct.
         integer = audio_mel_frames // self.prompt_audio_compression_rate
         remainder = audio_mel_frames % self.prompt_audio_compression_rate
         result = integer if remainder == 0 else integer + 1
@@ -473,11 +450,11 @@ class Gemma3OmniProcessor(ProcessorMixin):
             num_samples = 0
             if images is not None:
                 _images_list = images if isinstance(images, list) and (
-                            not images or not isinstance(images[0], (int, float))) else [images]
                 num_samples = len(_images_list)
             elif audios is not None:
                 _audios_list = audios if isinstance(audios, list) and not (
-                            isinstance(audios[0], tuple) and isinstance(audios[0][0], (int, float))) else [
                     audios]  # check if audios is list of items or list of (wave,sr)
                 num_samples = len(_audios_list)
             text = [""] * num_samples if num_samples > 0 else [""]  # Default to one empty string if no inputs
@@ -571,15 +548,6 @@ class Gemma3OmniProcessor(ProcessorMixin):
                 raise ValueError(
                     f"Inconsistent batch for audio/text: {num_audio_samples_processed} audio samples processed, {len(text)} text prompts."
                 )
-            # If using Gemma3AudioFeatureExtractor,
-            # "audio_embed_sizes" is already computed correctly (num compressed tokens).
-            # The processor's own _compute_audio_embed_size is called to determine how many
-            # self.audio_token_str_from_user_code to insert. Ideally, this matches.
-            # Get the number of frames that the processor's _compute_audio_embed_size expects.
-            # If the audio_processor is RefactoredGemma3..., its attention_mask is over (num_mel_frames * feat_stride).
-            # So, sum of that mask gives the input for this processor's _compute_audio_embed_size.
             frames_for_embed_size_calc = to_py_obj(audio_features_dict[self.audio_processor.model_input_names[2]].sum(
                 axis=-1))  # sum of audio_attention_mask
@@ -666,4 +634,4 @@ class Gemma3OmniProcessor(ProcessorMixin):
             else:
                 input_names.add(str(audio_inputs))
-        return list(input_names)

 import re
 from typing import List, Optional, Union, Dict, Any, Tuple  # Added Tuple
 import numpy as np
 import scipy.signal
 import torch
 DEFAULT_WIN_LENGTH = 400
 DEFAULT_HOP_LENGTH = 160
 DEFAULT_N_MELS = 80
+DEFAULT_COMPRESSION_RATE = 4  # Used for default in __init__
+DEFAULT_QFORMER_RATE = 2  # Used for default in __init__ (as audio_downsample_rate)
+DEFAULT_FEAT_STRIDE = 4  # Used for default in __init__
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
 logger = logging.get_logger(__name__)
 def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
     """Create a Mel filter-bank the same as SpeechLib FbankFC.
     Args:
         return log_fbank
     def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        print("self.compression_rate", self.compression_rate)
         integer = audio_frames // self.compression_rate
         remainder = audio_frames % self.compression_rate
         result = integer if remainder == 0 else integer + 1
         return result
 class Gemma3ImagesKwargs(ImagesKwargs):
     do_pan_and_scan: Optional[bool]
     pan_and_scan_min_crop_size: Optional[int]
         return final_kwargs
     def _compute_audio_embed_size(self, audio_mel_frames: int) -> int:
+        print("prompt_audio_compression_rate", self.prompt_audio_compression_rate)
         integer = audio_mel_frames // self.prompt_audio_compression_rate
         remainder = audio_mel_frames % self.prompt_audio_compression_rate
         result = integer if remainder == 0 else integer + 1
             num_samples = 0
             if images is not None:
                 _images_list = images if isinstance(images, list) and (
+                        not images or not isinstance(images[0], (int, float))) else [images]
                 num_samples = len(_images_list)
             elif audios is not None:
                 _audios_list = audios if isinstance(audios, list) and not (
+                        isinstance(audios[0], tuple) and isinstance(audios[0][0], (int, float))) else [
                     audios]  # check if audios is list of items or list of (wave,sr)
                 num_samples = len(_audios_list)
             text = [""] * num_samples if num_samples > 0 else [""]  # Default to one empty string if no inputs
                 raise ValueError(
                     f"Inconsistent batch for audio/text: {num_audio_samples_processed} audio samples processed, {len(text)} text prompts."
                 )
             frames_for_embed_size_calc = to_py_obj(audio_features_dict[self.audio_processor.model_input_names[2]].sum(
                 axis=-1))  # sum of audio_attention_mask
             else:
                 input_names.add(str(audio_inputs))
+        return list(input_names)