Spaces:

ariG23498
/

gemma3n-image-audio

Runtime error

ariG23498 HF Staff commited on Jun 30

Commit

20169cb

1 Parent(s): a3e6d78

fix

Files changed (2) hide show

app.py CHANGED Viewed

@@ -13,27 +13,37 @@ model = AutoModelForImageTextToText.from_pretrained(
 @spaces.GPU
 def process_inputs(image, audio):
-    # Prepare inputs for the model
-    inputs = processor(
-        images=image,
-        audio=audio,
-        return_tensors="pt"
-    ).to(model.device, dtype=model.dtype)
-    # Generate text output
     with torch.inference_mode:
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=256
         )
-    # Decode and return text
-    text = processor.batch_decode(
-        outputs,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=True
-    )[0]
-    return text
 # Gradio interface
 iface = gr.Interface(

 @spaces.GPU
 def process_inputs(image, audio):
+    messages = [
+        {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": image,},
+            {"type": "audio", "audio": audio,},
+        ]
+    },]
+    input_ids = self.processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+    input_len = input_ids["input_ids"].shape[-1]
+    input_ids = input_ids.to(self.model.device, dtype=model.dtype)
     with torch.inference_mode:
+        outputs = self.model.generate(
+            **input_ids,
+            max_new_tokens=max_tokens,
+            disable_compile=True
         )
+    text = self.processor.batch_decode(
+        outputs[:, input_len:],
         skip_special_tokens=True,
         clean_up_tokenization_spaces=True
+    )
+    return text[0]
 # Gradio interface
 iface = gr.Interface(

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 spaces
 gradio
-transformers==4.53.0

 spaces
 gradio
+transformers==4.53.0
+timm==1.0.16