Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,575 Bytes
344bc31 8d92093 344bc31 8d92093 344bc31 cba65f8 344bc31 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import os
import PIL.Image
import transformers
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
import torch
import supervision as sv
import cv2
import numpy as np
from PIL import Image
import gradio as gr
import space
BOX_ANNOTATOR = sv.BoxAnnotator()
LABEL_ANNOTATOR = sv.LabelAnnotator()
MASK_ANNOTATOR = sv.MaskAnnotator()
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "google/paligemma2-3b-pt-448"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DEVICE)
processor = PaliGemmaProcessor.from_pretrained(model_id)
@spaces.GPU
def process_image(input_image,input_text,class_names):
class_list = class_names.split(',')
cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
model_inputs = processor(text=input_text, images=input_image, return_tensors="pt").to(torch.bfloat16).to(model.device)
input_len = model_inputs["input_ids"].shape[-1]
with torch.inference_mode():
generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
generation = generation[0][input_len:]
result = processor.decode(generation, skip_special_tokens=True)
detections = sv.Detections.from_lmm(
sv.LMM.PALIGEMMA,
result,
resolution_wh=(input_image.width, input_image.height),
classes=class_list
)
annotated_image = BOX_ANNOTATOR.annotate(
scene=cv_image.copy(),
detections=detections
)
annotated_image = LABEL_ANNOTATOR.annotate(
scene=annotated_image,
detections=detections
)
annotated_image = MASK_ANNOTATOR.annotate(
scene=annotated_image,
detections=detections
)
annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
annotated_image = Image.fromarray(annotated_image)
return annotated_image, result
app = gr.Interface(
fn=process_image,
inputs=[
gr.Image(type="pil", label="Input Image"),
gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog"),
gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
],
outputs=[gr.Image(type="pil", label="Annotated Image"), gr.Textbox(label="Detection Result")],
title="PaliGemma2 Image Detection with Supervision",
description="Detect objects in an image using PaliGemma2 model."
)
if __name__ == "__main__":
app.launch() |