import gradio as gr import torch from PIL import Image import cv2 import numpy as np from transformers import CLIPProcessor, CLIPModel from ultralytics import FastSAM import supervision as sv from huggingface_hub import hf_hub_download # Load CLIP model model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # Download and load FastSAM model model_path = hf_hub_download("Jiawei-Yang/FastSAM-x", filename="FastSAM-x.pt") fast_sam = FastSAM(model_path) def process_image_clip(image, text_input): if image is None: return "Please upload an image first." # Process image for CLIP inputs = processor( images=image, text=[text_input], return_tensors="pt", padding=True ) # Get model predictions outputs = model(**inputs) logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1) confidence = float(probs[0][0]) return f"Confidence that the image contains '{text_input}': {confidence:.2%}" def process_image_fastsam(image): if image is None: return None # Convert PIL image to numpy array image_np = np.array(image) # Run FastSAM inference results = fast_sam(image_np, device='cpu', retina_masks=True, imgsz=1024, conf=0.4, iou=0.9) # Get detections detections = sv.Detections.from_ultralytics(results[0]) # Create annotator box_annotator = sv.BoxAnnotator() mask_annotator = sv.MaskAnnotator() # Annotate image annotated_image = mask_annotator.annotate(scene=image_np.copy(), detections=detections) annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections) return Image.fromarray(annotated_image) # Create Gradio interface with gr.Blocks() as demo: gr.Markdown(""" # CLIP and FastSAM Demo This demo combines two powerful AI models: - **CLIP**: For zero-shot image classification - **FastSAM**: For automatic image segmentation Try uploading an image and use either of the tabs below! """) with gr.Tab("CLIP Zero-Shot Classification"): with gr.Row(): image_input = gr.Image(type="pil", label="Input Image") text_input = gr.Textbox(label="What do you want to check in the image?", placeholder="e.g., 'a dog', 'sunset', 'people playing'") output_text = gr.Textbox(label="Result") classify_btn = gr.Button("Classify") classify_btn.click(fn=process_image_clip, inputs=[image_input, text_input], outputs=output_text) with gr.Tab("FastSAM Segmentation"): with gr.Row(): image_input_sam = gr.Image(type="pil", label="Input Image") image_output = gr.Image(type="pil", label="Segmentation Result") segment_btn = gr.Button("Segment") segment_btn.click(fn=process_image_fastsam, inputs=[image_input_sam], outputs=image_output) gr.Markdown(""" ### How to use: 1. **CLIP Classification**: Upload an image and enter text to check if that concept exists in the image 2. **FastSAM Segmentation**: Upload an image to get automatic segmentation with bounding boxes and masks """) demo.launch()