Spaces:

deepfakedetection
/

deepfake_uq

Running

App Files Files Community

saakshigupta commited on Apr 6

Commit

e5c0da6

verified ·

1 Parent(s): 43ab796

Upload gradcam_clip_large-2.py

Browse files

Files changed (1) hide show

gradcam_clip_large-2.py +345 -0

gradcam_clip_large-2.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from torchvision.transforms.functional import to_pil_image
+import matplotlib.pyplot as plt
+from torch.utils.data import DataLoader, Dataset
+from PIL import Image
+import os
+import numpy as np
+import warnings
+from transformers import AutoProcessor, CLIPModel
+import cv2
+import re
+from huggingface_hub import hf_hub_download
+import io
+warnings.filterwarnings("ignore", category=UserWarning)
+class ImageDataset(Dataset):
+    def __init__(self, image, transform=None, face_only=True, dataset_name=None):
+        # Modified to accept a single PIL image instead of a list of paths
+        self.image = image
+        self.transform = transform
+        self.face_only = face_only
+        self.dataset_name = dataset_name
+        # Load face detector
+        self.face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+    def __len__(self):
+        return 1  # Only one image
+    def detect_face(self, image_np):
+        """Detect face in image and return the face region"""
+        gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
+        faces = self.face_detector.detectMultiScale(gray, 1.1, 5)
+        # If no face is detected, use the whole image
+        if len(faces) == 0:
+            print("No face detected, using whole image")
+            h, w = image_np.shape[:2]
+            return (0, 0, w, h), image_np
+        # Get the largest face
+        if len(faces) > 1:
+            # Choose the largest face by area
+            areas = [w*h for (x, y, w, h) in faces]
+            largest_idx = np.argmax(areas)
+            x, y, w, h = faces[largest_idx]
+        else:
+            x, y, w, h = faces[0]
+        # Add padding around the face (5% on each side - reduced padding)
+        padding_x = int(w * 0.05)
+        padding_y = int(h * 0.05)
+        # Ensure padding doesn't go outside image bounds
+        x1 = max(0, x - padding_x)
+        y1 = max(0, y - padding_y)
+        x2 = min(image_np.shape[1], x + w + padding_x)
+        y2 = min(image_np.shape[0], y + h + padding_y)
+        # Extract the face region
+        face_img = image_np[y1:y2, x1:x2]
+        return (x1, y1, x2-x1, y2-y1), face_img
+    def __getitem__(self, idx):
+        # Use the single image provided
+        image_np = np.array(self.image)
+        label = 0  # Default label; will be overridden by prediction in app.py
+        # Store original image for visualization
+        original_image = self.image.copy()
+        # Detect face if required
+        if self.face_only:
+            face_box, face_img_np = self.detect_face(image_np)
+            face_img = Image.fromarray(face_img_np)
+            # Apply transform to face image
+            if self.transform:
+                face_tensor = self.transform(face_img)
+            else:
+                face_tensor = transforms.ToTensor()(face_img)
+            return face_tensor, label, "uploaded_image", original_image, face_box, self.dataset_name
+        else:
+            # Process the whole image
+            if self.transform:
+                image_tensor = self.transform(self.image)
+            else:
+                image_tensor = transforms.ToTensor()(self.image)
+            return image_tensor, label, "uploaded_image", original_image, None, self.dataset_name
+class GradCAM:
+    def __init__(self, model, target_layer):
+        self.model = model
+        self.target_layer = target_layer
+        self.gradients = None
+        self.activations = None
+        self._register_hooks()
+    def _register_hooks(self):
+        def forward_hook(module, input, output):
+            if isinstance(output, tuple):
+                self.activations = output[0]
+            else:
+                self.activations = output
+        def backward_hook(module, grad_in, grad_out):
+            if isinstance(grad_out, tuple):
+                self.gradients = grad_out[0]
+            else:
+                self.gradients = grad_out
+        layer = dict([*self.model.named_modules()])[self.target_layer]
+        layer.register_forward_hook(forward_hook)
+        layer.register_backward_hook(backward_hook)
+    def generate(self, input_tensor, class_idx):
+        self.model.zero_grad()
+        try:
+            # Use only the vision part of the model for gradient calculation
+            vision_outputs = self.model.vision_model(pixel_values=input_tensor)
+            # Get the pooler output
+            features = vision_outputs.pooler_output
+            # Create a dummy gradient for the feature based on the class idx
+            one_hot = torch.zeros_like(features)
+            one_hot[0, class_idx] = 1
+            # Manually backpropagate
+            features.backward(gradient=one_hot)
+            # Check for None values
+            if self.gradients is None or self.activations is None:
+                print("Warning: Gradients or activations are None. Using fallback CAM.")
+                return np.ones((14, 14), dtype=np.float32) * 0.5
+            # Process gradients and activations
+            if len(self.gradients.shape) == 4:  # Expected shape for convolutional layers
+                gradients = self.gradients.cpu().detach().numpy()
+                activations = self.activations.cpu().detach().numpy()
+                weights = np.mean(gradients, axis=(2, 3))
+                cam = np.zeros(activations.shape[2:], dtype=np.float32)
+                for i, w in enumerate(weights[0]):
+                    cam += w * activations[0, i, :, :]
+            else:
+                # Handle transformer model format
+                gradients = self.gradients.cpu().detach().numpy()
+                activations = self.activations.cpu().detach().numpy()
+                if len(activations.shape) == 3:  # [batch, sequence_length, hidden_dim]
+                    seq_len = activations.shape[1]
+                    # CLIP ViT typically has 196 patch tokens (14×14) + 1 class token = 197
+                    if seq_len == 197:
+                        # Skip the class token (first token) and reshape the patch tokens into a square
+                        patch_tokens = activations[0, 1:, :]  # Remove the class token
+                        # Take the mean across the hidden dimension
+                        token_importance = np.mean(np.abs(patch_tokens), axis=1)
+                        # Reshape to the expected grid size (14×14 for CLIP ViT-B/16)
+                        cam = token_importance.reshape(14, 14)
+                    else:
+                        # Try to find factors close to a square
+                        side_len = int(np.sqrt(seq_len))
+                        # Use the mean across features as importance
+                        token_importance = np.mean(np.abs(activations[0]), axis=1)
+                        # Create as square-like shape as possible
+                        cam = np.zeros((side_len, side_len))
+                        # Fill the cam with available values
+                        flat_cam = cam.flatten()
+                        flat_cam[:min(len(token_importance), len(flat_cam))] = token_importance[:min(len(token_importance), len(flat_cam))]
+                        cam = flat_cam.reshape(side_len, side_len)
+                else:
+                    # Fallback
+                    print("Using fallback CAM shape (14x14)")
+                    cam = np.ones((14, 14), dtype=np.float32) * 0.5  # Default fallback
+            # Ensure we have valid values
+            if cam is None or cam.size == 0:
+                print("Warning: Generated CAM is empty. Using fallback.")
+                cam = np.ones((14, 14), dtype=np.float32) * 0.5
+            cam = np.maximum(cam, 0)
+            if np.max(cam) > 0:
+                cam = cam / np.max(cam)
+            return cam
+        except Exception as e:
+            print(f"Error in GradCAM.generate: {str(e)}")
+            return np.ones((14, 14), dtype=np.float32) * 0.5
+def overlay_cam_on_image(image, cam, face_box=None, alpha=0.5):
+    if face_box is not None:
+        x, y, w, h = face_box
+        # Create a mask for the entire image (all zeros initially)
+        img_np = np.array(image)
+        full_h, full_w = img_np.shape[:2]
+        full_cam = np.zeros((full_h, full_w), dtype=np.float32)
+        # Resize CAM to match face region
+        face_cam = cv2.resize(cam, (w, h))
+        # Copy the face CAM into the full image CAM at the face position
+        full_cam[y:y+h, x:x+w] = face_cam
+        # Convert full CAM to image
+        cam_resized = Image.fromarray((full_cam * 255).astype(np.uint8))
+        cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3]  # Apply colormap
+        cam_colormap = (cam_colormap * 255).astype(np.uint8)
+    else:
+        cam_resized = Image.fromarray((cam * 255).astype(np.uint8)).resize(image.size, Image.BILINEAR)
+        cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3]  # Apply colormap
+        cam_colormap = (cam_colormap * 255).astype(np.uint8)
+    blended = Image.blend(image, Image.fromarray(cam_colormap), alpha=alpha)
+    return blended
+def save_comparison(image, cam, overlay, face_box=None):
+    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+    # Original Image
+    axes[0].imshow(image)
+    axes[0].set_title("Original")
+    if face_box is not None:
+        x, y, w, h = face_box
+        rect = plt.Rectangle((x, y), w, h, edgecolor='lime', linewidth=2, fill=False)
+        axes[0].add_patch(rect)
+    axes[0].axis("off")
+    # CAM
+    if face_box is not None:
+        # Create a full image CAM that highlights only the face
+        img_np = np.array(image)
+        h, w = img_np.shape[:2]
+        full_cam = np.zeros((h, w))
+        x, y, fw, fh = face_box
+        # Resize CAM to face size
+        face_cam = cv2.resize(cam, (fw, fh))
+        # Place it in the right position
+        full_cam[y:y+fh, x:x+fw] = face_cam
+        axes[1].imshow(full_cam, cmap="jet")
+    else:
+        axes[1].imshow(cam, cmap="jet")
+    axes[1].set_title("CAM")
+    axes[1].axis("off")
+    # Overlay
+    axes[2].imshow(overlay)
+    axes[2].set_title("Overlay")
+    axes[2].axis("off")
+    plt.tight_layout()
+    # Convert plot to PIL Image for Streamlit display
+    buf = io.BytesIO()
+    plt.savefig(buf, format="png", bbox_inches="tight")
+    plt.close()
+    buf.seek(0)
+    return Image.open(buf)
+def load_clip_model():
+    # Modified to load checkpoint from Hugging Face
+    model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+    processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    checkpoint_path = hf_hub_download(repo_id="drg31/model", filename="model.pth")
+    checkpoint = torch.load(checkpoint_path, map_location='cpu')
+    model_dict = model.state_dict()
+    checkpoint = {k: v for k, v in checkpoint.items() if k in model_dict and model_dict[k].shape == v.shape}
+    model_dict.update(checkpoint)
+    model.load_state_dict(model_dict)
+    model.eval()
+    return model, processor
+def get_target_layer_clip(model):
+    # For CLIP ViT large, use a layer that will have activations in the right format
+    return "vision_model.encoder.layers.23"
+def process_images(dataloader, model, cam_extractor, device, pred_class):
+    # Modified to process a single image and return results for Streamlit
+    for batch in dataloader:
+        input_tensor, label, img_paths, original_images, face_boxes, dataset_names = batch
+        original_image = original_images[0]
+        face_box = face_boxes[0]
+        print(f"Processing uploaded image...")
+        # Move tensors and model to device
+        input_tensor = input_tensor.to(device)
+        model = model.to(device)
+        try:
+            # Forward pass and Grad-CAM generation
+            output = model.vision_model(pixel_values=input_tensor).pooler_output
+            class_idx = pred_class  # Use predicted class from app.py
+            cam = cam_extractor.generate(input_tensor, class_idx)
+            # Generate CAM image
+            if face_box is not None:
+                x, y, w, h = face_box
+                img_np = np.array(original_image)
+                h_full, w_full = img_np.shape[:2]
+                full_cam = np.zeros((h_full, w_full))
+                face_cam = cv2.resize(cam, (w, h))
+                full_cam[y:y+h, x:x+w] = face_cam
+                cam_img = Image.fromarray((plt.cm.jet(full_cam)[:, :, :3] * 255).astype(np.uint8))
+            else:
+                cam_resized = Image.fromarray((cam * 255).astype(np.uint8)).resize(original_image.size, Image.BILINEAR)
+                cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3]
+                cam_colormap = (cam_colormap * 255).astype(np.uint8)
+                cam_img = Image.fromarray(cam_colormap)
+            # Generate Overlay
+            overlay = overlay_cam_on_image(original_image, cam, face_box)
+            # Generate Comparison
+            comparison = save_comparison(original_image, cam, overlay, face_box)
+            return cam, cam_img, overlay, comparison
+        except Exception as e:
+            print(f"Error processing image: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            # Return default values in case of error
+            default_cam = np.ones((14, 14), dtype=np.float32) * 0.5
+            cam_resized = Image.fromarray((default_cam * 255).astype(np.uint8)).resize(original_image.size, Image.BILINEAR)
+            cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3]
+            cam_colormap = (cam_colormap * 255).astype(np.uint8)
+            cam_img = Image.fromarray(cam_colormap)
+            overlay = overlay_cam_on_image(original_image, default_cam, face_box)
+            comparison = save_comparison(original_image, default_cam, overlay, face_box)
+            return default_cam, cam_img, overlay, comparison