Spaces:

saakshigupta
/

deepfake-explainer-app

Paused

App Files Files Community

deepfake-explainer-app / gradcam_clip_large.py

saakshigupta

Rename gradcam_clip_large-2.py to gradcam_clip_large.py

8b4e8d5 verified about 2 months ago

raw

history blame contribute delete

14.3 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torchvision import transforms
	from torchvision.transforms.functional import to_pil_image
	import matplotlib.pyplot as plt
	from torch.utils.data import DataLoader, Dataset
	from PIL import Image
	import os
	import numpy as np
	import warnings
	from transformers import AutoProcessor, CLIPModel
	import cv2
	import re
	from huggingface_hub import hf_hub_download
	import io
	warnings.filterwarnings("ignore", category=UserWarning)

	class ImageDataset(Dataset):
	def __init__(self, image, transform=None, face_only=True, dataset_name=None):
	# Modified to accept a single PIL image instead of a list of paths
	self.image = image
	self.transform = transform
	self.face_only = face_only
	self.dataset_name = dataset_name
	# Load face detector
	self.face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

	def __len__(self):
	return 1 # Only one image

	def detect_face(self, image_np):
	"""Detect face in image and return the face region"""
	gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
	faces = self.face_detector.detectMultiScale(gray, 1.1, 5)

	# If no face is detected, use the whole image
	if len(faces) == 0:
	print("No face detected, using whole image")
	h, w = image_np.shape[:2]
	return (0, 0, w, h), image_np

	# Get the largest face
	if len(faces) > 1:
	# Choose the largest face by area
	areas = [w*h for (x, y, w, h) in faces]
	largest_idx = np.argmax(areas)
	x, y, w, h = faces[largest_idx]
	else:
	x, y, w, h = faces[0]

	# Add padding around the face (5% on each side - reduced padding)
	padding_x = int(w * 0.05)
	padding_y = int(h * 0.05)

	# Ensure padding doesn't go outside image bounds
	x1 = max(0, x - padding_x)
	y1 = max(0, y - padding_y)
	x2 = min(image_np.shape[1], x + w + padding_x)
	y2 = min(image_np.shape[0], y + h + padding_y)

	# Extract the face region
	face_img = image_np[y1:y2, x1:x2]

	return (x1, y1, x2-x1, y2-y1), face_img

	def __getitem__(self, idx):
	# Use the single image provided
	image_np = np.array(self.image)
	label = 0 # Default label; will be overridden by prediction in app.py

	# Store original image for visualization
	original_image = self.image.copy()

	# Detect face if required
	if self.face_only:
	face_box, face_img_np = self.detect_face(image_np)
	face_img = Image.fromarray(face_img_np)

	# Apply transform to face image
	if self.transform:
	face_tensor = self.transform(face_img)
	else:
	face_tensor = transforms.ToTensor()(face_img)

	return face_tensor, label, "uploaded_image", original_image, face_box, self.dataset_name
	else:
	# Process the whole image
	if self.transform:
	image_tensor = self.transform(self.image)
	else:
	image_tensor = transforms.ToTensor()(self.image)

	return image_tensor, label, "uploaded_image", original_image, None, self.dataset_name

	class GradCAM:
	def __init__(self, model, target_layer):
	self.model = model
	self.target_layer = target_layer
	self.gradients = None
	self.activations = None
	self._register_hooks()

	def _register_hooks(self):
	def forward_hook(module, input, output):
	if isinstance(output, tuple):
	self.activations = output[0]
	else:
	self.activations = output

	def backward_hook(module, grad_in, grad_out):
	if isinstance(grad_out, tuple):
	self.gradients = grad_out[0]
	else:
	self.gradients = grad_out

	layer = dict([*self.model.named_modules()])[self.target_layer]
	layer.register_forward_hook(forward_hook)
	layer.register_backward_hook(backward_hook)

	def generate(self, input_tensor, class_idx):
	self.model.zero_grad()

	try:
	# Use only the vision part of the model for gradient calculation
	vision_outputs = self.model.vision_model(pixel_values=input_tensor)

	# Get the pooler output
	features = vision_outputs.pooler_output

	# Create a dummy gradient for the feature based on the class idx
	one_hot = torch.zeros_like(features)
	one_hot[0, class_idx] = 1

	# Manually backpropagate
	features.backward(gradient=one_hot)

	# Check for None values
	if self.gradients is None or self.activations is None:
	print("Warning: Gradients or activations are None. Using fallback CAM.")
	return np.ones((14, 14), dtype=np.float32) * 0.5

	# Process gradients and activations
	if len(self.gradients.shape) == 4: # Expected shape for convolutional layers
	gradients = self.gradients.cpu().detach().numpy()
	activations = self.activations.cpu().detach().numpy()

	weights = np.mean(gradients, axis=(2, 3))
	cam = np.zeros(activations.shape[2:], dtype=np.float32)

	for i, w in enumerate(weights[0]):
	cam += w * activations[0, i, :, :]
	else:
	# Handle transformer model format
	gradients = self.gradients.cpu().detach().numpy()
	activations = self.activations.cpu().detach().numpy()

	if len(activations.shape) == 3: # [batch, sequence_length, hidden_dim]
	seq_len = activations.shape[1]

	# CLIP ViT typically has 196 patch tokens (14×14) + 1 class token = 197
	if seq_len == 197:
	# Skip the class token (first token) and reshape the patch tokens into a square
	patch_tokens = activations[0, 1:, :] # Remove the class token
	# Take the mean across the hidden dimension
	token_importance = np.mean(np.abs(patch_tokens), axis=1)
	# Reshape to the expected grid size (14×14 for CLIP ViT-B/16)
	cam = token_importance.reshape(14, 14)
	else:
	# Try to find factors close to a square
	side_len = int(np.sqrt(seq_len))
	# Use the mean across features as importance
	token_importance = np.mean(np.abs(activations[0]), axis=1)
	# Create as square-like shape as possible
	cam = np.zeros((side_len, side_len))
	# Fill the cam with available values
	flat_cam = cam.flatten()
	flat_cam[:min(len(token_importance), len(flat_cam))] = token_importance[:min(len(token_importance), len(flat_cam))]
	cam = flat_cam.reshape(side_len, side_len)
	else:
	# Fallback
	print("Using fallback CAM shape (14x14)")
	cam = np.ones((14, 14), dtype=np.float32) * 0.5 # Default fallback

	# Ensure we have valid values
	if cam is None or cam.size == 0:
	print("Warning: Generated CAM is empty. Using fallback.")
	cam = np.ones((14, 14), dtype=np.float32) * 0.5

	cam = np.maximum(cam, 0)
	if np.max(cam) > 0:
	cam = cam / np.max(cam)

	return cam

	except Exception as e:
	print(f"Error in GradCAM.generate: {str(e)}")
	return np.ones((14, 14), dtype=np.float32) * 0.5

	def overlay_cam_on_image(image, cam, face_box=None, alpha=0.5):
	if face_box is not None:
	x, y, w, h = face_box
	# Create a mask for the entire image (all zeros initially)
	img_np = np.array(image)
	full_h, full_w = img_np.shape[:2]
	full_cam = np.zeros((full_h, full_w), dtype=np.float32)

	# Resize CAM to match face region
	face_cam = cv2.resize(cam, (w, h))

	# Copy the face CAM into the full image CAM at the face position
	full_cam[y:y+h, x:x+w] = face_cam

	# Convert full CAM to image
	cam_resized = Image.fromarray((full_cam * 255).astype(np.uint8))
	cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3] # Apply colormap
	cam_colormap = (cam_colormap * 255).astype(np.uint8)
	else:
	cam_resized = Image.fromarray((cam * 255).astype(np.uint8)).resize(image.size, Image.BILINEAR)
	cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3] # Apply colormap
	cam_colormap = (cam_colormap * 255).astype(np.uint8)

	blended = Image.blend(image, Image.fromarray(cam_colormap), alpha=alpha)
	return blended

	def save_comparison(image, cam, overlay, face_box=None):
	fig, axes = plt.subplots(1, 3, figsize=(15, 5))

	# Original Image
	axes[0].imshow(image)
	axes[0].set_title("Original")
	if face_box is not None:
	x, y, w, h = face_box
	rect = plt.Rectangle((x, y), w, h, edgecolor='lime', linewidth=2, fill=False)
	axes[0].add_patch(rect)
	axes[0].axis("off")

	# CAM
	if face_box is not None:
	# Create a full image CAM that highlights only the face
	img_np = np.array(image)
	h, w = img_np.shape[:2]
	full_cam = np.zeros((h, w))

	x, y, fw, fh = face_box
	# Resize CAM to face size
	face_cam = cv2.resize(cam, (fw, fh))
	# Place it in the right position
	full_cam[y:y+fh, x:x+fw] = face_cam
	axes[1].imshow(full_cam, cmap="jet")
	else:
	axes[1].imshow(cam, cmap="jet")
	axes[1].set_title("CAM")
	axes[1].axis("off")

	# Overlay
	axes[2].imshow(overlay)
	axes[2].set_title("Overlay")
	axes[2].axis("off")

	plt.tight_layout()

	# Convert plot to PIL Image for Streamlit display
	buf = io.BytesIO()
	plt.savefig(buf, format="png", bbox_inches="tight")
	plt.close()
	buf.seek(0)
	return Image.open(buf)

	def load_clip_model():
	# Modified to load checkpoint from Hugging Face
	model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
	processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")

	checkpoint_path = hf_hub_download(repo_id="drg31/model", filename="model.pth")
	checkpoint = torch.load(checkpoint_path, map_location='cpu')

	model_dict = model.state_dict()
	checkpoint = {k: v for k, v in checkpoint.items() if k in model_dict and model_dict[k].shape == v.shape}

	model_dict.update(checkpoint)
	model.load_state_dict(model_dict)

	model.eval()
	return model, processor

	def get_target_layer_clip(model):
	# For CLIP ViT large, use a layer that will have activations in the right format
	return "vision_model.encoder.layers.23"

	def process_images(dataloader, model, cam_extractor, device, pred_class):
	# Modified to process a single image and return results for Streamlit
	for batch in dataloader:
	input_tensor, label, img_paths, original_images, face_boxes, dataset_names = batch
	original_image = original_images[0]
	face_box = face_boxes[0]

	print(f"Processing uploaded image...")

	# Move tensors and model to device
	input_tensor = input_tensor.to(device)
	model = model.to(device)

	try:
	# Forward pass and Grad-CAM generation
	output = model.vision_model(pixel_values=input_tensor).pooler_output
	class_idx = pred_class # Use predicted class from app.py
	cam = cam_extractor.generate(input_tensor, class_idx)

	# Generate CAM image
	if face_box is not None:
	x, y, w, h = face_box
	img_np = np.array(original_image)
	h_full, w_full = img_np.shape[:2]
	full_cam = np.zeros((h_full, w_full))
	face_cam = cv2.resize(cam, (w, h))
	full_cam[y:y+h, x:x+w] = face_cam
	cam_img = Image.fromarray((plt.cm.jet(full_cam)[:, :, :3] * 255).astype(np.uint8))
	else:
	cam_resized = Image.fromarray((cam * 255).astype(np.uint8)).resize(original_image.size, Image.BILINEAR)
	cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3]
	cam_colormap = (cam_colormap * 255).astype(np.uint8)
	cam_img = Image.fromarray(cam_colormap)

	# Generate Overlay
	overlay = overlay_cam_on_image(original_image, cam, face_box)

	# Generate Comparison
	comparison = save_comparison(original_image, cam, overlay, face_box)

	return cam, cam_img, overlay, comparison

	except Exception as e:
	print(f"Error processing image: {str(e)}")
	import traceback
	traceback.print_exc()
	# Return default values in case of error
	default_cam = np.ones((14, 14), dtype=np.float32) * 0.5
	cam_resized = Image.fromarray((default_cam * 255).astype(np.uint8)).resize(original_image.size, Image.BILINEAR)
	cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3]
	cam_colormap = (cam_colormap * 255).astype(np.uint8)
	cam_img = Image.fromarray(cam_colormap)
	overlay = overlay_cam_on_image(original_image, default_cam, face_box)
	comparison = save_comparison(original_image, default_cam, overlay, face_box)
	return default_cam, cam_img, overlay, comparison