Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
import torch.nn as nn | |
from torch.utils.data import DataLoader | |
from torchvision import transforms | |
from transformers import CLIPModel, BlipProcessor, BlipForConditionalGeneration | |
from transformers.models.clip import CLIPModel | |
from PIL import Image | |
import numpy as np | |
import io | |
import base64 | |
import cv2 | |
import matplotlib.pyplot as plt | |
from peft import PeftModel | |
from unsloth import FastVisionModel | |
import os | |
import tempfile | |
import warnings | |
warnings.filterwarnings("ignore", category=UserWarning) | |
# App title and description | |
st.set_page_config( | |
page_title="Deepfake Analyzer", | |
layout="wide", | |
page_icon="π" | |
) | |
# Main title and description | |
st.title("Deepfake Image Analyser") | |
st.markdown("Analyse images for deepfake manipulation") | |
# Check for GPU availability | |
def check_gpu(): | |
if torch.cuda.is_available(): | |
gpu_info = torch.cuda.get_device_properties(0) | |
st.sidebar.success(f"β GPU available: {gpu_info.name} ({gpu_info.total_memory / (1024**3):.2f} GB)") | |
return True | |
else: | |
st.sidebar.warning("β οΈ No GPU detected. Analysis will be slower.") | |
return False | |
# Sidebar components | |
st.sidebar.title("About") | |
st.sidebar.markdown(""" | |
This tool detects deepfakes using four AI models: | |
- **CLIP**: Initial Real/Fake classification | |
- **GradCAM**: Highlights suspicious regions | |
- **BLIP**: Describes image content | |
- **Llama 3.2**: Explains potential manipulations | |
### Quick Start | |
1. **Load Models** - Start with CLIP, add others as needed | |
2. **Upload Image** - View classification and heat map | |
3. **Analyze** - Get explanations and ask questions | |
*GPU recommended for better performance* | |
""") | |
# Fixed values for temperature and max tokens | |
temperature = 0.7 | |
max_tokens = 500 | |
# Custom instruction text area in sidebar | |
use_custom_instructions = st.sidebar.toggle("Enable Custom Instructions", value=False, help="Toggle to enable/disable custom instructions") | |
if use_custom_instructions: | |
custom_instruction = st.sidebar.text_area( | |
"Custom Instructions (Advanced)", | |
value="Specify your preferred style of explanation (e.g., 'Provide technical, detailed explanations' or 'Use simple, non-technical language'). You can also specify what aspects of the image to focus on.", | |
help="Add specific instructions for the analysis" | |
) | |
else: | |
custom_instruction = "" | |
# ----- GradCAM Implementation ----- | |
class ImageDataset(torch.utils.data.Dataset): | |
def __init__(self, image, transform=None, face_only=True, dataset_name=None): | |
self.image = image | |
self.transform = transform | |
self.face_only = face_only | |
self.dataset_name = dataset_name | |
# Load face detector | |
self.face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') | |
def __len__(self): | |
return 1 # Only one image | |
def detect_face(self, image_np): | |
"""Detect face in image and return the face region""" | |
gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY) | |
faces = self.face_detector.detectMultiScale(gray, 1.1, 5) | |
# If no face is detected, use the whole image | |
if len(faces) == 0: | |
st.info("No face detected, using whole image for analysis") | |
h, w = image_np.shape[:2] | |
return (0, 0, w, h), image_np | |
# Get the largest face | |
if len(faces) > 1: | |
# Choose the largest face by area | |
areas = [w*h for (x, y, w, h) in faces] | |
largest_idx = np.argmax(areas) | |
x, y, w, h = faces[largest_idx] | |
else: | |
x, y, w, h = faces[0] | |
# Add padding around the face (5% on each side) | |
padding_x = int(w * 0.05) | |
padding_y = int(h * 0.05) | |
# Ensure padding doesn't go outside image bounds | |
x1 = max(0, x - padding_x) | |
y1 = max(0, y - padding_y) | |
x2 = min(image_np.shape[1], x + w + padding_x) | |
y2 = min(image_np.shape[0], y + h + padding_y) | |
# Extract the face region | |
face_img = image_np[y1:y2, x1:x2] | |
return (x1, y1, x2-x1, y2-y1), face_img | |
def __getitem__(self, idx): | |
image_np = np.array(self.image) | |
label = 0 # Default label; will be overridden by prediction | |
# Store original image for visualization | |
original_image = self.image.copy() | |
# Detect face if required | |
if self.face_only: | |
face_box, face_img_np = self.detect_face(image_np) | |
face_img = Image.fromarray(face_img_np) | |
# Apply transform to face image | |
if self.transform: | |
face_tensor = self.transform(face_img) | |
else: | |
face_tensor = transforms.ToTensor()(face_img) | |
return face_tensor, label, "uploaded_image", original_image, face_box, self.dataset_name | |
else: | |
# Process the whole image | |
if self.transform: | |
image_tensor = self.transform(self.image) | |
else: | |
image_tensor = transforms.ToTensor()(self.image) | |
return image_tensor, label, "uploaded_image", original_image, None, self.dataset_name | |
class GradCAM: | |
def __init__(self, model, target_layer): | |
self.model = model | |
self.target_layer = target_layer | |
self.gradients = None | |
self.activations = None | |
self._register_hooks() | |
def _register_hooks(self): | |
def forward_hook(module, input, output): | |
if isinstance(output, tuple): | |
self.activations = output[0] | |
else: | |
self.activations = output | |
def backward_hook(module, grad_in, grad_out): | |
if isinstance(grad_out, tuple): | |
self.gradients = grad_out[0] | |
else: | |
self.gradients = grad_out | |
layer = dict([*self.model.named_modules()])[self.target_layer] | |
layer.register_forward_hook(forward_hook) | |
layer.register_backward_hook(backward_hook) | |
def generate(self, input_tensor, class_idx): | |
self.model.zero_grad() | |
try: | |
# Use only the vision part of the model for gradient calculation | |
vision_outputs = self.model.vision_model(pixel_values=input_tensor) | |
# Get the pooler output | |
features = vision_outputs.pooler_output | |
# Create a dummy gradient for the feature based on the class idx | |
one_hot = torch.zeros_like(features) | |
one_hot[0, class_idx] = 1 | |
# Manually backpropagate | |
features.backward(gradient=one_hot) | |
# Check for None values | |
if self.gradients is None or self.activations is None: | |
st.warning("Warning: Gradients or activations are None. Using fallback CAM.") | |
return np.ones((14, 14), dtype=np.float32) * 0.5 | |
# Process gradients and activations for transformer-based model | |
gradients = self.gradients.cpu().detach().numpy() | |
activations = self.activations.cpu().detach().numpy() | |
if len(activations.shape) == 3: # [batch, sequence_length, hidden_dim] | |
seq_len = activations.shape[1] | |
# CLIP ViT typically has 196 patch tokens (14Γ14) + 1 class token = 197 | |
if seq_len >= 197: | |
# Skip the class token (first token) and reshape the patch tokens into a square | |
patch_tokens = activations[0, 1:197, :] # Remove the class token | |
# Take the mean across the hidden dimension | |
token_importance = np.mean(np.abs(patch_tokens), axis=1) | |
# Reshape to the expected grid size (14Γ14 for CLIP ViT) | |
cam = token_importance.reshape(14, 14) | |
else: | |
# Try to find factors close to a square | |
side_len = int(np.sqrt(seq_len)) | |
# Use the mean across features as importance | |
token_importance = np.mean(np.abs(activations[0]), axis=1) | |
# Create as square-like shape as possible | |
cam = np.zeros((side_len, side_len)) | |
# Fill the cam with available values | |
flat_cam = cam.flatten() | |
flat_cam[:min(len(token_importance), len(flat_cam))] = token_importance[:min(len(token_importance), len(flat_cam))] | |
cam = flat_cam.reshape(side_len, side_len) | |
else: | |
# Fallback | |
st.info("Using fallback CAM shape (14x14)") | |
cam = np.ones((14, 14), dtype=np.float32) * 0.5 # Default fallback | |
# Ensure we have valid values | |
cam = np.maximum(cam, 0) | |
if np.max(cam) > 0: | |
cam = cam / np.max(cam) | |
return cam | |
except Exception as e: | |
st.error(f"Error in GradCAM.generate: {str(e)}") | |
return np.ones((14, 14), dtype=np.float32) * 0.5 | |
def overlay_cam_on_image(image, cam, face_box=None, alpha=0.5): | |
"""Overlay the CAM on the image""" | |
if face_box is not None: | |
x, y, w, h = face_box | |
# Create a mask for the entire image (all zeros initially) | |
img_np = np.array(image) | |
full_h, full_w = img_np.shape[:2] | |
full_cam = np.zeros((full_h, full_w), dtype=np.float32) | |
# Resize CAM to match face region | |
face_cam = cv2.resize(cam, (w, h)) | |
# Copy the face CAM into the full image CAM at the face position | |
full_cam[y:y+h, x:x+w] = face_cam | |
# Convert full CAM to image | |
cam_resized = Image.fromarray((full_cam * 255).astype(np.uint8)) | |
cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3] # Apply colormap | |
cam_colormap = (cam_colormap * 255).astype(np.uint8) | |
else: | |
# Resize CAM to match image dimensions | |
img_np = np.array(image) | |
h, w = img_np.shape[:2] | |
cam_resized = cv2.resize(cam, (w, h)) | |
# Apply colormap | |
cam_colormap = plt.cm.jet(cam_resized)[:, :, :3] # Apply colormap | |
cam_colormap = (cam_colormap * 255).astype(np.uint8) | |
# Blend the original image with the colormap | |
img_np_float = img_np.astype(float) / 255.0 | |
cam_colormap_float = cam_colormap.astype(float) / 255.0 | |
blended = img_np_float * (1 - alpha) + cam_colormap_float * alpha | |
blended = (blended * 255).astype(np.uint8) | |
return Image.fromarray(blended) | |
def save_comparison(image, cam, overlay, face_box=None): | |
"""Create a side-by-side comparison of the original, CAM, and overlay""" | |
fig, axes = plt.subplots(1, 3, figsize=(15, 5)) | |
# Original Image | |
axes[0].imshow(image) | |
axes[0].set_title("Original") | |
if face_box is not None: | |
x, y, w, h = face_box | |
rect = plt.Rectangle((x, y), w, h, edgecolor='lime', linewidth=2, fill=False) | |
axes[0].add_patch(rect) | |
axes[0].axis("off") | |
# CAM | |
if face_box is not None: | |
# Create a full image CAM that highlights only the face | |
img_np = np.array(image) | |
h, w = img_np.shape[:2] | |
full_cam = np.zeros((h, w)) | |
x, y, fw, fh = face_box | |
# Resize CAM to face size | |
face_cam = cv2.resize(cam, (fw, fh)) | |
# Place it in the right position | |
full_cam[y:y+fh, x:x+fw] = face_cam | |
axes[1].imshow(full_cam, cmap="jet") | |
else: | |
cam_resized = cv2.resize(cam, (image.width, image.height)) | |
axes[1].imshow(cam_resized, cmap="jet") | |
axes[1].set_title("CAM") | |
axes[1].axis("off") | |
# Overlay | |
axes[2].imshow(overlay) | |
axes[2].set_title("Overlay") | |
axes[2].axis("off") | |
plt.tight_layout() | |
# Convert plot to PIL Image for Streamlit display | |
buf = io.BytesIO() | |
plt.savefig(buf, format="png", bbox_inches="tight") | |
plt.close() | |
buf.seek(0) | |
return Image.open(buf) | |
# Function to load GradCAM CLIP model | |
def load_clip_model(): | |
with st.spinner("Loading CLIP model for GradCAM..."): | |
try: | |
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") | |
# Apply a simple classification head | |
model.classification_head = nn.Linear(1024, 2) | |
model.classification_head.weight.data.normal_(mean=0.0, std=0.02) | |
model.classification_head.bias.data.zero_() | |
model.eval() | |
return model | |
except Exception as e: | |
st.error(f"Error loading CLIP model: {str(e)}") | |
return None | |
def get_target_layer_clip(model): | |
"""Get the target layer for GradCAM""" | |
return "vision_model.encoder.layers.23" | |
def process_image_with_gradcam(image, model, device, pred_class): | |
"""Process an image with GradCAM""" | |
# Set up transformations | |
transform = transforms.Compose([ | |
transforms.Resize((224, 224)), | |
transforms.ToTensor(), | |
transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]), | |
]) | |
# Create dataset for the single image | |
dataset = ImageDataset(image, transform=transform, face_only=True) | |
# Custom collate function | |
def custom_collate(batch): | |
tensors = [item[0] for item in batch] | |
labels = [item[1] for item in batch] | |
paths = [item[2] for item in batch] | |
images = [item[3] for item in batch] | |
face_boxes = [item[4] for item in batch] | |
dataset_names = [item[5] for item in batch] | |
tensors = torch.stack(tensors) | |
labels = torch.tensor(labels) | |
return tensors, labels, paths, images, face_boxes, dataset_names | |
# Create dataloader | |
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=custom_collate) | |
# Extract the batch | |
for batch in dataloader: | |
input_tensor, label, img_paths, original_images, face_boxes, dataset_names = batch | |
original_image = original_images[0] | |
face_box = face_boxes[0] | |
# Move tensors and model to device | |
input_tensor = input_tensor.to(device) | |
model = model.to(device) | |
try: | |
# Create GradCAM extractor | |
target_layer = get_target_layer_clip(model) | |
cam_extractor = GradCAM(model, target_layer) | |
# Generate CAM | |
cam = cam_extractor.generate(input_tensor, pred_class) | |
# Create visualizations | |
overlay = overlay_cam_on_image(original_image, cam, face_box) | |
comparison = save_comparison(original_image, cam, overlay, face_box) | |
# Return results | |
return cam, overlay, comparison, face_box | |
except Exception as e: | |
st.error(f"Error processing image with GradCAM: {str(e)}") | |
# Return default values | |
default_cam = np.ones((14, 14), dtype=np.float32) * 0.5 | |
overlay = overlay_cam_on_image(original_image, default_cam, face_box) | |
comparison = save_comparison(original_image, default_cam, overlay, face_box) | |
return default_cam, overlay, comparison, face_box | |
# ----- BLIP Image Captioning ----- | |
# Function to load BLIP captioning models | |
def load_blip_models(): | |
with st.spinner("Loading BLIP captioning models..."): | |
try: | |
# Load original BLIP model for general image captioning | |
original_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
original_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
# Load fine-tuned BLIP model for GradCAM analysis | |
finetuned_processor = BlipProcessor.from_pretrained("saakshigupta/deepfake-blip-large") | |
finetuned_model = BlipForConditionalGeneration.from_pretrained("saakshigupta/deepfake-blip-large") | |
return original_processor, original_model, finetuned_processor, finetuned_model | |
except Exception as e: | |
st.error(f"Error loading BLIP models: {str(e)}") | |
return None, None, None, None | |
# Function to generate image caption using BLIP's VQA approach for GradCAM | |
def generate_gradcam_caption(image, processor, model, max_length=60): | |
""" | |
Generate a detailed analysis of GradCAM visualization using the fine-tuned BLIP model | |
""" | |
try: | |
# Process image first | |
inputs = processor(image, return_tensors="pt") | |
# Check for available GPU and move model and inputs | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()} | |
# Generate caption | |
with torch.no_grad(): | |
output = model.generate(**inputs, max_length=max_length, num_beams=5) | |
# Decode the output | |
caption = processor.decode(output[0], skip_special_tokens=True) | |
# Extract descriptions using the full text | |
high_match = caption.split("high activation :")[1].split("moderate")[0] if "high activation :" in caption else "" | |
moderate_match = caption.split("moderate activation :")[1].split("low")[0] if "moderate activation :" in caption else "" | |
low_match = caption.split("low activation :")[1] if "low activation :" in caption else "" | |
# Format the output | |
formatted_text = "" | |
if high_match: | |
formatted_text += f"**High activation**:\n{high_match.strip()}\n\n" | |
if moderate_match: | |
formatted_text += f"**Moderate activation**:\n{moderate_match.strip()}\n\n" | |
if low_match: | |
formatted_text += f"**Low activation**:\n{low_match.strip()}" | |
return formatted_text.strip() | |
except Exception as e: | |
st.error(f"Error analyzing GradCAM: {str(e)}") | |
return "Error analyzing GradCAM visualization" | |
# Function to generate caption for original image | |
def generate_image_caption(image, processor, model, max_length=75, num_beams=5): | |
"""Generate a caption for the original image using the original BLIP model""" | |
try: | |
# Check for available GPU | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
# For original image, use unconditional captioning | |
inputs = processor(image, return_tensors="pt").to(device) | |
# Generate caption | |
with torch.no_grad(): | |
output = model.generate(**inputs, max_length=max_length, num_beams=num_beams) | |
# Decode the output | |
caption = processor.decode(output[0], skip_special_tokens=True) | |
# Format into structured description | |
structured_caption = f""" | |
**Subject**: The image shows a person in a photograph. | |
**Appearance**: {caption} | |
**Background**: The background appears to be a controlled environment. | |
**Lighting**: The lighting appears to be professional with even illumination. | |
**Colors**: The image contains natural skin tones and colors typical of photography. | |
**Notable Elements**: The facial features and expression are the central focus of the image. | |
""" | |
return structured_caption.strip() | |
except Exception as e: | |
st.error(f"Error generating caption: {str(e)}") | |
return "Error generating caption" | |
# ----- Fine-tuned Vision LLM ----- | |
# Function to fix cross-attention masks | |
def fix_cross_attention_mask(inputs): | |
if 'cross_attention_mask' in inputs and 0 in inputs['cross_attention_mask'].shape: | |
batch_size, seq_len, _, num_tiles = inputs['cross_attention_mask'].shape | |
visual_features = 6404 # Critical dimension | |
new_mask = torch.ones((batch_size, seq_len, visual_features, num_tiles), | |
device=inputs['cross_attention_mask'].device) | |
inputs['cross_attention_mask'] = new_mask | |
return inputs | |
# Load model function | |
def load_llm_model(): | |
with st.spinner("Loading LLM vision model... This may take a few minutes. Please be patient..."): | |
try: | |
# Check for GPU | |
has_gpu = check_gpu() | |
# Load base model and tokenizer using Unsloth | |
base_model_id = "unsloth/llama-3.2-11b-vision-instruct" | |
model, tokenizer = FastVisionModel.from_pretrained( | |
base_model_id, | |
load_in_4bit=True, | |
) | |
# Load the adapter | |
adapter_id = "saakshigupta/deepfake-explainer-1" | |
model = PeftModel.from_pretrained(model, adapter_id) | |
# Set to inference mode | |
FastVisionModel.for_inference(model) | |
return model, tokenizer | |
except Exception as e: | |
st.error(f"Error loading model: {str(e)}") | |
return None, None | |
# Analyze image function | |
def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confidence, question, model, tokenizer, temperature=0.7, max_tokens=500, custom_instruction=""): | |
# Create a prompt that includes GradCAM information | |
if custom_instruction.strip(): | |
full_prompt = f"{question}\n\nThe image has been processed with GradCAM and classified as {pred_label} with confidence {confidence:.2f}. Focus on the highlighted regions in red/yellow which show the areas the detection model found suspicious.\n\n{custom_instruction}" | |
else: | |
full_prompt = f"{question}\n\nThe image has been processed with GradCAM and classified as {pred_label} with confidence {confidence:.2f}. Focus on the highlighted regions in red/yellow which show the areas the detection model found suspicious." | |
# Format the message to include both the original image and the GradCAM visualization | |
messages = [ | |
{"role": "user", "content": [ | |
{"type": "image", "image": image}, # Original image | |
{"type": "image", "image": gradcam_overlay}, # GradCAM overlay | |
{"type": "text", "text": full_prompt} | |
]} | |
] | |
# Apply chat template | |
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True) | |
# Process with image | |
inputs = tokenizer( | |
[image, gradcam_overlay], # Send both images | |
input_text, | |
add_special_tokens=False, | |
return_tensors="pt", | |
).to(model.device) | |
# Fix cross-attention mask if needed | |
inputs = fix_cross_attention_mask(inputs) | |
# Generate response | |
with st.spinner("Generating detailed analysis... (this may take 15-30 seconds)"): | |
with torch.no_grad(): | |
output_ids = model.generate( | |
**inputs, | |
max_new_tokens=max_tokens, | |
use_cache=True, | |
temperature=temperature, | |
top_p=0.9 | |
) | |
# Decode the output | |
response = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
# Try to extract just the model's response (after the prompt) | |
if full_prompt in response: | |
result = response.split(full_prompt)[-1].strip() | |
else: | |
result = response | |
return result | |
# Main app | |
def main(): | |
# Initialize session state variables | |
if 'clip_model_loaded' not in st.session_state: | |
st.session_state.clip_model_loaded = False | |
st.session_state.clip_model = None | |
if 'llm_model_loaded' not in st.session_state: | |
st.session_state.llm_model_loaded = False | |
st.session_state.llm_model = None | |
st.session_state.tokenizer = None | |
if 'blip_model_loaded' not in st.session_state: | |
st.session_state.blip_model_loaded = False | |
st.session_state.original_processor = None | |
st.session_state.original_model = None | |
st.session_state.finetuned_processor = None | |
st.session_state.finetuned_model = None | |
# Initialize chat history | |
if 'chat_history' not in st.session_state: | |
st.session_state.chat_history = [] | |
# Create expanders for each stage | |
with st.expander("Stage 1: Model Loading", expanded=True): | |
st.write("Please load the models using the buttons below:") | |
# Button for loading models | |
clip_col, blip_col, llm_col = st.columns(3) | |
with clip_col: | |
if not st.session_state.clip_model_loaded: | |
if st.button("π₯ Load CLIP Model for Detection", type="primary"): | |
# Load CLIP model | |
model = load_clip_model() | |
if model is not None: | |
st.session_state.clip_model = model | |
st.session_state.clip_model_loaded = True | |
st.success("β CLIP model loaded successfully!") | |
else: | |
st.error("β Failed to load CLIP model.") | |
else: | |
st.success("β CLIP model loaded and ready!") | |
with blip_col: | |
if not st.session_state.blip_model_loaded: | |
if st.button("π₯ Load BLIP for Captioning", type="primary"): | |
# Load BLIP models | |
original_processor, original_model, finetuned_processor, finetuned_model = load_blip_models() | |
if all([original_processor, original_model, finetuned_processor, finetuned_model]): | |
st.session_state.original_processor = original_processor | |
st.session_state.original_model = original_model | |
st.session_state.finetuned_processor = finetuned_processor | |
st.session_state.finetuned_model = finetuned_model | |
st.session_state.blip_model_loaded = True | |
st.success("β BLIP captioning models loaded successfully!") | |
else: | |
st.error("β Failed to load BLIP models.") | |
else: | |
st.success("β BLIP captioning models loaded and ready!") | |
with llm_col: | |
if not st.session_state.llm_model_loaded: | |
if st.button("π₯ Load Vision LLM for Analysis", type="primary"): | |
# Load LLM model | |
model, tokenizer = load_llm_model() | |
if model is not None and tokenizer is not None: | |
st.session_state.llm_model = model | |
st.session_state.tokenizer = tokenizer | |
st.session_state.llm_model_loaded = True | |
st.success("β Vision LLM loaded successfully!") | |
else: | |
st.error("β Failed to load Vision LLM.") | |
else: | |
st.success("β Vision LLM loaded and ready!") | |
# Image upload section | |
with st.expander("Stage 2: Image Upload & Initial Detection", expanded=True): | |
st.subheader("Upload an Image") | |
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) | |
if uploaded_file is not None: | |
try: | |
# Load and display the image (with controlled size) | |
image = Image.open(uploaded_file).convert("RGB") | |
# Display the image with a controlled width | |
col1, col2 = st.columns([1, 2]) | |
with col1: | |
st.image(image, caption="Uploaded Image", width=300) | |
# Generate detailed caption for original image if BLIP model is loaded | |
if st.session_state.blip_model_loaded: | |
with st.spinner("Generating image description..."): | |
caption = generate_image_caption( | |
image, | |
st.session_state.original_processor, | |
st.session_state.original_model | |
) | |
st.session_state.image_caption = caption | |
# Store caption but don't display it yet | |
# Detect with CLIP model if loaded | |
if st.session_state.clip_model_loaded: | |
with st.spinner("Analyzing image with CLIP model..."): | |
# Preprocess image for CLIP | |
transform = transforms.Compose([ | |
transforms.Resize((224, 224)), | |
transforms.ToTensor(), | |
transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]), | |
]) | |
# Create a simple dataset for the image | |
dataset = ImageDataset(image, transform=transform, face_only=True) | |
tensor, _, _, _, face_box, _ = dataset[0] | |
tensor = tensor.unsqueeze(0) | |
# Get device | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Move model and tensor to device | |
model = st.session_state.clip_model.to(device) | |
tensor = tensor.to(device) | |
# Forward pass | |
with torch.no_grad(): | |
outputs = model.vision_model(pixel_values=tensor).pooler_output | |
logits = model.classification_head(outputs) | |
probs = torch.softmax(logits, dim=1)[0] | |
pred_class = torch.argmax(probs).item() | |
confidence = probs[pred_class].item() | |
pred_label = "Fake" if pred_class == 1 else "Real" | |
# Display results | |
with col2: | |
st.markdown("### Detection Result") | |
st.markdown(f"**Classification:** {pred_label} (Confidence: {confidence:.2%})") | |
# GradCAM visualization | |
st.subheader("GradCAM Visualization") | |
cam, overlay, comparison, detected_face_box = process_image_with_gradcam( | |
image, model, device, pred_class | |
) | |
# Display GradCAM results (controlled size) | |
st.image(comparison, caption="Original | CAM | Overlay", width=700) | |
# Generate caption for GradCAM overlay image if BLIP model is loaded | |
if st.session_state.blip_model_loaded: | |
with st.spinner("Analyzing GradCAM visualization..."): | |
gradcam_caption = generate_gradcam_caption( | |
overlay, | |
st.session_state.finetuned_processor, | |
st.session_state.finetuned_model | |
) | |
st.session_state.gradcam_caption = gradcam_caption | |
# Store caption but don't display it yet | |
# Save results in session state for LLM analysis | |
st.session_state.current_image = image | |
st.session_state.current_overlay = overlay | |
st.session_state.current_face_box = detected_face_box | |
st.session_state.current_pred_label = pred_label | |
st.session_state.current_confidence = confidence | |
st.success("β Initial detection and GradCAM visualization complete!") | |
else: | |
st.warning("β οΈ Please load the CLIP model first to perform initial detection.") | |
except Exception as e: | |
st.error(f"Error processing image: {str(e)}") | |
import traceback | |
st.error(traceback.format_exc()) # This will show the full error traceback | |
# Image Analysis Summary section - AFTER Stage 2 | |
if hasattr(st.session_state, 'current_image') and (hasattr(st.session_state, 'image_caption') or hasattr(st.session_state, 'gradcam_caption')): | |
with st.expander("Image Analysis Summary", expanded=True): | |
# Display images and analysis in organized layout | |
col1, col2 = st.columns([1, 2]) | |
with col1: | |
# Display original image | |
st.image(st.session_state.current_image, caption="Original Image", width=300) | |
# Display GradCAM overlay | |
if hasattr(st.session_state, 'current_overlay'): | |
st.image(st.session_state.current_overlay, caption="GradCAM Visualization", width=300) | |
with col2: | |
# Image description | |
if hasattr(st.session_state, 'image_caption'): | |
st.markdown("### Image Description") | |
st.markdown(st.session_state.image_caption) | |
st.markdown("---") | |
# GradCAM analysis | |
if hasattr(st.session_state, 'gradcam_caption'): | |
st.markdown("### GradCAM Analysis") | |
st.markdown(st.session_state.gradcam_caption) | |
st.markdown("---") | |
# LLM Analysis section - AFTER Image Analysis Summary | |
with st.expander("Stage 3: Detailed Analysis with Vision LLM", expanded=False): | |
if hasattr(st.session_state, 'current_image') and st.session_state.llm_model_loaded: | |
st.subheader("Detailed Deepfake Analysis") | |
# Display chat history | |
for i, (question, answer) in enumerate(st.session_state.chat_history): | |
st.markdown(f"**Question {i+1}:** {question}") | |
st.markdown(f"**Answer:** {answer}") | |
st.markdown("---") | |
# Include both captions in the prompt if available | |
caption_text = "" | |
if hasattr(st.session_state, 'image_caption'): | |
caption_text += f"\n\nImage Description:\n{st.session_state.image_caption}" | |
if hasattr(st.session_state, 'gradcam_caption'): | |
caption_text += f"\n\nGradCAM Analysis:\n{st.session_state.gradcam_caption}" | |
# Default question with option to customize | |
default_question = f"This image has been classified as {st.session_state.current_pred_label}. Analyze the key features that led to this classification, focusing on the highlighted areas in the GradCAM visualization. Provide both a technical explanation for experts and a simple explanation for non-technical users." | |
# User input for new question | |
new_question = st.text_area("Ask a question about the image:", value=default_question if not st.session_state.chat_history else "", height=100) | |
# Analyze button and Clear Chat button in the same row | |
col1, col2 = st.columns([3, 1]) | |
with col1: | |
analyze_button = st.button("π Send Question", type="primary") | |
with col2: | |
clear_button = st.button("ποΈ Clear Chat History") | |
if clear_button: | |
st.session_state.chat_history = [] | |
st.experimental_rerun() | |
if analyze_button and new_question: | |
try: | |
# Add caption info if it's the first question | |
if not st.session_state.chat_history: | |
full_question = new_question + caption_text | |
else: | |
full_question = new_question | |
result = analyze_image_with_llm( | |
st.session_state.current_image, | |
st.session_state.current_overlay, | |
st.session_state.current_face_box, | |
st.session_state.current_pred_label, | |
st.session_state.current_confidence, | |
full_question, | |
st.session_state.llm_model, | |
st.session_state.tokenizer, | |
temperature=temperature, | |
max_tokens=max_tokens, | |
custom_instruction=custom_instruction | |
) | |
# Add to chat history | |
st.session_state.chat_history.append((new_question, result)) | |
# Display the latest result too | |
st.success("β Analysis complete!") | |
# Check if the result contains both technical and non-technical explanations | |
if "Technical" in result and "Non-Technical" in result: | |
try: | |
# Split the result into technical and non-technical sections | |
parts = result.split("Non-Technical") | |
technical = parts[0] | |
non_technical = "Non-Technical" + parts[1] | |
# Display in two columns | |
tech_col, simple_col = st.columns(2) | |
with tech_col: | |
st.subheader("Technical Analysis") | |
st.markdown(technical) | |
with simple_col: | |
st.subheader("Simple Explanation") | |
st.markdown(non_technical) | |
except Exception as e: | |
# Fallback if splitting fails | |
st.subheader("Analysis Result") | |
st.markdown(result) | |
else: | |
# Just display the whole result | |
st.subheader("Analysis Result") | |
st.markdown(result) | |
# Rerun to update the chat history display | |
st.experimental_rerun() | |
except Exception as e: | |
st.error(f"Error during LLM analysis: {str(e)}") | |
elif not hasattr(st.session_state, 'current_image'): | |
st.warning("β οΈ Please upload an image and complete the initial detection first.") | |
else: | |
st.warning("β οΈ Please load the Vision LLM to perform detailed analysis.") | |
# Footer | |
st.markdown("---") | |
if __name__ == "__main__": | |
main() |