Spaces:

saakshigupta
/

deepfake-explainer-app

Paused

App Files Files Community

saakshigupta commited on Apr 6

Commit

4048570

verified ·

1 Parent(s): cd7498a

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -33

app.py CHANGED Viewed

@@ -424,9 +424,9 @@ def process_image_with_gradcam(image, model, device, pred_class):
 # ----- BLIP Image Captioning -----
-# Define simple prompts for BLIP
-ORIGINAL_IMAGE_PROMPT = ""  # Empty prompt for original images - BLIP works better with no prompt
-GRADCAM_IMAGE_PROMPT = "Describe what you see in this heatmap visualization"
 # Function to load BLIP captioning model
 @st.cache_resource
@@ -440,71 +440,65 @@ def load_blip_model():
             st.error(f"Error loading BLIP model: {str(e)}")
             return None, None
-# Function to generate image caption with manual structured formatting
-def generate_image_caption(image, processor, model, is_gradcam=False, max_length=150, num_beams=5):
     """
-    Generate a caption for the input image using BLIP model and format it with structured headings
     """
     try:
         # Select the appropriate prompt based on image type
-        prompt = GRADCAM_IMAGE_PROMPT if is_gradcam else ORIGINAL_IMAGE_PROMPT
-        # Preprocess the image
-        inputs = processor(image, text=prompt, return_tensors="pt")
         # Check for available GPU
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model = model.to(device)
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate caption
         with torch.no_grad():
-            output = model.generate(**inputs, max_length=max_length, num_beams=num_beams)
-        # Decode the caption
-        raw_caption = processor.decode(output[0], skip_special_tokens=True)
-        # Format the caption into a structured format based on type
         if is_gradcam:
-            formatted_caption = format_gradcam_caption(raw_caption)
         else:
-            formatted_caption = format_image_caption(raw_caption)
-        return formatted_caption
     except Exception as e:
         st.error(f"Error generating caption: {str(e)}")
         return "Error generating caption"
-def format_image_caption(raw_caption):
-    """Format a raw caption into a structured description with headings"""
-    # Try to extract some basic information from the raw caption
-    appearance_info = raw_caption  # Use the full caption by default
-    # Basic structure for image caption with extracted information
     structured_caption = f"""
-**Subject**: The image shows a person in a portrait-style photograph.
-**Appearance**: {appearance_info}
 **Background**: The background appears to be a controlled environment.
 **Lighting**: The lighting appears to be professional with even illumination.
-**Colors**: The image contains natural skin tones and colors typical of portrait photography.
 **Notable Elements**: The facial features and expression are the central focus of the image.
 """
     return structured_caption.strip()
-def format_gradcam_caption(raw_caption):
-    """Format a raw GradCAM description with proper structure"""
-    # Basic structure for GradCAM analysis
     structured_caption = f"""
 **Main Focus Area**: The heatmap is primarily focused on the facial region of the person.
-**High Activation Regions**: The red/yellow areas highlight important features that the model is focusing on. {raw_caption}
 **Medium Activation Regions**: The green/cyan areas correspond to regions of medium importance in the detection process, typically including parts of the face and surrounding areas.
@@ -778,6 +772,8 @@ def main():
                     st.warning("⚠️ Please load the CLIP model first to perform initial detection.")
             except Exception as e:
                 st.error(f"Error processing image: {str(e)}")
     # LLM Analysis section
     with st.expander("Stage 3: Detailed Analysis with Vision LLM", expanded=False):

 # ----- BLIP Image Captioning -----
+# Define conditional prompts for BLIP
+ORIGINAL_IMAGE_PROMPT = "an image of"  # For the original image
+GRADCAM_IMAGE_PROMPT = "a heatmap showing"  # For the GradCAM visualization
 # Function to load BLIP captioning model
 @st.cache_resource
             st.error(f"Error loading BLIP model: {str(e)}")
             return None, None
+# Function to generate image caption using BLIP
+def generate_image_caption(image, processor, model, is_gradcam=False, max_length=75, num_beams=5):
     """
+    Generate a caption for the input image using BLIP model's conditional captioning
     """
     try:
         # Select the appropriate prompt based on image type
+        conditional_prompt = GRADCAM_IMAGE_PROMPT if is_gradcam else ORIGINAL_IMAGE_PROMPT
         # Check for available GPU
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model = model.to(device)
+        # Get conditional caption
+        conditional_inputs = processor(image, conditional_prompt, return_tensors="pt").to(device)
         with torch.no_grad():
+            conditional_output = model.generate(**conditional_inputs, max_length=max_length, num_beams=num_beams)
+        conditional_caption = processor.decode(conditional_output[0], skip_special_tokens=True)
+        # Remove the prompt from the beginning if it appears
+        if conditional_prompt in conditional_caption:
+            conditional_caption = conditional_caption.replace(conditional_prompt, "").strip()
+        # Format the caption based on image type
         if is_gradcam:
+            full_info = format_gradcam_caption(conditional_caption)
         else:
+            full_info = format_image_caption(conditional_caption)
+        return full_info
     except Exception as e:
         st.error(f"Error generating caption: {str(e)}")
         return "Error generating caption"
+def format_image_caption(caption):
+    """Format caption into a structured description with headings"""
     structured_caption = f"""
+**Subject**: The image shows a person in a photograph.
+**Appearance**: {caption}
 **Background**: The background appears to be a controlled environment.
 **Lighting**: The lighting appears to be professional with even illumination.
+**Colors**: The image contains natural skin tones and colors typical of photography.
 **Notable Elements**: The facial features and expression are the central focus of the image.
 """
     return structured_caption.strip()
+def format_gradcam_caption(caption):
+    """Format GradCAM caption with proper structure"""
     structured_caption = f"""
 **Main Focus Area**: The heatmap is primarily focused on the facial region of the person.
+**High Activation Regions**: The red/yellow areas highlight important features that the model is focusing on. {caption}
 **Medium Activation Regions**: The green/cyan areas correspond to regions of medium importance in the detection process, typically including parts of the face and surrounding areas.
                     st.warning("⚠️ Please load the CLIP model first to perform initial detection.")
             except Exception as e:
                 st.error(f"Error processing image: {str(e)}")
+                import traceback
+                st.error(traceback.format_exc())  # This will show the full error traceback
     # LLM Analysis section
     with st.expander("Stage 3: Detailed Analysis with Vision LLM", expanded=False):