saakshigupta commited on
Commit
55a09e9
Β·
verified Β·
1 Parent(s): f0a1db6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -93
app.py CHANGED
@@ -432,8 +432,25 @@ def load_blip_model():
432
  st.error(f"Error loading BLIP model: {str(e)}")
433
  return None, None
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  # Function to generate image caption
436
- def generate_image_caption(image, processor, model, max_length=50, num_beams=5):
437
  """
438
  Generate a caption for the input image using BLIP model
439
 
@@ -441,6 +458,7 @@ def generate_image_caption(image, processor, model, max_length=50, num_beams=5):
441
  image (PIL.Image): Input image
442
  processor: BLIP processor
443
  model: BLIP model
 
444
  max_length (int): Maximum length of the caption
445
  num_beams (int): Number of beams for beam search
446
 
@@ -448,8 +466,11 @@ def generate_image_caption(image, processor, model, max_length=50, num_beams=5):
448
  str: Generated caption
449
  """
450
  try:
451
- # Preprocess the image
452
- inputs = processor(image, return_tensors="pt")
 
 
 
453
 
454
  # Check for available GPU
455
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -462,6 +483,11 @@ def generate_image_caption(image, processor, model, max_length=50, num_beams=5):
462
 
463
  # Decode the caption
464
  caption = processor.decode(output[0], skip_special_tokens=True)
 
 
 
 
 
465
  return caption
466
  except Exception as e:
467
  st.error(f"Error generating caption: {str(e)}")
@@ -636,16 +662,21 @@ def main():
636
  image = Image.open(uploaded_file).convert("RGB")
637
  st.image(image, caption="Uploaded Image", use_column_width=True)
638
 
639
- # Generate image caption if BLIP model is loaded
640
  if st.session_state.blip_model_loaded:
641
- with st.spinner("Generating image caption..."):
642
  caption = generate_image_caption(
643
  image,
644
  st.session_state.blip_processor,
645
- st.session_state.blip_model
 
646
  )
647
  st.session_state.image_caption = caption
648
- st.success(f"πŸ“ Image Caption: **{caption}**")
 
 
 
 
649
 
650
  # Detect with CLIP model if loaded
651
  if st.session_state.clip_model_loaded:
@@ -694,6 +725,23 @@ def main():
694
  # Display GradCAM results
695
  st.image(comparison, caption="Original | CAM | Overlay", use_column_width=True)
696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  # Save results in session state for LLM analysis
698
  st.session_state.current_image = image
699
  st.session_state.current_overlay = overlay
@@ -701,89 +749,4 @@ def main():
701
  st.session_state.current_pred_label = pred_label
702
  st.session_state.current_confidence = confidence
703
 
704
- st.success("βœ… Initial detection and GradCAM visualization complete!")
705
- else:
706
- st.warning("⚠️ Please load the CLIP model first to perform initial detection.")
707
-
708
- # LLM Analysis section
709
- with st.expander("Stage 3: Detailed Analysis with Vision LLM", expanded=False):
710
- if hasattr(st.session_state, 'current_image') and st.session_state.llm_model_loaded:
711
- st.subheader("Detailed Deepfake Analysis")
712
-
713
- # Include caption in the prompt if available
714
- caption_text = ""
715
- if hasattr(st.session_state, 'image_caption'):
716
- caption_text = f"\n\nImage caption: {st.session_state.image_caption}"
717
-
718
- # Default question with option to customize
719
- default_question = f"This image has been classified as {st.session_state.current_pred_label}.{caption_text} Analyze the key features that led to this classification, focusing on the highlighted areas in the GradCAM visualization. Provide both a technical explanation for experts and a simple explanation for non-technical users."
720
- question = st.text_area("Question/Prompt:", value=default_question, height=100)
721
-
722
- # Analyze button
723
- if st.button("πŸ” Perform Detailed Analysis", type="primary"):
724
- result = analyze_image_with_llm(
725
- st.session_state.current_image,
726
- st.session_state.current_overlay,
727
- st.session_state.current_face_box,
728
- st.session_state.current_pred_label,
729
- st.session_state.current_confidence,
730
- question,
731
- st.session_state.llm_model,
732
- st.session_state.tokenizer,
733
- temperature=temperature,
734
- max_tokens=max_tokens,
735
- custom_instruction=custom_instruction
736
- )
737
-
738
- # Display results
739
- st.success("βœ… Analysis complete!")
740
-
741
- # Check if the result contains both technical and non-technical explanations
742
- if "Technical" in result and "Non-Technical" in result:
743
- # Split the result into technical and non-technical sections
744
- parts = result.split("Non-Technical")
745
- technical = parts[0]
746
- non_technical = "Non-Technical" + parts[1]
747
-
748
- # Display in two columns
749
- col1, col2 = st.columns(2)
750
- with col1:
751
- st.subheader("Technical Analysis")
752
- st.markdown(technical)
753
-
754
- with col2:
755
- st.subheader("Simple Explanation")
756
- st.markdown(non_technical)
757
- else:
758
- # Just display the whole result
759
- st.subheader("Analysis Result")
760
- st.markdown(result)
761
- elif not hasattr(st.session_state, 'current_image'):
762
- st.warning("⚠️ Please upload an image and complete the initial detection first.")
763
- else:
764
- st.warning("⚠️ Please load the Vision LLM to perform detailed analysis.")
765
-
766
- # Summary section with caption
767
- if hasattr(st.session_state, 'current_image') and hasattr(st.session_state, 'image_caption'):
768
- with st.expander("Image Caption Summary", expanded=True):
769
- st.subheader("Generated Image Description")
770
-
771
- # Display image and caption
772
- col1, col2 = st.columns([1, 2])
773
- with col1:
774
- st.image(st.session_state.current_image, use_column_width=True)
775
- with col2:
776
- st.markdown("### BLIP Caption:")
777
- st.markdown(f"**{st.session_state.image_caption}**")
778
-
779
- # Display detection result if available
780
- if hasattr(st.session_state, 'current_pred_label'):
781
- st.markdown("### Detection Result:")
782
- st.markdown(f"Classification: **{st.session_state.current_pred_label}** (Confidence: {st.session_state.current_confidence:.2%})")
783
-
784
- # Footer
785
- st.markdown("---")
786
- st.caption("Advanced Deepfake Image Analyzer with BLIP Captioning")
787
-
788
- if __name__ == "__main__":
789
- main()
 
432
  st.error(f"Error loading BLIP model: {str(e)}")
433
  return None, None
434
 
435
+ # Define custom prompts for original and GradCAM images
436
+ ORIGINAL_IMAGE_PROMPT = """Generate a detailed description of this image with the following structure:
437
+ Subject: [Describe the person/main subject]
438
+ Appearance: [Describe clothing, hair, facial features]
439
+ Pose: [Describe the person's pose and expression]
440
+ Background: [Describe the environment and setting]
441
+ Lighting: [Describe lighting conditions and shadows]
442
+ Colors: [Note dominant colors and color palette]
443
+ Notable Elements: [Any distinctive objects or visual elements]"""
444
+
445
+ GRADCAM_IMAGE_PROMPT = """Describe the GradCAM visualization overlay with the following structure:
446
+ Main Focus Area: [Identify the primary region highlighted]
447
+ High Activation Regions: [Describe red/yellow areas and corresponding image features]
448
+ Medium Activation Regions: [Describe green/cyan areas and corresponding image features]
449
+ Low Activation Regions: [Describe blue/dark blue areas and corresponding image features]
450
+ Activation Pattern: [Describe the overall pattern of the heatmap]"""
451
+
452
  # Function to generate image caption
453
+ def generate_image_caption(image, processor, model, is_gradcam=False, max_length=75, num_beams=5):
454
  """
455
  Generate a caption for the input image using BLIP model
456
 
 
458
  image (PIL.Image): Input image
459
  processor: BLIP processor
460
  model: BLIP model
461
+ is_gradcam (bool): Whether the image is a GradCAM visualization
462
  max_length (int): Maximum length of the caption
463
  num_beams (int): Number of beams for beam search
464
 
 
466
  str: Generated caption
467
  """
468
  try:
469
+ # Select the appropriate prompt based on image type
470
+ prompt = GRADCAM_IMAGE_PROMPT if is_gradcam else ORIGINAL_IMAGE_PROMPT
471
+
472
+ # Preprocess the image with the prompt
473
+ inputs = processor(image, text=prompt, return_tensors="pt")
474
 
475
  # Check for available GPU
476
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
483
 
484
  # Decode the caption
485
  caption = processor.decode(output[0], skip_special_tokens=True)
486
+
487
+ # If the caption contains the prompt, remove it
488
+ if prompt in caption:
489
+ caption = caption.replace(prompt, "").strip()
490
+
491
  return caption
492
  except Exception as e:
493
  st.error(f"Error generating caption: {str(e)}")
 
662
  image = Image.open(uploaded_file).convert("RGB")
663
  st.image(image, caption="Uploaded Image", use_column_width=True)
664
 
665
+ # Generate detailed caption for original image if BLIP model is loaded
666
  if st.session_state.blip_model_loaded:
667
+ with st.spinner("Generating detailed image description..."):
668
  caption = generate_image_caption(
669
  image,
670
  st.session_state.blip_processor,
671
+ st.session_state.blip_model,
672
+ is_gradcam=False
673
  )
674
  st.session_state.image_caption = caption
675
+ st.success(f"πŸ“ Image Description Generated")
676
+
677
+ # Format the caption nicely
678
+ st.markdown("### Image Description:")
679
+ st.markdown(caption)
680
 
681
  # Detect with CLIP model if loaded
682
  if st.session_state.clip_model_loaded:
 
725
  # Display GradCAM results
726
  st.image(comparison, caption="Original | CAM | Overlay", use_column_width=True)
727
 
728
+ # Generate caption for GradCAM overlay image if BLIP model is loaded
729
+ if st.session_state.blip_model_loaded:
730
+ with st.spinner("Analyzing GradCAM visualization..."):
731
+ gradcam_caption = generate_image_caption(
732
+ overlay,
733
+ st.session_state.blip_processor,
734
+ st.session_state.blip_model,
735
+ is_gradcam=True,
736
+ max_length=100 # Longer for detailed analysis
737
+ )
738
+ st.session_state.gradcam_caption = gradcam_caption
739
+ st.success("βœ… GradCAM analysis complete")
740
+
741
+ # Format the GradCAM caption nicely
742
+ st.markdown("### GradCAM Analysis:")
743
+ st.markdown(gradcam_caption)
744
+
745
  # Save results in session state for LLM analysis
746
  st.session_state.current_image = image
747
  st.session_state.current_overlay = overlay
 
749
  st.session_state.current_pred_label = pred_label
750
  st.session_state.current_confidence = confidence
751
 
752
+ st.success("βœ… Initial detection and GradCAM visualization complete!")