saakshigupta commited on
Commit
cd7498a
·
verified ·
1 Parent(s): 2bc3c60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -44
app.py CHANGED
@@ -424,32 +424,32 @@ def process_image_with_gradcam(image, model, device, pred_class):
424
 
425
  # ----- BLIP Image Captioning -----
426
 
427
- # Define custom prompts for original and GradCAM images - simpler prompts that work better with BLIP
428
- ORIGINAL_IMAGE_PROMPT = "Detailed description:"
 
429
 
430
- GRADCAM_IMAGE_PROMPT = "Describe this heatmap visualization:"
 
 
 
 
 
 
 
 
 
 
431
 
432
- # Function to generate image caption with structured formatting
433
  def generate_image_caption(image, processor, model, is_gradcam=False, max_length=150, num_beams=5):
434
  """
435
  Generate a caption for the input image using BLIP model and format it with structured headings
436
-
437
- Args:
438
- image (PIL.Image): Input image
439
- processor: BLIP processor
440
- model: BLIP model
441
- is_gradcam (bool): Whether the image is a GradCAM visualization
442
- max_length (int): Maximum length of the caption
443
- num_beams (int): Number of beams for beam search
444
-
445
- Returns:
446
- str: Generated caption with structured formatting
447
  """
448
  try:
449
  # Select the appropriate prompt based on image type
450
  prompt = GRADCAM_IMAGE_PROMPT if is_gradcam else ORIGINAL_IMAGE_PROMPT
451
 
452
- # Preprocess the image with the basic prompt
453
  inputs = processor(image, text=prompt, return_tensors="pt")
454
 
455
  # Check for available GPU
@@ -464,11 +464,7 @@ def generate_image_caption(image, processor, model, is_gradcam=False, max_length
464
  # Decode the caption
465
  raw_caption = processor.decode(output[0], skip_special_tokens=True)
466
 
467
- # Remove the prompt if it appears in the caption
468
- if prompt in raw_caption:
469
- raw_caption = raw_caption.replace(prompt, "").strip()
470
-
471
- # Format the caption with proper structure based on type
472
  if is_gradcam:
473
  formatted_caption = format_gradcam_caption(raw_caption)
474
  else:
@@ -481,17 +477,21 @@ def generate_image_caption(image, processor, model, is_gradcam=False, max_length
481
 
482
  def format_image_caption(raw_caption):
483
  """Format a raw caption into a structured description with headings"""
484
- # Basic structure for image caption
 
 
 
 
485
  structured_caption = f"""
486
- **Subject**: The image shows a person, likely in a portrait or headshot format.
487
 
488
- **Appearance**: {raw_caption}
489
 
490
- **Background**: The background appears to be a studio or controlled environment setting.
491
 
492
- **Lighting**: The lighting appears to be professional with even illumination on the subject's face.
493
 
494
- **Colors**: The image contains a range of tones typical in portrait photography.
495
 
496
  **Notable Elements**: The facial features and expression are the central focus of the image.
497
  """
@@ -499,32 +499,21 @@ def format_image_caption(raw_caption):
499
 
500
  def format_gradcam_caption(raw_caption):
501
  """Format a raw GradCAM description with proper structure"""
 
502
  # Basic structure for GradCAM analysis
503
  structured_caption = f"""
504
- **Main Focus Area**: The heatmap is primarily focused on the facial region.
505
 
506
- **High Activation Regions**: The red/yellow areas highlight {raw_caption}
507
 
508
- **Medium Activation Regions**: The green/cyan areas correspond to medium importance features in the image.
509
 
510
- **Low Activation Regions**: The blue/dark blue areas represent features that have less impact on the model's decision.
511
 
512
- **Activation Pattern**: The overall pattern suggests the model is focusing on key facial features to make its determination.
513
  """
514
  return structured_caption.strip()
515
 
516
- # Function to load BLIP captioning model
517
- @st.cache_resource
518
- def load_blip_model():
519
- with st.spinner("Loading BLIP captioning model..."):
520
- try:
521
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
522
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
523
- return processor, model
524
- except Exception as e:
525
- st.error(f"Error loading BLIP model: {str(e)}")
526
- return None, None
527
-
528
  # ----- Fine-tuned Vision LLM -----
529
 
530
  # Function to fix cross-attention masks
 
424
 
425
  # ----- BLIP Image Captioning -----
426
 
427
+ # Define simple prompts for BLIP
428
+ ORIGINAL_IMAGE_PROMPT = "" # Empty prompt for original images - BLIP works better with no prompt
429
+ GRADCAM_IMAGE_PROMPT = "Describe what you see in this heatmap visualization"
430
 
431
+ # Function to load BLIP captioning model
432
+ @st.cache_resource
433
+ def load_blip_model():
434
+ with st.spinner("Loading BLIP captioning model..."):
435
+ try:
436
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
437
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
438
+ return processor, model
439
+ except Exception as e:
440
+ st.error(f"Error loading BLIP model: {str(e)}")
441
+ return None, None
442
 
443
+ # Function to generate image caption with manual structured formatting
444
  def generate_image_caption(image, processor, model, is_gradcam=False, max_length=150, num_beams=5):
445
  """
446
  Generate a caption for the input image using BLIP model and format it with structured headings
 
 
 
 
 
 
 
 
 
 
 
447
  """
448
  try:
449
  # Select the appropriate prompt based on image type
450
  prompt = GRADCAM_IMAGE_PROMPT if is_gradcam else ORIGINAL_IMAGE_PROMPT
451
 
452
+ # Preprocess the image
453
  inputs = processor(image, text=prompt, return_tensors="pt")
454
 
455
  # Check for available GPU
 
464
  # Decode the caption
465
  raw_caption = processor.decode(output[0], skip_special_tokens=True)
466
 
467
+ # Format the caption into a structured format based on type
 
 
 
 
468
  if is_gradcam:
469
  formatted_caption = format_gradcam_caption(raw_caption)
470
  else:
 
477
 
478
  def format_image_caption(raw_caption):
479
  """Format a raw caption into a structured description with headings"""
480
+
481
+ # Try to extract some basic information from the raw caption
482
+ appearance_info = raw_caption # Use the full caption by default
483
+
484
+ # Basic structure for image caption with extracted information
485
  structured_caption = f"""
486
+ **Subject**: The image shows a person in a portrait-style photograph.
487
 
488
+ **Appearance**: {appearance_info}
489
 
490
+ **Background**: The background appears to be a controlled environment.
491
 
492
+ **Lighting**: The lighting appears to be professional with even illumination.
493
 
494
+ **Colors**: The image contains natural skin tones and colors typical of portrait photography.
495
 
496
  **Notable Elements**: The facial features and expression are the central focus of the image.
497
  """
 
499
 
500
  def format_gradcam_caption(raw_caption):
501
  """Format a raw GradCAM description with proper structure"""
502
+
503
  # Basic structure for GradCAM analysis
504
  structured_caption = f"""
505
+ **Main Focus Area**: The heatmap is primarily focused on the facial region of the person.
506
 
507
+ **High Activation Regions**: The red/yellow areas highlight important features that the model is focusing on. {raw_caption}
508
 
509
+ **Medium Activation Regions**: The green/cyan areas correspond to regions of medium importance in the detection process, typically including parts of the face and surrounding areas.
510
 
511
+ **Low Activation Regions**: The blue/dark blue areas represent features that have less impact on the model's decision, usually the background and peripheral elements.
512
 
513
+ **Activation Pattern**: The overall pattern suggests the model is primarily analyzing facial features to make its determination of authenticity.
514
  """
515
  return structured_caption.strip()
516
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  # ----- Fine-tuned Vision LLM -----
518
 
519
  # Function to fix cross-attention masks