Spaces:

uw-insight-lab
/

Probing-Vis-Literacy-of-VLMs

Paused

App Files Files Community

Austing Dong commited on Jun 30

Commit

b019cc7

1 Parent(s): 63b5fc2

.

Browse files

Files changed (6) hide show

app.py +6 -1
evaluate/evaluate.py +52 -14
evaluate/new_test.json +236 -0
questions/New_test.py +73 -0
saliency_map/__init__.py +0 -12
saliency_map/cam.py +0 -75

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from demo.modified_attn import ModifiedLlamaAttention, ModifiedGemmaAttention
 from questions.mini_VLAT import mini_VLAT_questions
 from questions.VLAT_old import VLAT_old_questions
 from questions.VLAT import VLAT_questions
 import numpy as np
 import matplotlib.pyplot as plt
 import gc
@@ -244,6 +245,10 @@ def test_change(test_selector):
         return gr.Dataset(
                 samples=VLAT_questions,
             )
     else:
         return gr.Dataset(
                 samples=VLAT_old_questions,
@@ -265,7 +270,7 @@ with gr.Blocks() as demo:
         with gr.Column():
             model_selector = gr.Dropdown(choices=["ChartGemma-3B", "Janus-Pro-1B", "Janus-Pro-7B", "LLaVA-1.5-7B"], value="ChartGemma-3B", label="model")
-            test_selector = gr.Dropdown(choices=["mini-VLAT", "VLAT", "VLAT-old"], value="mini-VLAT", label="test")
             chart_type = gr.Textbox(label="Chart Type", value="Any")
             und_seed_input = gr.Number(label="Seed", precision=0, value=42)
             top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")

 from questions.mini_VLAT import mini_VLAT_questions
 from questions.VLAT_old import VLAT_old_questions
 from questions.VLAT import VLAT_questions
+from questions.New_test import new_test_questions
 import numpy as np
 import matplotlib.pyplot as plt
 import gc
         return gr.Dataset(
                 samples=VLAT_questions,
             )
+    elif test_selector == "New_test":
+        return gr.Dataset(
+                samples=new_test_questions,
+            )
     else:
         return gr.Dataset(
                 samples=VLAT_old_questions,
         with gr.Column():
             model_selector = gr.Dropdown(choices=["ChartGemma-3B", "Janus-Pro-1B", "Janus-Pro-7B", "LLaVA-1.5-7B"], value="ChartGemma-3B", label="model")
+            test_selector = gr.Dropdown(choices=["mini-VLAT", "VLAT", "VLAT-old", "New_test"], value="mini-VLAT", label="test")
             chart_type = gr.Textbox(label="Chart Type", value="Any")
             und_seed_input = gr.Number(label="Seed", precision=0, value=42)
             top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")

evaluate/evaluate.py CHANGED Viewed

@@ -1,15 +1,22 @@
 import os
 import torch
 import base64
 import numpy as np
 from PIL import Image
 from openai import OpenAI
 from demo.model_utils import *
-from evaluate.questions import questions
 def set_seed(model_seed = 70):
     torch.manual_seed(model_seed)
-    # np.random.seed(model_seed)
     torch.cuda.manual_seed(model_seed) if torch.cuda.is_available() else None
 def clean():
@@ -26,7 +33,23 @@ def encode_image(image_path):
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode("utf-8")
 def evaluate(model_type, num_eval = 10):
     for eval_idx in range(num_eval):
         clean()
         set_seed(np.random.randint(0, 1000))
@@ -53,13 +76,16 @@ def evaluate(model_type, num_eval = 10):
                             base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
         for question_idx, question in enumerate(questions):
-            chart_type = question[0]
-            q = question[1]
-            img_path = question[2]
             image = np.array(Image.open(img_path).convert("RGB"))
             if model_type.split('-')[0] == "GPT":
                 base64_image = encode_image(img_path)
                 completion = client.chat.completions.create(
@@ -68,7 +94,7 @@ def evaluate(model_type, num_eval = 10):
                         {
                             "role": "user",
                             "content": [
-                                { "type": "text", "text": f"{q}" },
                                 {
                                     "type": "image_url",
                                     "image_url": {
@@ -89,7 +115,7 @@ def evaluate(model_type, num_eval = 10):
                         {
                             "role": "user",
                             "content": [
-                                { "type": "text", "text": f"{q}" },
                                 {
                                     "type": "image_url",
                                     "image_url": {
@@ -103,7 +129,8 @@ def evaluate(model_type, num_eval = 10):
                 answer = completion.choices[0].message.content
             else:
-                prepare_inputs = model_utils.prepare_inputs(q, image)
                 temperature = 0.1
                 top_p = 0.95
@@ -116,19 +143,30 @@ def evaluate(model_type, num_eval = 10):
                 sequences = outputs.sequences.cpu().tolist()
                 answer = tokenizer.decode(sequences[0], skip_special_tokens=True)
-            RESULTS_ROOT = "./evaluate/results"
             FILES_ROOT = f"{RESULTS_ROOT}/{model_type}/{eval_idx}"
             os.makedirs(FILES_ROOT, exist_ok=True)
             with open(f"{FILES_ROOT}/Q{question_idx + 1}-{chart_type}.txt", "w") as f:
                 f.write(answer)
                 f.close()
-if __name__ == '__main__':
-    # models = ["ChartGemma", "Janus-Pro-1B", "Janus-Pro-7B", "LLaVA-1.5-7B", "GPT-4o", "Gemini-2.0-flash"]
-    models = ["Janus-Pro-7B"]
     for model_type in models:
         evaluate(model_type=model_type, num_eval=10)

 import os
 import torch
 import base64
+import json
 import numpy as np
 from PIL import Image
 from openai import OpenAI
 from demo.model_utils import *
+from pydantic import BaseModel
+questions = json.load(open("evaluate/new_test.json", "r"))
+judge_client = OpenAI(api_key=os.environ["GEMINI_HCI_API_KEY"],
+                            base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
+class Judge_Result(BaseModel):
+    result: int
 def set_seed(model_seed = 70):
     torch.manual_seed(model_seed)
     torch.cuda.manual_seed(model_seed) if torch.cuda.is_available() else None
 def clean():
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode("utf-8")
+def llm_judge(answer, options, correct_answer):
+    completion = judge_client.beta.chat.completions.parse(
+        model="gemini-2.5-pro-preview-03-25",
+        messages=[
+            { "role": "system", "content": "You are a judge that evaluates the correctness of answers to questions. The answer might not be the letter of correct option. You need to judge correctness of the answer, comparing to the options and correct option. Return the correctness in 1: Correct or 0: Incorrect." },
+            { "role": "user", "content": f":Options: {options}\nAnswer:{answer},Correct Option: {correct_answer}" },
+        ],
+        response_format=Judge_Result
+    )
+    answer = completion.choices[0].message.content
+    print(f"Judge Answer: {answer}")
+    return json.loads(answer)["result"]
 def evaluate(model_type, num_eval = 10):
+    sum_correct = np.zeros(len(questions))
+    RESULTS_ROOT = "./evaluate/results"
     for eval_idx in range(num_eval):
         clean()
         set_seed(np.random.randint(0, 1000))
                             base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
         for question_idx, question in enumerate(questions):
+            chart_type = question["type"]
+            q = question["question"]
+            img_path = question["img_path"]
+            options = question.get("options", None)
+            correct_answer = question.get("correct_answer", None)
             image = np.array(Image.open(img_path).convert("RGB"))
+            input_text = f"Options: {options}\nQuestion: {q}\n"
             if model_type.split('-')[0] == "GPT":
                 base64_image = encode_image(img_path)
                 completion = client.chat.completions.create(
                         {
                             "role": "user",
                             "content": [
+                                { "type": "text", "text": f"{input_text}" },
                                 {
                                     "type": "image_url",
                                     "image_url": {
                         {
                             "role": "user",
                             "content": [
+                                { "type": "text", "text": f"{input_text}" },
                                 {
                                     "type": "image_url",
                                     "image_url": {
                 answer = completion.choices[0].message.content
             else:
+                prepare_inputs = model_utils.prepare_inputs(input_text, image)
                 temperature = 0.1
                 top_p = 0.95
                 sequences = outputs.sequences.cpu().tolist()
                 answer = tokenizer.decode(sequences[0], skip_special_tokens=True)
+            # Judge the answer
+            result_judge = llm_judge(answer, options, correct_answer)
+            sum_correct[question_idx] += 1 if result_judge else 0
+            print(f"Model: {model_type}, Question: {question_idx + 1}, Answer: {answer}, Correct: {result_judge}")
+            # Save the results
             FILES_ROOT = f"{RESULTS_ROOT}/{model_type}/{eval_idx}"
             os.makedirs(FILES_ROOT, exist_ok=True)
             with open(f"{FILES_ROOT}/Q{question_idx + 1}-{chart_type}.txt", "w") as f:
                 f.write(answer)
                 f.close()
+    accuracy = sum_correct / num_eval
+    print(f"Model: {model_type}, Accuracy: {accuracy}")
+    with open(f"{RESULTS_ROOT}/{model_type}/accuracy.txt", "w") as f:
+        for question_idx, question in enumerate(questions):
+            chart_type = question["type"]
+            f.write(f"Chart Type: {chart_type}, Accuracy: {accuracy[question_idx]}\n")
+        f.close()
+if __name__ == '__main__':
+    # models = ["Janus-Pro-1B", "ChartGemma", "GPT-4o", "Gemini-2.0-flash", "Janus-Pro-7B", "LLaVA-1.5-7B"]
+    models = ["LLaVA-1.5-7B", "Gemini-2.0-flash", "Janus-Pro-7B", ]
     for model_type in models:
         evaluate(model_type=model_type, num_eval=10)

evaluate/new_test.json ADDED Viewed

	@@ -0,0 +1,236 @@

+[
+  {
+    "type": "Area Chart",
+    "img_path": "images/New_test/AreaChart.png",
+    "question": "At which month the price of teAreaa is the highest?",
+    "options": [
+      {
+        "A": "1"
+      },
+      {
+        "B": "3"
+      },
+      {
+        "C": "4"
+      },
+      {
+        "D": "7"
+      }
+    ],
+    "correct_answer": "C"
+  },
+  {
+    "type": "Bar Chart",
+    "img_path": "images/New_test/BarChart.png",
+    "question": "Which student has the highest score in midterm?",
+    "options": [
+      {
+        "A": "Bob"
+      },
+      {
+        "B": "Elora"
+      },
+      {
+        "C": "Kevin"
+      },
+      {
+        "D": "David"
+      }
+    ],
+    "correct_answer": "D"
+  },
+  {
+    "type": "Bubble Chart",
+    "img_path": "images/New_test/BubbleChart.png",
+    "question": "What is the number of employees of the company that has lowest annual income?",
+    "options": [
+      {
+        "A": "5"
+      },
+      {
+        "B": "20"
+      },
+      {
+        "C": "85"
+      },
+      {
+        "D": "60"
+      }
+    ],
+    "correct_answer": "C"
+  },
+  {
+    "type": "Choropleth Map",
+    "img_path": "images/New_test/Choropleth.png",
+    "question": "Is the GDP of CA higher than NV?",
+    "options": [
+      {
+        "A": "True"
+      },
+      {
+        "B": "False"
+      }
+    ],
+    "correct_answer": "A"
+  },
+  {
+    "type": "Histogram",
+    "img_path": "images/New_test/Histogram.png",
+    "question": "Which range of distance of trip people prefers the most?",
+    "options": [
+      {
+        "A": "10-20km"
+      },
+      {
+        "B": "30-40km"
+      },
+      {
+        "C": "50-60km"
+      },
+      {
+        "D": "70-80km"
+      }
+    ],
+    "correct_answer": "C"
+  },
+  {
+    "type": "Line Chart",
+    "img_path": "images/New_test/LineChart.png",
+    "question": "What is the blood sugar level three hours after a meal",
+    "options": [
+      {
+        "A": "18"
+      },
+      {
+        "B": "70"
+      },
+      {
+        "C": "90"
+      },
+      {
+        "D": "50"
+      }
+    ],
+    "correct_answer": "3"
+  },
+  {
+    "type": "Pie Chart",
+    "img_path": "images/New_test/PieChart.png",
+    "question": "Which stock has the smallest holdings in this portfolio?",
+    "options": [
+      {
+        "A": "AAPL"
+      },
+      {
+        "B": "NVDA"
+      },
+      {
+        "C": "TSLA"
+      },
+      {
+        "D": "GOOG"
+      }
+    ],
+    "correct_answer": "B"
+  },
+  {
+    "type": "Scatter Chart",
+    "img_path": "images/New_test/Scatterplot.png",
+    "question": "What is the weight of the individual that has highest height?",
+    "options": [
+      {
+        "A": "96"
+      },
+      {
+        "B": "72"
+      },
+      {
+        "C": "55"
+      },
+      {
+        "D": "25"
+      }
+    ],
+    "correct_answer": "A"
+  },
+  {
+    "type": "Stacked Area Chart",
+    "img_path": "images/New_test/StackedArea.png",
+    "question": "What was the ratio of boys named 'Justin' to boys named 'Kevin' in the 3rd month in the USA?",
+    "options": [
+      {
+        "A": "1:1"
+      },
+      {
+        "B": "1:2"
+      },
+      {
+        "C": "2:1"
+      },
+      {
+        "D": "4:1"
+      }
+    ],
+    "correct_answer": "C"
+  },
+  {
+    "type": "Stacked Bar Chart",
+    "img_path": "images/New_test/StackedBar.png",
+    "question": "What is the price of headphone in Japan?",
+    "options": [
+      {
+        "A": "10"
+      },
+      {
+        "B": "30"
+      },
+      {
+        "C": "50"
+      },
+      {
+        "D": "70"
+      }
+    ],
+    "correct_answer": "B"
+  },
+  {
+    "type": "100% Stacked Bar Chart",
+    "img_path": "images/New_test/Stacked100.png",
+    "question": "Which country has the largest proportion of mouse price?",
+    "options": [
+      {
+        "A": "Canada"
+      },
+      {
+        "B": "China"
+      },
+      {
+        "C": "Japan"
+      },
+      {
+        "D": "Korea"
+      }
+    ],
+    "correct_answer": "A"
+  },
+  {
+    "type": "Treemap",
+    "img_path": "images/New_test/TreeMap.png",
+    "question": "Which country has the largest population?",
+    "options": [
+      {
+        "A": "China"
+      },
+      {
+        "B": " USA"
+      },
+      {
+        "C": " Canada"
+      },
+      {
+        "D": " UK"
+      }
+    ],
+    "correct_answer": "A"
+  }
+]

questions/New_test.py ADDED Viewed

	@@ -0,0 +1,73 @@

+new_test_questions=[
+    [
+        "LineChart",
+        "What is the blood sugar level three hours after a meal",
+        "images/New_test/LineChart.png"
+    ],
+    [
+        "BarChart",
+        "Which student has the highest score in midterm?",
+        "images/New_test/BarChart.png"
+    ],
+    [
+        "StackedBar",
+        "What is the price of headphone in Japan?",
+        "images/New_test/StackedBar.png"
+    ],
+    [
+        "100%StackedBar",
+        "Which country has the largest proportion of mouse price?",
+        "images/New_test/Stacked100.png"
+    ],
+    [
+        "PieChart",
+        "Which stock has the smallest holdings in this portfolio?",
+        "images/New_test/PieChart.png"
+    ],
+    [
+        "Histogram",
+        "Which range of distance of trip people prefers the most?",
+        "images/New_test/Histogram.png"
+    ],
+    [
+        "Scatterplot",
+        "What is the weight of the individual that has highest height?",
+        "images/New_test/Scatterplot.png"
+    ],
+    [
+        "AreaChart",
+        "At which month the price of tea is the highest?",
+        "images/New_test/AreaChart.png"
+    ],
+    [
+        "StackedArea",
+        "What was the ratio of boys named 'Justin' to boys named 'Kevin' in the 3rd month in the USA?",
+        "images/New_test/StackedArea.png"
+    ],
+    [
+        "BubbleChart",
+        "What is the number of employees of the company that has lowest annual income?",
+        "images/New_test/BubbleChart.png"
+    ],
+    [
+        "Choropleth",
+        "Is the GDP of CA higher than NV?",
+        "images/New_test/Choropleth.png"
+    ],
+    [
+        "TreeMap",
+        "Which country has the largest population?",
+        "images/New_test/TreeMap.png"
+    ]
+]

saliency_map/__init__.py DELETED Viewed

@@ -1,12 +0,0 @@
-import sys
-if sys.version_info >= (3, 10):
-    print("Python version is above 3.10, patching the collections module.")
-    # Monkey patch collections
-    import collections
-    import collections.abc
-    for type_name in collections.abc.__all__:
-        setattr(collections, type_name, getattr(collections.abc, type_name))

saliency_map/cam.py DELETED Viewed

@@ -1,75 +0,0 @@
-import torch
-import cv2
-from PIL import Image
-import numpy as np
-class MultimodalGradCAM:
-    def __init__(self, model, processor):
-        self.model = model
-        self.processor = processor
-        self.activations = {}
-        self.gradients = {}
-        # Register hooks
-        self._register_hooks()
-    def _register_hooks(self):
-        # Hook the last vision transformer layer
-        def forward_hook(module, input, output):
-            self.activations['vision'] = output.last_hidden_state
-        def backward_hook(module, grad_input, grad_output):
-            self.gradients['vision'] = grad_output[0]
-        vision_encoder = self.model.get_vision_encoder()
-        vision_encoder.layers[-1].register_forward_hook(forward_hook)
-        vision_encoder.layers[-1].register_backward_hook(backward_hook)
-    def generate_saliency(self, image, question):
-        # Preprocess inputs
-        inputs = self.processor(
-            text=question,
-            images=image,
-            return_tensors="pt",
-            padding=True
-        )
-        # Forward pass
-        outputs = self.model(**inputs)
-        answer_ids = outputs.logits.argmax(dim=-1)
-        # Get target token (use last token for answer)
-        target_token_id = answer_ids[0, -1].item()
-        target = outputs.logits[0, -1, target_token_id]
-        # Backward pass
-        self.model.zero_grad()
-        target.backward()
-        # Process activations and gradients
-        activations = self.activations['vision'].detach()
-        gradients = self.gradients['vision'].detach()
-        # Grad-CAM calculation
-        weights = gradients.mean(dim=[1, 2], keepdim=True)  # Global average pooling
-        cam = (weights * activations).sum(dim=-1, keepdims=True)
-        cam = torch.relu(cam)
-        # Reshape and normalize
-        cam = cam.squeeze().cpu().numpy()
-        cam = (cam - cam.min()) / (cam.max() - cam.min() + 1e-8)
-        return cam
-    def visualize(self, image, cam):
-        # Resize CAM to original image size
-        img_size = image.size[::-1]  # (width, height) -> (height, width)
-        cam = cv2.resize(cam, img_size)
-        # Convert to heatmap
-        heatmap = cv2.applyColorMap(np.uint8(255 * cam), cv2.COLORMAP_JET)
-        heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
-        # Superimpose on original image
-        superimposed = np.array(image) * 0.4 + heatmap * 0.6
-        return Image.fromarray(np.uint8(superimposed))