Syzygianinfern0 commited on
Commit
8a7a9dd
·
1 Parent(s): 31d38b7

Update everything from iccv submission

Browse files
neus_v/automaton/video_automaton.py CHANGED
@@ -18,6 +18,7 @@ class VideoAutomaton:
18
  self.previous_states: list[VideoState] = []
19
  self.states: list[VideoState] = []
20
  self.transitions = []
 
21
  self.include_initial_state = include_initial_state
22
 
23
  def set_up(self, proposition_set: list[str]) -> None:
@@ -68,6 +69,7 @@ class VideoAutomaton:
68
  # Build transitions from previous states to current states
69
  if self.previous_states:
70
  for prev_state in self.previous_states:
 
71
  for cur_state in current_states:
72
  transition = (
73
  prev_state.state_index,
@@ -75,6 +77,7 @@ class VideoAutomaton:
75
  cur_state.probability,
76
  )
77
  self.transitions.append(transition)
 
78
 
79
  self.previous_states = current_states if current_states else self.previous_states
80
  self.frame_index_in_automaton += 1
@@ -101,14 +104,26 @@ class VideoAutomaton:
101
  (prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
102
  )
103
 
 
 
 
 
 
 
 
 
 
104
  def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
105
  """Update the probability of propositions."""
106
  for i, prop in enumerate(self.proposition_set):
107
- prop = prop.replace("_", " ")
108
  if frame.object_of_interest.get(prop):
109
  probability = frame.object_of_interest[prop].get_probability()
110
  else:
111
- probability = 0.0
 
 
 
 
112
  self.probability_of_propositions[i].append(round(probability, 2))
113
 
114
  def _create_label_combinations(self, num_props: int) -> list[str]:
 
18
  self.previous_states: list[VideoState] = []
19
  self.states: list[VideoState] = []
20
  self.transitions = []
21
+ self.transition_map = {}
22
  self.include_initial_state = include_initial_state
23
 
24
  def set_up(self, proposition_set: list[str]) -> None:
 
69
  # Build transitions from previous states to current states
70
  if self.previous_states:
71
  for prev_state in self.previous_states:
72
+ self.transition_map[prev_state.state_index] = []
73
  for cur_state in current_states:
74
  transition = (
75
  prev_state.state_index,
 
77
  cur_state.probability,
78
  )
79
  self.transitions.append(transition)
80
+ self.transition_map[prev_state.state_index].append(cur_state.state_index)
81
 
82
  self.previous_states = current_states if current_states else self.previous_states
83
  self.frame_index_in_automaton += 1
 
104
  (prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
105
  )
106
 
107
+ def get_frame_to_state_index(self) -> dict[int, list[int]]:
108
+ """Get frame to state index mapping."""
109
+ data = {}
110
+ for state in self.states:
111
+ if state.frame_index not in data:
112
+ data[state.frame_index] = []
113
+ data[state.frame_index].append(state.state_index)
114
+ return data
115
+
116
  def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
117
  """Update the probability of propositions."""
118
  for i, prop in enumerate(self.proposition_set):
 
119
  if frame.object_of_interest.get(prop):
120
  probability = frame.object_of_interest[prop].get_probability()
121
  else:
122
+ prop = prop.replace("_", " ")
123
+ if frame.object_of_interest.get(prop):
124
+ probability = frame.object_of_interest[prop].get_probability()
125
+ else:
126
+ probability = 0.0
127
  self.probability_of_propositions[i].append(round(probability, 2))
128
 
129
  def _create_label_combinations(self, num_props: int) -> list[str]:
neus_v/calibration/__init__.py ADDED
File without changes
neus_v/calibration/vlm.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+
4
+
5
+ def smooth_mapping(
6
+ confidence: float,
7
+ true_threshold=0.95,
8
+ false_threshold=0.40,
9
+ target_conf=0.60,
10
+ target_prob=0.78,
11
+ p_min=0.01,
12
+ p_max=0.99,
13
+ steepness_factor=0.7, # New parameter: 0-1 range, lower = less steep
14
+ ) -> float:
15
+ """Map confidence to probability using a sigmoid function with adjustable steepness.
16
+
17
+ Args:
18
+ confidence: Input confidence score
19
+ true_threshold: Upper threshold (0.78)
20
+ false_threshold: Lower threshold (0.40)
21
+ target_conf: Target confidence point (0.60)
22
+ target_prob: Target probability value (0.78)
23
+ p_min: Minimum probability (0.01)
24
+ p_max: Maximum probability (0.99)
25
+ steepness_factor: Controls curve steepness (0-1, lower = less steep)
26
+ """
27
+ if confidence <= false_threshold:
28
+ return p_min
29
+
30
+ if confidence >= true_threshold:
31
+ return p_max
32
+
33
+ # Calculate parameters to ensure target_conf maps to target_prob
34
+ # For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0)))
35
+
36
+ # First, normalize the target point
37
+ x_norm = (target_conf - false_threshold) / (true_threshold - false_threshold)
38
+ y_norm = (target_prob - p_min) / (p_max - p_min)
39
+
40
+ # Find x0 (midpoint) and k (steepness) to satisfy our target point
41
+ x0 = 0.30 # Midpoint of normalized range
42
+
43
+ # Calculate base k value to hit the target point
44
+ base_k = -np.log(1 / y_norm - 1) / (x_norm - x0)
45
+
46
+ # Apply steepness factor (lower = less steep)
47
+ k = base_k * steepness_factor
48
+
49
+ # With reduced steepness, we need to adjust x0 to still hit the target point
50
+ # Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k
51
+ adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k
52
+
53
+ # Apply the sigmoid with our calculated parameters
54
+ x_scaled = (confidence - false_threshold) / (true_threshold - false_threshold)
55
+ sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0)))
56
+
57
+ # Ensure we still hit exactly p_min and p_max at the thresholds
58
+ # by rescaling the output slightly
59
+ min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0)))
60
+ max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0)))
61
+
62
+ # Normalize the output
63
+ normalized = (sigmoid_value - min_val) / (max_val - min_val)
64
+
65
+ return p_min + normalized * (p_max - p_min)
66
+
67
+
68
+ def main():
69
+ # Visualize the function
70
+ x = np.linspace(0, 1, 1000)
71
+ y = [smooth_mapping(xi) for xi in x]
72
+
73
+ plt.figure(figsize=(10, 6))
74
+ plt.plot(x, y, "r-", label="Mapping Function", linewidth=2)
75
+
76
+ # Add vertical lines for thresholds
77
+ plt.axvline(x=0.40, color="b", linestyle="--", label="False Threshold (0.40)")
78
+ plt.axvline(x=0.95, color="g", linestyle="--", label="True Threshold (0.95)")
79
+
80
+ # Add horizontal lines for probability limits
81
+ plt.axhline(y=0.01, color="r", linestyle=":", alpha=0.5)
82
+ plt.axhline(y=0.99, color="g", linestyle=":", alpha=0.5)
83
+
84
+ plt.grid(True, alpha=0.3)
85
+ plt.xlabel("Confidence Score")
86
+ plt.ylabel("Mapped Probability")
87
+ plt.title("Confidence to Probability Mapping (Matching Red Line)")
88
+ plt.legend()
89
+ plt.ylim(-0.05, 1.05)
90
+ plt.savefig("calibration.png")
91
+
92
+ # Print some example values
93
+ test_values = [0.4, 0.425, 0.6, 0.85, 0.9]
94
+ for val in test_values:
95
+ print(f"Confidence {val:.3f} → Probability {smooth_mapping(val):.3f}")
96
+
97
+
98
+ if __name__ == "__main__":
99
+ main()
neus_v/model_checking/stormpy.py CHANGED
@@ -174,7 +174,8 @@ class StormModelChecker:
174
 
175
  # Check probabilities
176
  for state, prob_sum in outgoing_probs.items():
177
- if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
 
178
  logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
179
 
180
  # ... (existing logging code) ...
 
174
 
175
  # Check probabilities
176
  for state, prob_sum in outgoing_probs.items():
177
+ # if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
178
+ if not math.isclose(prob_sum, 1.0, abs_tol=1e-2):
179
  logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
180
 
181
  # ... (existing logging code) ...
neus_v/smooth_scoring.py CHANGED
@@ -14,7 +14,8 @@ class DataTransformer:
14
 
15
 
16
  def smooth_confidence_scores(target_data, prior_distribution=None):
17
- if prior_distribution is None:
18
- prior_distribution = target_data
19
- transformer = DataTransformer(prior_distribution)
20
- return transformer.mapping_function(target_data)
 
 
14
 
15
 
16
  def smooth_confidence_scores(target_data, prior_distribution=None):
17
+ # if prior_distribution is None:
18
+ # prior_distribution = target_data
19
+ # transformer = DataTransformer(prior_distribution)
20
+ # return transformer.mapping_function(target_data)
21
+ return target_data[0]
neus_v/utils.py CHANGED
@@ -1,5 +1,8 @@
1
  import gc
 
 
2
 
 
3
  import torch
4
 
5
 
@@ -8,3 +11,18 @@ def clear_gpu_memory():
8
  if torch.cuda.is_available():
9
  torch.cuda.ipc_collect()
10
  gc.collect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gc
2
+ import os
3
+ import random
4
 
5
+ import numpy as np
6
  import torch
7
 
8
 
 
11
  if torch.cuda.is_available():
12
  torch.cuda.ipc_collect()
13
  gc.collect()
14
+
15
+
16
+ def set_seed(seed: int = 42) -> None:
17
+ np.random.seed(seed)
18
+ random.seed(seed)
19
+ torch.manual_seed(seed)
20
+ torch.cuda.manual_seed(seed)
21
+
22
+ # When running on the CuDNN backend, two further options must be set
23
+ torch.backends.cudnn.deterministic = True
24
+ torch.backends.cudnn.benchmark = False
25
+
26
+ # Set a fixed value for the hash seed
27
+ os.environ["PYTHONHASHSEED"] = str(seed)
28
+ print(f"Random seed set as {seed}")
neus_v/veval/eval.py CHANGED
@@ -6,7 +6,7 @@ from joblib import Parallel, delayed
6
 
7
  from neus_v.automaton.video_automaton import VideoAutomaton
8
  from neus_v.model_checking.stormpy import StormModelChecker
9
- from neus_v.veval.parse import parse_tl_formula
10
  from neus_v.video.frame import VideoFrame
11
  from neus_v.video.read_video import read_video
12
 
@@ -35,6 +35,7 @@ def evaluate_video(
35
  tl_spec: str,
36
  parallel_inference: bool = False,
37
  threshold: float = 0.1,
 
38
  ) -> dict:
39
  """Evaluate a video using the given vision language model."""
40
  output_log = {
@@ -52,6 +53,7 @@ def evaluate_video(
52
 
53
  # TODO: if there's F in the tl_spec
54
  ltl_formula = parse_tl_formula(tl_spec)
 
55
 
56
  video_automaton = VideoAutomaton(include_initial_state=True)
57
 
@@ -97,10 +99,12 @@ def evaluate_video(
97
  return video_frame, object_of_interest
98
 
99
  if parallel_inference:
 
100
  results = Parallel(n_jobs=len(all_frames))(
101
  delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(all_frames)
102
  )
103
  else:
 
104
  results = [process_frame(frame_img, i) for i, frame_img in enumerate(all_frames)]
105
 
106
  for video_frame, object_of_interest in results:
@@ -163,6 +167,7 @@ def evaluate_video_with_sequence_of_images(
163
 
164
  # TODO: if there's F in the tl_spec
165
  ltl_formula = parse_tl_formula(tl_spec)
 
166
 
167
  video_automaton = VideoAutomaton(include_initial_state=True)
168
 
@@ -191,7 +196,7 @@ def evaluate_video_with_sequence_of_images(
191
  detected_object = vision_language_model.detect(
192
  seq_of_frames=sequence_of_frames,
193
  scene_description=proposition,
194
- confidence_as_token_probability=confidence_as_token_probability,
195
  threshold=threshold,
196
  )
197
  object_of_interest[proposition] = detected_object
 
6
 
7
  from neus_v.automaton.video_automaton import VideoAutomaton
8
  from neus_v.model_checking.stormpy import StormModelChecker
9
+ from neus_v.veval.parse import parse_tl_formula, parse_until_to_next_frame
10
  from neus_v.video.frame import VideoFrame
11
  from neus_v.video.read_video import read_video
12
 
 
35
  tl_spec: str,
36
  parallel_inference: bool = False,
37
  threshold: float = 0.1,
38
+ num_of_frame_in_sequence: int = 1,
39
  ) -> dict:
40
  """Evaluate a video using the given vision language model."""
41
  output_log = {
 
53
 
54
  # TODO: if there's F in the tl_spec
55
  ltl_formula = parse_tl_formula(tl_spec)
56
+ ltl_formula = parse_until_to_next_frame(ltl_formula)
57
 
58
  video_automaton = VideoAutomaton(include_initial_state=True)
59
 
 
99
  return video_frame, object_of_interest
100
 
101
  if parallel_inference:
102
+ frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
103
  results = Parallel(n_jobs=len(all_frames))(
104
  delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(all_frames)
105
  )
106
  else:
107
+ frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
108
  results = [process_frame(frame_img, i) for i, frame_img in enumerate(all_frames)]
109
 
110
  for video_frame, object_of_interest in results:
 
167
 
168
  # TODO: if there's F in the tl_spec
169
  ltl_formula = parse_tl_formula(tl_spec)
170
+ ltl_formula = parse_until_to_next_frame(ltl_formula)
171
 
172
  video_automaton = VideoAutomaton(include_initial_state=True)
173
 
 
196
  detected_object = vision_language_model.detect(
197
  seq_of_frames=sequence_of_frames,
198
  scene_description=proposition,
199
+ # confidence_as_token_probability=confidence_as_token_probability,
200
  threshold=threshold,
201
  )
202
  object_of_interest[proposition] = detected_object
neus_v/veval/parse.py CHANGED
@@ -27,3 +27,8 @@ def parse_proposition_set(proposition_set: list[str]) -> list[str]:
27
  def parse_tl_specification(tl_spec: str) -> str:
28
  """Parse the tl specification."""
29
  return tl_spec.replace("-", "_")
 
 
 
 
 
 
27
  def parse_tl_specification(tl_spec: str) -> str:
28
  """Parse the tl specification."""
29
  return tl_spec.replace("-", "_")
30
+
31
+
32
+ def parse_until_to_next_frame(tl_spec: str) -> str:
33
+ """Parse the tl specification."""
34
+ return tl_spec.replace("U", "& F")
neus_v/vlm/vllm_client.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+
3
+ import cv2
4
+ import numpy as np
5
+ from openai import OpenAI
6
+
7
+ from neus_v.calibration.vlm import smooth_mapping
8
+ from neus_v.vlm.obj import DetectedObject
9
+
10
+
11
+ class VLLMClient:
12
+ def __init__(
13
+ self,
14
+ api_key="EMPTY",
15
+ api_base="http://localhost:8000/v1",
16
+ model="OpenGVLab/InternVL2_5-8B",
17
+ ):
18
+ self.client = OpenAI(api_key=api_key, base_url=api_base)
19
+ self.model = model
20
+ self.system_message = (
21
+ "You must only return a Yes or No, and not both, to any question asked.\n"
22
+ "You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.\n"
23
+ "For example, if the question is 'Is there a cat present in the Image?', the answer must only be 'Yes' or 'No'."
24
+ )
25
+
26
+ # def _encode_frame(self, frame):
27
+ # return base64.b64encode(frame.tobytes()).decode("utf-8")
28
+ def _encode_frame(self, frame):
29
+ # Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.
30
+ ret, buffer = cv2.imencode(".jpg", frame)
31
+ if not ret:
32
+ raise ValueError("Could not encode frame")
33
+ return base64.b64encode(buffer).decode("utf-8")
34
+
35
+ def detect(self, seq_of_frames, scene_description, threshold=None, **kwargs):
36
+ # Encode each frame.
37
+ encoded_images = [self._encode_frame(frame) for frame in seq_of_frames]
38
+
39
+ # Build the user message: a text prompt plus one image for each frame.
40
+ user_content = [
41
+ {
42
+ "type": "text",
43
+ "text": f"Does the sequence of these images depict '{scene_description}'",
44
+ }
45
+ ]
46
+ for encoded in encoded_images:
47
+ user_content.append(
48
+ {
49
+ "type": "image_url",
50
+ "image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
51
+ }
52
+ )
53
+
54
+ # Create a chat completion request.
55
+ chat_response = self.client.chat.completions.create(
56
+ model=self.model,
57
+ messages=[
58
+ {"role": "system", "content": self.system_message},
59
+ {"role": "user", "content": user_content},
60
+ ],
61
+ max_tokens=1,
62
+ temperature=0.0,
63
+ logprobs=True,
64
+ top_logprobs=20,
65
+ )
66
+
67
+ # Retrieve the list of TopLogprob objects.
68
+ top_logprobs_list = chat_response.choices[0].logprobs.content[0].top_logprobs
69
+
70
+ # Build a mapping from token text (stripped) to its probability.
71
+ token_prob_map = {}
72
+ for top_logprob in top_logprobs_list:
73
+ token_text = top_logprob.token.strip()
74
+ token_prob_map[token_text] = np.exp(top_logprob.logprob)
75
+
76
+ # Extract probabilities for "Yes" and "No"
77
+ yes_prob = token_prob_map.get("Yes", 0.0)
78
+ no_prob = token_prob_map.get("No", 0.0)
79
+
80
+ # Compute the normalized probability for "Yes": p_yes / (p_yes + p_no)
81
+ if yes_prob + no_prob > 0:
82
+ confidence = yes_prob / (yes_prob + no_prob)
83
+ else:
84
+ raise ValueError("No probabilities for 'Yes' or 'No' found in the response.")
85
+
86
+ if threshold:
87
+ confidence = smooth_mapping(confidence=confidence, false_threshold=threshold)
88
+ if confidence < threshold:
89
+ no_prob = 1.0
90
+
91
+ return DetectedObject(
92
+ name=scene_description,
93
+ model_name=self.model,
94
+ confidence=round(confidence, 3),
95
+ probability=round(confidence, 3),
96
+ number_of_detection=1,
97
+ is_detected=yes_prob > no_prob, # TODO: Check if this is correct
98
+ )