Spaces:
Sleeping
Sleeping
Commit
·
8a7a9dd
1
Parent(s):
31d38b7
Update everything from iccv submission
Browse files- neus_v/automaton/video_automaton.py +17 -2
- neus_v/calibration/__init__.py +0 -0
- neus_v/calibration/vlm.py +99 -0
- neus_v/model_checking/stormpy.py +2 -1
- neus_v/smooth_scoring.py +5 -4
- neus_v/utils.py +18 -0
- neus_v/veval/eval.py +7 -2
- neus_v/veval/parse.py +5 -0
- neus_v/vlm/vllm_client.py +98 -0
neus_v/automaton/video_automaton.py
CHANGED
@@ -18,6 +18,7 @@ class VideoAutomaton:
|
|
18 |
self.previous_states: list[VideoState] = []
|
19 |
self.states: list[VideoState] = []
|
20 |
self.transitions = []
|
|
|
21 |
self.include_initial_state = include_initial_state
|
22 |
|
23 |
def set_up(self, proposition_set: list[str]) -> None:
|
@@ -68,6 +69,7 @@ class VideoAutomaton:
|
|
68 |
# Build transitions from previous states to current states
|
69 |
if self.previous_states:
|
70 |
for prev_state in self.previous_states:
|
|
|
71 |
for cur_state in current_states:
|
72 |
transition = (
|
73 |
prev_state.state_index,
|
@@ -75,6 +77,7 @@ class VideoAutomaton:
|
|
75 |
cur_state.probability,
|
76 |
)
|
77 |
self.transitions.append(transition)
|
|
|
78 |
|
79 |
self.previous_states = current_states if current_states else self.previous_states
|
80 |
self.frame_index_in_automaton += 1
|
@@ -101,14 +104,26 @@ class VideoAutomaton:
|
|
101 |
(prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
|
102 |
)
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
|
105 |
"""Update the probability of propositions."""
|
106 |
for i, prop in enumerate(self.proposition_set):
|
107 |
-
prop = prop.replace("_", " ")
|
108 |
if frame.object_of_interest.get(prop):
|
109 |
probability = frame.object_of_interest[prop].get_probability()
|
110 |
else:
|
111 |
-
|
|
|
|
|
|
|
|
|
112 |
self.probability_of_propositions[i].append(round(probability, 2))
|
113 |
|
114 |
def _create_label_combinations(self, num_props: int) -> list[str]:
|
|
|
18 |
self.previous_states: list[VideoState] = []
|
19 |
self.states: list[VideoState] = []
|
20 |
self.transitions = []
|
21 |
+
self.transition_map = {}
|
22 |
self.include_initial_state = include_initial_state
|
23 |
|
24 |
def set_up(self, proposition_set: list[str]) -> None:
|
|
|
69 |
# Build transitions from previous states to current states
|
70 |
if self.previous_states:
|
71 |
for prev_state in self.previous_states:
|
72 |
+
self.transition_map[prev_state.state_index] = []
|
73 |
for cur_state in current_states:
|
74 |
transition = (
|
75 |
prev_state.state_index,
|
|
|
77 |
cur_state.probability,
|
78 |
)
|
79 |
self.transitions.append(transition)
|
80 |
+
self.transition_map[prev_state.state_index].append(cur_state.state_index)
|
81 |
|
82 |
self.previous_states = current_states if current_states else self.previous_states
|
83 |
self.frame_index_in_automaton += 1
|
|
|
104 |
(prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
|
105 |
)
|
106 |
|
107 |
+
def get_frame_to_state_index(self) -> dict[int, list[int]]:
|
108 |
+
"""Get frame to state index mapping."""
|
109 |
+
data = {}
|
110 |
+
for state in self.states:
|
111 |
+
if state.frame_index not in data:
|
112 |
+
data[state.frame_index] = []
|
113 |
+
data[state.frame_index].append(state.state_index)
|
114 |
+
return data
|
115 |
+
|
116 |
def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
|
117 |
"""Update the probability of propositions."""
|
118 |
for i, prop in enumerate(self.proposition_set):
|
|
|
119 |
if frame.object_of_interest.get(prop):
|
120 |
probability = frame.object_of_interest[prop].get_probability()
|
121 |
else:
|
122 |
+
prop = prop.replace("_", " ")
|
123 |
+
if frame.object_of_interest.get(prop):
|
124 |
+
probability = frame.object_of_interest[prop].get_probability()
|
125 |
+
else:
|
126 |
+
probability = 0.0
|
127 |
self.probability_of_propositions[i].append(round(probability, 2))
|
128 |
|
129 |
def _create_label_combinations(self, num_props: int) -> list[str]:
|
neus_v/calibration/__init__.py
ADDED
File without changes
|
neus_v/calibration/vlm.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def smooth_mapping(
|
6 |
+
confidence: float,
|
7 |
+
true_threshold=0.95,
|
8 |
+
false_threshold=0.40,
|
9 |
+
target_conf=0.60,
|
10 |
+
target_prob=0.78,
|
11 |
+
p_min=0.01,
|
12 |
+
p_max=0.99,
|
13 |
+
steepness_factor=0.7, # New parameter: 0-1 range, lower = less steep
|
14 |
+
) -> float:
|
15 |
+
"""Map confidence to probability using a sigmoid function with adjustable steepness.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
confidence: Input confidence score
|
19 |
+
true_threshold: Upper threshold (0.78)
|
20 |
+
false_threshold: Lower threshold (0.40)
|
21 |
+
target_conf: Target confidence point (0.60)
|
22 |
+
target_prob: Target probability value (0.78)
|
23 |
+
p_min: Minimum probability (0.01)
|
24 |
+
p_max: Maximum probability (0.99)
|
25 |
+
steepness_factor: Controls curve steepness (0-1, lower = less steep)
|
26 |
+
"""
|
27 |
+
if confidence <= false_threshold:
|
28 |
+
return p_min
|
29 |
+
|
30 |
+
if confidence >= true_threshold:
|
31 |
+
return p_max
|
32 |
+
|
33 |
+
# Calculate parameters to ensure target_conf maps to target_prob
|
34 |
+
# For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0)))
|
35 |
+
|
36 |
+
# First, normalize the target point
|
37 |
+
x_norm = (target_conf - false_threshold) / (true_threshold - false_threshold)
|
38 |
+
y_norm = (target_prob - p_min) / (p_max - p_min)
|
39 |
+
|
40 |
+
# Find x0 (midpoint) and k (steepness) to satisfy our target point
|
41 |
+
x0 = 0.30 # Midpoint of normalized range
|
42 |
+
|
43 |
+
# Calculate base k value to hit the target point
|
44 |
+
base_k = -np.log(1 / y_norm - 1) / (x_norm - x0)
|
45 |
+
|
46 |
+
# Apply steepness factor (lower = less steep)
|
47 |
+
k = base_k * steepness_factor
|
48 |
+
|
49 |
+
# With reduced steepness, we need to adjust x0 to still hit the target point
|
50 |
+
# Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k
|
51 |
+
adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k
|
52 |
+
|
53 |
+
# Apply the sigmoid with our calculated parameters
|
54 |
+
x_scaled = (confidence - false_threshold) / (true_threshold - false_threshold)
|
55 |
+
sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0)))
|
56 |
+
|
57 |
+
# Ensure we still hit exactly p_min and p_max at the thresholds
|
58 |
+
# by rescaling the output slightly
|
59 |
+
min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0)))
|
60 |
+
max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0)))
|
61 |
+
|
62 |
+
# Normalize the output
|
63 |
+
normalized = (sigmoid_value - min_val) / (max_val - min_val)
|
64 |
+
|
65 |
+
return p_min + normalized * (p_max - p_min)
|
66 |
+
|
67 |
+
|
68 |
+
def main():
|
69 |
+
# Visualize the function
|
70 |
+
x = np.linspace(0, 1, 1000)
|
71 |
+
y = [smooth_mapping(xi) for xi in x]
|
72 |
+
|
73 |
+
plt.figure(figsize=(10, 6))
|
74 |
+
plt.plot(x, y, "r-", label="Mapping Function", linewidth=2)
|
75 |
+
|
76 |
+
# Add vertical lines for thresholds
|
77 |
+
plt.axvline(x=0.40, color="b", linestyle="--", label="False Threshold (0.40)")
|
78 |
+
plt.axvline(x=0.95, color="g", linestyle="--", label="True Threshold (0.95)")
|
79 |
+
|
80 |
+
# Add horizontal lines for probability limits
|
81 |
+
plt.axhline(y=0.01, color="r", linestyle=":", alpha=0.5)
|
82 |
+
plt.axhline(y=0.99, color="g", linestyle=":", alpha=0.5)
|
83 |
+
|
84 |
+
plt.grid(True, alpha=0.3)
|
85 |
+
plt.xlabel("Confidence Score")
|
86 |
+
plt.ylabel("Mapped Probability")
|
87 |
+
plt.title("Confidence to Probability Mapping (Matching Red Line)")
|
88 |
+
plt.legend()
|
89 |
+
plt.ylim(-0.05, 1.05)
|
90 |
+
plt.savefig("calibration.png")
|
91 |
+
|
92 |
+
# Print some example values
|
93 |
+
test_values = [0.4, 0.425, 0.6, 0.85, 0.9]
|
94 |
+
for val in test_values:
|
95 |
+
print(f"Confidence {val:.3f} → Probability {smooth_mapping(val):.3f}")
|
96 |
+
|
97 |
+
|
98 |
+
if __name__ == "__main__":
|
99 |
+
main()
|
neus_v/model_checking/stormpy.py
CHANGED
@@ -174,7 +174,8 @@ class StormModelChecker:
|
|
174 |
|
175 |
# Check probabilities
|
176 |
for state, prob_sum in outgoing_probs.items():
|
177 |
-
if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
|
|
|
178 |
logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
|
179 |
|
180 |
# ... (existing logging code) ...
|
|
|
174 |
|
175 |
# Check probabilities
|
176 |
for state, prob_sum in outgoing_probs.items():
|
177 |
+
# if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
|
178 |
+
if not math.isclose(prob_sum, 1.0, abs_tol=1e-2):
|
179 |
logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
|
180 |
|
181 |
# ... (existing logging code) ...
|
neus_v/smooth_scoring.py
CHANGED
@@ -14,7 +14,8 @@ class DataTransformer:
|
|
14 |
|
15 |
|
16 |
def smooth_confidence_scores(target_data, prior_distribution=None):
|
17 |
-
if prior_distribution is None:
|
18 |
-
|
19 |
-
transformer = DataTransformer(prior_distribution)
|
20 |
-
return transformer.mapping_function(target_data)
|
|
|
|
14 |
|
15 |
|
16 |
def smooth_confidence_scores(target_data, prior_distribution=None):
|
17 |
+
# if prior_distribution is None:
|
18 |
+
# prior_distribution = target_data
|
19 |
+
# transformer = DataTransformer(prior_distribution)
|
20 |
+
# return transformer.mapping_function(target_data)
|
21 |
+
return target_data[0]
|
neus_v/utils.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
import gc
|
|
|
|
|
2 |
|
|
|
3 |
import torch
|
4 |
|
5 |
|
@@ -8,3 +11,18 @@ def clear_gpu_memory():
|
|
8 |
if torch.cuda.is_available():
|
9 |
torch.cuda.ipc_collect()
|
10 |
gc.collect()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gc
|
2 |
+
import os
|
3 |
+
import random
|
4 |
|
5 |
+
import numpy as np
|
6 |
import torch
|
7 |
|
8 |
|
|
|
11 |
if torch.cuda.is_available():
|
12 |
torch.cuda.ipc_collect()
|
13 |
gc.collect()
|
14 |
+
|
15 |
+
|
16 |
+
def set_seed(seed: int = 42) -> None:
|
17 |
+
np.random.seed(seed)
|
18 |
+
random.seed(seed)
|
19 |
+
torch.manual_seed(seed)
|
20 |
+
torch.cuda.manual_seed(seed)
|
21 |
+
|
22 |
+
# When running on the CuDNN backend, two further options must be set
|
23 |
+
torch.backends.cudnn.deterministic = True
|
24 |
+
torch.backends.cudnn.benchmark = False
|
25 |
+
|
26 |
+
# Set a fixed value for the hash seed
|
27 |
+
os.environ["PYTHONHASHSEED"] = str(seed)
|
28 |
+
print(f"Random seed set as {seed}")
|
neus_v/veval/eval.py
CHANGED
@@ -6,7 +6,7 @@ from joblib import Parallel, delayed
|
|
6 |
|
7 |
from neus_v.automaton.video_automaton import VideoAutomaton
|
8 |
from neus_v.model_checking.stormpy import StormModelChecker
|
9 |
-
from neus_v.veval.parse import parse_tl_formula
|
10 |
from neus_v.video.frame import VideoFrame
|
11 |
from neus_v.video.read_video import read_video
|
12 |
|
@@ -35,6 +35,7 @@ def evaluate_video(
|
|
35 |
tl_spec: str,
|
36 |
parallel_inference: bool = False,
|
37 |
threshold: float = 0.1,
|
|
|
38 |
) -> dict:
|
39 |
"""Evaluate a video using the given vision language model."""
|
40 |
output_log = {
|
@@ -52,6 +53,7 @@ def evaluate_video(
|
|
52 |
|
53 |
# TODO: if there's F in the tl_spec
|
54 |
ltl_formula = parse_tl_formula(tl_spec)
|
|
|
55 |
|
56 |
video_automaton = VideoAutomaton(include_initial_state=True)
|
57 |
|
@@ -97,10 +99,12 @@ def evaluate_video(
|
|
97 |
return video_frame, object_of_interest
|
98 |
|
99 |
if parallel_inference:
|
|
|
100 |
results = Parallel(n_jobs=len(all_frames))(
|
101 |
delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(all_frames)
|
102 |
)
|
103 |
else:
|
|
|
104 |
results = [process_frame(frame_img, i) for i, frame_img in enumerate(all_frames)]
|
105 |
|
106 |
for video_frame, object_of_interest in results:
|
@@ -163,6 +167,7 @@ def evaluate_video_with_sequence_of_images(
|
|
163 |
|
164 |
# TODO: if there's F in the tl_spec
|
165 |
ltl_formula = parse_tl_formula(tl_spec)
|
|
|
166 |
|
167 |
video_automaton = VideoAutomaton(include_initial_state=True)
|
168 |
|
@@ -191,7 +196,7 @@ def evaluate_video_with_sequence_of_images(
|
|
191 |
detected_object = vision_language_model.detect(
|
192 |
seq_of_frames=sequence_of_frames,
|
193 |
scene_description=proposition,
|
194 |
-
confidence_as_token_probability=confidence_as_token_probability,
|
195 |
threshold=threshold,
|
196 |
)
|
197 |
object_of_interest[proposition] = detected_object
|
|
|
6 |
|
7 |
from neus_v.automaton.video_automaton import VideoAutomaton
|
8 |
from neus_v.model_checking.stormpy import StormModelChecker
|
9 |
+
from neus_v.veval.parse import parse_tl_formula, parse_until_to_next_frame
|
10 |
from neus_v.video.frame import VideoFrame
|
11 |
from neus_v.video.read_video import read_video
|
12 |
|
|
|
35 |
tl_spec: str,
|
36 |
parallel_inference: bool = False,
|
37 |
threshold: float = 0.1,
|
38 |
+
num_of_frame_in_sequence: int = 1,
|
39 |
) -> dict:
|
40 |
"""Evaluate a video using the given vision language model."""
|
41 |
output_log = {
|
|
|
53 |
|
54 |
# TODO: if there's F in the tl_spec
|
55 |
ltl_formula = parse_tl_formula(tl_spec)
|
56 |
+
ltl_formula = parse_until_to_next_frame(ltl_formula)
|
57 |
|
58 |
video_automaton = VideoAutomaton(include_initial_state=True)
|
59 |
|
|
|
99 |
return video_frame, object_of_interest
|
100 |
|
101 |
if parallel_inference:
|
102 |
+
frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
|
103 |
results = Parallel(n_jobs=len(all_frames))(
|
104 |
delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(all_frames)
|
105 |
)
|
106 |
else:
|
107 |
+
frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
|
108 |
results = [process_frame(frame_img, i) for i, frame_img in enumerate(all_frames)]
|
109 |
|
110 |
for video_frame, object_of_interest in results:
|
|
|
167 |
|
168 |
# TODO: if there's F in the tl_spec
|
169 |
ltl_formula = parse_tl_formula(tl_spec)
|
170 |
+
ltl_formula = parse_until_to_next_frame(ltl_formula)
|
171 |
|
172 |
video_automaton = VideoAutomaton(include_initial_state=True)
|
173 |
|
|
|
196 |
detected_object = vision_language_model.detect(
|
197 |
seq_of_frames=sequence_of_frames,
|
198 |
scene_description=proposition,
|
199 |
+
# confidence_as_token_probability=confidence_as_token_probability,
|
200 |
threshold=threshold,
|
201 |
)
|
202 |
object_of_interest[proposition] = detected_object
|
neus_v/veval/parse.py
CHANGED
@@ -27,3 +27,8 @@ def parse_proposition_set(proposition_set: list[str]) -> list[str]:
|
|
27 |
def parse_tl_specification(tl_spec: str) -> str:
|
28 |
"""Parse the tl specification."""
|
29 |
return tl_spec.replace("-", "_")
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def parse_tl_specification(tl_spec: str) -> str:
|
28 |
"""Parse the tl specification."""
|
29 |
return tl_spec.replace("-", "_")
|
30 |
+
|
31 |
+
|
32 |
+
def parse_until_to_next_frame(tl_spec: str) -> str:
|
33 |
+
"""Parse the tl specification."""
|
34 |
+
return tl_spec.replace("U", "& F")
|
neus_v/vlm/vllm_client.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
from openai import OpenAI
|
6 |
+
|
7 |
+
from neus_v.calibration.vlm import smooth_mapping
|
8 |
+
from neus_v.vlm.obj import DetectedObject
|
9 |
+
|
10 |
+
|
11 |
+
class VLLMClient:
|
12 |
+
def __init__(
|
13 |
+
self,
|
14 |
+
api_key="EMPTY",
|
15 |
+
api_base="http://localhost:8000/v1",
|
16 |
+
model="OpenGVLab/InternVL2_5-8B",
|
17 |
+
):
|
18 |
+
self.client = OpenAI(api_key=api_key, base_url=api_base)
|
19 |
+
self.model = model
|
20 |
+
self.system_message = (
|
21 |
+
"You must only return a Yes or No, and not both, to any question asked.\n"
|
22 |
+
"You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.\n"
|
23 |
+
"For example, if the question is 'Is there a cat present in the Image?', the answer must only be 'Yes' or 'No'."
|
24 |
+
)
|
25 |
+
|
26 |
+
# def _encode_frame(self, frame):
|
27 |
+
# return base64.b64encode(frame.tobytes()).decode("utf-8")
|
28 |
+
def _encode_frame(self, frame):
|
29 |
+
# Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.
|
30 |
+
ret, buffer = cv2.imencode(".jpg", frame)
|
31 |
+
if not ret:
|
32 |
+
raise ValueError("Could not encode frame")
|
33 |
+
return base64.b64encode(buffer).decode("utf-8")
|
34 |
+
|
35 |
+
def detect(self, seq_of_frames, scene_description, threshold=None, **kwargs):
|
36 |
+
# Encode each frame.
|
37 |
+
encoded_images = [self._encode_frame(frame) for frame in seq_of_frames]
|
38 |
+
|
39 |
+
# Build the user message: a text prompt plus one image for each frame.
|
40 |
+
user_content = [
|
41 |
+
{
|
42 |
+
"type": "text",
|
43 |
+
"text": f"Does the sequence of these images depict '{scene_description}'",
|
44 |
+
}
|
45 |
+
]
|
46 |
+
for encoded in encoded_images:
|
47 |
+
user_content.append(
|
48 |
+
{
|
49 |
+
"type": "image_url",
|
50 |
+
"image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
|
51 |
+
}
|
52 |
+
)
|
53 |
+
|
54 |
+
# Create a chat completion request.
|
55 |
+
chat_response = self.client.chat.completions.create(
|
56 |
+
model=self.model,
|
57 |
+
messages=[
|
58 |
+
{"role": "system", "content": self.system_message},
|
59 |
+
{"role": "user", "content": user_content},
|
60 |
+
],
|
61 |
+
max_tokens=1,
|
62 |
+
temperature=0.0,
|
63 |
+
logprobs=True,
|
64 |
+
top_logprobs=20,
|
65 |
+
)
|
66 |
+
|
67 |
+
# Retrieve the list of TopLogprob objects.
|
68 |
+
top_logprobs_list = chat_response.choices[0].logprobs.content[0].top_logprobs
|
69 |
+
|
70 |
+
# Build a mapping from token text (stripped) to its probability.
|
71 |
+
token_prob_map = {}
|
72 |
+
for top_logprob in top_logprobs_list:
|
73 |
+
token_text = top_logprob.token.strip()
|
74 |
+
token_prob_map[token_text] = np.exp(top_logprob.logprob)
|
75 |
+
|
76 |
+
# Extract probabilities for "Yes" and "No"
|
77 |
+
yes_prob = token_prob_map.get("Yes", 0.0)
|
78 |
+
no_prob = token_prob_map.get("No", 0.0)
|
79 |
+
|
80 |
+
# Compute the normalized probability for "Yes": p_yes / (p_yes + p_no)
|
81 |
+
if yes_prob + no_prob > 0:
|
82 |
+
confidence = yes_prob / (yes_prob + no_prob)
|
83 |
+
else:
|
84 |
+
raise ValueError("No probabilities for 'Yes' or 'No' found in the response.")
|
85 |
+
|
86 |
+
if threshold:
|
87 |
+
confidence = smooth_mapping(confidence=confidence, false_threshold=threshold)
|
88 |
+
if confidence < threshold:
|
89 |
+
no_prob = 1.0
|
90 |
+
|
91 |
+
return DetectedObject(
|
92 |
+
name=scene_description,
|
93 |
+
model_name=self.model,
|
94 |
+
confidence=round(confidence, 3),
|
95 |
+
probability=round(confidence, 3),
|
96 |
+
number_of_detection=1,
|
97 |
+
is_detected=yes_prob > no_prob, # TODO: Check if this is correct
|
98 |
+
)
|