Spaces:

Syzygianinfern0
/

NeuS-V

Sleeping

App Files Files Community

Syzygianinfern0 commited on Apr 6

Commit

8a7a9dd

1 Parent(s): 31d38b7

Update everything from iccv submission

Browse files

Files changed (9) hide show

neus_v/automaton/video_automaton.py +17 -2
neus_v/calibration/__init__.py +0 -0
neus_v/calibration/vlm.py +99 -0
neus_v/model_checking/stormpy.py +2 -1
neus_v/smooth_scoring.py +5 -4
neus_v/utils.py +18 -0
neus_v/veval/eval.py +7 -2
neus_v/veval/parse.py +5 -0
neus_v/vlm/vllm_client.py +98 -0

neus_v/automaton/video_automaton.py CHANGED Viewed

@@ -18,6 +18,7 @@ class VideoAutomaton:
         self.previous_states: list[VideoState] = []
         self.states: list[VideoState] = []
         self.transitions = []
         self.include_initial_state = include_initial_state
     def set_up(self, proposition_set: list[str]) -> None:
@@ -68,6 +69,7 @@ class VideoAutomaton:
         # Build transitions from previous states to current states
         if self.previous_states:
             for prev_state in self.previous_states:
                 for cur_state in current_states:
                     transition = (
                         prev_state.state_index,
@@ -75,6 +77,7 @@ class VideoAutomaton:
                         cur_state.probability,
                     )
                     self.transitions.append(transition)
         self.previous_states = current_states if current_states else self.previous_states
         self.frame_index_in_automaton += 1
@@ -101,14 +104,26 @@ class VideoAutomaton:
                 (prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
             )
     def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
         """Update the probability of propositions."""
         for i, prop in enumerate(self.proposition_set):
-            prop = prop.replace("_", " ")
             if frame.object_of_interest.get(prop):
                 probability = frame.object_of_interest[prop].get_probability()
             else:
-                probability = 0.0
             self.probability_of_propositions[i].append(round(probability, 2))
     def _create_label_combinations(self, num_props: int) -> list[str]:

         self.previous_states: list[VideoState] = []
         self.states: list[VideoState] = []
         self.transitions = []
+        self.transition_map = {}
         self.include_initial_state = include_initial_state
     def set_up(self, proposition_set: list[str]) -> None:
         # Build transitions from previous states to current states
         if self.previous_states:
             for prev_state in self.previous_states:
+                self.transition_map[prev_state.state_index] = []
                 for cur_state in current_states:
                     transition = (
                         prev_state.state_index,
                         cur_state.probability,
                     )
                     self.transitions.append(transition)
+                    self.transition_map[prev_state.state_index].append(cur_state.state_index)
         self.previous_states = current_states if current_states else self.previous_states
         self.frame_index_in_automaton += 1
                 (prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
             )
+    def get_frame_to_state_index(self) -> dict[int, list[int]]:
+        """Get frame to state index mapping."""
+        data = {}
+        for state in self.states:
+            if state.frame_index not in data:
+                data[state.frame_index] = []
+            data[state.frame_index].append(state.state_index)
+        return data
     def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
         """Update the probability of propositions."""
         for i, prop in enumerate(self.proposition_set):
             if frame.object_of_interest.get(prop):
                 probability = frame.object_of_interest[prop].get_probability()
             else:
+                prop = prop.replace("_", " ")
+                if frame.object_of_interest.get(prop):
+                    probability = frame.object_of_interest[prop].get_probability()
+                else:
+                    probability = 0.0
             self.probability_of_propositions[i].append(round(probability, 2))
     def _create_label_combinations(self, num_props: int) -> list[str]:

neus_v/calibration/__init__.py ADDED Viewed

File without changes

neus_v/calibration/vlm.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import matplotlib.pyplot as plt
+import numpy as np
+def smooth_mapping(
+    confidence: float,
+    true_threshold=0.95,
+    false_threshold=0.40,
+    target_conf=0.60,
+    target_prob=0.78,
+    p_min=0.01,
+    p_max=0.99,
+    steepness_factor=0.7,  # New parameter: 0-1 range, lower = less steep
+) -> float:
+    """Map confidence to probability using a sigmoid function with adjustable steepness.
+    Args:
+        confidence: Input confidence score
+        true_threshold: Upper threshold (0.78)
+        false_threshold: Lower threshold (0.40)
+        target_conf: Target confidence point (0.60)
+        target_prob: Target probability value (0.78)
+        p_min: Minimum probability (0.01)
+        p_max: Maximum probability (0.99)
+        steepness_factor: Controls curve steepness (0-1, lower = less steep)
+    """
+    if confidence <= false_threshold:
+        return p_min
+    if confidence >= true_threshold:
+        return p_max
+    # Calculate parameters to ensure target_conf maps to target_prob
+    # For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0)))
+    # First, normalize the target point
+    x_norm = (target_conf - false_threshold) / (true_threshold - false_threshold)
+    y_norm = (target_prob - p_min) / (p_max - p_min)
+    # Find x0 (midpoint) and k (steepness) to satisfy our target point
+    x0 = 0.30  # Midpoint of normalized range
+    # Calculate base k value to hit the target point
+    base_k = -np.log(1 / y_norm - 1) / (x_norm - x0)
+    # Apply steepness factor (lower = less steep)
+    k = base_k * steepness_factor
+    # With reduced steepness, we need to adjust x0 to still hit the target point
+    # Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k
+    adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k
+    # Apply the sigmoid with our calculated parameters
+    x_scaled = (confidence - false_threshold) / (true_threshold - false_threshold)
+    sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0)))
+    # Ensure we still hit exactly p_min and p_max at the thresholds
+    # by rescaling the output slightly
+    min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0)))
+    max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0)))
+    # Normalize the output
+    normalized = (sigmoid_value - min_val) / (max_val - min_val)
+    return p_min + normalized * (p_max - p_min)
+def main():
+    # Visualize the function
+    x = np.linspace(0, 1, 1000)
+    y = [smooth_mapping(xi) for xi in x]
+    plt.figure(figsize=(10, 6))
+    plt.plot(x, y, "r-", label="Mapping Function", linewidth=2)
+    # Add vertical lines for thresholds
+    plt.axvline(x=0.40, color="b", linestyle="--", label="False Threshold (0.40)")
+    plt.axvline(x=0.95, color="g", linestyle="--", label="True Threshold (0.95)")
+    # Add horizontal lines for probability limits
+    plt.axhline(y=0.01, color="r", linestyle=":", alpha=0.5)
+    plt.axhline(y=0.99, color="g", linestyle=":", alpha=0.5)
+    plt.grid(True, alpha=0.3)
+    plt.xlabel("Confidence Score")
+    plt.ylabel("Mapped Probability")
+    plt.title("Confidence to Probability Mapping (Matching Red Line)")
+    plt.legend()
+    plt.ylim(-0.05, 1.05)
+    plt.savefig("calibration.png")
+    # Print some example values
+    test_values = [0.4, 0.425, 0.6, 0.85, 0.9]
+    for val in test_values:
+        print(f"Confidence {val:.3f} → Probability {smooth_mapping(val):.3f}")
+if __name__ == "__main__":
+    main()

neus_v/model_checking/stormpy.py CHANGED Viewed

@@ -174,7 +174,8 @@ class StormModelChecker:
             # Check probabilities
             for state, prob_sum in outgoing_probs.items():
-                if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
                     logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
             # ... (existing logging code) ...

             # Check probabilities
             for state, prob_sum in outgoing_probs.items():
+                # if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
+                if not math.isclose(prob_sum, 1.0, abs_tol=1e-2):
                     logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
             # ... (existing logging code) ...

neus_v/smooth_scoring.py CHANGED Viewed

@@ -14,7 +14,8 @@ class DataTransformer:
 def smooth_confidence_scores(target_data, prior_distribution=None):
-    if prior_distribution is None:
-        prior_distribution = target_data
-    transformer = DataTransformer(prior_distribution)
-    return transformer.mapping_function(target_data)

 def smooth_confidence_scores(target_data, prior_distribution=None):
+    # if prior_distribution is None:
+    #     prior_distribution = target_data
+    # transformer = DataTransformer(prior_distribution)
+    # return transformer.mapping_function(target_data)
+    return target_data[0]

neus_v/utils.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import gc
 import torch
@@ -8,3 +11,18 @@ def clear_gpu_memory():
     if torch.cuda.is_available():
         torch.cuda.ipc_collect()
     gc.collect()

 import gc
+import os
+import random
+import numpy as np
 import torch
     if torch.cuda.is_available():
         torch.cuda.ipc_collect()
     gc.collect()
+def set_seed(seed: int = 42) -> None:
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    # When running on the CuDNN backend, two further options must be set
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    # Set a fixed value for the hash seed
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    print(f"Random seed set as {seed}")

neus_v/veval/eval.py CHANGED Viewed

@@ -6,7 +6,7 @@ from joblib import Parallel, delayed
 from neus_v.automaton.video_automaton import VideoAutomaton
 from neus_v.model_checking.stormpy import StormModelChecker
-from neus_v.veval.parse import parse_tl_formula
 from neus_v.video.frame import VideoFrame
 from neus_v.video.read_video import read_video
@@ -35,6 +35,7 @@ def evaluate_video(
     tl_spec: str,
     parallel_inference: bool = False,
     threshold: float = 0.1,
 ) -> dict:
     """Evaluate a video using the given vision language model."""
     output_log = {
@@ -52,6 +53,7 @@ def evaluate_video(
     # TODO: if there's F in the tl_spec
     ltl_formula = parse_tl_formula(tl_spec)
     video_automaton = VideoAutomaton(include_initial_state=True)
@@ -97,10 +99,12 @@ def evaluate_video(
                 return video_frame, object_of_interest
             if parallel_inference:
                 results = Parallel(n_jobs=len(all_frames))(
                     delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(all_frames)
                 )
             else:
                 results = [process_frame(frame_img, i) for i, frame_img in enumerate(all_frames)]
             for video_frame, object_of_interest in results:
@@ -163,6 +167,7 @@ def evaluate_video_with_sequence_of_images(
     # TODO: if there's F in the tl_spec
     ltl_formula = parse_tl_formula(tl_spec)
     video_automaton = VideoAutomaton(include_initial_state=True)
@@ -191,7 +196,7 @@ def evaluate_video_with_sequence_of_images(
                     detected_object = vision_language_model.detect(
                         seq_of_frames=sequence_of_frames,
                         scene_description=proposition,
-                        confidence_as_token_probability=confidence_as_token_probability,
                         threshold=threshold,
                     )
                     object_of_interest[proposition] = detected_object

 from neus_v.automaton.video_automaton import VideoAutomaton
 from neus_v.model_checking.stormpy import StormModelChecker
+from neus_v.veval.parse import parse_tl_formula, parse_until_to_next_frame
 from neus_v.video.frame import VideoFrame
 from neus_v.video.read_video import read_video
     tl_spec: str,
     parallel_inference: bool = False,
     threshold: float = 0.1,
+    num_of_frame_in_sequence: int = 1,
 ) -> dict:
     """Evaluate a video using the given vision language model."""
     output_log = {
     # TODO: if there's F in the tl_spec
     ltl_formula = parse_tl_formula(tl_spec)
+    ltl_formula = parse_until_to_next_frame(ltl_formula)
     video_automaton = VideoAutomaton(include_initial_state=True)
                 return video_frame, object_of_interest
             if parallel_inference:
+                frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
                 results = Parallel(n_jobs=len(all_frames))(
                     delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(all_frames)
                 )
             else:
+                frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
                 results = [process_frame(frame_img, i) for i, frame_img in enumerate(all_frames)]
             for video_frame, object_of_interest in results:
     # TODO: if there's F in the tl_spec
     ltl_formula = parse_tl_formula(tl_spec)
+    ltl_formula = parse_until_to_next_frame(ltl_formula)
     video_automaton = VideoAutomaton(include_initial_state=True)
                     detected_object = vision_language_model.detect(
                         seq_of_frames=sequence_of_frames,
                         scene_description=proposition,
+                        # confidence_as_token_probability=confidence_as_token_probability,
                         threshold=threshold,
                     )
                     object_of_interest[proposition] = detected_object

neus_v/veval/parse.py CHANGED Viewed

@@ -27,3 +27,8 @@ def parse_proposition_set(proposition_set: list[str]) -> list[str]:
 def parse_tl_specification(tl_spec: str) -> str:
     """Parse the tl specification."""
     return tl_spec.replace("-", "_")

 def parse_tl_specification(tl_spec: str) -> str:
     """Parse the tl specification."""
     return tl_spec.replace("-", "_")
+def parse_until_to_next_frame(tl_spec: str) -> str:
+    """Parse the tl specification."""
+    return tl_spec.replace("U", "& F")

neus_v/vlm/vllm_client.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import base64
+import cv2
+import numpy as np
+from openai import OpenAI
+from neus_v.calibration.vlm import smooth_mapping
+from neus_v.vlm.obj import DetectedObject
+class VLLMClient:
+    def __init__(
+        self,
+        api_key="EMPTY",
+        api_base="http://localhost:8000/v1",
+        model="OpenGVLab/InternVL2_5-8B",
+    ):
+        self.client = OpenAI(api_key=api_key, base_url=api_base)
+        self.model = model
+        self.system_message = (
+            "You must only return a Yes or No, and not both, to any question asked.\n"
+            "You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.\n"
+            "For example, if the question is 'Is there a cat present in the Image?', the answer must only be 'Yes' or 'No'."
+        )
+    # def _encode_frame(self, frame):
+    #     return base64.b64encode(frame.tobytes()).decode("utf-8")
+    def _encode_frame(self, frame):
+        # Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.
+        ret, buffer = cv2.imencode(".jpg", frame)
+        if not ret:
+            raise ValueError("Could not encode frame")
+        return base64.b64encode(buffer).decode("utf-8")
+    def detect(self, seq_of_frames, scene_description, threshold=None, **kwargs):
+        # Encode each frame.
+        encoded_images = [self._encode_frame(frame) for frame in seq_of_frames]
+        # Build the user message: a text prompt plus one image for each frame.
+        user_content = [
+            {
+                "type": "text",
+                "text": f"Does the sequence of these images depict '{scene_description}'",
+            }
+        ]
+        for encoded in encoded_images:
+            user_content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
+                }
+            )
+        # Create a chat completion request.
+        chat_response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": self.system_message},
+                {"role": "user", "content": user_content},
+            ],
+            max_tokens=1,
+            temperature=0.0,
+            logprobs=True,
+            top_logprobs=20,
+        )
+        # Retrieve the list of TopLogprob objects.
+        top_logprobs_list = chat_response.choices[0].logprobs.content[0].top_logprobs
+        # Build a mapping from token text (stripped) to its probability.
+        token_prob_map = {}
+        for top_logprob in top_logprobs_list:
+            token_text = top_logprob.token.strip()
+            token_prob_map[token_text] = np.exp(top_logprob.logprob)
+        # Extract probabilities for "Yes" and "No"
+        yes_prob = token_prob_map.get("Yes", 0.0)
+        no_prob = token_prob_map.get("No", 0.0)
+        # Compute the normalized probability for "Yes": p_yes / (p_yes + p_no)
+        if yes_prob + no_prob > 0:
+            confidence = yes_prob / (yes_prob + no_prob)
+        else:
+            raise ValueError("No probabilities for 'Yes' or 'No' found in the response.")
+        if threshold:
+            confidence = smooth_mapping(confidence=confidence, false_threshold=threshold)
+            if confidence < threshold:
+                no_prob = 1.0
+        return DetectedObject(
+            name=scene_description,
+            model_name=self.model,
+            confidence=round(confidence, 3),
+            probability=round(confidence, 3),
+            number_of_detection=1,
+            is_detected=yes_prob > no_prob,  # TODO: Check if this is correct
+        )