Syzygianinfern0 commited on
Commit
8d3e73e
·
0 Parent(s):

Add refactored codebase

Browse files
.gitignore ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
README.md ADDED
File without changes
evaluate.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import warnings
3
+ from pathlib import Path
4
+
5
+ import gradio as gr
6
+
7
+ from neus_v.smooth_scoring import smooth_confidence_scores
8
+ from neus_v.utils import clear_gpu_memory
9
+ from neus_v.veval.eval import evaluate_video_with_sequence_of_images
10
+ from neus_v.veval.parse import parse_proposition_set, parse_tl_specification
11
+ from neus_v.vlm.internvl import InternVL
12
+
13
+ # Suppress specific warnings
14
+ warnings.filterwarnings(
15
+ "ignore", category=DeprecationWarning, message="Conversion of an array with ndim > 0 to a scalar is deprecated"
16
+ )
17
+
18
+ # Paths and parameters
19
+ WEIGHT_PATH = Path("/opt/mars/mnt/model_weights")
20
+ pickle_path = WEIGHT_PATH / "distributions.pkl"
21
+ num_of_frame_in_sequence = 3
22
+ model = "InternVL2-8B"
23
+ device = 7
24
+ # Load the vision-language model
25
+ vision_language_model = InternVL(model_name=model, device=device)
26
+ # Load distributions
27
+ with open(pickle_path, "rb") as f:
28
+ distributions = pickle.load(f)
29
+ all_dimension_data = distributions.get(model).get("all_dimension")
30
+
31
+
32
+ # TODO: Make paths better for public release
33
+ def process_video(video_path, propositions, tl):
34
+ """Process the video and compute the score_on_all."""
35
+ proposition_set = parse_proposition_set(propositions.split(","))
36
+ tl_spec = parse_tl_specification(tl)
37
+ threshold = 0.349
38
+
39
+ try:
40
+ result = evaluate_video_with_sequence_of_images(
41
+ vision_language_model=vision_language_model,
42
+ confidence_as_token_probability=True,
43
+ video_path=video_path,
44
+ proposition_set=proposition_set,
45
+ tl_spec=tl_spec,
46
+ parallel_inference=False,
47
+ num_of_frame_in_sequence=num_of_frame_in_sequence,
48
+ threshold=threshold,
49
+ )
50
+ probability = result.get("probability")
51
+ score_on_all = float(
52
+ smooth_confidence_scores(
53
+ target_data=[probability],
54
+ prior_distribution=all_dimension_data,
55
+ )
56
+ )
57
+ clear_gpu_memory()
58
+ return score_on_all
59
+
60
+ except Exception as e:
61
+ clear_gpu_memory()
62
+ return f"Error: {str(e)}"
63
+
64
+
65
+ # Gradio interface
66
+ def demo_interface(video, propositions, tl):
67
+ """Wrapper for the Gradio interface."""
68
+ return process_video(video, propositions, tl)
69
+
70
+
71
+ def main():
72
+ # Example data from the original script
73
+ example_video_path_1 = "/opt/mars/mnt/dataset/teaser/A_storm_bursts_in_with_intermittent_lightning_and_causes_flooding_and_large_waves_crash_in.mp4"
74
+ example_video_path_2 = "/opt/mars/mnt/dataset/teaser/The ocean waves gently lapping at the shore, until a storm bursts in, and then lightning flashes across the sky.mp4"
75
+ example_propositions = "waves lapping,ocean shore,storm bursts in,lightning on the sky"
76
+ example_tl = '("waves_lapping" & "ocean_shore") U ("storm_bursts_in" U "lightning_on_the_sky")'
77
+
78
+ demo = gr.Interface(
79
+ fn=demo_interface,
80
+ inputs=[
81
+ gr.Video(label="Upload Video"),
82
+ gr.Textbox(label="List of Propositions (comma-separated)"),
83
+ gr.Textbox(label="Temporal Logic Specification"),
84
+ ],
85
+ outputs=gr.Textbox(label="Score on All"),
86
+ title="Video Evaluation with Temporal Logic",
87
+ description="Upload a video and provide propositions and temporal logic to evaluate the score_on_all.",
88
+ examples=[
89
+ [example_video_path_1, example_propositions, example_tl],
90
+ [example_video_path_2, example_propositions, example_tl],
91
+ ],
92
+ )
93
+
94
+ demo.launch(allowed_paths=["/opt/mars/mnt/dataset/teaser"])
95
+
96
+
97
+ if __name__ == "__main__":
98
+ main()
install.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #/bin/bash
2
+ # These are the commands that I used to install the necessary packages for the project
3
+ conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
4
+ pip install gradio
5
+ pip install transformers
6
+ pip install decord
7
+ pip install opencv-python
8
+ pip install joblib
9
+
10
+ # Storm
11
+ sudo apt install libboost-all-dev m4
12
+
13
+ mkdir build
14
+ cd build
15
+ wget https://github.com/moves-rwth/storm/archive/stable.zip
16
+ unzip stable.zip
17
+ cd storm-stable
18
+ mkdir build
19
+ cd build
20
+ cmake ..
21
+
22
+ # Carl
23
+ cd FILLEMEUP
24
+ git clone https://github.com/moves-rwth/carl-storm
25
+ cd carl-storm
26
+ mkdir build
27
+ cd build
28
+ cmake ..
29
+ make lib_carl
30
+
31
+
32
+
33
+ pip install pycarl
34
+ pip install stormpy
neus_v/automaton/__init__.py ADDED
File without changes
neus_v/automaton/video_automaton.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from neus_v.model_checking.proposition import process_proposition_set
2
+ from neus_v.model_checking.video_state import VideoState
3
+ from neus_v.video.frame import VideoFrame
4
+
5
+
6
+ class VideoAutomaton:
7
+ """Represents a Markov Automaton for video state modeling."""
8
+
9
+ def __init__(self, include_initial_state: bool = False) -> None:
10
+ """Initialize the MarkovAutomaton.
11
+
12
+ Args:
13
+ include_initial_state (bool, optional): Whether to include
14
+ the initial state. Defaults to False.
15
+ proposition_set (list[str] | None, optional): List of propositions.
16
+ Defaults to None.
17
+ """
18
+ self.previous_states: list[VideoState] = []
19
+ self.states: list[VideoState] = []
20
+ self.transitions = []
21
+ self.include_initial_state = include_initial_state
22
+
23
+ def set_up(self, proposition_set: list[str]) -> None:
24
+ """Set up the MarkovAutomaton."""
25
+ self.proposition_set = process_proposition_set(proposition_set)
26
+ self.label_combinations = self._create_label_combinations(len(proposition_set))
27
+ self.probability_of_propositions = [[] for _ in range(len(proposition_set))]
28
+ self.frame_index_in_automaton = 0
29
+
30
+ if self.include_initial_state:
31
+ initial_state = VideoState(
32
+ state_index=0,
33
+ frame_index=-1,
34
+ label="init",
35
+ proposition_set=proposition_set,
36
+ )
37
+ self.previous_states = [initial_state]
38
+ self.states = [initial_state]
39
+ self._current_state = initial_state
40
+
41
+ def reset(self) -> None:
42
+ """Reset automaton."""
43
+ self.__init__(self.include_initial_state)
44
+ self.set_up(self.proposition_set)
45
+
46
+ def add_frame(self, frame: VideoFrame) -> None:
47
+ """Add frame to automaton."""
48
+ self._get_probability_of_propositions(frame)
49
+ current_states = []
50
+ for prop_comb in self.label_combinations:
51
+ # iterate through all possible combinations of T and F
52
+ self._current_state = VideoState(
53
+ state_index=len(self.states),
54
+ frame_index=self.frame_index_in_automaton,
55
+ label=prop_comb,
56
+ proposition_set=self.proposition_set,
57
+ )
58
+ # TODO: Make a method for update and compute probability
59
+ self._current_state.update(
60
+ frame_index=self.frame_index_in_automaton,
61
+ target_label=prop_comb,
62
+ )
63
+ self._current_state.compute_probability(probabilities=self.probability_of_propositions)
64
+ if self._current_state.probability > 0:
65
+ self.states.append(self._current_state)
66
+ current_states.append(self._current_state)
67
+
68
+ # Build transitions from previous states to current states
69
+ if self.previous_states:
70
+ for prev_state in self.previous_states:
71
+ for cur_state in current_states:
72
+ transition = (
73
+ prev_state.state_index,
74
+ cur_state.state_index,
75
+ cur_state.probability,
76
+ )
77
+ self.transitions.append(transition)
78
+
79
+ self.previous_states = current_states if current_states else self.previous_states
80
+ self.frame_index_in_automaton += 1
81
+
82
+ def add_terminal_state(self, add_with_terminal_label: bool = False) -> None:
83
+ """Add terminal state to the automaton."""
84
+ if add_with_terminal_label:
85
+ terminal_state_index = len(self.states)
86
+ terminal_state = VideoState(
87
+ state_index=terminal_state_index,
88
+ frame_index=self.frame_index_in_automaton,
89
+ label="terminal",
90
+ proposition_set=self.proposition_set,
91
+ )
92
+ self.states.append(terminal_state)
93
+ self._current_state = terminal_state
94
+
95
+ self.transitions.extend(
96
+ (prev_state.state_index, terminal_state_index, 1.0) for prev_state in self.previous_states
97
+ )
98
+ self.transitions.append((terminal_state_index, terminal_state_index, 1.0))
99
+ else:
100
+ self.transitions.extend(
101
+ (prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
102
+ )
103
+
104
+ def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
105
+ """Update the probability of propositions."""
106
+ for i, prop in enumerate(self.proposition_set):
107
+ prop = prop.replace("_", " ")
108
+ if frame.object_of_interest.get(prop):
109
+ probability = frame.object_of_interest[prop].get_probability()
110
+ else:
111
+ probability = 0.0
112
+ self.probability_of_propositions[i].append(round(probability, 2))
113
+
114
+ def _create_label_combinations(self, num_props: int) -> list[str]:
115
+ """Create all possible combinations of T and F for the number of propositions.
116
+
117
+ Args:
118
+ num_props (int): Number of propositions.
119
+
120
+ Returns:
121
+ list[str]: List of all possible combinations of T and F.
122
+ """ # noqa: E501
123
+ label_list = []
124
+
125
+ def add_labels(num_props: int, label: str, label_list: list[str]) -> None:
126
+ if len(label) == num_props:
127
+ label_list.append(label)
128
+ return
129
+ add_labels(num_props, label + "T", label_list)
130
+ add_labels(num_props, label + "F", label_list)
131
+
132
+ add_labels(num_props, "", label_list)
133
+ return label_list
neus_v/model_checking/__init__.py ADDED
File without changes
neus_v/model_checking/proposition.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def process_proposition_set(proposition_set: list[str]) -> list:
2
+ """Process proposition set."""
3
+ new_set = []
4
+ for proposition in proposition_set:
5
+ new_set.append(proposition.replace(" ", "_"))
6
+ return new_set
neus_v/model_checking/stormpy.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+
4
+ import numpy as np
5
+ import stormpy
6
+ import stormpy.examples.files
7
+ from stormpy.core import ExplicitQualitativeCheckResult
8
+
9
+ from neus_v.model_checking.proposition import process_proposition_set
10
+ from neus_v.model_checking.video_state import VideoState
11
+
12
+
13
+ class StormModelChecker:
14
+ """Model Checker using Stormpy for verifying properties."""
15
+
16
+ def __init__(
17
+ self,
18
+ proposition_set: list[str],
19
+ ltl_formula: str,
20
+ ) -> None:
21
+ """Initialize the StormModelChecker.
22
+
23
+ Args:
24
+ proposition_set: List of propositions.
25
+ ltl_formula: LTL formula to check.
26
+ verbose: Enable verbose output.
27
+ is_filter: Apply filtering to results.
28
+ """
29
+ self.proposition_set = process_proposition_set(proposition_set)
30
+ self.ltl_formula = ltl_formula
31
+
32
+ def create_model(
33
+ self,
34
+ transitions: list[tuple[int, int, float]],
35
+ states: list[VideoState],
36
+ model_type: str = "sparse_ma",
37
+ ) -> any:
38
+ """Create model.
39
+
40
+ Args:
41
+ transitions (list[tuple[int, int, float]]): List of transitions.
42
+ states (list[VideoState]): List of states.
43
+ model_type (str): Type of model to create ("sparse_ma" or "dtmc").
44
+ verbose (bool): Whether to print verbose output.
45
+ """
46
+ state_labeling = self._build_label_func(states, self.proposition_set)
47
+ if model_type in ["sparse_ma", "mdp"]:
48
+ transition_matrix = self._build_trans_matrix(
49
+ transitions=transitions,
50
+ states=states,
51
+ model_type="nondeterministic",
52
+ )
53
+ else:
54
+ transition_matrix = self._build_trans_matrix(
55
+ transitions=transitions,
56
+ states=states,
57
+ model_type="deterministic",
58
+ )
59
+ components = stormpy.SparseModelComponents(
60
+ transition_matrix=transition_matrix,
61
+ state_labeling=state_labeling,
62
+ )
63
+ if model_type == "sparse_ma":
64
+ markovian_states = stormpy.BitVector(len(states), list(range(len(states))))
65
+ components.markovian_states = markovian_states
66
+ components.exit_rates = [1.0 for _ in range(len(states))]
67
+ model = stormpy.SparseMA(components)
68
+ elif model_type == "dtmc":
69
+ model = stormpy.storage.SparseDtmc(components)
70
+ elif model_type == "mdp":
71
+ model = stormpy.storage.SparseMdp(components)
72
+ else:
73
+ msg = f"Unsupported model type: {model_type}"
74
+ raise ValueError(msg)
75
+ return model
76
+
77
+ def check_automaton(
78
+ self,
79
+ transitions: list[tuple[int, int, float]],
80
+ states: list[VideoState],
81
+ model_type: str = "sparse_ma",
82
+ use_filter: bool = False,
83
+ ) -> any:
84
+ """Check automaton.
85
+
86
+ Args:
87
+ transitions: List of transitions.
88
+ states: List of states.
89
+ verbose: Enable verbose output.
90
+ use_filter: Apply filtering to results.
91
+ """
92
+ model = self.create_model(
93
+ transitions=transitions,
94
+ states=states,
95
+ model_type=model_type,
96
+ )
97
+ # Check the model
98
+ # Initialize Prism Program
99
+ path = stormpy.examples.files.prism_dtmc_die # prism_mdp_maze
100
+ prism_program = stormpy.parse_prism_program(path)
101
+
102
+ # Define Properties
103
+ properties = stormpy.parse_properties(self.ltl_formula, prism_program)
104
+
105
+ # Get Result and Filter it
106
+ result = stormpy.model_checking(model, properties[0])
107
+
108
+ if use_filter:
109
+ # The final result will only consider paths starting from the initial states of the automaton. # noqa: E501
110
+ filtered_result = stormpy.create_filter_initial_states_sparse(model)
111
+ result.filter(filtered_result)
112
+ return result
113
+
114
+ def qualitative_result_eval(self, verification_result: ExplicitQualitativeCheckResult) -> bool:
115
+ if isinstance(verification_result, ExplicitQualitativeCheckResult):
116
+ # string result is "true" when is absolutely true
117
+ # but it returns "true, false" when we have some true and false
118
+ verification_result_str = str(verification_result)
119
+ string_result = verification_result_str.split("{")[-1].split("}")[0]
120
+ if len(string_result) == 4:
121
+ if string_result[0] == "t": # 0,6
122
+ result = True
123
+ elif len(string_result) > 5:
124
+ # "true, false" -> some true and some false
125
+ result = True
126
+ else:
127
+ result = False
128
+ return result
129
+ msg = "Model Checking is not qualitative"
130
+ raise ValueError(msg)
131
+
132
+ def _build_trans_matrix(
133
+ self,
134
+ transitions: list[tuple[int, int, float]],
135
+ states: list[VideoState],
136
+ model_type: str = "nondeterministic",
137
+ ) -> stormpy.storage.SparseMatrix:
138
+ """Build transition matrix.
139
+
140
+ Args:
141
+ transitions: List of transitions.
142
+ states: List of states.
143
+ model_type: Type of model ("nondeterministic" or "deterministic").
144
+ """
145
+ if model_type not in ["nondeterministic", "deterministic"]:
146
+ msg = "Invalid model_type. Must be 'nondeterministic' or 'deterministic'" # noqa: E501
147
+ raise ValueError(msg)
148
+
149
+ if model_type == "nondeterministic":
150
+ matrix = np.zeros((len(states), len(states)))
151
+ for t in transitions:
152
+ matrix[int(t[0]), int(t[1])] = float(t[2])
153
+ trans_matrix = stormpy.build_sparse_matrix(matrix, list(range(len(states))))
154
+
155
+ elif model_type == "deterministic":
156
+ num_states = len(states)
157
+ builder = stormpy.SparseMatrixBuilder(
158
+ rows=num_states,
159
+ columns=num_states,
160
+ entries=len(transitions),
161
+ force_dimensions=False,
162
+ )
163
+ states_with_transitions = set(src for src, _, _ in transitions)
164
+ outgoing_probs = {i: 0.0 for i in range(num_states)}
165
+
166
+ for src, dest, prob in transitions:
167
+ builder.add_next_value(src, dest, prob)
168
+ outgoing_probs[src] += prob
169
+
170
+ for state in range(num_states):
171
+ if state not in states_with_transitions:
172
+ builder.add_next_value(state, state, 1.0)
173
+ outgoing_probs[state] = 1.0
174
+
175
+ # Check probabilities
176
+ for state, prob_sum in outgoing_probs.items():
177
+ if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
178
+ logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
179
+
180
+ # ... (existing logging code) ...
181
+ trans_matrix = builder.build()
182
+ return trans_matrix
183
+
184
+ def _build_label_func(
185
+ self,
186
+ states: list[VideoState],
187
+ props: list[str],
188
+ model_type: str = "nondeterministic",
189
+ ) -> stormpy.storage.StateLabeling:
190
+ """Build label function.
191
+
192
+ Args:
193
+ states (list[State]): List of states.
194
+ props (list[str]): List of propositions.
195
+ model_type (str): Type of model
196
+ ("nondeterministic" or "deterministic").
197
+
198
+ Returns:
199
+ stormpy.storage.StateLabeling: State labeling.
200
+ """
201
+ state_labeling = stormpy.storage.StateLabeling(len(states))
202
+ state_labeling.add_label("init")
203
+ state_labeling.add_label("terminal")
204
+ for label in props:
205
+ state_labeling.add_label(label)
206
+
207
+ if model_type == "nondeterministic":
208
+ for state in states:
209
+ for label in state.descriptive_label:
210
+ state_labeling.add_label_to_state(label, state.state_index)
211
+ else:
212
+ for i, state in enumerate(states):
213
+ for prop in state.props:
214
+ if prop in props:
215
+ state_labeling.add_label_to_state(prop, i)
216
+ return state_labeling
217
+
218
+ def validate_tl_specification(self, ltl_formula: str) -> bool:
219
+ """Validate LTL specification.
220
+
221
+ Args:
222
+ ltl_formula: LTL formula to validate.
223
+ """
224
+ path = stormpy.examples.files.prism_dtmc_die # prism_mdp_maze
225
+ prism_program = stormpy.parse_prism_program(path)
226
+ # Define Properties
227
+ try:
228
+ stormpy.parse_properties(ltl_formula, prism_program)
229
+ except Exception as e:
230
+ msg = f"Error validating LTL specification: {e}"
231
+ logging.exception(msg)
232
+ return False
233
+ else:
234
+ return True
neus_v/model_checking/video_state.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class VideoState:
2
+ """Video state class."""
3
+
4
+ def __init__(
5
+ self,
6
+ state_index: int,
7
+ frame_index: int,
8
+ label: str,
9
+ proposition_set: list[str],
10
+ probability: float = 1.0,
11
+ ) -> None:
12
+ """State class.
13
+
14
+ Args:
15
+ state_index (int): state_index.
16
+ frame_index (int): Frame index.
17
+ label (str): Label set. :abel is a string with characters T or F
18
+ indicating True or False
19
+ proposition_set (list[str]): Proposition set.
20
+ probability (float): Probability of the state.
21
+ """
22
+ self.state_index = state_index
23
+ self.frame_index = frame_index
24
+ self.proposition_set = proposition_set
25
+ self.label = label # "init", "terminal", TTT, TFT, FTT, etc.
26
+ self.descriptive_label = self._get_descriptive_label(label=label)
27
+ self.probability = probability
28
+
29
+ def __repr__(self) -> str:
30
+ """Representation of state."""
31
+ return f"{self.state_index} {self.descriptive_label} {self.frame_index} {self.probability}" # noqa: E501
32
+
33
+ def __str__(self) -> str:
34
+ """String of state."""
35
+ return f"{self.__repr__()}"
36
+
37
+ def _get_descriptive_label(self, label: str) -> list:
38
+ """Get descriptive label.
39
+
40
+ Args:
41
+ label (str): Label.
42
+ """
43
+ labels = []
44
+ if label == "init":
45
+ labels.append("init")
46
+ elif label == "terminal":
47
+ labels.append("terminal")
48
+ else:
49
+ for i in range(len(self.proposition_set)):
50
+ if label[i] == "T":
51
+ labels.append(self.proposition_set[i])
52
+ return labels
53
+
54
+ def update(self, frame_index: int, target_label: str) -> None:
55
+ """Update state to the new state..
56
+
57
+ Args:
58
+ frame_index (int): Frame index.
59
+ target_label (str): Target label for the new state.
60
+ """
61
+ self.frame_index = frame_index
62
+ self.label = target_label # TTT, TFT, FTT, etc.
63
+ self.descriptive_label = self._get_descriptive_label(label=target_label)
64
+ self.probability = 1.0
65
+
66
+ def compute_probability(self, probabilities: list[list[float]]) -> None:
67
+ """Compute probability of the state given the probabilities of the propositions.
68
+
69
+ Args:
70
+ probabilities (list): list of probabilities of the propositions
71
+ e.g. two propositions with three frames
72
+ -> [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]].
73
+ """ # noqa: E501
74
+ probability = 1.0
75
+ for i in range(len(self.label)):
76
+ if self.label[i] == "T":
77
+ probability *= probabilities[i][self.frame_index]
78
+ else:
79
+ probability *= 1 - probabilities[i][self.frame_index]
80
+ self.probability = round(probability, 3)
neus_v/smooth_scoring.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ class DataTransformer:
5
+ def __init__(self, data):
6
+ self.data = np.asarray(data)
7
+ self.sorted_data = np.sort(self.data)
8
+ self.n = len(self.sorted_data)
9
+ self.ecdf = np.arange(1, self.n + 1) / self.n
10
+
11
+ def mapping_function(self, x):
12
+ x = np.asarray(x)
13
+ return np.interp(x, self.sorted_data, self.ecdf, left=0, right=1)
14
+
15
+
16
+ def smooth_confidence_scores(target_data, prior_distribution=None):
17
+ if prior_distribution is None:
18
+ prior_distribution = target_data
19
+ transformer = DataTransformer(prior_distribution)
20
+ return transformer.mapping_function(target_data)
neus_v/utils.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+
3
+ import torch
4
+
5
+
6
+ def clear_gpu_memory():
7
+ torch.cuda.empty_cache()
8
+ if torch.cuda.is_available():
9
+ torch.cuda.ipc_collect()
10
+ gc.collect()
neus_v/veval/__init__.py ADDED
File without changes
neus_v/veval/eval.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ import numpy as np
5
+ from joblib import Parallel, delayed
6
+
7
+ from neus_v.automaton.video_automaton import VideoAutomaton
8
+ from neus_v.model_checking.stormpy import StormModelChecker
9
+ from neus_v.veval.parse import parse_tl_formula
10
+ from neus_v.video.frame import VideoFrame
11
+ from neus_v.video.read_video import read_video
12
+
13
+
14
+ def create_frame_windows(frames: list, window_size: int) -> list[list]:
15
+ """Create non-overlapping windows of frames, with remainder in last window.
16
+
17
+ Args:
18
+ frames: List of frames
19
+ window_size: Size of each window
20
+
21
+ Returns:
22
+ List of frame windows
23
+ """
24
+ windows = []
25
+ for i in range(0, len(frames), window_size):
26
+ windows.append(frames[i : i + window_size])
27
+ return windows
28
+
29
+
30
+ def evaluate_video(
31
+ vision_language_model,
32
+ confidence_as_token_probability: bool,
33
+ video_path: Path | str,
34
+ proposition_set: list,
35
+ tl_spec: str,
36
+ parallel_inference: bool = False,
37
+ threshold: float = 0.1,
38
+ ) -> dict:
39
+ """Evaluate a video using the given vision language model."""
40
+ output_log = {
41
+ "specification": None,
42
+ "propositions": None,
43
+ "probability": None,
44
+ "min_probability": None,
45
+ "max_probability": None,
46
+ "propositions_avg_probability": {},
47
+ }
48
+
49
+ if isinstance(video_path, str):
50
+ video_path = Path(video_path)
51
+ video = read_video(video_path=video_path)
52
+
53
+ # TODO: if there's F in the tl_spec
54
+ ltl_formula = parse_tl_formula(tl_spec)
55
+
56
+ video_automaton = VideoAutomaton(include_initial_state=True)
57
+
58
+ video_automaton.set_up(proposition_set=proposition_set)
59
+ model_checker = StormModelChecker(
60
+ proposition_set=proposition_set,
61
+ ltl_formula=ltl_formula,
62
+ )
63
+
64
+ proposition_probability_record = {}
65
+ for proposition in proposition_set:
66
+ proposition_probability_record[proposition] = []
67
+ if model_checker.validate_tl_specification(ltl_formula):
68
+ frame_count = 0
69
+ all_frames: list[np.ndarray] = video.get_all_frames_of_video(
70
+ return_format="ndarray",
71
+ desired_interval_in_sec=1,
72
+ )
73
+ try:
74
+ # for frame_img in all_frames:
75
+
76
+ def process_frame(frame_img: np.ndarray, frame_count: int):
77
+ sys.stdout.write(f"\rProcessing frame: {frame_count+1}/{len(all_frames)} ")
78
+ sys.stdout.flush()
79
+ object_of_interest = {}
80
+ for proposition in proposition_set:
81
+ detected_object = vision_language_model.detect(
82
+ frame_img=frame_img,
83
+ scene_description=proposition,
84
+ confidence_as_token_probability=confidence_as_token_probability,
85
+ threshold=threshold,
86
+ )
87
+ object_of_interest[proposition] = detected_object
88
+ # proposition_probability_record.get(proposition).append(
89
+ # detected_object.probability
90
+ # )
91
+ video_frame = VideoFrame(
92
+ frame_idx=frame_count,
93
+ timestamp=None,
94
+ frame_image=frame_img,
95
+ object_of_interest=object_of_interest,
96
+ )
97
+ return video_frame, object_of_interest
98
+
99
+ if parallel_inference:
100
+ results = Parallel(n_jobs=len(all_frames))(
101
+ delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(all_frames)
102
+ )
103
+ else:
104
+ results = [process_frame(frame_img, i) for i, frame_img in enumerate(all_frames)]
105
+
106
+ for video_frame, object_of_interest in results:
107
+ video_automaton.add_frame(frame=video_frame)
108
+ for proposition, detected_object in object_of_interest.items():
109
+ proposition_probability_record[proposition].append(detected_object.probability)
110
+
111
+ video_automaton.add_terminal_state(add_with_terminal_label=True)
112
+ sys.stdout.write("\n") # Move to the next line after processing all frames
113
+ result = model_checker.check_automaton(
114
+ states=video_automaton.states,
115
+ transitions=video_automaton.transitions,
116
+ model_type="dtmc",
117
+ use_filter=True,
118
+ )
119
+ output_log["specification"] = tl_spec
120
+ output_log["propositions"] = proposition_set
121
+ output_log["probability"] = round(float(str(result)), 6)
122
+ output_log["min_probability"] = round(float(str(result.min)), 6)
123
+ output_log["max_probability"] = round(float(str(result.max)), 6)
124
+ for (
125
+ proposition,
126
+ probabilities,
127
+ ) in proposition_probability_record.items():
128
+ avg_probability = sum(probabilities) / len(probabilities)
129
+ output_log["propositions_avg_probability"][proposition] = round(avg_probability, 3)
130
+ except Exception as e: # noqa: BLE001
131
+ # print(f"\nError processing frame {frame_count}: {e}")
132
+ import traceback
133
+
134
+ print(f"\nError processing frame {frame_count}: {e}")
135
+ traceback.print_exc()
136
+
137
+ return output_log
138
+
139
+
140
+ def evaluate_video_with_sequence_of_images(
141
+ vision_language_model,
142
+ confidence_as_token_probability: bool,
143
+ video_path: Path | str,
144
+ proposition_set: list,
145
+ tl_spec: str,
146
+ parallel_inference: bool = False,
147
+ num_of_frame_in_sequence: int = 3,
148
+ threshold: float = 0.1,
149
+ ) -> dict:
150
+ """Evaluate a video using the given vision language model."""
151
+ output_log = {
152
+ "specification": None,
153
+ "propositions": None,
154
+ "probability": None,
155
+ "min_probability": None,
156
+ "max_probability": None,
157
+ "propositions_avg_probability": {},
158
+ }
159
+
160
+ if isinstance(video_path, str):
161
+ video_path = Path(video_path)
162
+ video = read_video(video_path=video_path)
163
+
164
+ # TODO: if there's F in the tl_spec
165
+ ltl_formula = parse_tl_formula(tl_spec)
166
+
167
+ video_automaton = VideoAutomaton(include_initial_state=True)
168
+
169
+ video_automaton.set_up(proposition_set=proposition_set)
170
+ model_checker = StormModelChecker(
171
+ proposition_set=proposition_set,
172
+ ltl_formula=ltl_formula,
173
+ )
174
+
175
+ proposition_probability_record = {}
176
+ for proposition in proposition_set:
177
+ proposition_probability_record[proposition] = []
178
+ if model_checker.validate_tl_specification(ltl_formula):
179
+ frame_count = 0
180
+ all_frames: list[np.ndarray] = video.get_all_frames_of_video(
181
+ return_format="ndarray",
182
+ desired_interval_in_sec=0.5,
183
+ )
184
+ try:
185
+ # for frame_img in all_frames:
186
+ def process_frame(sequence_of_frames: list[np.ndarray], frame_count: int):
187
+ sys.stdout.write(f"\rProcessing frame window: {frame_count+1}/{len(frame_windows)} ")
188
+ sys.stdout.flush()
189
+ object_of_interest = {}
190
+ for proposition in proposition_set:
191
+ detected_object = vision_language_model.detect(
192
+ seq_of_frames=sequence_of_frames,
193
+ scene_description=proposition,
194
+ confidence_as_token_probability=confidence_as_token_probability,
195
+ threshold=threshold,
196
+ )
197
+ object_of_interest[proposition] = detected_object
198
+ # proposition_probability_record.get(proposition).append(
199
+ # detected_object.probability
200
+ # )
201
+ print(f"{proposition}: {detected_object.probability}")
202
+ video_frame = VideoFrame(
203
+ frame_idx=frame_count,
204
+ timestamp=None,
205
+ frame_image=sequence_of_frames,
206
+ object_of_interest=object_of_interest,
207
+ )
208
+ return video_frame, object_of_interest
209
+
210
+ if parallel_inference:
211
+ frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
212
+ results = Parallel(n_jobs=len(frame_windows))(
213
+ delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(frame_windows)
214
+ )
215
+ else:
216
+ frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
217
+ results = [process_frame(sequence_of_frames, i) for i, sequence_of_frames in enumerate(frame_windows)]
218
+
219
+ for video_frame, object_of_interest in results:
220
+ video_automaton.add_frame(frame=video_frame)
221
+ for proposition, detected_object in object_of_interest.items():
222
+ proposition_probability_record[proposition].append(detected_object.probability)
223
+
224
+ video_automaton.add_terminal_state(add_with_terminal_label=True)
225
+ sys.stdout.write("\n") # Move to the next line after processing all frames
226
+ result = model_checker.check_automaton(
227
+ states=video_automaton.states,
228
+ transitions=video_automaton.transitions,
229
+ model_type="dtmc",
230
+ use_filter=True,
231
+ )
232
+ output_log["specification"] = tl_spec
233
+ output_log["propositions"] = proposition_set
234
+ output_log["probability"] = round(float(str(result)), 6)
235
+ output_log["min_probability"] = round(float(str(result.min)), 6)
236
+ output_log["max_probability"] = round(float(str(result.max)), 6)
237
+ for (
238
+ proposition,
239
+ probabilities,
240
+ ) in proposition_probability_record.items():
241
+ avg_probability = sum(probabilities) / len(probabilities)
242
+ output_log["propositions_avg_probability"][proposition] = round(avg_probability, 3)
243
+ except Exception as e: # noqa: BLE001
244
+ # print(f"\nError processing frame {frame_count}: {e}")
245
+ import traceback
246
+
247
+ print(f"\nError processing frame {frame_count}: {e}")
248
+ traceback.print_exc()
249
+
250
+ return output_log
neus_v/veval/parse.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def parse_tl_formula(tl_spec: str) -> str:
2
+ """Validate the tl specification."""
3
+ if 'G "' in tl_spec:
4
+ tl_spec = tl_spec.replace('G "', 'F "')
5
+ tl_spec = tl_spec.replace("-", "_")
6
+ if tl_spec[0] == "\n":
7
+ tl_spec = tl_spec[1:]
8
+
9
+ if tl_spec[0] in ["F"]:
10
+ return f"P=? [{tl_spec}]"
11
+
12
+ if tl_spec[0] in ["G"]:
13
+ tl_spec = tl_spec[1:]
14
+ return f"P=? [F {tl_spec}]"
15
+
16
+ # if any(op in tl_spec for op in ["F", "G", "U"]):
17
+ # return f"P=? [F ({tl_spec})]"
18
+
19
+ return f"P=? [F {tl_spec}]"
20
+
21
+
22
+ def parse_proposition_set(proposition_set: list[str]) -> list[str]:
23
+ """Parse the proposition set."""
24
+ return [prop.replace("-", "_") for prop in proposition_set]
25
+
26
+
27
+ def parse_tl_specification(tl_spec: str) -> str:
28
+ """Parse the tl specification."""
29
+ return tl_spec.replace("-", "_")
neus_v/video/__init__.py ADDED
File without changes
neus_v/video/frame.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ from typing import TYPE_CHECKING
3
+
4
+ import cv2
5
+
6
+ if TYPE_CHECKING:
7
+ import numpy as np
8
+
9
+
10
+ @dataclasses.dataclass
11
+ class VideoFrame:
12
+ """Frame class."""
13
+
14
+ frame_idx: int
15
+ timestamp: int | None = None
16
+ frame_image: np.ndarray | None = None
17
+ annotated_image: dict[str, np.ndarray] = dataclasses.field(default_factory=dict)
18
+ detected_object_set: dict | None = None
19
+ object_of_interest: dict | None = None
20
+ activity_of_interest: dict | None = None
21
+
22
+ def save_frame_img(self, save_path: str) -> None:
23
+ """Save frame image."""
24
+ if self.frame_image is not None:
25
+ cv2.imwrite(
26
+ save_path,
27
+ self.frame_image,
28
+ )
29
+
30
+ def is_any_object_detected(self) -> bool:
31
+ """Check if object is detected."""
32
+ return len(self.detected_object_set.objects) > 0
33
+
34
+ @property
35
+ def list_of_detected_object_of_interest(self) -> list:
36
+ """Get detected object."""
37
+ detected_obj = []
38
+ for obj_name, obj_value in self.object_of_interest.items():
39
+ if obj_value.is_detected:
40
+ detected_obj.append(obj_name)
41
+ return detected_obj
42
+
43
+ @property
44
+ def detected_object_dict(self) -> dict:
45
+ """Get detected object info as dict."""
46
+ detected_obj = {}
47
+ for obj_name, obj_value in self.object_of_interest.items():
48
+ if obj_value.is_detected:
49
+ detected_obj[obj_name] = {}
50
+ detected_obj[obj_name]["total_number_of_detection"] = obj_value.number_of_detection
51
+ detected_obj[obj_name]["maximum_probability"] = max(obj_value.probability_of_all_obj)
52
+ detected_obj[obj_name]["minimum_probability"] = min(obj_value.probability_of_all_obj)
53
+ detected_obj[obj_name]["maximum_confidence"] = max(obj_value.confidence_of_all_obj)
54
+ detected_obj[obj_name]["minimum_confidence"] = min(obj_value.confidence_of_all_obj)
55
+
56
+ return detected_obj
57
+
58
+ def detected_bboxes(self, probability_threshold: bool = False) -> list:
59
+ """Get detected object.
60
+
61
+ Args:
62
+ probability_threshold (float | None): Probability threshold.
63
+ Defaults to None.
64
+
65
+ Returns:
66
+ list: Bounding boxes.
67
+ """
68
+ bboxes = []
69
+
70
+ for _, obj_value in self.object_of_interest.items():
71
+ if obj_value.is_detected:
72
+ if probability_threshold:
73
+ for obj_prob in obj_value.probability_of_all_obj:
74
+ if obj_prob > 0:
75
+ bboxes += obj_value.bounding_box_of_all_obj
76
+ else:
77
+ bboxes += obj_value.bounding_box_of_all_obj
78
+
79
+ return bboxes
80
+
81
+
neus_v/video/read_video.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import TYPE_CHECKING
3
+
4
+ from neus_v.video.video import Video, VideoFormat
5
+
6
+ if TYPE_CHECKING:
7
+ import numpy as np
8
+
9
+
10
+ def read_video(
11
+ video_path: str | Path | None = None,
12
+ sequence_of_image: list[np.ndarray] | None = None,
13
+ ) -> Video:
14
+ """Read video from video_path or sequence of images.
15
+
16
+ Args:
17
+ video_path (str | Path | None): Path to video file. Defaults to None.
18
+ sequence_of_image (list[np.ndarray] | None): Sequence of images
19
+ as numpy arrays. Defaults to None.
20
+
21
+ Returns:
22
+ Video: Video object.
23
+
24
+ Raises:
25
+ ValueError: If neither or both video_path and
26
+ sequence_of_image are provided.
27
+ """
28
+ if (video_path is None) == (sequence_of_image is None):
29
+ msg = "Exactly one of video_path or sequence_of_image must be provided."
30
+ raise ValueError(msg)
31
+ if video_path:
32
+ if isinstance(video_path, str):
33
+ video_path = Path(video_path)
34
+
35
+ if video_path.suffix == ".mp4":
36
+ read_format = VideoFormat.MP4
37
+
38
+ if sequence_of_image:
39
+ read_format = VideoFormat.LIST_OF_ARRAY
40
+
41
+ return Video(
42
+ video_path=video_path,
43
+ sequence_of_image=sequence_of_image,
44
+ read_format=read_format,
45
+ )
neus_v/video/video.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import enum
2
+ import logging
3
+ import uuid
4
+ from dataclasses import dataclass, field
5
+ from typing import TYPE_CHECKING
6
+
7
+ import cv2
8
+ from PIL import Image
9
+
10
+ if TYPE_CHECKING:
11
+ from pathlib import Path
12
+
13
+ import numpy as np
14
+
15
+
16
+ class VideoFormat(enum.Enum):
17
+ """Status Enum for the CV API."""
18
+
19
+ MP4 = "mp4"
20
+ LIST_OF_ARRAY = "list_of_array"
21
+
22
+
23
+ @dataclass
24
+ class VideoInfo:
25
+ """Represents information about a video file."""
26
+
27
+ format: VideoFormat
28
+ frame_width: int
29
+ frame_height: int
30
+ original_frame_count: int
31
+ video_id: uuid.UUID = field(default_factory=uuid.uuid4)
32
+ video_path: str | None = None
33
+ processed_fps: float | None = None
34
+ processed_frame_count: int = 1
35
+ original_fps: float | None = None
36
+
37
+
38
+ class Video:
39
+ """vflow's Video Object."""
40
+
41
+ def __init__(
42
+ self,
43
+ read_format: VideoFormat,
44
+ video_path: str | Path | None = None,
45
+ sequence_of_image: list[np.ndarray] | None = None,
46
+ ) -> None:
47
+ """Video Frame Processor.
48
+
49
+ Args:
50
+ video_path (str | Path): Path to video file.
51
+ read_format (VideoFormat): Format to read the video in.
52
+ sequence_of_image (list[np.ndarray] | None): List of image arrays
53
+ for processing.
54
+ """
55
+ self._video_path = video_path
56
+ self._read_format = read_format
57
+ self.video_info = None
58
+ if sequence_of_image:
59
+ self.all_frames = sequence_of_image
60
+ if isinstance(sequence_of_image[0], list):
61
+ self.all_frames = sequence_of_image[0]
62
+ self.import_video(str(video_path))
63
+ self.current_frame_index = 0
64
+ self.video_ended = False
65
+
66
+ def __str__(self) -> str:
67
+ """Return a concise string representation of the Video object."""
68
+ return str(self.video_info)
69
+
70
+ def __repr__(self) -> str:
71
+ """Return a detailed string representation of the Video object."""
72
+ return repr(self.video_info)
73
+
74
+ def import_video(self, video_path: str | None) -> None:
75
+ """Read video from video_path.
76
+
77
+ Args:
78
+ video_path (str): Path to video file.
79
+ """
80
+ logging.info(f"Video format: {self._read_format}")
81
+ if self._read_format == VideoFormat.MP4:
82
+ self._cap = cv2.VideoCapture(video_path)
83
+ ret, _ = self._cap.read()
84
+ if not ret:
85
+ logging.error("Video path is invalid.")
86
+ self.video_info = VideoInfo(
87
+ video_path=str(self._video_path),
88
+ format=self._read_format,
89
+ frame_width=int(self._cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
90
+ frame_height=int(self._cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
91
+ original_fps=self._cap.get(cv2.CAP_PROP_FPS),
92
+ original_frame_count=int(self._cap.get(cv2.CAP_PROP_FRAME_COUNT)),
93
+ )
94
+ elif self._read_format.LIST_OF_ARRAY:
95
+ self.video_info = VideoInfo(
96
+ format=self._read_format,
97
+ frame_width=int(self.all_frames[0].shape[0]),
98
+ frame_height=int(self.all_frames[0].shape[1]),
99
+ original_frame_count=len(self.all_frames),
100
+ )
101
+
102
+ def _resize_frame_by_scale(self, frame_img: np.ndarray, frame_scale: int) -> np.ndarray:
103
+ """Resize frame image.
104
+
105
+ Args:
106
+ frame_img (np.ndarray): Frame image.
107
+ frame_scale (int): Scale of frame.
108
+
109
+ Returns:
110
+ np.ndarray: Resized frame image.
111
+ """
112
+ return cv2.resize(
113
+ frame_img,
114
+ (
115
+ int(self.video_info.frame_width / frame_scale),
116
+ int(self.video_info.frame_height / frame_scale),
117
+ ),
118
+ )
119
+
120
+ def get_all_frames_of_video(
121
+ self,
122
+ return_format: str = "ndarray",
123
+ frame_scale: int | None = None,
124
+ desired_fps: int | None = None,
125
+ desired_interval_in_sec: int | None = None,
126
+ ) -> list:
127
+ """Get video frames by frame_scale and second_per_frame.
128
+
129
+ Args:
130
+ return_format (str, optional): Return format. Defaults to "cv2".
131
+ Options: [cv2, ndarray]
132
+ frame_scale (int | None, optional): Frame scale. Defaults to None.
133
+ desired_fps (int | None, optional): Desired FPS. Defaults to None.
134
+ desired_interval_in_sec (int | None, optional): Interval between frames in seconds.
135
+ If provided, frames will be extracted at this interval. Defaults to None.
136
+ """ # noqa: E501
137
+ if self._read_format == VideoFormat.LIST_OF_ARRAY:
138
+ resize_func = lambda img: self.process_frame_image( # noqa: E731
139
+ frame_img=img,
140
+ frame_scale=frame_scale,
141
+ return_format=return_format,
142
+ )
143
+ all_frames = list(map(resize_func, self.all_frames))
144
+ self.processed_frame_count = len(all_frames)
145
+ return all_frames
146
+
147
+ all_frames = []
148
+
149
+ if self._read_format == VideoFormat.MP4 and desired_fps is None and desired_interval_in_sec is None:
150
+ msg = (
151
+ "Either desired_fps",
152
+ "or desired_interval_in_sec must be provided.",
153
+ )
154
+ raise ValueError(msg)
155
+
156
+ if self._read_format == VideoFormat.MP4:
157
+ frame_step = self.get_frame_step(
158
+ desired_fps=desired_fps,
159
+ desired_interval_in_sec=desired_interval_in_sec,
160
+ )
161
+
162
+ for real_frame_idx in range(0, int(self.video_info.original_frame_count), int(frame_step)):
163
+ self._cap.set(cv2.CAP_PROP_POS_FRAMES, real_frame_idx)
164
+ ret, frame_img = self._cap.read()
165
+ frame_img = cv2.cvtColor(frame_img, cv2.COLOR_BGR2RGB)
166
+ if not ret:
167
+ break
168
+ frame_img = self.process_frame_image(
169
+ frame_img=frame_img,
170
+ frame_scale=frame_scale,
171
+ return_format=return_format,
172
+ )
173
+ all_frames.append(frame_img)
174
+ self._cap.release()
175
+ # cv2.destroyAllWindows()
176
+ self.processed_frame_count = len(all_frames)
177
+ return all_frames
178
+
179
+ def get_next_frame(
180
+ self,
181
+ return_format: str = "ndarray",
182
+ frame_scale: int | None = None,
183
+ desired_fps: int | None = None,
184
+ desired_interval_in_sec: int | None = None,
185
+ ) -> np.ndarray | None:
186
+ """Get the next video frame based on frame step.
187
+
188
+ Args:
189
+ return_format (str, optional): Return format. Defaults to "ndarray".
190
+ - [cv2, ndarray, pil]
191
+ frame_scale (int | None, optional): Frame scale. Defaults to None.
192
+ desired_fps (int | None, optional): Desired FPS. Defaults to None.
193
+ desired_interval_in_sec (int | None, optional): Desired interval.
194
+ Defaults to None.
195
+
196
+ Returns:
197
+ np.ndarray | None: The next frame as an ndarray, or None if no more
198
+ frames are available or the video ended.
199
+ """
200
+ if self._read_format == VideoFormat.MP4 and desired_fps is None and desired_interval_in_sec is None:
201
+ msg = (
202
+ "Either desired_fps or",
203
+ "desired_interval_in_sec must be provided.",
204
+ )
205
+ raise ValueError(msg)
206
+
207
+ if self.video_ended:
208
+ logging.info("No frame available.")
209
+ return None # No more frames to process
210
+
211
+ if self._read_format == VideoFormat.MP4:
212
+ frame_step = self.get_frame_step(
213
+ desired_fps=desired_fps,
214
+ desired_interval_in_sec=desired_interval_in_sec,
215
+ )
216
+ # Skip to the next frame based on frame_step
217
+ self._cap.set(cv2.CAP_PROP_POS_FRAMES, self.current_frame_index)
218
+
219
+ ret, frame_img = self._cap.read()
220
+
221
+ if not ret:
222
+ self.video_ended = True
223
+ return None # No more frames or error occurred
224
+
225
+ # Update the current frame index for the next call
226
+ self.current_frame_index += frame_step
227
+
228
+ frame_img = cv2.cvtColor(frame_img, cv2.COLOR_BGR2RGB)
229
+
230
+ if self._read_format == VideoFormat.LIST_OF_ARRAY:
231
+ if self.current_frame_index < len(self.all_frames):
232
+ frame_img = self.all_frames[self.current_frame_index]
233
+ self.current_frame_index += 1
234
+ else:
235
+ # No more frames available.
236
+ self.video_ended = True
237
+ return None
238
+
239
+ self.video_info.processed_frame_count += 1
240
+
241
+ return self.process_frame_image(
242
+ frame_img=frame_img,
243
+ frame_scale=frame_scale,
244
+ return_format=return_format,
245
+ )
246
+
247
+ def process_frame_image(
248
+ self,
249
+ frame_img: np.ndarray,
250
+ return_format: str = "ndarray",
251
+ frame_scale: int | None = None,
252
+ ) -> np.ndarray:
253
+ """Process a single frame image.
254
+
255
+ Args:
256
+ frame_img (np.ndarray): Input frame image.
257
+ return_format (str, optional): Desired return format.
258
+ Defaults to "ndarray".
259
+ frame_scale (int | None, optional): Scale factor for resizing.
260
+ Defaults to None.
261
+
262
+ Returns:
263
+ np.ndarray: Processed frame image.
264
+ """
265
+ if frame_scale is not None:
266
+ frame_img = self._resize_frame_by_scale(frame_img, frame_scale)
267
+ if return_format == "pil":
268
+ frame_img = Image.fromarray(frame_img).convert("RGB")
269
+ return frame_img
270
+
271
+ def get_frame_step(
272
+ self,
273
+ desired_interval_in_sec: int | None = None,
274
+ desired_fps: int | None = None,
275
+ ) -> int:
276
+ """Calculate the frame step based on desired interval or FPS.
277
+
278
+ Args:
279
+ desired_interval_in_sec (int | None): Desired interval between frames in seconds.
280
+ desired_fps (int | None): Desired frames per second.
281
+
282
+ Returns:
283
+ int: Calculated frame step.
284
+ """ # noqa: E501
285
+ if desired_fps is not None:
286
+ frame_step = int(round(self.video_info.original_fps / desired_fps))
287
+ processed_fps = desired_fps
288
+ if desired_interval_in_sec is not None:
289
+ frame_step = int(round(self.video_info.original_fps * desired_interval_in_sec))
290
+ processed_fps = round(1 / desired_interval_in_sec, 2)
291
+ self.video_info.processed_fps = processed_fps
292
+
293
+ return frame_step
294
+
295
+ def insert_annotation_to_current_frame(self, annotations: list[str]) -> None:
296
+ """Insert annotations to the current frame.
297
+
298
+ Args:
299
+ annotations (list[str]): List of annotations.
300
+ """
301
+
302
+ def get_video_info(self) -> VideoInfo:
303
+ """Return the VideoInfo object containing video information."""
304
+ return self.video_info
neus_v/vlm/__init__.py ADDED
File without changes
neus_v/vlm/internvl.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import logging
3
+
4
+ import numpy as np
5
+ import torch
6
+ from PIL import Image
7
+ from torch.nn.functional import softmax
8
+ from transformers import AutoModel, AutoTokenizer
9
+
10
+ from neus_v.vlm.internvl_utils import (
11
+ assign_device_map,
12
+ load_image,
13
+ load_video_from_file,
14
+ load_video_from_seq_of_frames,
15
+ split_model,
16
+ )
17
+ from neus_v.vlm.obj import DetectedObject
18
+
19
+ MODEL_PATH = {
20
+ "InternVL2-40B": "HuggingFace Model",
21
+ "InternVL2-8B": "HuggingFace Model",
22
+ "InternVL2-2B": "HuggingFace Model",
23
+ }
24
+
25
+
26
+ class InternVL:
27
+ """InternVL's Vision Language Model."""
28
+
29
+ def __init__(
30
+ self,
31
+ model_name: str = "InternVL2-8B",
32
+ multi_gpus: bool = False,
33
+ device: int = 0,
34
+ ) -> None:
35
+ """Initialization the InternVL."""
36
+ logging.info(
37
+ (
38
+ "You are using the model based on HuggingFace API.",
39
+ "The model will be downloaded to the HuggingFace cache dir.",
40
+ )
41
+ )
42
+ self.model_name = model_name
43
+ self._path = f"OpenGVLab/{model_name}"
44
+ self._num_gpus = torch.cuda.device_count()
45
+ self.device = device
46
+ if multi_gpus:
47
+ device_map = split_model(model_name)
48
+ else:
49
+ device_map = assign_device_map(model_name=model_name, manual_gpu_id=device)
50
+ self.model = AutoModel.from_pretrained(
51
+ self._path,
52
+ torch_dtype=torch.bfloat16,
53
+ low_cpu_mem_usage=True,
54
+ use_flash_attn=True,
55
+ trust_remote_code=True,
56
+ device_map=device_map,
57
+ ).eval()
58
+ self.model.apply(self.move_tensors_to_gpu)
59
+ self.tokenizer = AutoTokenizer.from_pretrained(self._path, trust_remote_code=True, use_fast=False)
60
+
61
+ def reset_model(self) -> None:
62
+ """Reset the model to its initial state using pretrained weights."""
63
+ self.model = AutoModel.from_pretrained(
64
+ self._path,
65
+ torch_dtype=torch.bfloat16,
66
+ low_cpu_mem_usage=True,
67
+ use_flash_attn=True,
68
+ trust_remote_code=True,
69
+ ).eval()
70
+ self.model.apply(self.move_tensors_to_gpu)
71
+
72
+ def clear_gpu_memory(self) -> None:
73
+ """Clear CUDA cache and run garbage collection to free GPU memory."""
74
+ torch.cuda.empty_cache()
75
+ if torch.cuda.is_available():
76
+ torch.cuda.ipc_collect()
77
+ gc.collect() # Run garbage collector
78
+
79
+ def move_tensors_to_gpu(
80
+ self,
81
+ module: torch.nn.Module,
82
+ ) -> None:
83
+ """Move all tensors in the module to GPU if they are on the CPU."""
84
+ for name, tensor in module.named_buffers():
85
+ if isinstance(tensor, torch.Tensor) and tensor.device.type == "cpu":
86
+ module.register_buffer(
87
+ name,
88
+ tensor.cuda(self.device),
89
+ persistent=False,
90
+ )
91
+ for _, param in module.named_parameters():
92
+ if param.device.type == "cpu":
93
+ param.data = param.data.cuda(self.device)
94
+
95
+ def infer_with_image(
96
+ self,
97
+ language: str,
98
+ image: np.ndarray | None = None,
99
+ image_path: str | None = None,
100
+ max_new_tokens: int = 1024,
101
+ do_sample: bool = True,
102
+ ) -> str:
103
+ """Perform image inference with given video inputs."""
104
+ assert ( # noqa: S101
105
+ image is not None or image_path is not None
106
+ ), "One of 'image' or 'image_path' must be defined."
107
+ if image_path:
108
+ image = Image.open(image_path).convert("RGB")
109
+ else:
110
+ image = Image.fromarray(image)
111
+ # set the max number of tiles in `max_num`
112
+ pixel_values = load_image(image, max_num=12).to(torch.bfloat16).cuda(self.device)
113
+ generation_config = {
114
+ "max_new_tokens": max_new_tokens,
115
+ "do_sample": do_sample,
116
+ }
117
+ image_prefix = "<image>\n"
118
+ language = image_prefix + language
119
+ return self.model.chat(self.tokenizer, pixel_values, language, generation_config)
120
+
121
+ def infer_with_video(
122
+ self,
123
+ language: str,
124
+ seq_of_frames: list[np.ndarray] | None = None,
125
+ video_path: str | None = None,
126
+ max_new_tokens: int = 1024,
127
+ do_sample: bool = True,
128
+ ) -> str:
129
+ """Perform image inference with given video inputs."""
130
+ assert ( # noqa: S101
131
+ seq_of_frames is not None or video_path is not None
132
+ ), "One of 'seq_of_frames' or 'video_path' must be defined."
133
+ generation_config = {
134
+ "max_new_tokens": max_new_tokens,
135
+ "do_sample": do_sample,
136
+ }
137
+ if video_path:
138
+ pixel_values, num_patches_list = load_video_from_file(video_path)
139
+ else:
140
+ pixel_values, num_patches_list = load_video_from_seq_of_frames(seq_of_frames=seq_of_frames)
141
+ video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
142
+ language = video_prefix + language
143
+ return self.model.chat(
144
+ self.tokenizer,
145
+ pixel_values,
146
+ language,
147
+ generation_config,
148
+ num_patches_list=num_patches_list,
149
+ history=None,
150
+ return_history=True,
151
+ )
152
+
153
+ def detect(
154
+ self,
155
+ scene_description: str,
156
+ frame_img: np.ndarray | None = None,
157
+ seq_of_frames: list[np.ndarray] | None = None,
158
+ video_path: str | None = None,
159
+ threshold: float = 0.349,
160
+ confidence_as_token_probability: bool = True,
161
+ ) -> DetectedObject:
162
+ """Detect objects in the given frame image.
163
+
164
+ Args:
165
+ frame_img (np.ndarray): The image frame to process.
166
+ scene_description (str): Description of the scene.
167
+ seq_of_frames (list[np.ndarray] | None):
168
+ List of video frames to process.
169
+ video_path (str | None): Path to video file to process.
170
+ threshold (float): Detection threshold.
171
+ confidence_as_token_probability (bool):
172
+ Whether to use token probabilities for confidence.
173
+
174
+ Returns:
175
+ DetectedObject: Detected objects with their details.
176
+ """
177
+ if confidence_as_token_probability:
178
+ parsing_rule = [
179
+ "You must only return a Yes or No, and not both, to any question asked. " # noqa: E501
180
+ "You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.", # noqa: E501
181
+ "For example, if the question is 'Is there a cat present in the Image?', the answer must only be 'Yes' or 'No'.", # noqa: E501
182
+ ]
183
+ parsing_rule = "\n".join(parsing_rule)
184
+ prompt = rf"Is there a {scene_description} present in the image? " f"[PARSING RULE]\n:{parsing_rule}"
185
+
186
+ if seq_of_frames or video_path:
187
+ response, confidence = self.infer_with_video_confidence(
188
+ language=prompt,
189
+ seq_of_frames=seq_of_frames,
190
+ video_path=video_path,
191
+ )
192
+ else:
193
+ response, confidence = self.infer_with_image_confidence(language=prompt, image=frame_img)
194
+ # TODO: Add a check for the response to be Yes or NO or clean up response better # noqa: E501
195
+ if "yes" in response.lower():
196
+ detected = True
197
+ if confidence <= threshold:
198
+ confidence = 0.0
199
+ detected = False
200
+
201
+ else:
202
+ detected = False
203
+ confidence = 0.0
204
+
205
+ return DetectedObject(
206
+ name=scene_description,
207
+ model_name=self.model_name,
208
+ confidence=round(confidence, 3),
209
+ probability=round(confidence, 3),
210
+ number_of_detection=1,
211
+ is_detected=detected,
212
+ )
213
+
214
+ def infer_with_image_confidence(
215
+ self,
216
+ language: str,
217
+ image: np.ndarray | None = None,
218
+ image_path: str | None = None,
219
+ max_new_tokens: int = 1024,
220
+ do_sample: bool = True,
221
+ ) -> tuple[str, float]:
222
+ """Perform image inference and return response with confidence score.
223
+
224
+ Args:
225
+ language (str): The input prompt or question.
226
+ image (np.ndarray | None): The input image as a numpy array.
227
+ image_path (str | None): Path to the input image file.
228
+ max_new_tokens (int): Maximum number of new tokens to generate.
229
+ do_sample (bool): Whether to use sampling for generation.
230
+
231
+ Returns:
232
+ tuple[str, float]: Generated response and confidence score.
233
+ """
234
+ if image_path:
235
+ image = Image.open(image_path).convert("RGB")
236
+ else:
237
+ image = Image.fromarray(image)
238
+ # set the max number of tiles in `max_num`
239
+ pixel_values = load_image(image, max_num=12).to(torch.bfloat16).cuda(self.device)
240
+ generation_config = {
241
+ "max_new_tokens": max_new_tokens,
242
+ "do_sample": do_sample,
243
+ }
244
+ image_prefix = "<image>\n"
245
+ language = image_prefix + language
246
+
247
+ return self.chat_with_confidence(self.tokenizer, pixel_values, language, generation_config)
248
+
249
+ def chat_with_confidence( # noqa: PLR0913
250
+ self,
251
+ tokenizer: AutoTokenizer,
252
+ pixel_values: torch.Tensor,
253
+ question: str,
254
+ generation_config: dict,
255
+ num_patches_list: list[int] | None = None,
256
+ IMG_START_TOKEN: str = "<img>", # noqa: N803, S107
257
+ IMG_END_TOKEN: str = "</img>", # noqa: N803, S107
258
+ IMG_CONTEXT_TOKEN: str = "<IMG_CONTEXT>", # noqa: N803, S107
259
+ verbose: bool = False,
260
+ ) -> tuple[str, float]:
261
+ """Generate a response with confidence score for the given input.
262
+
263
+ Args:
264
+ tokenizer: The tokenizer to use.
265
+ pixel_values: Image tensor input.
266
+ question: The input question or prompt.
267
+ generation_config: Configuration for text generation.
268
+ num_patches_list: List of number of patches for video frames.
269
+ IMG_START_TOKEN: Token to mark the start of an image.
270
+ IMG_END_TOKEN: Token to mark the end of an image.
271
+ IMG_CONTEXT_TOKEN: Token for image context.
272
+ verbose: Whether to print verbose output.
273
+
274
+ Returns:
275
+ A tuple containing the generated response and its confidence score.
276
+ """
277
+ if num_patches_list is None:
278
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
279
+
280
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list) # noqa: S101
281
+
282
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
283
+ self.model.img_context_token_id = img_context_token_id
284
+
285
+ template = self.model.conv_template
286
+ template.system_message = self.model.system_message
287
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
288
+
289
+ template.append_message(template.roles[0], question)
290
+ template.append_message(template.roles[1], None)
291
+ query = template.get_prompt()
292
+
293
+ if verbose and pixel_values is not None:
294
+ image_bs = pixel_values.shape[0]
295
+ print(f"dynamic ViT batch size: {image_bs}") # noqa: T201
296
+
297
+ for num_patches in num_patches_list:
298
+ context_tokens = IMG_CONTEXT_TOKEN * self.model.num_image_token * num_patches
299
+ image_tokens = IMG_START_TOKEN + context_tokens + IMG_END_TOKEN
300
+ query = query.replace("<image>", image_tokens, 1)
301
+
302
+ model_inputs = tokenizer(query, return_tensors="pt")
303
+ input_ids = model_inputs["input_ids"].cuda(self.device)
304
+ attention_mask = model_inputs["attention_mask"].cuda(self.device)
305
+ generation_config["eos_token_id"] = eos_token_id
306
+ generation_config["return_dict_in_generate"] = True
307
+ generation_config["output_scores"] = True
308
+ generation_config["output_logits"] = True
309
+ generation_output = self.model.generate(
310
+ pixel_values=pixel_values,
311
+ input_ids=input_ids,
312
+ attention_mask=attention_mask,
313
+ **generation_config,
314
+ )
315
+ response = tokenizer.batch_decode(generation_output.sequences, skip_special_tokens=True)[0]
316
+ response = response.split(template.sep)[0].strip()
317
+
318
+ logits_to_compute = np.where(generation_output.sequences[0].detach().cpu().numpy() != eos_token_id)[0]
319
+ confidence = 1.0
320
+ for logit in logits_to_compute:
321
+ token = generation_output.sequences[0, logit].item()
322
+ prob = softmax(generation_output.logits[logit])[0, token]
323
+ confidence = prob.item() * confidence
324
+ self.clear_gpu_memory()
325
+ return response, confidence
326
+
327
+ def infer_with_video_confidence(
328
+ self,
329
+ language: str,
330
+ seq_of_frames: list[np.ndarray] | None = None,
331
+ video_path: str | None = None,
332
+ max_new_tokens: int = 1024,
333
+ do_sample: bool = True,
334
+ ) -> tuple[str, float]:
335
+ """Perform video inference and return response with confidence score.
336
+
337
+ Args:
338
+ language (str): The input prompt or question.
339
+ seq_of_frames (list[np.ndarray] | None):
340
+ List of video frames as numpy arrays.
341
+ video_path (str | None): Path to the input video file.
342
+ max_new_tokens (int): Maximum number of new tokens to generate.
343
+ do_sample (bool): Whether to use sampling for generation.
344
+
345
+ Returns:
346
+ tuple[str, float]: Generated response and confidence score.
347
+ """
348
+ assert ( # noqa: S101
349
+ seq_of_frames is not None or video_path is not None
350
+ ), "One of 'seq_of_frames' or 'video_path' must be defined."
351
+
352
+ generation_config = {
353
+ "max_new_tokens": max_new_tokens,
354
+ "do_sample": do_sample,
355
+ }
356
+
357
+ if video_path:
358
+ pixel_values, num_patches_list = load_video_from_file(video_path, device=self.device)
359
+ else:
360
+ pixel_values, num_patches_list = load_video_from_seq_of_frames(
361
+ seq_of_frames=seq_of_frames, device=self.device
362
+ )
363
+
364
+ video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
365
+ language = video_prefix + language
366
+
367
+ return self.chat_with_confidence(
368
+ self.tokenizer,
369
+ pixel_values,
370
+ language,
371
+ generation_config,
372
+ num_patches_list=num_patches_list,
373
+ )
neus_v/vlm/internvl_utils.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torchvision.transforms as T
6
+ from decord import VideoReader, cpu
7
+ from PIL import Image
8
+ from torchvision.transforms.functional import InterpolationMode
9
+
10
+ from neus_v.video.read_video import read_video
11
+
12
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
13
+ IMAGENET_STD = (0.229, 0.224, 0.225)
14
+
15
+
16
+ def build_transform(input_size: int) -> T.Compose:
17
+ """Builds a transformation pipeline for the given input size."""
18
+ mean, std = IMAGENET_MEAN, IMAGENET_STD
19
+ return T.Compose(
20
+ [
21
+ T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
22
+ T.Resize(
23
+ (input_size, input_size),
24
+ interpolation=InterpolationMode.BICUBIC,
25
+ ),
26
+ T.ToTensor(),
27
+ T.Normalize(mean=mean, std=std),
28
+ ]
29
+ )
30
+
31
+
32
+ def assign_device_map(model_name, manual_gpu_id=0):
33
+ device_map = {}
34
+ world_size = torch.cuda.device_count()
35
+ num_layers = {
36
+ "InternVL2-1B": 24,
37
+ "InternVL2-2B": 24,
38
+ "InternVL2-4B": 32,
39
+ "InternVL2-8B": 32,
40
+ "InternVL2-26B": 48,
41
+ "InternVL2-40B": 60,
42
+ "InternVL2-Llama3-76B": 80,
43
+ }[model_name]
44
+ for layer_idx in range(num_layers):
45
+ device_map[f"language_model.model.layers.{layer_idx}"] = manual_gpu_id
46
+
47
+ device_map["vision_model"] = manual_gpu_id
48
+ device_map["mlp1"] = manual_gpu_id
49
+ device_map["language_model.model.tok_embeddings"] = manual_gpu_id
50
+ device_map["language_model.model.embed_tokens"] = manual_gpu_id
51
+ device_map["language_model.output"] = manual_gpu_id
52
+ device_map["language_model.model.norm"] = manual_gpu_id
53
+ device_map["language_model.lm_head"] = manual_gpu_id
54
+ device_map[f"language_model.model.layers.{num_layers - 1}"] = manual_gpu_id
55
+
56
+ return device_map
57
+
58
+
59
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
60
+ best_ratio_diff = float("inf")
61
+ best_ratio = (1, 1)
62
+ area = width * height
63
+ for ratio in target_ratios:
64
+ target_aspect_ratio = ratio[0] / ratio[1]
65
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
66
+ if ratio_diff < best_ratio_diff:
67
+ best_ratio_diff = ratio_diff
68
+ best_ratio = ratio
69
+ elif ratio_diff == best_ratio_diff:
70
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
71
+ best_ratio = ratio
72
+ return best_ratio
73
+
74
+
75
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
76
+ # Convert numpy array to PIL Image if needed
77
+ if isinstance(image, np.ndarray):
78
+ image = Image.fromarray(image)
79
+
80
+ orig_width, orig_height = image.size
81
+ aspect_ratio = orig_width / orig_height
82
+
83
+ # calculate the existing image aspect ratio
84
+ target_ratios = set(
85
+ (i, j)
86
+ for n in range(min_num, max_num + 1)
87
+ for i in range(1, n + 1)
88
+ for j in range(1, n + 1)
89
+ if i * j <= max_num and i * j >= min_num
90
+ )
91
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
92
+
93
+ # find the closest aspect ratio to the target
94
+ target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
95
+
96
+ # calculate the target width and height
97
+ target_width = image_size * target_aspect_ratio[0]
98
+ target_height = image_size * target_aspect_ratio[1]
99
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
100
+
101
+ # resize the image
102
+ resized_img = image.resize((target_width, target_height))
103
+ processed_images = []
104
+ for i in range(blocks):
105
+ box = (
106
+ (i % (target_width // image_size)) * image_size,
107
+ (i // (target_width // image_size)) * image_size,
108
+ ((i % (target_width // image_size)) + 1) * image_size,
109
+ ((i // (target_width // image_size)) + 1) * image_size,
110
+ )
111
+ # split the image
112
+ split_img = resized_img.crop(box)
113
+ processed_images.append(split_img)
114
+ assert len(processed_images) == blocks
115
+ if use_thumbnail and len(processed_images) != 1:
116
+ thumbnail_img = image.resize((image_size, image_size))
117
+ processed_images.append(thumbnail_img)
118
+ return processed_images
119
+
120
+
121
+ def load_image(image, input_size=448, max_num=12):
122
+ transform = build_transform(input_size=input_size)
123
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
124
+ pixel_values = [transform(image) for image in images]
125
+ pixel_values = torch.stack(pixel_values)
126
+ return pixel_values
127
+
128
+
129
+ def split_model(model_name):
130
+ device_map = {}
131
+ world_size = torch.cuda.device_count()
132
+ num_layers = {
133
+ "InternVL2-1B": 24,
134
+ "InternVL2-2B": 24,
135
+ "InternVL2-4B": 32,
136
+ "InternVL2-8B": 32,
137
+ "InternVL2-26B": 48,
138
+ "InternVL2-40B": 60,
139
+ "InternVL2-Llama3-76B": 80,
140
+ }[model_name]
141
+ # Since the first GPU will be used for ViT, treat it as half a GPU.
142
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
143
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
144
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
145
+ layer_cnt = 0
146
+ for i, num_layer in enumerate(num_layers_per_gpu):
147
+ for j in range(num_layer):
148
+ device_map[f"language_model.model.layers.{layer_cnt}"] = i
149
+ layer_cnt += 1
150
+ device_map["vision_model"] = 0
151
+ device_map["mlp1"] = 0
152
+ device_map["language_model.model.tok_embeddings"] = 0
153
+ device_map["language_model.model.embed_tokens"] = 0
154
+ device_map["language_model.output"] = 0
155
+ device_map["language_model.model.norm"] = 0
156
+ device_map["language_model.lm_head"] = 0
157
+ device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
158
+
159
+ return device_map
160
+
161
+
162
+ def move_tensors_to_gpu(module):
163
+ for name, tensor in module.named_buffers():
164
+ if isinstance(tensor, torch.Tensor) and tensor.device.type == "cpu":
165
+ module.register_buffer(name, tensor.cuda(), persistent=False)
166
+ for _, param in module.named_parameters():
167
+ if param.device.type == "cpu":
168
+ param.data = param.data.cuda()
169
+
170
+
171
+ # video multi-round conversation (视频多轮对话)
172
+ def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
173
+ if bound:
174
+ start, end = bound[0], bound[1]
175
+ else:
176
+ start, end = -100000, 100000
177
+ start_idx = max(first_idx, round(start * fps))
178
+ end_idx = min(round(end * fps), max_frame)
179
+ seg_size = float(end_idx - start_idx) / num_segments
180
+ frame_indices = np.array(
181
+ [int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)]
182
+ )
183
+ return frame_indices
184
+
185
+
186
+ def load_video_from_file(
187
+ video_path: str, input_size=448, max_num=1, device="cuda", dtype=torch.bfloat16 # Add dtype parameter
188
+ ):
189
+ video = read_video(video_path)
190
+ pixel_values_list, num_patches_list = [], []
191
+ transform = build_transform(input_size=input_size)
192
+ while True:
193
+ img: np.ndarray = video.get_next_frame(
194
+ return_format="pil",
195
+ desired_interval_in_sec=1,
196
+ )
197
+ if img is None:
198
+ break # No more frames or end of video
199
+ img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
200
+ pixel_values = [transform(tile) for tile in img]
201
+ pixel_values = torch.stack(pixel_values)
202
+ num_patches_list.append(pixel_values.shape[0])
203
+ pixel_values_list.append(pixel_values.to(device))
204
+ return torch.cat(pixel_values_list), num_patches_list
205
+
206
+
207
+ def load_video_from_seq_of_frames(
208
+ seq_of_frames: list[np.ndarray],
209
+ input_size=448,
210
+ max_num=1,
211
+ device="cuda",
212
+ dtype=torch.bfloat16, # Add dtype parameter
213
+ ):
214
+ pixel_values_list, num_patches_list = [], []
215
+ transform = build_transform(input_size=input_size)
216
+ for img in seq_of_frames:
217
+ img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
218
+ pixel_values = [transform(tile) for tile in img]
219
+ pixel_values = torch.stack(pixel_values).to(dtype=dtype, device=device) # Convert to bfloat16
220
+ num_patches_list.append(pixel_values.shape[0])
221
+ pixel_values_list.append(pixel_values)
222
+ return torch.cat(pixel_values_list), num_patches_list
223
+
224
+
225
+ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
226
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
227
+ max_frame = len(vr) - 1
228
+ fps = float(vr.get_avg_fps())
229
+
230
+ pixel_values_list, num_patches_list = [], []
231
+ transform = build_transform(input_size=input_size)
232
+ frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
233
+ for frame_index in frame_indices:
234
+ img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
235
+ img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
236
+ pixel_values = [transform(tile) for tile in img]
237
+ pixel_values = torch.stack(pixel_values)
238
+ num_patches_list.append(pixel_values.shape[0])
239
+ pixel_values_list.append(pixel_values.to(torch.bfloat16))
240
+ pixel_values = torch.cat(pixel_values_list)
241
+ return pixel_values, num_patches_list
neus_v/vlm/obj.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import enum
3
+ import logging
4
+ from typing import Any
5
+
6
+
7
+ class Status(enum.Enum):
8
+ """Status Enum for the CV API."""
9
+
10
+ UNKNOWN = 0
11
+ SUCCESS = 1
12
+ RUNNING = 2
13
+ FAILURE = 3
14
+ INVALID = 4
15
+
16
+
17
+ class DetectedObject:
18
+ """Detected Object class."""
19
+
20
+ name: str | None
21
+ confidence: float = 0.0
22
+ probability: float = 0.0
23
+ confidence_of_all_obj: list[float] | None = dataclasses.field(default_factory=list)
24
+ probability_of_all_obj: list[float] | None = dataclasses.field(default_factory=list)
25
+ all_obj_detected: list[Any] | None = None
26
+ number_of_detection: int = 0
27
+ is_detected: bool | Status = Status.UNKNOWN
28
+ model_name: str | None = None
29
+ bounding_box_of_all_obj: list[Any] | None = None
30
+
31
+ def __post_init__(self) -> None:
32
+ """Post init."""
33
+ if self.confidence_of_all_obj is not None and len(self.confidence_of_all_obj) > 0:
34
+ self.confidence = max(self.confidence_of_all_obj)
35
+ if self.probability_of_all_obj and len(self.probability_of_all_obj) > 0:
36
+ self.probability = max(self.probability_of_all_obj)
37
+
38
+ def get_probability(self) -> float:
39
+ """Get probability."""
40
+ if self.probability > 0:
41
+ return self.probability
42
+ if self.confidence > 0 and self.probability == 0:
43
+ logging.info("Probability is not set, using confidence: %f", self.confidence)
44
+ return self.confidence
45
+ return self.probability
setup.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from setuptools import find_packages, setup
2
+
3
+ setup(
4
+ name="NeuS-V",
5
+ version="0.1",
6
+ packages=find_packages(),
7
+ )