Spaces:
Running
on
L4
Running
on
L4
Commit
·
8d3e73e
0
Parent(s):
Add refactored codebase
Browse files- .gitignore +168 -0
- README.md +0 -0
- evaluate.py +98 -0
- install.sh +34 -0
- neus_v/automaton/__init__.py +0 -0
- neus_v/automaton/video_automaton.py +133 -0
- neus_v/model_checking/__init__.py +0 -0
- neus_v/model_checking/proposition.py +6 -0
- neus_v/model_checking/stormpy.py +234 -0
- neus_v/model_checking/video_state.py +80 -0
- neus_v/smooth_scoring.py +20 -0
- neus_v/utils.py +10 -0
- neus_v/veval/__init__.py +0 -0
- neus_v/veval/eval.py +250 -0
- neus_v/veval/parse.py +29 -0
- neus_v/video/__init__.py +0 -0
- neus_v/video/frame.py +81 -0
- neus_v/video/read_video.py +45 -0
- neus_v/video/video.py +304 -0
- neus_v/vlm/__init__.py +0 -0
- neus_v/vlm/internvl.py +373 -0
- neus_v/vlm/internvl_utils.py +241 -0
- neus_v/vlm/obj.py +45 -0
- setup.py +7 -0
.gitignore
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# UV
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
#uv.lock
|
102 |
+
|
103 |
+
# poetry
|
104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
106 |
+
# commonly ignored for libraries.
|
107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
108 |
+
#poetry.lock
|
109 |
+
|
110 |
+
# pdm
|
111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
112 |
+
#pdm.lock
|
113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
114 |
+
# in version control.
|
115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
116 |
+
.pdm.toml
|
117 |
+
.pdm-python
|
118 |
+
.pdm-build/
|
119 |
+
|
120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
121 |
+
__pypackages__/
|
122 |
+
|
123 |
+
# Celery stuff
|
124 |
+
celerybeat-schedule
|
125 |
+
celerybeat.pid
|
126 |
+
|
127 |
+
# SageMath parsed files
|
128 |
+
*.sage.py
|
129 |
+
|
130 |
+
# Environments
|
131 |
+
.env
|
132 |
+
.venv
|
133 |
+
env/
|
134 |
+
venv/
|
135 |
+
ENV/
|
136 |
+
env.bak/
|
137 |
+
venv.bak/
|
138 |
+
|
139 |
+
# Spyder project settings
|
140 |
+
.spyderproject
|
141 |
+
.spyproject
|
142 |
+
|
143 |
+
# Rope project settings
|
144 |
+
.ropeproject
|
145 |
+
|
146 |
+
# mkdocs documentation
|
147 |
+
/site
|
148 |
+
|
149 |
+
# mypy
|
150 |
+
.mypy_cache/
|
151 |
+
.dmypy.json
|
152 |
+
dmypy.json
|
153 |
+
|
154 |
+
# Pyre type checker
|
155 |
+
.pyre/
|
156 |
+
|
157 |
+
# pytype static type analyzer
|
158 |
+
.pytype/
|
159 |
+
|
160 |
+
# Cython debug symbols
|
161 |
+
cython_debug/
|
162 |
+
|
163 |
+
# PyCharm
|
164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
168 |
+
#.idea/
|
README.md
ADDED
File without changes
|
evaluate.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import warnings
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
from neus_v.smooth_scoring import smooth_confidence_scores
|
8 |
+
from neus_v.utils import clear_gpu_memory
|
9 |
+
from neus_v.veval.eval import evaluate_video_with_sequence_of_images
|
10 |
+
from neus_v.veval.parse import parse_proposition_set, parse_tl_specification
|
11 |
+
from neus_v.vlm.internvl import InternVL
|
12 |
+
|
13 |
+
# Suppress specific warnings
|
14 |
+
warnings.filterwarnings(
|
15 |
+
"ignore", category=DeprecationWarning, message="Conversion of an array with ndim > 0 to a scalar is deprecated"
|
16 |
+
)
|
17 |
+
|
18 |
+
# Paths and parameters
|
19 |
+
WEIGHT_PATH = Path("/opt/mars/mnt/model_weights")
|
20 |
+
pickle_path = WEIGHT_PATH / "distributions.pkl"
|
21 |
+
num_of_frame_in_sequence = 3
|
22 |
+
model = "InternVL2-8B"
|
23 |
+
device = 7
|
24 |
+
# Load the vision-language model
|
25 |
+
vision_language_model = InternVL(model_name=model, device=device)
|
26 |
+
# Load distributions
|
27 |
+
with open(pickle_path, "rb") as f:
|
28 |
+
distributions = pickle.load(f)
|
29 |
+
all_dimension_data = distributions.get(model).get("all_dimension")
|
30 |
+
|
31 |
+
|
32 |
+
# TODO: Make paths better for public release
|
33 |
+
def process_video(video_path, propositions, tl):
|
34 |
+
"""Process the video and compute the score_on_all."""
|
35 |
+
proposition_set = parse_proposition_set(propositions.split(","))
|
36 |
+
tl_spec = parse_tl_specification(tl)
|
37 |
+
threshold = 0.349
|
38 |
+
|
39 |
+
try:
|
40 |
+
result = evaluate_video_with_sequence_of_images(
|
41 |
+
vision_language_model=vision_language_model,
|
42 |
+
confidence_as_token_probability=True,
|
43 |
+
video_path=video_path,
|
44 |
+
proposition_set=proposition_set,
|
45 |
+
tl_spec=tl_spec,
|
46 |
+
parallel_inference=False,
|
47 |
+
num_of_frame_in_sequence=num_of_frame_in_sequence,
|
48 |
+
threshold=threshold,
|
49 |
+
)
|
50 |
+
probability = result.get("probability")
|
51 |
+
score_on_all = float(
|
52 |
+
smooth_confidence_scores(
|
53 |
+
target_data=[probability],
|
54 |
+
prior_distribution=all_dimension_data,
|
55 |
+
)
|
56 |
+
)
|
57 |
+
clear_gpu_memory()
|
58 |
+
return score_on_all
|
59 |
+
|
60 |
+
except Exception as e:
|
61 |
+
clear_gpu_memory()
|
62 |
+
return f"Error: {str(e)}"
|
63 |
+
|
64 |
+
|
65 |
+
# Gradio interface
|
66 |
+
def demo_interface(video, propositions, tl):
|
67 |
+
"""Wrapper for the Gradio interface."""
|
68 |
+
return process_video(video, propositions, tl)
|
69 |
+
|
70 |
+
|
71 |
+
def main():
|
72 |
+
# Example data from the original script
|
73 |
+
example_video_path_1 = "/opt/mars/mnt/dataset/teaser/A_storm_bursts_in_with_intermittent_lightning_and_causes_flooding_and_large_waves_crash_in.mp4"
|
74 |
+
example_video_path_2 = "/opt/mars/mnt/dataset/teaser/The ocean waves gently lapping at the shore, until a storm bursts in, and then lightning flashes across the sky.mp4"
|
75 |
+
example_propositions = "waves lapping,ocean shore,storm bursts in,lightning on the sky"
|
76 |
+
example_tl = '("waves_lapping" & "ocean_shore") U ("storm_bursts_in" U "lightning_on_the_sky")'
|
77 |
+
|
78 |
+
demo = gr.Interface(
|
79 |
+
fn=demo_interface,
|
80 |
+
inputs=[
|
81 |
+
gr.Video(label="Upload Video"),
|
82 |
+
gr.Textbox(label="List of Propositions (comma-separated)"),
|
83 |
+
gr.Textbox(label="Temporal Logic Specification"),
|
84 |
+
],
|
85 |
+
outputs=gr.Textbox(label="Score on All"),
|
86 |
+
title="Video Evaluation with Temporal Logic",
|
87 |
+
description="Upload a video and provide propositions and temporal logic to evaluate the score_on_all.",
|
88 |
+
examples=[
|
89 |
+
[example_video_path_1, example_propositions, example_tl],
|
90 |
+
[example_video_path_2, example_propositions, example_tl],
|
91 |
+
],
|
92 |
+
)
|
93 |
+
|
94 |
+
demo.launch(allowed_paths=["/opt/mars/mnt/dataset/teaser"])
|
95 |
+
|
96 |
+
|
97 |
+
if __name__ == "__main__":
|
98 |
+
main()
|
install.sh
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#/bin/bash
|
2 |
+
# These are the commands that I used to install the necessary packages for the project
|
3 |
+
conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
|
4 |
+
pip install gradio
|
5 |
+
pip install transformers
|
6 |
+
pip install decord
|
7 |
+
pip install opencv-python
|
8 |
+
pip install joblib
|
9 |
+
|
10 |
+
# Storm
|
11 |
+
sudo apt install libboost-all-dev m4
|
12 |
+
|
13 |
+
mkdir build
|
14 |
+
cd build
|
15 |
+
wget https://github.com/moves-rwth/storm/archive/stable.zip
|
16 |
+
unzip stable.zip
|
17 |
+
cd storm-stable
|
18 |
+
mkdir build
|
19 |
+
cd build
|
20 |
+
cmake ..
|
21 |
+
|
22 |
+
# Carl
|
23 |
+
cd FILLEMEUP
|
24 |
+
git clone https://github.com/moves-rwth/carl-storm
|
25 |
+
cd carl-storm
|
26 |
+
mkdir build
|
27 |
+
cd build
|
28 |
+
cmake ..
|
29 |
+
make lib_carl
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
pip install pycarl
|
34 |
+
pip install stormpy
|
neus_v/automaton/__init__.py
ADDED
File without changes
|
neus_v/automaton/video_automaton.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from neus_v.model_checking.proposition import process_proposition_set
|
2 |
+
from neus_v.model_checking.video_state import VideoState
|
3 |
+
from neus_v.video.frame import VideoFrame
|
4 |
+
|
5 |
+
|
6 |
+
class VideoAutomaton:
|
7 |
+
"""Represents a Markov Automaton for video state modeling."""
|
8 |
+
|
9 |
+
def __init__(self, include_initial_state: bool = False) -> None:
|
10 |
+
"""Initialize the MarkovAutomaton.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
include_initial_state (bool, optional): Whether to include
|
14 |
+
the initial state. Defaults to False.
|
15 |
+
proposition_set (list[str] | None, optional): List of propositions.
|
16 |
+
Defaults to None.
|
17 |
+
"""
|
18 |
+
self.previous_states: list[VideoState] = []
|
19 |
+
self.states: list[VideoState] = []
|
20 |
+
self.transitions = []
|
21 |
+
self.include_initial_state = include_initial_state
|
22 |
+
|
23 |
+
def set_up(self, proposition_set: list[str]) -> None:
|
24 |
+
"""Set up the MarkovAutomaton."""
|
25 |
+
self.proposition_set = process_proposition_set(proposition_set)
|
26 |
+
self.label_combinations = self._create_label_combinations(len(proposition_set))
|
27 |
+
self.probability_of_propositions = [[] for _ in range(len(proposition_set))]
|
28 |
+
self.frame_index_in_automaton = 0
|
29 |
+
|
30 |
+
if self.include_initial_state:
|
31 |
+
initial_state = VideoState(
|
32 |
+
state_index=0,
|
33 |
+
frame_index=-1,
|
34 |
+
label="init",
|
35 |
+
proposition_set=proposition_set,
|
36 |
+
)
|
37 |
+
self.previous_states = [initial_state]
|
38 |
+
self.states = [initial_state]
|
39 |
+
self._current_state = initial_state
|
40 |
+
|
41 |
+
def reset(self) -> None:
|
42 |
+
"""Reset automaton."""
|
43 |
+
self.__init__(self.include_initial_state)
|
44 |
+
self.set_up(self.proposition_set)
|
45 |
+
|
46 |
+
def add_frame(self, frame: VideoFrame) -> None:
|
47 |
+
"""Add frame to automaton."""
|
48 |
+
self._get_probability_of_propositions(frame)
|
49 |
+
current_states = []
|
50 |
+
for prop_comb in self.label_combinations:
|
51 |
+
# iterate through all possible combinations of T and F
|
52 |
+
self._current_state = VideoState(
|
53 |
+
state_index=len(self.states),
|
54 |
+
frame_index=self.frame_index_in_automaton,
|
55 |
+
label=prop_comb,
|
56 |
+
proposition_set=self.proposition_set,
|
57 |
+
)
|
58 |
+
# TODO: Make a method for update and compute probability
|
59 |
+
self._current_state.update(
|
60 |
+
frame_index=self.frame_index_in_automaton,
|
61 |
+
target_label=prop_comb,
|
62 |
+
)
|
63 |
+
self._current_state.compute_probability(probabilities=self.probability_of_propositions)
|
64 |
+
if self._current_state.probability > 0:
|
65 |
+
self.states.append(self._current_state)
|
66 |
+
current_states.append(self._current_state)
|
67 |
+
|
68 |
+
# Build transitions from previous states to current states
|
69 |
+
if self.previous_states:
|
70 |
+
for prev_state in self.previous_states:
|
71 |
+
for cur_state in current_states:
|
72 |
+
transition = (
|
73 |
+
prev_state.state_index,
|
74 |
+
cur_state.state_index,
|
75 |
+
cur_state.probability,
|
76 |
+
)
|
77 |
+
self.transitions.append(transition)
|
78 |
+
|
79 |
+
self.previous_states = current_states if current_states else self.previous_states
|
80 |
+
self.frame_index_in_automaton += 1
|
81 |
+
|
82 |
+
def add_terminal_state(self, add_with_terminal_label: bool = False) -> None:
|
83 |
+
"""Add terminal state to the automaton."""
|
84 |
+
if add_with_terminal_label:
|
85 |
+
terminal_state_index = len(self.states)
|
86 |
+
terminal_state = VideoState(
|
87 |
+
state_index=terminal_state_index,
|
88 |
+
frame_index=self.frame_index_in_automaton,
|
89 |
+
label="terminal",
|
90 |
+
proposition_set=self.proposition_set,
|
91 |
+
)
|
92 |
+
self.states.append(terminal_state)
|
93 |
+
self._current_state = terminal_state
|
94 |
+
|
95 |
+
self.transitions.extend(
|
96 |
+
(prev_state.state_index, terminal_state_index, 1.0) for prev_state in self.previous_states
|
97 |
+
)
|
98 |
+
self.transitions.append((terminal_state_index, terminal_state_index, 1.0))
|
99 |
+
else:
|
100 |
+
self.transitions.extend(
|
101 |
+
(prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
|
102 |
+
)
|
103 |
+
|
104 |
+
def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
|
105 |
+
"""Update the probability of propositions."""
|
106 |
+
for i, prop in enumerate(self.proposition_set):
|
107 |
+
prop = prop.replace("_", " ")
|
108 |
+
if frame.object_of_interest.get(prop):
|
109 |
+
probability = frame.object_of_interest[prop].get_probability()
|
110 |
+
else:
|
111 |
+
probability = 0.0
|
112 |
+
self.probability_of_propositions[i].append(round(probability, 2))
|
113 |
+
|
114 |
+
def _create_label_combinations(self, num_props: int) -> list[str]:
|
115 |
+
"""Create all possible combinations of T and F for the number of propositions.
|
116 |
+
|
117 |
+
Args:
|
118 |
+
num_props (int): Number of propositions.
|
119 |
+
|
120 |
+
Returns:
|
121 |
+
list[str]: List of all possible combinations of T and F.
|
122 |
+
""" # noqa: E501
|
123 |
+
label_list = []
|
124 |
+
|
125 |
+
def add_labels(num_props: int, label: str, label_list: list[str]) -> None:
|
126 |
+
if len(label) == num_props:
|
127 |
+
label_list.append(label)
|
128 |
+
return
|
129 |
+
add_labels(num_props, label + "T", label_list)
|
130 |
+
add_labels(num_props, label + "F", label_list)
|
131 |
+
|
132 |
+
add_labels(num_props, "", label_list)
|
133 |
+
return label_list
|
neus_v/model_checking/__init__.py
ADDED
File without changes
|
neus_v/model_checking/proposition.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def process_proposition_set(proposition_set: list[str]) -> list:
|
2 |
+
"""Process proposition set."""
|
3 |
+
new_set = []
|
4 |
+
for proposition in proposition_set:
|
5 |
+
new_set.append(proposition.replace(" ", "_"))
|
6 |
+
return new_set
|
neus_v/model_checking/stormpy.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import math
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import stormpy
|
6 |
+
import stormpy.examples.files
|
7 |
+
from stormpy.core import ExplicitQualitativeCheckResult
|
8 |
+
|
9 |
+
from neus_v.model_checking.proposition import process_proposition_set
|
10 |
+
from neus_v.model_checking.video_state import VideoState
|
11 |
+
|
12 |
+
|
13 |
+
class StormModelChecker:
|
14 |
+
"""Model Checker using Stormpy for verifying properties."""
|
15 |
+
|
16 |
+
def __init__(
|
17 |
+
self,
|
18 |
+
proposition_set: list[str],
|
19 |
+
ltl_formula: str,
|
20 |
+
) -> None:
|
21 |
+
"""Initialize the StormModelChecker.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
proposition_set: List of propositions.
|
25 |
+
ltl_formula: LTL formula to check.
|
26 |
+
verbose: Enable verbose output.
|
27 |
+
is_filter: Apply filtering to results.
|
28 |
+
"""
|
29 |
+
self.proposition_set = process_proposition_set(proposition_set)
|
30 |
+
self.ltl_formula = ltl_formula
|
31 |
+
|
32 |
+
def create_model(
|
33 |
+
self,
|
34 |
+
transitions: list[tuple[int, int, float]],
|
35 |
+
states: list[VideoState],
|
36 |
+
model_type: str = "sparse_ma",
|
37 |
+
) -> any:
|
38 |
+
"""Create model.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
transitions (list[tuple[int, int, float]]): List of transitions.
|
42 |
+
states (list[VideoState]): List of states.
|
43 |
+
model_type (str): Type of model to create ("sparse_ma" or "dtmc").
|
44 |
+
verbose (bool): Whether to print verbose output.
|
45 |
+
"""
|
46 |
+
state_labeling = self._build_label_func(states, self.proposition_set)
|
47 |
+
if model_type in ["sparse_ma", "mdp"]:
|
48 |
+
transition_matrix = self._build_trans_matrix(
|
49 |
+
transitions=transitions,
|
50 |
+
states=states,
|
51 |
+
model_type="nondeterministic",
|
52 |
+
)
|
53 |
+
else:
|
54 |
+
transition_matrix = self._build_trans_matrix(
|
55 |
+
transitions=transitions,
|
56 |
+
states=states,
|
57 |
+
model_type="deterministic",
|
58 |
+
)
|
59 |
+
components = stormpy.SparseModelComponents(
|
60 |
+
transition_matrix=transition_matrix,
|
61 |
+
state_labeling=state_labeling,
|
62 |
+
)
|
63 |
+
if model_type == "sparse_ma":
|
64 |
+
markovian_states = stormpy.BitVector(len(states), list(range(len(states))))
|
65 |
+
components.markovian_states = markovian_states
|
66 |
+
components.exit_rates = [1.0 for _ in range(len(states))]
|
67 |
+
model = stormpy.SparseMA(components)
|
68 |
+
elif model_type == "dtmc":
|
69 |
+
model = stormpy.storage.SparseDtmc(components)
|
70 |
+
elif model_type == "mdp":
|
71 |
+
model = stormpy.storage.SparseMdp(components)
|
72 |
+
else:
|
73 |
+
msg = f"Unsupported model type: {model_type}"
|
74 |
+
raise ValueError(msg)
|
75 |
+
return model
|
76 |
+
|
77 |
+
def check_automaton(
|
78 |
+
self,
|
79 |
+
transitions: list[tuple[int, int, float]],
|
80 |
+
states: list[VideoState],
|
81 |
+
model_type: str = "sparse_ma",
|
82 |
+
use_filter: bool = False,
|
83 |
+
) -> any:
|
84 |
+
"""Check automaton.
|
85 |
+
|
86 |
+
Args:
|
87 |
+
transitions: List of transitions.
|
88 |
+
states: List of states.
|
89 |
+
verbose: Enable verbose output.
|
90 |
+
use_filter: Apply filtering to results.
|
91 |
+
"""
|
92 |
+
model = self.create_model(
|
93 |
+
transitions=transitions,
|
94 |
+
states=states,
|
95 |
+
model_type=model_type,
|
96 |
+
)
|
97 |
+
# Check the model
|
98 |
+
# Initialize Prism Program
|
99 |
+
path = stormpy.examples.files.prism_dtmc_die # prism_mdp_maze
|
100 |
+
prism_program = stormpy.parse_prism_program(path)
|
101 |
+
|
102 |
+
# Define Properties
|
103 |
+
properties = stormpy.parse_properties(self.ltl_formula, prism_program)
|
104 |
+
|
105 |
+
# Get Result and Filter it
|
106 |
+
result = stormpy.model_checking(model, properties[0])
|
107 |
+
|
108 |
+
if use_filter:
|
109 |
+
# The final result will only consider paths starting from the initial states of the automaton. # noqa: E501
|
110 |
+
filtered_result = stormpy.create_filter_initial_states_sparse(model)
|
111 |
+
result.filter(filtered_result)
|
112 |
+
return result
|
113 |
+
|
114 |
+
def qualitative_result_eval(self, verification_result: ExplicitQualitativeCheckResult) -> bool:
|
115 |
+
if isinstance(verification_result, ExplicitQualitativeCheckResult):
|
116 |
+
# string result is "true" when is absolutely true
|
117 |
+
# but it returns "true, false" when we have some true and false
|
118 |
+
verification_result_str = str(verification_result)
|
119 |
+
string_result = verification_result_str.split("{")[-1].split("}")[0]
|
120 |
+
if len(string_result) == 4:
|
121 |
+
if string_result[0] == "t": # 0,6
|
122 |
+
result = True
|
123 |
+
elif len(string_result) > 5:
|
124 |
+
# "true, false" -> some true and some false
|
125 |
+
result = True
|
126 |
+
else:
|
127 |
+
result = False
|
128 |
+
return result
|
129 |
+
msg = "Model Checking is not qualitative"
|
130 |
+
raise ValueError(msg)
|
131 |
+
|
132 |
+
def _build_trans_matrix(
|
133 |
+
self,
|
134 |
+
transitions: list[tuple[int, int, float]],
|
135 |
+
states: list[VideoState],
|
136 |
+
model_type: str = "nondeterministic",
|
137 |
+
) -> stormpy.storage.SparseMatrix:
|
138 |
+
"""Build transition matrix.
|
139 |
+
|
140 |
+
Args:
|
141 |
+
transitions: List of transitions.
|
142 |
+
states: List of states.
|
143 |
+
model_type: Type of model ("nondeterministic" or "deterministic").
|
144 |
+
"""
|
145 |
+
if model_type not in ["nondeterministic", "deterministic"]:
|
146 |
+
msg = "Invalid model_type. Must be 'nondeterministic' or 'deterministic'" # noqa: E501
|
147 |
+
raise ValueError(msg)
|
148 |
+
|
149 |
+
if model_type == "nondeterministic":
|
150 |
+
matrix = np.zeros((len(states), len(states)))
|
151 |
+
for t in transitions:
|
152 |
+
matrix[int(t[0]), int(t[1])] = float(t[2])
|
153 |
+
trans_matrix = stormpy.build_sparse_matrix(matrix, list(range(len(states))))
|
154 |
+
|
155 |
+
elif model_type == "deterministic":
|
156 |
+
num_states = len(states)
|
157 |
+
builder = stormpy.SparseMatrixBuilder(
|
158 |
+
rows=num_states,
|
159 |
+
columns=num_states,
|
160 |
+
entries=len(transitions),
|
161 |
+
force_dimensions=False,
|
162 |
+
)
|
163 |
+
states_with_transitions = set(src for src, _, _ in transitions)
|
164 |
+
outgoing_probs = {i: 0.0 for i in range(num_states)}
|
165 |
+
|
166 |
+
for src, dest, prob in transitions:
|
167 |
+
builder.add_next_value(src, dest, prob)
|
168 |
+
outgoing_probs[src] += prob
|
169 |
+
|
170 |
+
for state in range(num_states):
|
171 |
+
if state not in states_with_transitions:
|
172 |
+
builder.add_next_value(state, state, 1.0)
|
173 |
+
outgoing_probs[state] = 1.0
|
174 |
+
|
175 |
+
# Check probabilities
|
176 |
+
for state, prob_sum in outgoing_probs.items():
|
177 |
+
if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
|
178 |
+
logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
|
179 |
+
|
180 |
+
# ... (existing logging code) ...
|
181 |
+
trans_matrix = builder.build()
|
182 |
+
return trans_matrix
|
183 |
+
|
184 |
+
def _build_label_func(
|
185 |
+
self,
|
186 |
+
states: list[VideoState],
|
187 |
+
props: list[str],
|
188 |
+
model_type: str = "nondeterministic",
|
189 |
+
) -> stormpy.storage.StateLabeling:
|
190 |
+
"""Build label function.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
states (list[State]): List of states.
|
194 |
+
props (list[str]): List of propositions.
|
195 |
+
model_type (str): Type of model
|
196 |
+
("nondeterministic" or "deterministic").
|
197 |
+
|
198 |
+
Returns:
|
199 |
+
stormpy.storage.StateLabeling: State labeling.
|
200 |
+
"""
|
201 |
+
state_labeling = stormpy.storage.StateLabeling(len(states))
|
202 |
+
state_labeling.add_label("init")
|
203 |
+
state_labeling.add_label("terminal")
|
204 |
+
for label in props:
|
205 |
+
state_labeling.add_label(label)
|
206 |
+
|
207 |
+
if model_type == "nondeterministic":
|
208 |
+
for state in states:
|
209 |
+
for label in state.descriptive_label:
|
210 |
+
state_labeling.add_label_to_state(label, state.state_index)
|
211 |
+
else:
|
212 |
+
for i, state in enumerate(states):
|
213 |
+
for prop in state.props:
|
214 |
+
if prop in props:
|
215 |
+
state_labeling.add_label_to_state(prop, i)
|
216 |
+
return state_labeling
|
217 |
+
|
218 |
+
def validate_tl_specification(self, ltl_formula: str) -> bool:
|
219 |
+
"""Validate LTL specification.
|
220 |
+
|
221 |
+
Args:
|
222 |
+
ltl_formula: LTL formula to validate.
|
223 |
+
"""
|
224 |
+
path = stormpy.examples.files.prism_dtmc_die # prism_mdp_maze
|
225 |
+
prism_program = stormpy.parse_prism_program(path)
|
226 |
+
# Define Properties
|
227 |
+
try:
|
228 |
+
stormpy.parse_properties(ltl_formula, prism_program)
|
229 |
+
except Exception as e:
|
230 |
+
msg = f"Error validating LTL specification: {e}"
|
231 |
+
logging.exception(msg)
|
232 |
+
return False
|
233 |
+
else:
|
234 |
+
return True
|
neus_v/model_checking/video_state.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class VideoState:
|
2 |
+
"""Video state class."""
|
3 |
+
|
4 |
+
def __init__(
|
5 |
+
self,
|
6 |
+
state_index: int,
|
7 |
+
frame_index: int,
|
8 |
+
label: str,
|
9 |
+
proposition_set: list[str],
|
10 |
+
probability: float = 1.0,
|
11 |
+
) -> None:
|
12 |
+
"""State class.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
state_index (int): state_index.
|
16 |
+
frame_index (int): Frame index.
|
17 |
+
label (str): Label set. :abel is a string with characters T or F
|
18 |
+
indicating True or False
|
19 |
+
proposition_set (list[str]): Proposition set.
|
20 |
+
probability (float): Probability of the state.
|
21 |
+
"""
|
22 |
+
self.state_index = state_index
|
23 |
+
self.frame_index = frame_index
|
24 |
+
self.proposition_set = proposition_set
|
25 |
+
self.label = label # "init", "terminal", TTT, TFT, FTT, etc.
|
26 |
+
self.descriptive_label = self._get_descriptive_label(label=label)
|
27 |
+
self.probability = probability
|
28 |
+
|
29 |
+
def __repr__(self) -> str:
|
30 |
+
"""Representation of state."""
|
31 |
+
return f"{self.state_index} {self.descriptive_label} {self.frame_index} {self.probability}" # noqa: E501
|
32 |
+
|
33 |
+
def __str__(self) -> str:
|
34 |
+
"""String of state."""
|
35 |
+
return f"{self.__repr__()}"
|
36 |
+
|
37 |
+
def _get_descriptive_label(self, label: str) -> list:
|
38 |
+
"""Get descriptive label.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
label (str): Label.
|
42 |
+
"""
|
43 |
+
labels = []
|
44 |
+
if label == "init":
|
45 |
+
labels.append("init")
|
46 |
+
elif label == "terminal":
|
47 |
+
labels.append("terminal")
|
48 |
+
else:
|
49 |
+
for i in range(len(self.proposition_set)):
|
50 |
+
if label[i] == "T":
|
51 |
+
labels.append(self.proposition_set[i])
|
52 |
+
return labels
|
53 |
+
|
54 |
+
def update(self, frame_index: int, target_label: str) -> None:
|
55 |
+
"""Update state to the new state..
|
56 |
+
|
57 |
+
Args:
|
58 |
+
frame_index (int): Frame index.
|
59 |
+
target_label (str): Target label for the new state.
|
60 |
+
"""
|
61 |
+
self.frame_index = frame_index
|
62 |
+
self.label = target_label # TTT, TFT, FTT, etc.
|
63 |
+
self.descriptive_label = self._get_descriptive_label(label=target_label)
|
64 |
+
self.probability = 1.0
|
65 |
+
|
66 |
+
def compute_probability(self, probabilities: list[list[float]]) -> None:
|
67 |
+
"""Compute probability of the state given the probabilities of the propositions.
|
68 |
+
|
69 |
+
Args:
|
70 |
+
probabilities (list): list of probabilities of the propositions
|
71 |
+
e.g. two propositions with three frames
|
72 |
+
-> [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]].
|
73 |
+
""" # noqa: E501
|
74 |
+
probability = 1.0
|
75 |
+
for i in range(len(self.label)):
|
76 |
+
if self.label[i] == "T":
|
77 |
+
probability *= probabilities[i][self.frame_index]
|
78 |
+
else:
|
79 |
+
probability *= 1 - probabilities[i][self.frame_index]
|
80 |
+
self.probability = round(probability, 3)
|
neus_v/smooth_scoring.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
class DataTransformer:
|
5 |
+
def __init__(self, data):
|
6 |
+
self.data = np.asarray(data)
|
7 |
+
self.sorted_data = np.sort(self.data)
|
8 |
+
self.n = len(self.sorted_data)
|
9 |
+
self.ecdf = np.arange(1, self.n + 1) / self.n
|
10 |
+
|
11 |
+
def mapping_function(self, x):
|
12 |
+
x = np.asarray(x)
|
13 |
+
return np.interp(x, self.sorted_data, self.ecdf, left=0, right=1)
|
14 |
+
|
15 |
+
|
16 |
+
def smooth_confidence_scores(target_data, prior_distribution=None):
|
17 |
+
if prior_distribution is None:
|
18 |
+
prior_distribution = target_data
|
19 |
+
transformer = DataTransformer(prior_distribution)
|
20 |
+
return transformer.mapping_function(target_data)
|
neus_v/utils.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
|
6 |
+
def clear_gpu_memory():
|
7 |
+
torch.cuda.empty_cache()
|
8 |
+
if torch.cuda.is_available():
|
9 |
+
torch.cuda.ipc_collect()
|
10 |
+
gc.collect()
|
neus_v/veval/__init__.py
ADDED
File without changes
|
neus_v/veval/eval.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
from joblib import Parallel, delayed
|
6 |
+
|
7 |
+
from neus_v.automaton.video_automaton import VideoAutomaton
|
8 |
+
from neus_v.model_checking.stormpy import StormModelChecker
|
9 |
+
from neus_v.veval.parse import parse_tl_formula
|
10 |
+
from neus_v.video.frame import VideoFrame
|
11 |
+
from neus_v.video.read_video import read_video
|
12 |
+
|
13 |
+
|
14 |
+
def create_frame_windows(frames: list, window_size: int) -> list[list]:
|
15 |
+
"""Create non-overlapping windows of frames, with remainder in last window.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
frames: List of frames
|
19 |
+
window_size: Size of each window
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
List of frame windows
|
23 |
+
"""
|
24 |
+
windows = []
|
25 |
+
for i in range(0, len(frames), window_size):
|
26 |
+
windows.append(frames[i : i + window_size])
|
27 |
+
return windows
|
28 |
+
|
29 |
+
|
30 |
+
def evaluate_video(
|
31 |
+
vision_language_model,
|
32 |
+
confidence_as_token_probability: bool,
|
33 |
+
video_path: Path | str,
|
34 |
+
proposition_set: list,
|
35 |
+
tl_spec: str,
|
36 |
+
parallel_inference: bool = False,
|
37 |
+
threshold: float = 0.1,
|
38 |
+
) -> dict:
|
39 |
+
"""Evaluate a video using the given vision language model."""
|
40 |
+
output_log = {
|
41 |
+
"specification": None,
|
42 |
+
"propositions": None,
|
43 |
+
"probability": None,
|
44 |
+
"min_probability": None,
|
45 |
+
"max_probability": None,
|
46 |
+
"propositions_avg_probability": {},
|
47 |
+
}
|
48 |
+
|
49 |
+
if isinstance(video_path, str):
|
50 |
+
video_path = Path(video_path)
|
51 |
+
video = read_video(video_path=video_path)
|
52 |
+
|
53 |
+
# TODO: if there's F in the tl_spec
|
54 |
+
ltl_formula = parse_tl_formula(tl_spec)
|
55 |
+
|
56 |
+
video_automaton = VideoAutomaton(include_initial_state=True)
|
57 |
+
|
58 |
+
video_automaton.set_up(proposition_set=proposition_set)
|
59 |
+
model_checker = StormModelChecker(
|
60 |
+
proposition_set=proposition_set,
|
61 |
+
ltl_formula=ltl_formula,
|
62 |
+
)
|
63 |
+
|
64 |
+
proposition_probability_record = {}
|
65 |
+
for proposition in proposition_set:
|
66 |
+
proposition_probability_record[proposition] = []
|
67 |
+
if model_checker.validate_tl_specification(ltl_formula):
|
68 |
+
frame_count = 0
|
69 |
+
all_frames: list[np.ndarray] = video.get_all_frames_of_video(
|
70 |
+
return_format="ndarray",
|
71 |
+
desired_interval_in_sec=1,
|
72 |
+
)
|
73 |
+
try:
|
74 |
+
# for frame_img in all_frames:
|
75 |
+
|
76 |
+
def process_frame(frame_img: np.ndarray, frame_count: int):
|
77 |
+
sys.stdout.write(f"\rProcessing frame: {frame_count+1}/{len(all_frames)} ")
|
78 |
+
sys.stdout.flush()
|
79 |
+
object_of_interest = {}
|
80 |
+
for proposition in proposition_set:
|
81 |
+
detected_object = vision_language_model.detect(
|
82 |
+
frame_img=frame_img,
|
83 |
+
scene_description=proposition,
|
84 |
+
confidence_as_token_probability=confidence_as_token_probability,
|
85 |
+
threshold=threshold,
|
86 |
+
)
|
87 |
+
object_of_interest[proposition] = detected_object
|
88 |
+
# proposition_probability_record.get(proposition).append(
|
89 |
+
# detected_object.probability
|
90 |
+
# )
|
91 |
+
video_frame = VideoFrame(
|
92 |
+
frame_idx=frame_count,
|
93 |
+
timestamp=None,
|
94 |
+
frame_image=frame_img,
|
95 |
+
object_of_interest=object_of_interest,
|
96 |
+
)
|
97 |
+
return video_frame, object_of_interest
|
98 |
+
|
99 |
+
if parallel_inference:
|
100 |
+
results = Parallel(n_jobs=len(all_frames))(
|
101 |
+
delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(all_frames)
|
102 |
+
)
|
103 |
+
else:
|
104 |
+
results = [process_frame(frame_img, i) for i, frame_img in enumerate(all_frames)]
|
105 |
+
|
106 |
+
for video_frame, object_of_interest in results:
|
107 |
+
video_automaton.add_frame(frame=video_frame)
|
108 |
+
for proposition, detected_object in object_of_interest.items():
|
109 |
+
proposition_probability_record[proposition].append(detected_object.probability)
|
110 |
+
|
111 |
+
video_automaton.add_terminal_state(add_with_terminal_label=True)
|
112 |
+
sys.stdout.write("\n") # Move to the next line after processing all frames
|
113 |
+
result = model_checker.check_automaton(
|
114 |
+
states=video_automaton.states,
|
115 |
+
transitions=video_automaton.transitions,
|
116 |
+
model_type="dtmc",
|
117 |
+
use_filter=True,
|
118 |
+
)
|
119 |
+
output_log["specification"] = tl_spec
|
120 |
+
output_log["propositions"] = proposition_set
|
121 |
+
output_log["probability"] = round(float(str(result)), 6)
|
122 |
+
output_log["min_probability"] = round(float(str(result.min)), 6)
|
123 |
+
output_log["max_probability"] = round(float(str(result.max)), 6)
|
124 |
+
for (
|
125 |
+
proposition,
|
126 |
+
probabilities,
|
127 |
+
) in proposition_probability_record.items():
|
128 |
+
avg_probability = sum(probabilities) / len(probabilities)
|
129 |
+
output_log["propositions_avg_probability"][proposition] = round(avg_probability, 3)
|
130 |
+
except Exception as e: # noqa: BLE001
|
131 |
+
# print(f"\nError processing frame {frame_count}: {e}")
|
132 |
+
import traceback
|
133 |
+
|
134 |
+
print(f"\nError processing frame {frame_count}: {e}")
|
135 |
+
traceback.print_exc()
|
136 |
+
|
137 |
+
return output_log
|
138 |
+
|
139 |
+
|
140 |
+
def evaluate_video_with_sequence_of_images(
|
141 |
+
vision_language_model,
|
142 |
+
confidence_as_token_probability: bool,
|
143 |
+
video_path: Path | str,
|
144 |
+
proposition_set: list,
|
145 |
+
tl_spec: str,
|
146 |
+
parallel_inference: bool = False,
|
147 |
+
num_of_frame_in_sequence: int = 3,
|
148 |
+
threshold: float = 0.1,
|
149 |
+
) -> dict:
|
150 |
+
"""Evaluate a video using the given vision language model."""
|
151 |
+
output_log = {
|
152 |
+
"specification": None,
|
153 |
+
"propositions": None,
|
154 |
+
"probability": None,
|
155 |
+
"min_probability": None,
|
156 |
+
"max_probability": None,
|
157 |
+
"propositions_avg_probability": {},
|
158 |
+
}
|
159 |
+
|
160 |
+
if isinstance(video_path, str):
|
161 |
+
video_path = Path(video_path)
|
162 |
+
video = read_video(video_path=video_path)
|
163 |
+
|
164 |
+
# TODO: if there's F in the tl_spec
|
165 |
+
ltl_formula = parse_tl_formula(tl_spec)
|
166 |
+
|
167 |
+
video_automaton = VideoAutomaton(include_initial_state=True)
|
168 |
+
|
169 |
+
video_automaton.set_up(proposition_set=proposition_set)
|
170 |
+
model_checker = StormModelChecker(
|
171 |
+
proposition_set=proposition_set,
|
172 |
+
ltl_formula=ltl_formula,
|
173 |
+
)
|
174 |
+
|
175 |
+
proposition_probability_record = {}
|
176 |
+
for proposition in proposition_set:
|
177 |
+
proposition_probability_record[proposition] = []
|
178 |
+
if model_checker.validate_tl_specification(ltl_formula):
|
179 |
+
frame_count = 0
|
180 |
+
all_frames: list[np.ndarray] = video.get_all_frames_of_video(
|
181 |
+
return_format="ndarray",
|
182 |
+
desired_interval_in_sec=0.5,
|
183 |
+
)
|
184 |
+
try:
|
185 |
+
# for frame_img in all_frames:
|
186 |
+
def process_frame(sequence_of_frames: list[np.ndarray], frame_count: int):
|
187 |
+
sys.stdout.write(f"\rProcessing frame window: {frame_count+1}/{len(frame_windows)} ")
|
188 |
+
sys.stdout.flush()
|
189 |
+
object_of_interest = {}
|
190 |
+
for proposition in proposition_set:
|
191 |
+
detected_object = vision_language_model.detect(
|
192 |
+
seq_of_frames=sequence_of_frames,
|
193 |
+
scene_description=proposition,
|
194 |
+
confidence_as_token_probability=confidence_as_token_probability,
|
195 |
+
threshold=threshold,
|
196 |
+
)
|
197 |
+
object_of_interest[proposition] = detected_object
|
198 |
+
# proposition_probability_record.get(proposition).append(
|
199 |
+
# detected_object.probability
|
200 |
+
# )
|
201 |
+
print(f"{proposition}: {detected_object.probability}")
|
202 |
+
video_frame = VideoFrame(
|
203 |
+
frame_idx=frame_count,
|
204 |
+
timestamp=None,
|
205 |
+
frame_image=sequence_of_frames,
|
206 |
+
object_of_interest=object_of_interest,
|
207 |
+
)
|
208 |
+
return video_frame, object_of_interest
|
209 |
+
|
210 |
+
if parallel_inference:
|
211 |
+
frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
|
212 |
+
results = Parallel(n_jobs=len(frame_windows))(
|
213 |
+
delayed(process_frame)(frame_img, i) for i, frame_img in enumerate(frame_windows)
|
214 |
+
)
|
215 |
+
else:
|
216 |
+
frame_windows = create_frame_windows(frames=all_frames, window_size=num_of_frame_in_sequence)
|
217 |
+
results = [process_frame(sequence_of_frames, i) for i, sequence_of_frames in enumerate(frame_windows)]
|
218 |
+
|
219 |
+
for video_frame, object_of_interest in results:
|
220 |
+
video_automaton.add_frame(frame=video_frame)
|
221 |
+
for proposition, detected_object in object_of_interest.items():
|
222 |
+
proposition_probability_record[proposition].append(detected_object.probability)
|
223 |
+
|
224 |
+
video_automaton.add_terminal_state(add_with_terminal_label=True)
|
225 |
+
sys.stdout.write("\n") # Move to the next line after processing all frames
|
226 |
+
result = model_checker.check_automaton(
|
227 |
+
states=video_automaton.states,
|
228 |
+
transitions=video_automaton.transitions,
|
229 |
+
model_type="dtmc",
|
230 |
+
use_filter=True,
|
231 |
+
)
|
232 |
+
output_log["specification"] = tl_spec
|
233 |
+
output_log["propositions"] = proposition_set
|
234 |
+
output_log["probability"] = round(float(str(result)), 6)
|
235 |
+
output_log["min_probability"] = round(float(str(result.min)), 6)
|
236 |
+
output_log["max_probability"] = round(float(str(result.max)), 6)
|
237 |
+
for (
|
238 |
+
proposition,
|
239 |
+
probabilities,
|
240 |
+
) in proposition_probability_record.items():
|
241 |
+
avg_probability = sum(probabilities) / len(probabilities)
|
242 |
+
output_log["propositions_avg_probability"][proposition] = round(avg_probability, 3)
|
243 |
+
except Exception as e: # noqa: BLE001
|
244 |
+
# print(f"\nError processing frame {frame_count}: {e}")
|
245 |
+
import traceback
|
246 |
+
|
247 |
+
print(f"\nError processing frame {frame_count}: {e}")
|
248 |
+
traceback.print_exc()
|
249 |
+
|
250 |
+
return output_log
|
neus_v/veval/parse.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def parse_tl_formula(tl_spec: str) -> str:
|
2 |
+
"""Validate the tl specification."""
|
3 |
+
if 'G "' in tl_spec:
|
4 |
+
tl_spec = tl_spec.replace('G "', 'F "')
|
5 |
+
tl_spec = tl_spec.replace("-", "_")
|
6 |
+
if tl_spec[0] == "\n":
|
7 |
+
tl_spec = tl_spec[1:]
|
8 |
+
|
9 |
+
if tl_spec[0] in ["F"]:
|
10 |
+
return f"P=? [{tl_spec}]"
|
11 |
+
|
12 |
+
if tl_spec[0] in ["G"]:
|
13 |
+
tl_spec = tl_spec[1:]
|
14 |
+
return f"P=? [F {tl_spec}]"
|
15 |
+
|
16 |
+
# if any(op in tl_spec for op in ["F", "G", "U"]):
|
17 |
+
# return f"P=? [F ({tl_spec})]"
|
18 |
+
|
19 |
+
return f"P=? [F {tl_spec}]"
|
20 |
+
|
21 |
+
|
22 |
+
def parse_proposition_set(proposition_set: list[str]) -> list[str]:
|
23 |
+
"""Parse the proposition set."""
|
24 |
+
return [prop.replace("-", "_") for prop in proposition_set]
|
25 |
+
|
26 |
+
|
27 |
+
def parse_tl_specification(tl_spec: str) -> str:
|
28 |
+
"""Parse the tl specification."""
|
29 |
+
return tl_spec.replace("-", "_")
|
neus_v/video/__init__.py
ADDED
File without changes
|
neus_v/video/frame.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
from typing import TYPE_CHECKING
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
|
6 |
+
if TYPE_CHECKING:
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
|
10 |
+
@dataclasses.dataclass
|
11 |
+
class VideoFrame:
|
12 |
+
"""Frame class."""
|
13 |
+
|
14 |
+
frame_idx: int
|
15 |
+
timestamp: int | None = None
|
16 |
+
frame_image: np.ndarray | None = None
|
17 |
+
annotated_image: dict[str, np.ndarray] = dataclasses.field(default_factory=dict)
|
18 |
+
detected_object_set: dict | None = None
|
19 |
+
object_of_interest: dict | None = None
|
20 |
+
activity_of_interest: dict | None = None
|
21 |
+
|
22 |
+
def save_frame_img(self, save_path: str) -> None:
|
23 |
+
"""Save frame image."""
|
24 |
+
if self.frame_image is not None:
|
25 |
+
cv2.imwrite(
|
26 |
+
save_path,
|
27 |
+
self.frame_image,
|
28 |
+
)
|
29 |
+
|
30 |
+
def is_any_object_detected(self) -> bool:
|
31 |
+
"""Check if object is detected."""
|
32 |
+
return len(self.detected_object_set.objects) > 0
|
33 |
+
|
34 |
+
@property
|
35 |
+
def list_of_detected_object_of_interest(self) -> list:
|
36 |
+
"""Get detected object."""
|
37 |
+
detected_obj = []
|
38 |
+
for obj_name, obj_value in self.object_of_interest.items():
|
39 |
+
if obj_value.is_detected:
|
40 |
+
detected_obj.append(obj_name)
|
41 |
+
return detected_obj
|
42 |
+
|
43 |
+
@property
|
44 |
+
def detected_object_dict(self) -> dict:
|
45 |
+
"""Get detected object info as dict."""
|
46 |
+
detected_obj = {}
|
47 |
+
for obj_name, obj_value in self.object_of_interest.items():
|
48 |
+
if obj_value.is_detected:
|
49 |
+
detected_obj[obj_name] = {}
|
50 |
+
detected_obj[obj_name]["total_number_of_detection"] = obj_value.number_of_detection
|
51 |
+
detected_obj[obj_name]["maximum_probability"] = max(obj_value.probability_of_all_obj)
|
52 |
+
detected_obj[obj_name]["minimum_probability"] = min(obj_value.probability_of_all_obj)
|
53 |
+
detected_obj[obj_name]["maximum_confidence"] = max(obj_value.confidence_of_all_obj)
|
54 |
+
detected_obj[obj_name]["minimum_confidence"] = min(obj_value.confidence_of_all_obj)
|
55 |
+
|
56 |
+
return detected_obj
|
57 |
+
|
58 |
+
def detected_bboxes(self, probability_threshold: bool = False) -> list:
|
59 |
+
"""Get detected object.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
probability_threshold (float | None): Probability threshold.
|
63 |
+
Defaults to None.
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
list: Bounding boxes.
|
67 |
+
"""
|
68 |
+
bboxes = []
|
69 |
+
|
70 |
+
for _, obj_value in self.object_of_interest.items():
|
71 |
+
if obj_value.is_detected:
|
72 |
+
if probability_threshold:
|
73 |
+
for obj_prob in obj_value.probability_of_all_obj:
|
74 |
+
if obj_prob > 0:
|
75 |
+
bboxes += obj_value.bounding_box_of_all_obj
|
76 |
+
else:
|
77 |
+
bboxes += obj_value.bounding_box_of_all_obj
|
78 |
+
|
79 |
+
return bboxes
|
80 |
+
|
81 |
+
|
neus_v/video/read_video.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import TYPE_CHECKING
|
3 |
+
|
4 |
+
from neus_v.video.video import Video, VideoFormat
|
5 |
+
|
6 |
+
if TYPE_CHECKING:
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
|
10 |
+
def read_video(
|
11 |
+
video_path: str | Path | None = None,
|
12 |
+
sequence_of_image: list[np.ndarray] | None = None,
|
13 |
+
) -> Video:
|
14 |
+
"""Read video from video_path or sequence of images.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
video_path (str | Path | None): Path to video file. Defaults to None.
|
18 |
+
sequence_of_image (list[np.ndarray] | None): Sequence of images
|
19 |
+
as numpy arrays. Defaults to None.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
Video: Video object.
|
23 |
+
|
24 |
+
Raises:
|
25 |
+
ValueError: If neither or both video_path and
|
26 |
+
sequence_of_image are provided.
|
27 |
+
"""
|
28 |
+
if (video_path is None) == (sequence_of_image is None):
|
29 |
+
msg = "Exactly one of video_path or sequence_of_image must be provided."
|
30 |
+
raise ValueError(msg)
|
31 |
+
if video_path:
|
32 |
+
if isinstance(video_path, str):
|
33 |
+
video_path = Path(video_path)
|
34 |
+
|
35 |
+
if video_path.suffix == ".mp4":
|
36 |
+
read_format = VideoFormat.MP4
|
37 |
+
|
38 |
+
if sequence_of_image:
|
39 |
+
read_format = VideoFormat.LIST_OF_ARRAY
|
40 |
+
|
41 |
+
return Video(
|
42 |
+
video_path=video_path,
|
43 |
+
sequence_of_image=sequence_of_image,
|
44 |
+
read_format=read_format,
|
45 |
+
)
|
neus_v/video/video.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import enum
|
2 |
+
import logging
|
3 |
+
import uuid
|
4 |
+
from dataclasses import dataclass, field
|
5 |
+
from typing import TYPE_CHECKING
|
6 |
+
|
7 |
+
import cv2
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
if TYPE_CHECKING:
|
11 |
+
from pathlib import Path
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
|
15 |
+
|
16 |
+
class VideoFormat(enum.Enum):
|
17 |
+
"""Status Enum for the CV API."""
|
18 |
+
|
19 |
+
MP4 = "mp4"
|
20 |
+
LIST_OF_ARRAY = "list_of_array"
|
21 |
+
|
22 |
+
|
23 |
+
@dataclass
|
24 |
+
class VideoInfo:
|
25 |
+
"""Represents information about a video file."""
|
26 |
+
|
27 |
+
format: VideoFormat
|
28 |
+
frame_width: int
|
29 |
+
frame_height: int
|
30 |
+
original_frame_count: int
|
31 |
+
video_id: uuid.UUID = field(default_factory=uuid.uuid4)
|
32 |
+
video_path: str | None = None
|
33 |
+
processed_fps: float | None = None
|
34 |
+
processed_frame_count: int = 1
|
35 |
+
original_fps: float | None = None
|
36 |
+
|
37 |
+
|
38 |
+
class Video:
|
39 |
+
"""vflow's Video Object."""
|
40 |
+
|
41 |
+
def __init__(
|
42 |
+
self,
|
43 |
+
read_format: VideoFormat,
|
44 |
+
video_path: str | Path | None = None,
|
45 |
+
sequence_of_image: list[np.ndarray] | None = None,
|
46 |
+
) -> None:
|
47 |
+
"""Video Frame Processor.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
video_path (str | Path): Path to video file.
|
51 |
+
read_format (VideoFormat): Format to read the video in.
|
52 |
+
sequence_of_image (list[np.ndarray] | None): List of image arrays
|
53 |
+
for processing.
|
54 |
+
"""
|
55 |
+
self._video_path = video_path
|
56 |
+
self._read_format = read_format
|
57 |
+
self.video_info = None
|
58 |
+
if sequence_of_image:
|
59 |
+
self.all_frames = sequence_of_image
|
60 |
+
if isinstance(sequence_of_image[0], list):
|
61 |
+
self.all_frames = sequence_of_image[0]
|
62 |
+
self.import_video(str(video_path))
|
63 |
+
self.current_frame_index = 0
|
64 |
+
self.video_ended = False
|
65 |
+
|
66 |
+
def __str__(self) -> str:
|
67 |
+
"""Return a concise string representation of the Video object."""
|
68 |
+
return str(self.video_info)
|
69 |
+
|
70 |
+
def __repr__(self) -> str:
|
71 |
+
"""Return a detailed string representation of the Video object."""
|
72 |
+
return repr(self.video_info)
|
73 |
+
|
74 |
+
def import_video(self, video_path: str | None) -> None:
|
75 |
+
"""Read video from video_path.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
video_path (str): Path to video file.
|
79 |
+
"""
|
80 |
+
logging.info(f"Video format: {self._read_format}")
|
81 |
+
if self._read_format == VideoFormat.MP4:
|
82 |
+
self._cap = cv2.VideoCapture(video_path)
|
83 |
+
ret, _ = self._cap.read()
|
84 |
+
if not ret:
|
85 |
+
logging.error("Video path is invalid.")
|
86 |
+
self.video_info = VideoInfo(
|
87 |
+
video_path=str(self._video_path),
|
88 |
+
format=self._read_format,
|
89 |
+
frame_width=int(self._cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
|
90 |
+
frame_height=int(self._cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
|
91 |
+
original_fps=self._cap.get(cv2.CAP_PROP_FPS),
|
92 |
+
original_frame_count=int(self._cap.get(cv2.CAP_PROP_FRAME_COUNT)),
|
93 |
+
)
|
94 |
+
elif self._read_format.LIST_OF_ARRAY:
|
95 |
+
self.video_info = VideoInfo(
|
96 |
+
format=self._read_format,
|
97 |
+
frame_width=int(self.all_frames[0].shape[0]),
|
98 |
+
frame_height=int(self.all_frames[0].shape[1]),
|
99 |
+
original_frame_count=len(self.all_frames),
|
100 |
+
)
|
101 |
+
|
102 |
+
def _resize_frame_by_scale(self, frame_img: np.ndarray, frame_scale: int) -> np.ndarray:
|
103 |
+
"""Resize frame image.
|
104 |
+
|
105 |
+
Args:
|
106 |
+
frame_img (np.ndarray): Frame image.
|
107 |
+
frame_scale (int): Scale of frame.
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
np.ndarray: Resized frame image.
|
111 |
+
"""
|
112 |
+
return cv2.resize(
|
113 |
+
frame_img,
|
114 |
+
(
|
115 |
+
int(self.video_info.frame_width / frame_scale),
|
116 |
+
int(self.video_info.frame_height / frame_scale),
|
117 |
+
),
|
118 |
+
)
|
119 |
+
|
120 |
+
def get_all_frames_of_video(
|
121 |
+
self,
|
122 |
+
return_format: str = "ndarray",
|
123 |
+
frame_scale: int | None = None,
|
124 |
+
desired_fps: int | None = None,
|
125 |
+
desired_interval_in_sec: int | None = None,
|
126 |
+
) -> list:
|
127 |
+
"""Get video frames by frame_scale and second_per_frame.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
return_format (str, optional): Return format. Defaults to "cv2".
|
131 |
+
Options: [cv2, ndarray]
|
132 |
+
frame_scale (int | None, optional): Frame scale. Defaults to None.
|
133 |
+
desired_fps (int | None, optional): Desired FPS. Defaults to None.
|
134 |
+
desired_interval_in_sec (int | None, optional): Interval between frames in seconds.
|
135 |
+
If provided, frames will be extracted at this interval. Defaults to None.
|
136 |
+
""" # noqa: E501
|
137 |
+
if self._read_format == VideoFormat.LIST_OF_ARRAY:
|
138 |
+
resize_func = lambda img: self.process_frame_image( # noqa: E731
|
139 |
+
frame_img=img,
|
140 |
+
frame_scale=frame_scale,
|
141 |
+
return_format=return_format,
|
142 |
+
)
|
143 |
+
all_frames = list(map(resize_func, self.all_frames))
|
144 |
+
self.processed_frame_count = len(all_frames)
|
145 |
+
return all_frames
|
146 |
+
|
147 |
+
all_frames = []
|
148 |
+
|
149 |
+
if self._read_format == VideoFormat.MP4 and desired_fps is None and desired_interval_in_sec is None:
|
150 |
+
msg = (
|
151 |
+
"Either desired_fps",
|
152 |
+
"or desired_interval_in_sec must be provided.",
|
153 |
+
)
|
154 |
+
raise ValueError(msg)
|
155 |
+
|
156 |
+
if self._read_format == VideoFormat.MP4:
|
157 |
+
frame_step = self.get_frame_step(
|
158 |
+
desired_fps=desired_fps,
|
159 |
+
desired_interval_in_sec=desired_interval_in_sec,
|
160 |
+
)
|
161 |
+
|
162 |
+
for real_frame_idx in range(0, int(self.video_info.original_frame_count), int(frame_step)):
|
163 |
+
self._cap.set(cv2.CAP_PROP_POS_FRAMES, real_frame_idx)
|
164 |
+
ret, frame_img = self._cap.read()
|
165 |
+
frame_img = cv2.cvtColor(frame_img, cv2.COLOR_BGR2RGB)
|
166 |
+
if not ret:
|
167 |
+
break
|
168 |
+
frame_img = self.process_frame_image(
|
169 |
+
frame_img=frame_img,
|
170 |
+
frame_scale=frame_scale,
|
171 |
+
return_format=return_format,
|
172 |
+
)
|
173 |
+
all_frames.append(frame_img)
|
174 |
+
self._cap.release()
|
175 |
+
# cv2.destroyAllWindows()
|
176 |
+
self.processed_frame_count = len(all_frames)
|
177 |
+
return all_frames
|
178 |
+
|
179 |
+
def get_next_frame(
|
180 |
+
self,
|
181 |
+
return_format: str = "ndarray",
|
182 |
+
frame_scale: int | None = None,
|
183 |
+
desired_fps: int | None = None,
|
184 |
+
desired_interval_in_sec: int | None = None,
|
185 |
+
) -> np.ndarray | None:
|
186 |
+
"""Get the next video frame based on frame step.
|
187 |
+
|
188 |
+
Args:
|
189 |
+
return_format (str, optional): Return format. Defaults to "ndarray".
|
190 |
+
- [cv2, ndarray, pil]
|
191 |
+
frame_scale (int | None, optional): Frame scale. Defaults to None.
|
192 |
+
desired_fps (int | None, optional): Desired FPS. Defaults to None.
|
193 |
+
desired_interval_in_sec (int | None, optional): Desired interval.
|
194 |
+
Defaults to None.
|
195 |
+
|
196 |
+
Returns:
|
197 |
+
np.ndarray | None: The next frame as an ndarray, or None if no more
|
198 |
+
frames are available or the video ended.
|
199 |
+
"""
|
200 |
+
if self._read_format == VideoFormat.MP4 and desired_fps is None and desired_interval_in_sec is None:
|
201 |
+
msg = (
|
202 |
+
"Either desired_fps or",
|
203 |
+
"desired_interval_in_sec must be provided.",
|
204 |
+
)
|
205 |
+
raise ValueError(msg)
|
206 |
+
|
207 |
+
if self.video_ended:
|
208 |
+
logging.info("No frame available.")
|
209 |
+
return None # No more frames to process
|
210 |
+
|
211 |
+
if self._read_format == VideoFormat.MP4:
|
212 |
+
frame_step = self.get_frame_step(
|
213 |
+
desired_fps=desired_fps,
|
214 |
+
desired_interval_in_sec=desired_interval_in_sec,
|
215 |
+
)
|
216 |
+
# Skip to the next frame based on frame_step
|
217 |
+
self._cap.set(cv2.CAP_PROP_POS_FRAMES, self.current_frame_index)
|
218 |
+
|
219 |
+
ret, frame_img = self._cap.read()
|
220 |
+
|
221 |
+
if not ret:
|
222 |
+
self.video_ended = True
|
223 |
+
return None # No more frames or error occurred
|
224 |
+
|
225 |
+
# Update the current frame index for the next call
|
226 |
+
self.current_frame_index += frame_step
|
227 |
+
|
228 |
+
frame_img = cv2.cvtColor(frame_img, cv2.COLOR_BGR2RGB)
|
229 |
+
|
230 |
+
if self._read_format == VideoFormat.LIST_OF_ARRAY:
|
231 |
+
if self.current_frame_index < len(self.all_frames):
|
232 |
+
frame_img = self.all_frames[self.current_frame_index]
|
233 |
+
self.current_frame_index += 1
|
234 |
+
else:
|
235 |
+
# No more frames available.
|
236 |
+
self.video_ended = True
|
237 |
+
return None
|
238 |
+
|
239 |
+
self.video_info.processed_frame_count += 1
|
240 |
+
|
241 |
+
return self.process_frame_image(
|
242 |
+
frame_img=frame_img,
|
243 |
+
frame_scale=frame_scale,
|
244 |
+
return_format=return_format,
|
245 |
+
)
|
246 |
+
|
247 |
+
def process_frame_image(
|
248 |
+
self,
|
249 |
+
frame_img: np.ndarray,
|
250 |
+
return_format: str = "ndarray",
|
251 |
+
frame_scale: int | None = None,
|
252 |
+
) -> np.ndarray:
|
253 |
+
"""Process a single frame image.
|
254 |
+
|
255 |
+
Args:
|
256 |
+
frame_img (np.ndarray): Input frame image.
|
257 |
+
return_format (str, optional): Desired return format.
|
258 |
+
Defaults to "ndarray".
|
259 |
+
frame_scale (int | None, optional): Scale factor for resizing.
|
260 |
+
Defaults to None.
|
261 |
+
|
262 |
+
Returns:
|
263 |
+
np.ndarray: Processed frame image.
|
264 |
+
"""
|
265 |
+
if frame_scale is not None:
|
266 |
+
frame_img = self._resize_frame_by_scale(frame_img, frame_scale)
|
267 |
+
if return_format == "pil":
|
268 |
+
frame_img = Image.fromarray(frame_img).convert("RGB")
|
269 |
+
return frame_img
|
270 |
+
|
271 |
+
def get_frame_step(
|
272 |
+
self,
|
273 |
+
desired_interval_in_sec: int | None = None,
|
274 |
+
desired_fps: int | None = None,
|
275 |
+
) -> int:
|
276 |
+
"""Calculate the frame step based on desired interval or FPS.
|
277 |
+
|
278 |
+
Args:
|
279 |
+
desired_interval_in_sec (int | None): Desired interval between frames in seconds.
|
280 |
+
desired_fps (int | None): Desired frames per second.
|
281 |
+
|
282 |
+
Returns:
|
283 |
+
int: Calculated frame step.
|
284 |
+
""" # noqa: E501
|
285 |
+
if desired_fps is not None:
|
286 |
+
frame_step = int(round(self.video_info.original_fps / desired_fps))
|
287 |
+
processed_fps = desired_fps
|
288 |
+
if desired_interval_in_sec is not None:
|
289 |
+
frame_step = int(round(self.video_info.original_fps * desired_interval_in_sec))
|
290 |
+
processed_fps = round(1 / desired_interval_in_sec, 2)
|
291 |
+
self.video_info.processed_fps = processed_fps
|
292 |
+
|
293 |
+
return frame_step
|
294 |
+
|
295 |
+
def insert_annotation_to_current_frame(self, annotations: list[str]) -> None:
|
296 |
+
"""Insert annotations to the current frame.
|
297 |
+
|
298 |
+
Args:
|
299 |
+
annotations (list[str]): List of annotations.
|
300 |
+
"""
|
301 |
+
|
302 |
+
def get_video_info(self) -> VideoInfo:
|
303 |
+
"""Return the VideoInfo object containing video information."""
|
304 |
+
return self.video_info
|
neus_v/vlm/__init__.py
ADDED
File without changes
|
neus_v/vlm/internvl.py
ADDED
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import logging
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from PIL import Image
|
7 |
+
from torch.nn.functional import softmax
|
8 |
+
from transformers import AutoModel, AutoTokenizer
|
9 |
+
|
10 |
+
from neus_v.vlm.internvl_utils import (
|
11 |
+
assign_device_map,
|
12 |
+
load_image,
|
13 |
+
load_video_from_file,
|
14 |
+
load_video_from_seq_of_frames,
|
15 |
+
split_model,
|
16 |
+
)
|
17 |
+
from neus_v.vlm.obj import DetectedObject
|
18 |
+
|
19 |
+
MODEL_PATH = {
|
20 |
+
"InternVL2-40B": "HuggingFace Model",
|
21 |
+
"InternVL2-8B": "HuggingFace Model",
|
22 |
+
"InternVL2-2B": "HuggingFace Model",
|
23 |
+
}
|
24 |
+
|
25 |
+
|
26 |
+
class InternVL:
|
27 |
+
"""InternVL's Vision Language Model."""
|
28 |
+
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
model_name: str = "InternVL2-8B",
|
32 |
+
multi_gpus: bool = False,
|
33 |
+
device: int = 0,
|
34 |
+
) -> None:
|
35 |
+
"""Initialization the InternVL."""
|
36 |
+
logging.info(
|
37 |
+
(
|
38 |
+
"You are using the model based on HuggingFace API.",
|
39 |
+
"The model will be downloaded to the HuggingFace cache dir.",
|
40 |
+
)
|
41 |
+
)
|
42 |
+
self.model_name = model_name
|
43 |
+
self._path = f"OpenGVLab/{model_name}"
|
44 |
+
self._num_gpus = torch.cuda.device_count()
|
45 |
+
self.device = device
|
46 |
+
if multi_gpus:
|
47 |
+
device_map = split_model(model_name)
|
48 |
+
else:
|
49 |
+
device_map = assign_device_map(model_name=model_name, manual_gpu_id=device)
|
50 |
+
self.model = AutoModel.from_pretrained(
|
51 |
+
self._path,
|
52 |
+
torch_dtype=torch.bfloat16,
|
53 |
+
low_cpu_mem_usage=True,
|
54 |
+
use_flash_attn=True,
|
55 |
+
trust_remote_code=True,
|
56 |
+
device_map=device_map,
|
57 |
+
).eval()
|
58 |
+
self.model.apply(self.move_tensors_to_gpu)
|
59 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self._path, trust_remote_code=True, use_fast=False)
|
60 |
+
|
61 |
+
def reset_model(self) -> None:
|
62 |
+
"""Reset the model to its initial state using pretrained weights."""
|
63 |
+
self.model = AutoModel.from_pretrained(
|
64 |
+
self._path,
|
65 |
+
torch_dtype=torch.bfloat16,
|
66 |
+
low_cpu_mem_usage=True,
|
67 |
+
use_flash_attn=True,
|
68 |
+
trust_remote_code=True,
|
69 |
+
).eval()
|
70 |
+
self.model.apply(self.move_tensors_to_gpu)
|
71 |
+
|
72 |
+
def clear_gpu_memory(self) -> None:
|
73 |
+
"""Clear CUDA cache and run garbage collection to free GPU memory."""
|
74 |
+
torch.cuda.empty_cache()
|
75 |
+
if torch.cuda.is_available():
|
76 |
+
torch.cuda.ipc_collect()
|
77 |
+
gc.collect() # Run garbage collector
|
78 |
+
|
79 |
+
def move_tensors_to_gpu(
|
80 |
+
self,
|
81 |
+
module: torch.nn.Module,
|
82 |
+
) -> None:
|
83 |
+
"""Move all tensors in the module to GPU if they are on the CPU."""
|
84 |
+
for name, tensor in module.named_buffers():
|
85 |
+
if isinstance(tensor, torch.Tensor) and tensor.device.type == "cpu":
|
86 |
+
module.register_buffer(
|
87 |
+
name,
|
88 |
+
tensor.cuda(self.device),
|
89 |
+
persistent=False,
|
90 |
+
)
|
91 |
+
for _, param in module.named_parameters():
|
92 |
+
if param.device.type == "cpu":
|
93 |
+
param.data = param.data.cuda(self.device)
|
94 |
+
|
95 |
+
def infer_with_image(
|
96 |
+
self,
|
97 |
+
language: str,
|
98 |
+
image: np.ndarray | None = None,
|
99 |
+
image_path: str | None = None,
|
100 |
+
max_new_tokens: int = 1024,
|
101 |
+
do_sample: bool = True,
|
102 |
+
) -> str:
|
103 |
+
"""Perform image inference with given video inputs."""
|
104 |
+
assert ( # noqa: S101
|
105 |
+
image is not None or image_path is not None
|
106 |
+
), "One of 'image' or 'image_path' must be defined."
|
107 |
+
if image_path:
|
108 |
+
image = Image.open(image_path).convert("RGB")
|
109 |
+
else:
|
110 |
+
image = Image.fromarray(image)
|
111 |
+
# set the max number of tiles in `max_num`
|
112 |
+
pixel_values = load_image(image, max_num=12).to(torch.bfloat16).cuda(self.device)
|
113 |
+
generation_config = {
|
114 |
+
"max_new_tokens": max_new_tokens,
|
115 |
+
"do_sample": do_sample,
|
116 |
+
}
|
117 |
+
image_prefix = "<image>\n"
|
118 |
+
language = image_prefix + language
|
119 |
+
return self.model.chat(self.tokenizer, pixel_values, language, generation_config)
|
120 |
+
|
121 |
+
def infer_with_video(
|
122 |
+
self,
|
123 |
+
language: str,
|
124 |
+
seq_of_frames: list[np.ndarray] | None = None,
|
125 |
+
video_path: str | None = None,
|
126 |
+
max_new_tokens: int = 1024,
|
127 |
+
do_sample: bool = True,
|
128 |
+
) -> str:
|
129 |
+
"""Perform image inference with given video inputs."""
|
130 |
+
assert ( # noqa: S101
|
131 |
+
seq_of_frames is not None or video_path is not None
|
132 |
+
), "One of 'seq_of_frames' or 'video_path' must be defined."
|
133 |
+
generation_config = {
|
134 |
+
"max_new_tokens": max_new_tokens,
|
135 |
+
"do_sample": do_sample,
|
136 |
+
}
|
137 |
+
if video_path:
|
138 |
+
pixel_values, num_patches_list = load_video_from_file(video_path)
|
139 |
+
else:
|
140 |
+
pixel_values, num_patches_list = load_video_from_seq_of_frames(seq_of_frames=seq_of_frames)
|
141 |
+
video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
|
142 |
+
language = video_prefix + language
|
143 |
+
return self.model.chat(
|
144 |
+
self.tokenizer,
|
145 |
+
pixel_values,
|
146 |
+
language,
|
147 |
+
generation_config,
|
148 |
+
num_patches_list=num_patches_list,
|
149 |
+
history=None,
|
150 |
+
return_history=True,
|
151 |
+
)
|
152 |
+
|
153 |
+
def detect(
|
154 |
+
self,
|
155 |
+
scene_description: str,
|
156 |
+
frame_img: np.ndarray | None = None,
|
157 |
+
seq_of_frames: list[np.ndarray] | None = None,
|
158 |
+
video_path: str | None = None,
|
159 |
+
threshold: float = 0.349,
|
160 |
+
confidence_as_token_probability: bool = True,
|
161 |
+
) -> DetectedObject:
|
162 |
+
"""Detect objects in the given frame image.
|
163 |
+
|
164 |
+
Args:
|
165 |
+
frame_img (np.ndarray): The image frame to process.
|
166 |
+
scene_description (str): Description of the scene.
|
167 |
+
seq_of_frames (list[np.ndarray] | None):
|
168 |
+
List of video frames to process.
|
169 |
+
video_path (str | None): Path to video file to process.
|
170 |
+
threshold (float): Detection threshold.
|
171 |
+
confidence_as_token_probability (bool):
|
172 |
+
Whether to use token probabilities for confidence.
|
173 |
+
|
174 |
+
Returns:
|
175 |
+
DetectedObject: Detected objects with their details.
|
176 |
+
"""
|
177 |
+
if confidence_as_token_probability:
|
178 |
+
parsing_rule = [
|
179 |
+
"You must only return a Yes or No, and not both, to any question asked. " # noqa: E501
|
180 |
+
"You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.", # noqa: E501
|
181 |
+
"For example, if the question is 'Is there a cat present in the Image?', the answer must only be 'Yes' or 'No'.", # noqa: E501
|
182 |
+
]
|
183 |
+
parsing_rule = "\n".join(parsing_rule)
|
184 |
+
prompt = rf"Is there a {scene_description} present in the image? " f"[PARSING RULE]\n:{parsing_rule}"
|
185 |
+
|
186 |
+
if seq_of_frames or video_path:
|
187 |
+
response, confidence = self.infer_with_video_confidence(
|
188 |
+
language=prompt,
|
189 |
+
seq_of_frames=seq_of_frames,
|
190 |
+
video_path=video_path,
|
191 |
+
)
|
192 |
+
else:
|
193 |
+
response, confidence = self.infer_with_image_confidence(language=prompt, image=frame_img)
|
194 |
+
# TODO: Add a check for the response to be Yes or NO or clean up response better # noqa: E501
|
195 |
+
if "yes" in response.lower():
|
196 |
+
detected = True
|
197 |
+
if confidence <= threshold:
|
198 |
+
confidence = 0.0
|
199 |
+
detected = False
|
200 |
+
|
201 |
+
else:
|
202 |
+
detected = False
|
203 |
+
confidence = 0.0
|
204 |
+
|
205 |
+
return DetectedObject(
|
206 |
+
name=scene_description,
|
207 |
+
model_name=self.model_name,
|
208 |
+
confidence=round(confidence, 3),
|
209 |
+
probability=round(confidence, 3),
|
210 |
+
number_of_detection=1,
|
211 |
+
is_detected=detected,
|
212 |
+
)
|
213 |
+
|
214 |
+
def infer_with_image_confidence(
|
215 |
+
self,
|
216 |
+
language: str,
|
217 |
+
image: np.ndarray | None = None,
|
218 |
+
image_path: str | None = None,
|
219 |
+
max_new_tokens: int = 1024,
|
220 |
+
do_sample: bool = True,
|
221 |
+
) -> tuple[str, float]:
|
222 |
+
"""Perform image inference and return response with confidence score.
|
223 |
+
|
224 |
+
Args:
|
225 |
+
language (str): The input prompt or question.
|
226 |
+
image (np.ndarray | None): The input image as a numpy array.
|
227 |
+
image_path (str | None): Path to the input image file.
|
228 |
+
max_new_tokens (int): Maximum number of new tokens to generate.
|
229 |
+
do_sample (bool): Whether to use sampling for generation.
|
230 |
+
|
231 |
+
Returns:
|
232 |
+
tuple[str, float]: Generated response and confidence score.
|
233 |
+
"""
|
234 |
+
if image_path:
|
235 |
+
image = Image.open(image_path).convert("RGB")
|
236 |
+
else:
|
237 |
+
image = Image.fromarray(image)
|
238 |
+
# set the max number of tiles in `max_num`
|
239 |
+
pixel_values = load_image(image, max_num=12).to(torch.bfloat16).cuda(self.device)
|
240 |
+
generation_config = {
|
241 |
+
"max_new_tokens": max_new_tokens,
|
242 |
+
"do_sample": do_sample,
|
243 |
+
}
|
244 |
+
image_prefix = "<image>\n"
|
245 |
+
language = image_prefix + language
|
246 |
+
|
247 |
+
return self.chat_with_confidence(self.tokenizer, pixel_values, language, generation_config)
|
248 |
+
|
249 |
+
def chat_with_confidence( # noqa: PLR0913
|
250 |
+
self,
|
251 |
+
tokenizer: AutoTokenizer,
|
252 |
+
pixel_values: torch.Tensor,
|
253 |
+
question: str,
|
254 |
+
generation_config: dict,
|
255 |
+
num_patches_list: list[int] | None = None,
|
256 |
+
IMG_START_TOKEN: str = "<img>", # noqa: N803, S107
|
257 |
+
IMG_END_TOKEN: str = "</img>", # noqa: N803, S107
|
258 |
+
IMG_CONTEXT_TOKEN: str = "<IMG_CONTEXT>", # noqa: N803, S107
|
259 |
+
verbose: bool = False,
|
260 |
+
) -> tuple[str, float]:
|
261 |
+
"""Generate a response with confidence score for the given input.
|
262 |
+
|
263 |
+
Args:
|
264 |
+
tokenizer: The tokenizer to use.
|
265 |
+
pixel_values: Image tensor input.
|
266 |
+
question: The input question or prompt.
|
267 |
+
generation_config: Configuration for text generation.
|
268 |
+
num_patches_list: List of number of patches for video frames.
|
269 |
+
IMG_START_TOKEN: Token to mark the start of an image.
|
270 |
+
IMG_END_TOKEN: Token to mark the end of an image.
|
271 |
+
IMG_CONTEXT_TOKEN: Token for image context.
|
272 |
+
verbose: Whether to print verbose output.
|
273 |
+
|
274 |
+
Returns:
|
275 |
+
A tuple containing the generated response and its confidence score.
|
276 |
+
"""
|
277 |
+
if num_patches_list is None:
|
278 |
+
num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
|
279 |
+
|
280 |
+
assert pixel_values is None or len(pixel_values) == sum(num_patches_list) # noqa: S101
|
281 |
+
|
282 |
+
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
283 |
+
self.model.img_context_token_id = img_context_token_id
|
284 |
+
|
285 |
+
template = self.model.conv_template
|
286 |
+
template.system_message = self.model.system_message
|
287 |
+
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
|
288 |
+
|
289 |
+
template.append_message(template.roles[0], question)
|
290 |
+
template.append_message(template.roles[1], None)
|
291 |
+
query = template.get_prompt()
|
292 |
+
|
293 |
+
if verbose and pixel_values is not None:
|
294 |
+
image_bs = pixel_values.shape[0]
|
295 |
+
print(f"dynamic ViT batch size: {image_bs}") # noqa: T201
|
296 |
+
|
297 |
+
for num_patches in num_patches_list:
|
298 |
+
context_tokens = IMG_CONTEXT_TOKEN * self.model.num_image_token * num_patches
|
299 |
+
image_tokens = IMG_START_TOKEN + context_tokens + IMG_END_TOKEN
|
300 |
+
query = query.replace("<image>", image_tokens, 1)
|
301 |
+
|
302 |
+
model_inputs = tokenizer(query, return_tensors="pt")
|
303 |
+
input_ids = model_inputs["input_ids"].cuda(self.device)
|
304 |
+
attention_mask = model_inputs["attention_mask"].cuda(self.device)
|
305 |
+
generation_config["eos_token_id"] = eos_token_id
|
306 |
+
generation_config["return_dict_in_generate"] = True
|
307 |
+
generation_config["output_scores"] = True
|
308 |
+
generation_config["output_logits"] = True
|
309 |
+
generation_output = self.model.generate(
|
310 |
+
pixel_values=pixel_values,
|
311 |
+
input_ids=input_ids,
|
312 |
+
attention_mask=attention_mask,
|
313 |
+
**generation_config,
|
314 |
+
)
|
315 |
+
response = tokenizer.batch_decode(generation_output.sequences, skip_special_tokens=True)[0]
|
316 |
+
response = response.split(template.sep)[0].strip()
|
317 |
+
|
318 |
+
logits_to_compute = np.where(generation_output.sequences[0].detach().cpu().numpy() != eos_token_id)[0]
|
319 |
+
confidence = 1.0
|
320 |
+
for logit in logits_to_compute:
|
321 |
+
token = generation_output.sequences[0, logit].item()
|
322 |
+
prob = softmax(generation_output.logits[logit])[0, token]
|
323 |
+
confidence = prob.item() * confidence
|
324 |
+
self.clear_gpu_memory()
|
325 |
+
return response, confidence
|
326 |
+
|
327 |
+
def infer_with_video_confidence(
|
328 |
+
self,
|
329 |
+
language: str,
|
330 |
+
seq_of_frames: list[np.ndarray] | None = None,
|
331 |
+
video_path: str | None = None,
|
332 |
+
max_new_tokens: int = 1024,
|
333 |
+
do_sample: bool = True,
|
334 |
+
) -> tuple[str, float]:
|
335 |
+
"""Perform video inference and return response with confidence score.
|
336 |
+
|
337 |
+
Args:
|
338 |
+
language (str): The input prompt or question.
|
339 |
+
seq_of_frames (list[np.ndarray] | None):
|
340 |
+
List of video frames as numpy arrays.
|
341 |
+
video_path (str | None): Path to the input video file.
|
342 |
+
max_new_tokens (int): Maximum number of new tokens to generate.
|
343 |
+
do_sample (bool): Whether to use sampling for generation.
|
344 |
+
|
345 |
+
Returns:
|
346 |
+
tuple[str, float]: Generated response and confidence score.
|
347 |
+
"""
|
348 |
+
assert ( # noqa: S101
|
349 |
+
seq_of_frames is not None or video_path is not None
|
350 |
+
), "One of 'seq_of_frames' or 'video_path' must be defined."
|
351 |
+
|
352 |
+
generation_config = {
|
353 |
+
"max_new_tokens": max_new_tokens,
|
354 |
+
"do_sample": do_sample,
|
355 |
+
}
|
356 |
+
|
357 |
+
if video_path:
|
358 |
+
pixel_values, num_patches_list = load_video_from_file(video_path, device=self.device)
|
359 |
+
else:
|
360 |
+
pixel_values, num_patches_list = load_video_from_seq_of_frames(
|
361 |
+
seq_of_frames=seq_of_frames, device=self.device
|
362 |
+
)
|
363 |
+
|
364 |
+
video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
|
365 |
+
language = video_prefix + language
|
366 |
+
|
367 |
+
return self.chat_with_confidence(
|
368 |
+
self.tokenizer,
|
369 |
+
pixel_values,
|
370 |
+
language,
|
371 |
+
generation_config,
|
372 |
+
num_patches_list=num_patches_list,
|
373 |
+
)
|
neus_v/vlm/internvl_utils.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torchvision.transforms as T
|
6 |
+
from decord import VideoReader, cpu
|
7 |
+
from PIL import Image
|
8 |
+
from torchvision.transforms.functional import InterpolationMode
|
9 |
+
|
10 |
+
from neus_v.video.read_video import read_video
|
11 |
+
|
12 |
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
13 |
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
14 |
+
|
15 |
+
|
16 |
+
def build_transform(input_size: int) -> T.Compose:
|
17 |
+
"""Builds a transformation pipeline for the given input size."""
|
18 |
+
mean, std = IMAGENET_MEAN, IMAGENET_STD
|
19 |
+
return T.Compose(
|
20 |
+
[
|
21 |
+
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
|
22 |
+
T.Resize(
|
23 |
+
(input_size, input_size),
|
24 |
+
interpolation=InterpolationMode.BICUBIC,
|
25 |
+
),
|
26 |
+
T.ToTensor(),
|
27 |
+
T.Normalize(mean=mean, std=std),
|
28 |
+
]
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
def assign_device_map(model_name, manual_gpu_id=0):
|
33 |
+
device_map = {}
|
34 |
+
world_size = torch.cuda.device_count()
|
35 |
+
num_layers = {
|
36 |
+
"InternVL2-1B": 24,
|
37 |
+
"InternVL2-2B": 24,
|
38 |
+
"InternVL2-4B": 32,
|
39 |
+
"InternVL2-8B": 32,
|
40 |
+
"InternVL2-26B": 48,
|
41 |
+
"InternVL2-40B": 60,
|
42 |
+
"InternVL2-Llama3-76B": 80,
|
43 |
+
}[model_name]
|
44 |
+
for layer_idx in range(num_layers):
|
45 |
+
device_map[f"language_model.model.layers.{layer_idx}"] = manual_gpu_id
|
46 |
+
|
47 |
+
device_map["vision_model"] = manual_gpu_id
|
48 |
+
device_map["mlp1"] = manual_gpu_id
|
49 |
+
device_map["language_model.model.tok_embeddings"] = manual_gpu_id
|
50 |
+
device_map["language_model.model.embed_tokens"] = manual_gpu_id
|
51 |
+
device_map["language_model.output"] = manual_gpu_id
|
52 |
+
device_map["language_model.model.norm"] = manual_gpu_id
|
53 |
+
device_map["language_model.lm_head"] = manual_gpu_id
|
54 |
+
device_map[f"language_model.model.layers.{num_layers - 1}"] = manual_gpu_id
|
55 |
+
|
56 |
+
return device_map
|
57 |
+
|
58 |
+
|
59 |
+
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
60 |
+
best_ratio_diff = float("inf")
|
61 |
+
best_ratio = (1, 1)
|
62 |
+
area = width * height
|
63 |
+
for ratio in target_ratios:
|
64 |
+
target_aspect_ratio = ratio[0] / ratio[1]
|
65 |
+
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
66 |
+
if ratio_diff < best_ratio_diff:
|
67 |
+
best_ratio_diff = ratio_diff
|
68 |
+
best_ratio = ratio
|
69 |
+
elif ratio_diff == best_ratio_diff:
|
70 |
+
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
71 |
+
best_ratio = ratio
|
72 |
+
return best_ratio
|
73 |
+
|
74 |
+
|
75 |
+
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
|
76 |
+
# Convert numpy array to PIL Image if needed
|
77 |
+
if isinstance(image, np.ndarray):
|
78 |
+
image = Image.fromarray(image)
|
79 |
+
|
80 |
+
orig_width, orig_height = image.size
|
81 |
+
aspect_ratio = orig_width / orig_height
|
82 |
+
|
83 |
+
# calculate the existing image aspect ratio
|
84 |
+
target_ratios = set(
|
85 |
+
(i, j)
|
86 |
+
for n in range(min_num, max_num + 1)
|
87 |
+
for i in range(1, n + 1)
|
88 |
+
for j in range(1, n + 1)
|
89 |
+
if i * j <= max_num and i * j >= min_num
|
90 |
+
)
|
91 |
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
92 |
+
|
93 |
+
# find the closest aspect ratio to the target
|
94 |
+
target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
95 |
+
|
96 |
+
# calculate the target width and height
|
97 |
+
target_width = image_size * target_aspect_ratio[0]
|
98 |
+
target_height = image_size * target_aspect_ratio[1]
|
99 |
+
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
100 |
+
|
101 |
+
# resize the image
|
102 |
+
resized_img = image.resize((target_width, target_height))
|
103 |
+
processed_images = []
|
104 |
+
for i in range(blocks):
|
105 |
+
box = (
|
106 |
+
(i % (target_width // image_size)) * image_size,
|
107 |
+
(i // (target_width // image_size)) * image_size,
|
108 |
+
((i % (target_width // image_size)) + 1) * image_size,
|
109 |
+
((i // (target_width // image_size)) + 1) * image_size,
|
110 |
+
)
|
111 |
+
# split the image
|
112 |
+
split_img = resized_img.crop(box)
|
113 |
+
processed_images.append(split_img)
|
114 |
+
assert len(processed_images) == blocks
|
115 |
+
if use_thumbnail and len(processed_images) != 1:
|
116 |
+
thumbnail_img = image.resize((image_size, image_size))
|
117 |
+
processed_images.append(thumbnail_img)
|
118 |
+
return processed_images
|
119 |
+
|
120 |
+
|
121 |
+
def load_image(image, input_size=448, max_num=12):
|
122 |
+
transform = build_transform(input_size=input_size)
|
123 |
+
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
124 |
+
pixel_values = [transform(image) for image in images]
|
125 |
+
pixel_values = torch.stack(pixel_values)
|
126 |
+
return pixel_values
|
127 |
+
|
128 |
+
|
129 |
+
def split_model(model_name):
|
130 |
+
device_map = {}
|
131 |
+
world_size = torch.cuda.device_count()
|
132 |
+
num_layers = {
|
133 |
+
"InternVL2-1B": 24,
|
134 |
+
"InternVL2-2B": 24,
|
135 |
+
"InternVL2-4B": 32,
|
136 |
+
"InternVL2-8B": 32,
|
137 |
+
"InternVL2-26B": 48,
|
138 |
+
"InternVL2-40B": 60,
|
139 |
+
"InternVL2-Llama3-76B": 80,
|
140 |
+
}[model_name]
|
141 |
+
# Since the first GPU will be used for ViT, treat it as half a GPU.
|
142 |
+
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
|
143 |
+
num_layers_per_gpu = [num_layers_per_gpu] * world_size
|
144 |
+
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
|
145 |
+
layer_cnt = 0
|
146 |
+
for i, num_layer in enumerate(num_layers_per_gpu):
|
147 |
+
for j in range(num_layer):
|
148 |
+
device_map[f"language_model.model.layers.{layer_cnt}"] = i
|
149 |
+
layer_cnt += 1
|
150 |
+
device_map["vision_model"] = 0
|
151 |
+
device_map["mlp1"] = 0
|
152 |
+
device_map["language_model.model.tok_embeddings"] = 0
|
153 |
+
device_map["language_model.model.embed_tokens"] = 0
|
154 |
+
device_map["language_model.output"] = 0
|
155 |
+
device_map["language_model.model.norm"] = 0
|
156 |
+
device_map["language_model.lm_head"] = 0
|
157 |
+
device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
|
158 |
+
|
159 |
+
return device_map
|
160 |
+
|
161 |
+
|
162 |
+
def move_tensors_to_gpu(module):
|
163 |
+
for name, tensor in module.named_buffers():
|
164 |
+
if isinstance(tensor, torch.Tensor) and tensor.device.type == "cpu":
|
165 |
+
module.register_buffer(name, tensor.cuda(), persistent=False)
|
166 |
+
for _, param in module.named_parameters():
|
167 |
+
if param.device.type == "cpu":
|
168 |
+
param.data = param.data.cuda()
|
169 |
+
|
170 |
+
|
171 |
+
# video multi-round conversation (视频多轮对话)
|
172 |
+
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
|
173 |
+
if bound:
|
174 |
+
start, end = bound[0], bound[1]
|
175 |
+
else:
|
176 |
+
start, end = -100000, 100000
|
177 |
+
start_idx = max(first_idx, round(start * fps))
|
178 |
+
end_idx = min(round(end * fps), max_frame)
|
179 |
+
seg_size = float(end_idx - start_idx) / num_segments
|
180 |
+
frame_indices = np.array(
|
181 |
+
[int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)]
|
182 |
+
)
|
183 |
+
return frame_indices
|
184 |
+
|
185 |
+
|
186 |
+
def load_video_from_file(
|
187 |
+
video_path: str, input_size=448, max_num=1, device="cuda", dtype=torch.bfloat16 # Add dtype parameter
|
188 |
+
):
|
189 |
+
video = read_video(video_path)
|
190 |
+
pixel_values_list, num_patches_list = [], []
|
191 |
+
transform = build_transform(input_size=input_size)
|
192 |
+
while True:
|
193 |
+
img: np.ndarray = video.get_next_frame(
|
194 |
+
return_format="pil",
|
195 |
+
desired_interval_in_sec=1,
|
196 |
+
)
|
197 |
+
if img is None:
|
198 |
+
break # No more frames or end of video
|
199 |
+
img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
200 |
+
pixel_values = [transform(tile) for tile in img]
|
201 |
+
pixel_values = torch.stack(pixel_values)
|
202 |
+
num_patches_list.append(pixel_values.shape[0])
|
203 |
+
pixel_values_list.append(pixel_values.to(device))
|
204 |
+
return torch.cat(pixel_values_list), num_patches_list
|
205 |
+
|
206 |
+
|
207 |
+
def load_video_from_seq_of_frames(
|
208 |
+
seq_of_frames: list[np.ndarray],
|
209 |
+
input_size=448,
|
210 |
+
max_num=1,
|
211 |
+
device="cuda",
|
212 |
+
dtype=torch.bfloat16, # Add dtype parameter
|
213 |
+
):
|
214 |
+
pixel_values_list, num_patches_list = [], []
|
215 |
+
transform = build_transform(input_size=input_size)
|
216 |
+
for img in seq_of_frames:
|
217 |
+
img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
218 |
+
pixel_values = [transform(tile) for tile in img]
|
219 |
+
pixel_values = torch.stack(pixel_values).to(dtype=dtype, device=device) # Convert to bfloat16
|
220 |
+
num_patches_list.append(pixel_values.shape[0])
|
221 |
+
pixel_values_list.append(pixel_values)
|
222 |
+
return torch.cat(pixel_values_list), num_patches_list
|
223 |
+
|
224 |
+
|
225 |
+
def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
|
226 |
+
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
227 |
+
max_frame = len(vr) - 1
|
228 |
+
fps = float(vr.get_avg_fps())
|
229 |
+
|
230 |
+
pixel_values_list, num_patches_list = [], []
|
231 |
+
transform = build_transform(input_size=input_size)
|
232 |
+
frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
|
233 |
+
for frame_index in frame_indices:
|
234 |
+
img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
|
235 |
+
img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
236 |
+
pixel_values = [transform(tile) for tile in img]
|
237 |
+
pixel_values = torch.stack(pixel_values)
|
238 |
+
num_patches_list.append(pixel_values.shape[0])
|
239 |
+
pixel_values_list.append(pixel_values.to(torch.bfloat16))
|
240 |
+
pixel_values = torch.cat(pixel_values_list)
|
241 |
+
return pixel_values, num_patches_list
|
neus_v/vlm/obj.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
import enum
|
3 |
+
import logging
|
4 |
+
from typing import Any
|
5 |
+
|
6 |
+
|
7 |
+
class Status(enum.Enum):
|
8 |
+
"""Status Enum for the CV API."""
|
9 |
+
|
10 |
+
UNKNOWN = 0
|
11 |
+
SUCCESS = 1
|
12 |
+
RUNNING = 2
|
13 |
+
FAILURE = 3
|
14 |
+
INVALID = 4
|
15 |
+
|
16 |
+
|
17 |
+
class DetectedObject:
|
18 |
+
"""Detected Object class."""
|
19 |
+
|
20 |
+
name: str | None
|
21 |
+
confidence: float = 0.0
|
22 |
+
probability: float = 0.0
|
23 |
+
confidence_of_all_obj: list[float] | None = dataclasses.field(default_factory=list)
|
24 |
+
probability_of_all_obj: list[float] | None = dataclasses.field(default_factory=list)
|
25 |
+
all_obj_detected: list[Any] | None = None
|
26 |
+
number_of_detection: int = 0
|
27 |
+
is_detected: bool | Status = Status.UNKNOWN
|
28 |
+
model_name: str | None = None
|
29 |
+
bounding_box_of_all_obj: list[Any] | None = None
|
30 |
+
|
31 |
+
def __post_init__(self) -> None:
|
32 |
+
"""Post init."""
|
33 |
+
if self.confidence_of_all_obj is not None and len(self.confidence_of_all_obj) > 0:
|
34 |
+
self.confidence = max(self.confidence_of_all_obj)
|
35 |
+
if self.probability_of_all_obj and len(self.probability_of_all_obj) > 0:
|
36 |
+
self.probability = max(self.probability_of_all_obj)
|
37 |
+
|
38 |
+
def get_probability(self) -> float:
|
39 |
+
"""Get probability."""
|
40 |
+
if self.probability > 0:
|
41 |
+
return self.probability
|
42 |
+
if self.confidence > 0 and self.probability == 0:
|
43 |
+
logging.info("Probability is not set, using confidence: %f", self.confidence)
|
44 |
+
return self.confidence
|
45 |
+
return self.probability
|
setup.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import find_packages, setup
|
2 |
+
|
3 |
+
setup(
|
4 |
+
name="NeuS-V",
|
5 |
+
version="0.1",
|
6 |
+
packages=find_packages(),
|
7 |
+
)
|