Update README.md
Browse files
README.md
CHANGED
@@ -16,15 +16,271 @@ VideoMind is a multi-modal agent framework that enhances video reasoning by emul
|
|
16 |
|
17 |
## 🔖 Model Details
|
18 |
|
19 |
-
### Model Description
|
20 |
-
|
21 |
- **Model type:** Multi-modal Large Language Model
|
22 |
- **Language(s):** English
|
23 |
- **License:** BSD-3-Clause
|
24 |
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
## 📖 Citation
|
30 |
|
|
|
16 |
|
17 |
## 🔖 Model Details
|
18 |
|
|
|
|
|
19 |
- **Model type:** Multi-modal Large Language Model
|
20 |
- **Language(s):** English
|
21 |
- **License:** BSD-3-Clause
|
22 |
|
23 |
+
## 🚀 Quick Start
|
24 |
+
|
25 |
+
### Install the environment
|
26 |
+
|
27 |
+
1. Clone the repository from GitHub.
|
28 |
+
|
29 |
+
```shell
|
30 |
+
git clone [email protected]:yeliudev/VideoMind.git
|
31 |
+
cd VideoMind
|
32 |
+
```
|
33 |
+
|
34 |
+
2. Initialize conda environment.
|
35 |
+
|
36 |
+
```shell
|
37 |
+
conda create -n videomind python=3.11 -y
|
38 |
+
conda activate videomind
|
39 |
+
```
|
40 |
+
|
41 |
+
3. Install dependencies.
|
42 |
+
|
43 |
+
```shell
|
44 |
+
pip install -r requirements.txt
|
45 |
+
```
|
46 |
+
|
47 |
+
For NPU users, please modify [Line 18-25](https://github.com/yeliudev/VideoMind/blob/main/requirements.txt#L18:L25) of `requirements.txt`.
|
48 |
+
|
49 |
+
### Quick Inference Demo
|
50 |
+
|
51 |
+
The script below showcases how to perform inference with VideoMind's different roles. Please refer to our [GitHub Repository](https://github.com/yeliudev/VideoMind) for more details about this model.
|
52 |
+
|
53 |
+
```python
|
54 |
+
import torch
|
55 |
+
|
56 |
+
from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
|
57 |
+
from videomind.dataset.utils import process_vision_info
|
58 |
+
from videomind.model.builder import build_model
|
59 |
+
from videomind.utils.io import get_duration
|
60 |
+
from videomind.utils.parser import parse_span
|
61 |
+
|
62 |
+
MODEL_PATH = 'yeliudev/VideoMind-7B'
|
63 |
+
|
64 |
+
video_path = '<path-to-video>'
|
65 |
+
question = '<question>'
|
66 |
+
|
67 |
+
# initialize role *grounder*
|
68 |
+
model, processor = build_model(MODEL_PATH)
|
69 |
+
device = next(model.parameters()).device
|
70 |
+
|
71 |
+
# initialize role *planner*
|
72 |
+
model.load_adapter(f'{MODEL_PATH}/planner', adapter_name='planner')
|
73 |
+
|
74 |
+
# initialize role *verifier*
|
75 |
+
model.load_adapter(f'{MODEL_PATH}/verifier', adapter_name='verifier')
|
76 |
+
|
77 |
+
# ==================== Planner ====================
|
78 |
+
|
79 |
+
messages = [{
|
80 |
+
'role':
|
81 |
+
'user',
|
82 |
+
'content': [{
|
83 |
+
'type': 'video',
|
84 |
+
'video': video_path,
|
85 |
+
'min_pixels': 36 * 28 * 28,
|
86 |
+
'max_pixels': 64 * 28 * 28,
|
87 |
+
'max_frames': 100,
|
88 |
+
'fps': 1.0
|
89 |
+
}, {
|
90 |
+
'type': 'text',
|
91 |
+
'text': PLANNER_PROMPT.format(question)
|
92 |
+
}]
|
93 |
+
}]
|
94 |
+
|
95 |
+
# preprocess inputs
|
96 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
97 |
+
images, videos = process_vision_info(messages)
|
98 |
+
data = processor(text=[text], images=images, videos=videos, return_tensors='pt').to(device)
|
99 |
+
|
100 |
+
# switch adapter to *planner*
|
101 |
+
model.base_model.disable_adapter_layers()
|
102 |
+
model.base_model.enable_adapter_layers()
|
103 |
+
model.set_adapter('planner')
|
104 |
+
|
105 |
+
# run inference
|
106 |
+
output_ids = model.generate(**data, do_sample=False, temperature=None, top_p=None, top_k=None, max_new_tokens=256)
|
107 |
+
|
108 |
+
# decode output ids
|
109 |
+
output_ids = output_ids[0, data.input_ids.size(1):-1]
|
110 |
+
response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
|
111 |
+
|
112 |
+
print(f'Planner Response: {response}')
|
113 |
+
|
114 |
+
# ==================== Grounder ====================
|
115 |
+
|
116 |
+
messages = [{
|
117 |
+
'role':
|
118 |
+
'user',
|
119 |
+
'content': [{
|
120 |
+
'type': 'video',
|
121 |
+
'video': video_path,
|
122 |
+
'min_pixels': 36 * 28 * 28,
|
123 |
+
'max_pixels': 64 * 28 * 28,
|
124 |
+
'max_frames': 150,
|
125 |
+
'fps': 1.0
|
126 |
+
}, {
|
127 |
+
'type': 'text',
|
128 |
+
'text': GROUNDER_PROMPT.format(question)
|
129 |
+
}]
|
130 |
+
}]
|
131 |
+
|
132 |
+
# preprocess inputs
|
133 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
134 |
+
images, videos = process_vision_info(messages)
|
135 |
+
data = processor(text=[text], images=images, videos=videos, return_tensors='pt').to(device)
|
136 |
+
|
137 |
+
# switch adapter to *grounder*
|
138 |
+
model.base_model.disable_adapter_layers()
|
139 |
+
model.base_model.enable_adapter_layers()
|
140 |
+
model.set_adapter('grounder')
|
141 |
|
142 |
+
# run inference
|
143 |
+
output_ids = model.generate(**data, do_sample=False, temperature=None, top_p=None, top_k=None, max_new_tokens=256)
|
144 |
+
|
145 |
+
# decode output ids
|
146 |
+
output_ids = output_ids[0, data.input_ids.size(1):-1]
|
147 |
+
response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
|
148 |
+
|
149 |
+
print(f'Grounder Response: {response}')
|
150 |
+
|
151 |
+
duration = get_duration(video_path)
|
152 |
+
|
153 |
+
# 1. extract timestamps and confidences
|
154 |
+
blob = model.reg[0].cpu().float()
|
155 |
+
pred, conf = blob[:, :2] * duration, blob[:, -1].tolist()
|
156 |
+
|
157 |
+
# 2. clamp timestamps
|
158 |
+
pred = pred.clamp(min=0, max=duration)
|
159 |
+
|
160 |
+
# 3. sort timestamps
|
161 |
+
inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0]
|
162 |
+
pred[inds] = pred[inds].roll(1)
|
163 |
+
|
164 |
+
# 4. convert timestamps to list
|
165 |
+
pred = pred.tolist()
|
166 |
+
|
167 |
+
print(f'Grounder Regressed Timestamps: {pred}')
|
168 |
+
|
169 |
+
# ==================== Verifier ====================
|
170 |
+
|
171 |
+
# using top-5 predictions
|
172 |
+
probs = []
|
173 |
+
for cand in pred[:5]:
|
174 |
+
s0, e0 = parse_span(cand, duration, 2)
|
175 |
+
offset = (e0 - s0) / 2
|
176 |
+
s1, e1 = parse_span([s0 - offset, e0 + offset], duration)
|
177 |
+
|
178 |
+
# percentage of s0, e0 within s1, e1
|
179 |
+
s = (s0 - s1) / (e1 - s1)
|
180 |
+
e = (e0 - s1) / (e1 - s1)
|
181 |
+
|
182 |
+
messages = [{
|
183 |
+
'role':
|
184 |
+
'user',
|
185 |
+
'content': [{
|
186 |
+
'type': 'video',
|
187 |
+
'video': video_path,
|
188 |
+
'video_start': s1,
|
189 |
+
'video_end': e1,
|
190 |
+
'min_pixels': 36 * 28 * 28,
|
191 |
+
'max_pixels': 64 * 28 * 28,
|
192 |
+
'max_frames': 64,
|
193 |
+
'fps': 2.0
|
194 |
+
}, {
|
195 |
+
'type': 'text',
|
196 |
+
'text': VERIFIER_PROMPT.format(question)
|
197 |
+
}]
|
198 |
+
}]
|
199 |
+
|
200 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
201 |
+
images, videos = process_vision_info(messages)
|
202 |
+
data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
|
203 |
+
|
204 |
+
# ===== insert segment start/end tokens =====
|
205 |
+
video_grid_thw = data['video_grid_thw'][0]
|
206 |
+
num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
|
207 |
+
assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
|
208 |
+
|
209 |
+
pos_s, pos_e = round(s * num_frames), round(e * num_frames)
|
210 |
+
pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
|
211 |
+
assert pos_s <= pos_e, (num_frames, s, e)
|
212 |
+
|
213 |
+
base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item()
|
214 |
+
pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
|
215 |
+
|
216 |
+
input_ids = data['input_ids'][0].tolist()
|
217 |
+
input_ids.insert(pos_s, model.config.seg_s_token_id)
|
218 |
+
input_ids.insert(pos_e, model.config.seg_e_token_id)
|
219 |
+
data['input_ids'] = torch.LongTensor([input_ids])
|
220 |
+
data['attention_mask'] = torch.ones_like(data['input_ids'])
|
221 |
+
# ===========================================
|
222 |
+
|
223 |
+
data = data.to(device)
|
224 |
+
|
225 |
+
# switch adapter to *verifier*
|
226 |
+
model.base_model.disable_adapter_layers()
|
227 |
+
model.base_model.enable_adapter_layers()
|
228 |
+
model.set_adapter('verifier')
|
229 |
+
|
230 |
+
# run inference
|
231 |
+
with torch.inference_mode():
|
232 |
+
logits = model(**data).logits[0, -1].softmax(dim=-1)
|
233 |
+
|
234 |
+
# NOTE: magic numbers here
|
235 |
+
# In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No
|
236 |
+
score = (logits[9454] - logits[2753]).sigmoid().item()
|
237 |
+
probs.append(score)
|
238 |
+
|
239 |
+
# sort predictions by verifier's confidence
|
240 |
+
ranks = torch.Tensor(probs).argsort(descending=True).tolist()
|
241 |
+
|
242 |
+
pred = [pred[idx] for idx in ranks]
|
243 |
+
conf = [conf[idx] for idx in ranks]
|
244 |
+
|
245 |
+
print(f'Verifier Re-ranked Timestamps: {pred}')
|
246 |
+
|
247 |
+
# ==================== Answerer ====================
|
248 |
+
|
249 |
+
# select the best candidate moment
|
250 |
+
s, e = parse_span(pred[0], duration, 32)
|
251 |
+
|
252 |
+
messages = [{
|
253 |
+
'role':
|
254 |
+
'user',
|
255 |
+
'content': [{
|
256 |
+
'type': 'video',
|
257 |
+
'video': video_path,
|
258 |
+
'video_start': s,
|
259 |
+
'video_end': e,
|
260 |
+
'min_pixels': 128 * 28 * 28,
|
261 |
+
'max_pixels': 256 * 28 * 28,
|
262 |
+
'max_frames': 32,
|
263 |
+
'fps': 2.0
|
264 |
+
}, {
|
265 |
+
'type': 'text',
|
266 |
+
'text': question
|
267 |
+
}]
|
268 |
+
}]
|
269 |
+
|
270 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
271 |
+
images, videos = process_vision_info(messages)
|
272 |
+
data = processor(text=[text], images=images, videos=videos, return_tensors='pt').to(device)
|
273 |
+
|
274 |
+
# remove all adapters as *answerer* is the base model itself
|
275 |
+
with model.disable_adapter():
|
276 |
+
output_ids = model.generate(**data, do_sample=False, temperature=None, top_p=None, top_k=None, max_new_tokens=256)
|
277 |
+
|
278 |
+
# decode output ids
|
279 |
+
output_ids = output_ids[0, data.input_ids.size(1):-1]
|
280 |
+
response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
|
281 |
+
|
282 |
+
print(f'Answerer Response: {response}')
|
283 |
+
```
|
284 |
|
285 |
## 📖 Citation
|
286 |
|