yeliudev commited on
Commit
18a6844
·
verified ·
1 Parent(s): d8871ad

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +260 -4
README.md CHANGED
@@ -16,15 +16,271 @@ VideoMind is a multi-modal agent framework that enhances video reasoning by emul
16
 
17
  ## 🔖 Model Details
18
 
19
- ### Model Description
20
-
21
  - **Model type:** Multi-modal Large Language Model
22
  - **Language(s):** English
23
  - **License:** BSD-3-Clause
24
 
25
- ### More Details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- Please refer to our [GitHub Repository](https://github.com/yeliudev/VideoMind) for more details about this model.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  ## 📖 Citation
30
 
 
16
 
17
  ## 🔖 Model Details
18
 
 
 
19
  - **Model type:** Multi-modal Large Language Model
20
  - **Language(s):** English
21
  - **License:** BSD-3-Clause
22
 
23
+ ## 🚀 Quick Start
24
+
25
+ ### Install the environment
26
+
27
+ 1. Clone the repository from GitHub.
28
+
29
+ ```shell
30
+ git clone [email protected]:yeliudev/VideoMind.git
31
+ cd VideoMind
32
+ ```
33
+
34
+ 2. Initialize conda environment.
35
+
36
+ ```shell
37
+ conda create -n videomind python=3.11 -y
38
+ conda activate videomind
39
+ ```
40
+
41
+ 3. Install dependencies.
42
+
43
+ ```shell
44
+ pip install -r requirements.txt
45
+ ```
46
+
47
+ For NPU users, please modify [Line 18-25](https://github.com/yeliudev/VideoMind/blob/main/requirements.txt#L18:L25) of `requirements.txt`.
48
+
49
+ ### Quick Inference Demo
50
+
51
+ The script below showcases how to perform inference with VideoMind's different roles. Please refer to our [GitHub Repository](https://github.com/yeliudev/VideoMind) for more details about this model.
52
+
53
+ ```python
54
+ import torch
55
+
56
+ from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
57
+ from videomind.dataset.utils import process_vision_info
58
+ from videomind.model.builder import build_model
59
+ from videomind.utils.io import get_duration
60
+ from videomind.utils.parser import parse_span
61
+
62
+ MODEL_PATH = 'yeliudev/VideoMind-7B'
63
+
64
+ video_path = '<path-to-video>'
65
+ question = '<question>'
66
+
67
+ # initialize role *grounder*
68
+ model, processor = build_model(MODEL_PATH)
69
+ device = next(model.parameters()).device
70
+
71
+ # initialize role *planner*
72
+ model.load_adapter(f'{MODEL_PATH}/planner', adapter_name='planner')
73
+
74
+ # initialize role *verifier*
75
+ model.load_adapter(f'{MODEL_PATH}/verifier', adapter_name='verifier')
76
+
77
+ # ==================== Planner ====================
78
+
79
+ messages = [{
80
+ 'role':
81
+ 'user',
82
+ 'content': [{
83
+ 'type': 'video',
84
+ 'video': video_path,
85
+ 'min_pixels': 36 * 28 * 28,
86
+ 'max_pixels': 64 * 28 * 28,
87
+ 'max_frames': 100,
88
+ 'fps': 1.0
89
+ }, {
90
+ 'type': 'text',
91
+ 'text': PLANNER_PROMPT.format(question)
92
+ }]
93
+ }]
94
+
95
+ # preprocess inputs
96
+ text = processor.apply_chat_template(messages, add_generation_prompt=True)
97
+ images, videos = process_vision_info(messages)
98
+ data = processor(text=[text], images=images, videos=videos, return_tensors='pt').to(device)
99
+
100
+ # switch adapter to *planner*
101
+ model.base_model.disable_adapter_layers()
102
+ model.base_model.enable_adapter_layers()
103
+ model.set_adapter('planner')
104
+
105
+ # run inference
106
+ output_ids = model.generate(**data, do_sample=False, temperature=None, top_p=None, top_k=None, max_new_tokens=256)
107
+
108
+ # decode output ids
109
+ output_ids = output_ids[0, data.input_ids.size(1):-1]
110
+ response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
111
+
112
+ print(f'Planner Response: {response}')
113
+
114
+ # ==================== Grounder ====================
115
+
116
+ messages = [{
117
+ 'role':
118
+ 'user',
119
+ 'content': [{
120
+ 'type': 'video',
121
+ 'video': video_path,
122
+ 'min_pixels': 36 * 28 * 28,
123
+ 'max_pixels': 64 * 28 * 28,
124
+ 'max_frames': 150,
125
+ 'fps': 1.0
126
+ }, {
127
+ 'type': 'text',
128
+ 'text': GROUNDER_PROMPT.format(question)
129
+ }]
130
+ }]
131
+
132
+ # preprocess inputs
133
+ text = processor.apply_chat_template(messages, add_generation_prompt=True)
134
+ images, videos = process_vision_info(messages)
135
+ data = processor(text=[text], images=images, videos=videos, return_tensors='pt').to(device)
136
+
137
+ # switch adapter to *grounder*
138
+ model.base_model.disable_adapter_layers()
139
+ model.base_model.enable_adapter_layers()
140
+ model.set_adapter('grounder')
141
 
142
+ # run inference
143
+ output_ids = model.generate(**data, do_sample=False, temperature=None, top_p=None, top_k=None, max_new_tokens=256)
144
+
145
+ # decode output ids
146
+ output_ids = output_ids[0, data.input_ids.size(1):-1]
147
+ response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
148
+
149
+ print(f'Grounder Response: {response}')
150
+
151
+ duration = get_duration(video_path)
152
+
153
+ # 1. extract timestamps and confidences
154
+ blob = model.reg[0].cpu().float()
155
+ pred, conf = blob[:, :2] * duration, blob[:, -1].tolist()
156
+
157
+ # 2. clamp timestamps
158
+ pred = pred.clamp(min=0, max=duration)
159
+
160
+ # 3. sort timestamps
161
+ inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0]
162
+ pred[inds] = pred[inds].roll(1)
163
+
164
+ # 4. convert timestamps to list
165
+ pred = pred.tolist()
166
+
167
+ print(f'Grounder Regressed Timestamps: {pred}')
168
+
169
+ # ==================== Verifier ====================
170
+
171
+ # using top-5 predictions
172
+ probs = []
173
+ for cand in pred[:5]:
174
+ s0, e0 = parse_span(cand, duration, 2)
175
+ offset = (e0 - s0) / 2
176
+ s1, e1 = parse_span([s0 - offset, e0 + offset], duration)
177
+
178
+ # percentage of s0, e0 within s1, e1
179
+ s = (s0 - s1) / (e1 - s1)
180
+ e = (e0 - s1) / (e1 - s1)
181
+
182
+ messages = [{
183
+ 'role':
184
+ 'user',
185
+ 'content': [{
186
+ 'type': 'video',
187
+ 'video': video_path,
188
+ 'video_start': s1,
189
+ 'video_end': e1,
190
+ 'min_pixels': 36 * 28 * 28,
191
+ 'max_pixels': 64 * 28 * 28,
192
+ 'max_frames': 64,
193
+ 'fps': 2.0
194
+ }, {
195
+ 'type': 'text',
196
+ 'text': VERIFIER_PROMPT.format(question)
197
+ }]
198
+ }]
199
+
200
+ text = processor.apply_chat_template(messages, add_generation_prompt=True)
201
+ images, videos = process_vision_info(messages)
202
+ data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
203
+
204
+ # ===== insert segment start/end tokens =====
205
+ video_grid_thw = data['video_grid_thw'][0]
206
+ num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
207
+ assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
208
+
209
+ pos_s, pos_e = round(s * num_frames), round(e * num_frames)
210
+ pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
211
+ assert pos_s <= pos_e, (num_frames, s, e)
212
+
213
+ base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item()
214
+ pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
215
+
216
+ input_ids = data['input_ids'][0].tolist()
217
+ input_ids.insert(pos_s, model.config.seg_s_token_id)
218
+ input_ids.insert(pos_e, model.config.seg_e_token_id)
219
+ data['input_ids'] = torch.LongTensor([input_ids])
220
+ data['attention_mask'] = torch.ones_like(data['input_ids'])
221
+ # ===========================================
222
+
223
+ data = data.to(device)
224
+
225
+ # switch adapter to *verifier*
226
+ model.base_model.disable_adapter_layers()
227
+ model.base_model.enable_adapter_layers()
228
+ model.set_adapter('verifier')
229
+
230
+ # run inference
231
+ with torch.inference_mode():
232
+ logits = model(**data).logits[0, -1].softmax(dim=-1)
233
+
234
+ # NOTE: magic numbers here
235
+ # In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No
236
+ score = (logits[9454] - logits[2753]).sigmoid().item()
237
+ probs.append(score)
238
+
239
+ # sort predictions by verifier's confidence
240
+ ranks = torch.Tensor(probs).argsort(descending=True).tolist()
241
+
242
+ pred = [pred[idx] for idx in ranks]
243
+ conf = [conf[idx] for idx in ranks]
244
+
245
+ print(f'Verifier Re-ranked Timestamps: {pred}')
246
+
247
+ # ==================== Answerer ====================
248
+
249
+ # select the best candidate moment
250
+ s, e = parse_span(pred[0], duration, 32)
251
+
252
+ messages = [{
253
+ 'role':
254
+ 'user',
255
+ 'content': [{
256
+ 'type': 'video',
257
+ 'video': video_path,
258
+ 'video_start': s,
259
+ 'video_end': e,
260
+ 'min_pixels': 128 * 28 * 28,
261
+ 'max_pixels': 256 * 28 * 28,
262
+ 'max_frames': 32,
263
+ 'fps': 2.0
264
+ }, {
265
+ 'type': 'text',
266
+ 'text': question
267
+ }]
268
+ }]
269
+
270
+ text = processor.apply_chat_template(messages, add_generation_prompt=True)
271
+ images, videos = process_vision_info(messages)
272
+ data = processor(text=[text], images=images, videos=videos, return_tensors='pt').to(device)
273
+
274
+ # remove all adapters as *answerer* is the base model itself
275
+ with model.disable_adapter():
276
+ output_ids = model.generate(**data, do_sample=False, temperature=None, top_p=None, top_k=None, max_new_tokens=256)
277
+
278
+ # decode output ids
279
+ output_ids = output_ids[0, data.input_ids.size(1):-1]
280
+ response = processor.decode(output_ids, clean_up_tokenization_spaces=False)
281
+
282
+ print(f'Answerer Response: {response}')
283
+ ```
284
 
285
  ## 📖 Citation
286