MBZUAI
/

ViMUL

Safetensors

llava

Model card Files Files and versions

xet

Community

k-m-irfan commited on Jun 9

Commit

cd071c7

verified ·

1 Parent(s): d48ba4d

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

README.md +108 -3

README.md CHANGED Viewed

@@ -1,3 +1,108 @@
----
-license: cc-by-sa-4.0
----

+---
+license: cc-by-sa-4.0
+---
+# Model Card: VIMUL
+## Requires
+```bash
+git clone https://github.com/LLaVA-VL/LLaVA-NeXT.git
+pip install LLaVA-NeXT
+```
+## Inference
+Example video inference:
+```python
+import torch
+import numpy as np
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import process_anyres_image, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from transformers import AutoConfig
+from decord import VideoReader, cpu
+def load_video(video_path, num_frames=32, force_sample=False):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    total_frame_num = len(vr)
+    fps = round(vr.get_avg_fps())
+    frame_idx = [i for i in range(0, len(vr), fps)]
+    if len(frame_idx) > num_frames or force_sample:
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, num_frames, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+    frames = vr.get_batch(frame_idx).asnumpy()
+    return frames
+def infer(
+    model_path,
+    video_path,
+    prompt,
+    model_base=None,
+    conv_mode=None,
+    num_frames=32,
+    force_sample=False,
+    load_8bit=False,
+    device="cuda"
+):
+    model_name = get_model_name_from_path(model_path)+"llava_qwen"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        model_path, model_base, model_name, load_8bit=load_8bit
+    )
+    frames = load_video(video_path, num_frames=num_frames, force_sample=force_sample)
+    video = image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].half().to(device)
+    video = [video]
+    qs = DEFAULT_IMAGE_TOKEN + "\n" + prompt
+    conv = conv_templates[conv_mode].copy() if conv_mode else conv_templates["default"].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt_str = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt_str, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    attention_masks = input_ids.ne(tokenizer.pad_token_id).long().to(device)
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+    with torch.inference_mode():
+        output_ids = model.generate(
+            inputs=input_ids,
+            images=video,
+            attention_mask=attention_masks,
+            modalities="video",
+            do_sample=False,
+            temperature=0.0,
+            max_new_tokens=1024,
+            top_p=0.1,
+            num_beams=1,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria]
+        )
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+    if outputs.endswith(stop_str):
+        outputs = outputs[:-len(stop_str)]
+    return outputs.strip()
+if __name__ == "__main__":
+    model_path = "MBZUAI/ViMUL"
+    video_path = "LLaVA-NeXT/playground/demo/xU25MMA2N4aVtYay.mp4"
+    prompt = "Describe what happens in the video."
+    conv_mode = "qwen_1_5"
+    output = infer(model_path, video_path, prompt, conv_mode=conv_mode)
+    print("\n")
+    print("="*40)
+    print("Output:", output)
+    print("="*40)
+```
+## Citation
+```
+```