File size: 2,796 Bytes
e95a3a8
 
 
 
 
 
 
213c599
 
 
e95a3a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch
from decord import VideoReader, cpu
import os
import spaces

import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Load the model and tokenizer
model_name = "openbmb/MiniCPM-V-2_6-int4"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
model.eval()

MAX_NUM_FRAMES = 64
VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}

def get_file_extension(filename):
    return os.path.splitext(filename)[1].lower()

def is_video(filename):
    return get_file_extension(filename) in VIDEO_EXTENSIONS

def encode_video(video):
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]

    if hasattr(video, 'path'):
        video_path = video.path
    else:
        video_path = video.file.path

    vr = VideoReader(video_path, ctx=cpu(0))
    total_frames = len(vr)
    if total_frames <= MAX_NUM_FRAMES:
        frame_idxs = list(range(total_frames))
    else:
        frame_idxs = uniform_sample(range(total_frames), MAX_NUM_FRAMES)

    frames = vr.get_batch(frame_idxs).asnumpy()
    return frames

@spaces.GPU
def analyze_video(video, prompt):
    if not is_video(video.name):
        return "Please upload a valid video file."

    frames = encode_video(video)
    
    # Prepare the frames for the model
    inputs = model.vpm(frames)
    
    # Generate the caption with the user's prompt
    with torch.no_grad():
        outputs = model.generate(inputs=inputs, tokenizer=tokenizer, max_new_tokens=50, prompt=prompt)
    
    # Decode the output
    caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return caption

# Create the Gradio interface using Blocks
with gr.Blocks(title="Video Analyzer using MiniCPM-V-2.6-int4") as iface:
    gr.Markdown("# Video Analyzer using MiniCPM-V-2.6-int4")
    gr.Markdown("Upload a video to get an analysis using the MiniCPM-V-2.6-int4 model.")
    gr.Markdown("This model uses 4-bit quantization for improved efficiency. [Learn more](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4)")
    
    with gr.Row():
        video_input = gr.Video()
        prompt_input = gr.Textbox(label="Prompt (optional)", placeholder="Enter a prompt to guide the analysis...")
        analysis_output = gr.Textbox(label="Video Analysis")
    
    analyze_button = gr.Button("Analyze Video")
    analyze_button.click(fn=analyze_video, inputs=[video_input, prompt_input], outputs=analysis_output)

# Launch the interface
iface.launch()