File size: 3,434 Bytes
9d46b1d
043d7db
 
9d46b1d
 
 
043d7db
 
9d46b1d
043d7db
 
9d46b1d
043d7db
9d46b1d
 
 
 
043d7db
9d46b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559d80f
9d46b1d
 
559d80f
9d46b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559d80f
a590910
559d80f
a590910
 
 
 
 
559d80f
a590910
9d46b1d
 
559d80f
9d46b1d
 
 
 
 
 
 
 
 
 
 
043d7db
 
 
9d46b1d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import cv2
import os
import gradio as gr
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import uuid

# Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model 1: ViT-GPT2
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Model 2: FuseCap
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device)

# Model 3: BLIP Large
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)


# Frame Extraction and Captioning Logic
def process_video(video_path):
    vidObj = cv2.VideoCapture(video_path)
    count = 0
    success = True
    frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []}
    print("LOGX")
    while success:
        success, frame = vidObj.read()
        print("LOGY")
        if not success:
            break

        # Process every 20th frame
        if count % 20 == 0:
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            # Model 1: ViT-GPT2
            pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device)
            output_ids = model1.generate(pixel_values, max_length=16, num_beams=4)
            caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
            frame_captions["Model 1"].append(caption1)

            # Model 2: FuseCap
            inputs = processor2(image, "a picture of ", return_tensors="pt").to(device)
            out2 = model2.generate(**inputs, num_beams=3)
            caption2 = processor2.decode(out2[0], skip_special_tokens=True)
            frame_captions["Model 2"].append(caption2)

            # Model 3: BLIP Large
            inputs3 = processor3(image, return_tensors="pt").to(device)
            out3 = model3.generate(**inputs3)
            caption3 = processor3.decode(out3[0], skip_special_tokens=True)
            frame_captions["Model 3"].append(caption3)

        count += 1

    vidObj.release()
    return frame_captions


# Gradio Interface
def generate_captions(video):
    print("LOG1")
    captions = process_video(video)
    print("LOG PO")
    result = ""
    for model_name, model_captions in captions.items():
        result += f"### {model_name}\n"
        result += "\n".join(f"- {caption}" for caption in model_captions)
        result += "\n\n"
    print("LOG KONIEc")
    return result



with gr.Blocks() as demo:
    gr.Markdown("# Video Captioning with Multiple Models 🎥")
    gr.Markdown("Upload a video to generate captions for its frames using three different models.")
    video_input = gr.Video(label="Upload Video")
    output = gr.Textbox(label="Generated Captions", lines=20)
    submit_button = gr.Button("Generate Captions")

    submit_button.click(
        fn=generate_captions,
        inputs=video_input,
        outputs=output,
    )

if __name__ == "__main__":
    demo.launch()