File size: 3,512 Bytes
9d46b1d
043d7db
 
9d46b1d
 
 
043d7db
 
9d46b1d
043d7db
 
9d46b1d
043d7db
9d46b1d
 
 
 
043d7db
9d46b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
043d7db
9d46b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bc9a6b
 
a590910
 
8bc9a6b
a590910
 
8bc9a6b
a590910
 
 
 
 
 
8bc9a6b
a590910
9d46b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
043d7db
 
 
9d46b1d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import cv2
import os
import gradio as gr
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import uuid

# Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model 1: ViT-GPT2
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Model 2: FuseCap
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device)

# Model 3: BLIP Large
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)


# Frame Extraction and Captioning Logic
def process_video(video_path):
    vidObj = cv2.VideoCapture(video_path)
    count = 0
    success = True
    frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []}

    while success:
        success, frame = vidObj.read()

        if not success:
            break

        # Process every 20th frame
        if count % 20 == 0:
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            # Model 1: ViT-GPT2
            pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device)
            output_ids = model1.generate(pixel_values, max_length=16, num_beams=4)
            caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
            frame_captions["Model 1"].append(caption1)

            # Model 2: FuseCap
            inputs = processor2(image, "a picture of ", return_tensors="pt").to(device)
            out2 = model2.generate(**inputs, num_beams=3)
            caption2 = processor2.decode(out2[0], skip_special_tokens=True)
            frame_captions["Model 2"].append(caption2)

            # Model 3: BLIP Large
            inputs3 = processor3(image, return_tensors="pt").to(device)
            out3 = model3.generate(**inputs3)
            caption3 = processor3.decode(out3[0], skip_special_tokens=True)
            frame_captions["Model 3"].append(caption3)

        count += 1

    vidObj.release()
    return frame_captions


# Gradio Interface
def generate_captions(video):
    # Save uploaded video
    print(video)

    # Process video and get captions
    captions = process_video(video)

    # Clean up temporary file
    os.remove(video)

    # Format output for display
    result = ""
    for model_name, model_captions in captions.items():
        result += f"### {model_name}\n"
        result += "\n".join(f"- {caption}" for caption in model_captions)
        result += "\n\n"

    return result


# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Video Captioning with Multiple Models 🎥")
    gr.Markdown("Upload a video to generate captions for its frames using three different models.")
    video_input = gr.Video(label="Upload Video")
    output = gr.Textbox(label="Generated Captions", lines=20)
    submit_button = gr.Button("Generate Captions")

    submit_button.click(
        fn=generate_captions,
        inputs=video_input,
        outputs=output,
    )

if __name__ == "__main__":
    demo.launch()