Spaces:

matjarm
/

model-comparison

Sleeping

File size: 3,434 Bytes

import cv2
import os
import gradio as gr
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import uuid

# Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model 1: ViT-GPT2
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Model 2: FuseCap
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device)

# Model 3: BLIP Large
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)


# Frame Extraction and Captioning Logic
def process_video(video_path):
    vidObj = cv2.VideoCapture(video_path)
    count = 0
    success = True
    frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []}
    print("LOGX")
    while success:
        success, frame = vidObj.read()
        print("LOGY")
        if not success:
            break

        # Process every 20th frame
        if count % 20 == 0:
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            # Model 1: ViT-GPT2
            pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device)
            output_ids = model1.generate(pixel_values, max_length=16, num_beams=4)
            caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
            frame_captions["Model 1"].append(caption1)

            # Model 2: FuseCap
            inputs = processor2(image, "a picture of ", return_tensors="pt").to(device)
            out2 = model2.generate(**inputs, num_beams=3)
            caption2 = processor2.decode(out2[0], skip_special_tokens=True)
            frame_captions["Model 2"].append(caption2)

            # Model 3: BLIP Large
            inputs3 = processor3(image, return_tensors="pt").to(device)
            out3 = model3.generate(**inputs3)
            caption3 = processor3.decode(out3[0], skip_special_tokens=True)
            frame_captions["Model 3"].append(caption3)

        count += 1

    vidObj.release()
    return frame_captions


# Gradio Interface
def generate_captions(video):
    print("LOG1")
    captions = process_video(video)
    print("LOG PO")
    result = ""
    for model_name, model_captions in captions.items():
        result += f"### {model_name}\n"
        result += "\n".join(f"- {caption}" for caption in model_captions)
        result += "\n\n"
    print("LOG KONIEc")
    return result



with gr.Blocks() as demo:
    gr.Markdown("# Video Captioning with Multiple Models 🎥")
    gr.Markdown("Upload a video to generate captions for its frames using three different models.")
    video_input = gr.Video(label="Upload Video")
    output = gr.Textbox(label="Generated Captions", lines=20)
    submit_button = gr.Button("Generate Captions")

    submit_button.click(
        fn=generate_captions,
        inputs=video_input,
        outputs=output,
    )

if __name__ == "__main__":
    demo.launch()