Spaces:

matjarm
/

model-comparison

Sleeping

File size: 3,512 Bytes

import cv2
import os
import gradio as gr
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import uuid

# Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model 1: ViT-GPT2
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Model 2: FuseCap
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device)

# Model 3: BLIP Large
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)


# Frame Extraction and Captioning Logic
def process_video(video_path):
    vidObj = cv2.VideoCapture(video_path)
    count = 0
    success = True
    frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []}

    while success:
        success, frame = vidObj.read()

        if not success:
            break

        # Process every 20th frame
        if count % 20 == 0:
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            # Model 1: ViT-GPT2
            pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device)
            output_ids = model1.generate(pixel_values, max_length=16, num_beams=4)
            caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
            frame_captions["Model 1"].append(caption1)

            # Model 2: FuseCap
            inputs = processor2(image, "a picture of ", return_tensors="pt").to(device)
            out2 = model2.generate(**inputs, num_beams=3)
            caption2 = processor2.decode(out2[0], skip_special_tokens=True)
            frame_captions["Model 2"].append(caption2)

            # Model 3: BLIP Large
            inputs3 = processor3(image, return_tensors="pt").to(device)
            out3 = model3.generate(**inputs3)
            caption3 = processor3.decode(out3[0], skip_special_tokens=True)
            frame_captions["Model 3"].append(caption3)

        count += 1

    vidObj.release()
    return frame_captions


# Gradio Interface
def generate_captions(video):
    # Save uploaded video
    print(video)

    # Process video and get captions
    captions = process_video(video)

    # Clean up temporary file
    os.remove(video)

    # Format output for display
    result = ""
    for model_name, model_captions in captions.items():
        result += f"### {model_name}\n"
        result += "\n".join(f"- {caption}" for caption in model_captions)
        result += "\n\n"

    return result


# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Video Captioning with Multiple Models 🎥")
    gr.Markdown("Upload a video to generate captions for its frames using three different models.")
    video_input = gr.Video(label="Upload Video")
    output = gr.Textbox(label="Generated Captions", lines=20)
    submit_button = gr.Button("Generate Captions")

    submit_button.click(
        fn=generate_captions,
        inputs=video_input,
        outputs=output,
    )

if __name__ == "__main__":
    demo.launch()