Spaces:
Sleeping
Sleeping
File size: 3,434 Bytes
9d46b1d 043d7db 9d46b1d 043d7db 9d46b1d 043d7db 9d46b1d 043d7db 9d46b1d 043d7db 9d46b1d 559d80f 9d46b1d 559d80f 9d46b1d 559d80f a590910 559d80f a590910 559d80f a590910 9d46b1d 559d80f 9d46b1d 043d7db 9d46b1d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import cv2
import os
import gradio as gr
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import uuid
# Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Model 1: ViT-GPT2
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# Model 2: FuseCap
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device)
# Model 3: BLIP Large
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
# Frame Extraction and Captioning Logic
def process_video(video_path):
vidObj = cv2.VideoCapture(video_path)
count = 0
success = True
frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []}
print("LOGX")
while success:
success, frame = vidObj.read()
print("LOGY")
if not success:
break
# Process every 20th frame
if count % 20 == 0:
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# Model 1: ViT-GPT2
pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device)
output_ids = model1.generate(pixel_values, max_length=16, num_beams=4)
caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
frame_captions["Model 1"].append(caption1)
# Model 2: FuseCap
inputs = processor2(image, "a picture of ", return_tensors="pt").to(device)
out2 = model2.generate(**inputs, num_beams=3)
caption2 = processor2.decode(out2[0], skip_special_tokens=True)
frame_captions["Model 2"].append(caption2)
# Model 3: BLIP Large
inputs3 = processor3(image, return_tensors="pt").to(device)
out3 = model3.generate(**inputs3)
caption3 = processor3.decode(out3[0], skip_special_tokens=True)
frame_captions["Model 3"].append(caption3)
count += 1
vidObj.release()
return frame_captions
# Gradio Interface
def generate_captions(video):
print("LOG1")
captions = process_video(video)
print("LOG PO")
result = ""
for model_name, model_captions in captions.items():
result += f"### {model_name}\n"
result += "\n".join(f"- {caption}" for caption in model_captions)
result += "\n\n"
print("LOG KONIEc")
return result
with gr.Blocks() as demo:
gr.Markdown("# Video Captioning with Multiple Models 🎥")
gr.Markdown("Upload a video to generate captions for its frames using three different models.")
video_input = gr.Video(label="Upload Video")
output = gr.Textbox(label="Generated Captions", lines=20)
submit_button = gr.Button("Generate Captions")
submit_button.click(
fn=generate_captions,
inputs=video_input,
outputs=output,
)
if __name__ == "__main__":
demo.launch()
|