Spaces:
Paused
Paused
File size: 4,869 Bytes
7ec133b 9db455c 7ec133b c52e238 fb293c4 c52e238 7ec133b f8dcf83 a4115fd f8dcf83 7ec133b 9db455c 7294f1e 9db455c 7294f1e 9db455c edff486 9db455c edff486 9db455c edff486 634326a 9db455c edff486 9db455c 7294f1e 9db455c edff486 9db455c 7ec133b edff486 7ec133b 7db5813 edff486 7ec133b edff486 bed8af5 9db455c 7ec133b d8b8982 c9a7831 7ec133b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import cv2
import numpy as np
import io
# # Ensure GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
# def process_video(video_bytes):
# """Extracts frames from the video, 1 per second."""
# video = cv2.VideoCapture(io.BytesIO(video_bytes))
# fps = video.get(cv2.CAP_PROP_FPS)
# frames = []
# success, frame = video.read()
# while success:
# frames.append(frame)
# for _ in range(int(fps)): # Skip fps frames
# success, frame = video.read()
# video.release()
# return frames[:4] # Return the first 4 frames
def video_to_frames(video_path):
"""Converts a video file into frames and stores them as PNG images in a list."""
# List to hold frames encoded as PNG
frames_png = []
# Open the video file
cap = cv2.VideoCapture(video_path)
# Check if video opened successfully
if not cap.isOpened():
print("Error opening video file")
return frames_png
# Read until video is completed
while cap.isOpened():
# Capture frame-by-frame
ret, frame = cap.read()
# If frame is read correctly ret is True
if not ret:
print("Can't receive frame (stream end?). Exiting ...")
break
# Convert the frame to PNG and store it
is_success, buffer = cv2.imencode(".png", frame)
if is_success:
frames_png.append(np.array(buffer).tobytes())
# When everything done, release the video capture object
cap.release()
return frames_png
def predict_answer(image, video, question, max_tokens=100):
if image:
# Process as an image
image = image.convert("RGB")
input_ids = tokenizer(question, return_tensors='pt').input_ids.to(device)
image_tensor = model.image_preprocess(image)
#Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=max_tokens,
images=image_tensor,
use_cache=True)[0]
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
elif video:
# Process as a video
frames = video_to_frames(video)
answers = []
for frame in frames:
frame = Image.open(frame).convert("RGB")
input_ids = tokenizer(question, return_tensors='pt').input_ids.to(device)
image_tensor = model.image_preprocess(frame)
# Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=max_tokens,
images=image_tensor,
use_cache=True)[0]
answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
answers.append(answer)
return "\n".join(answers)
else:
return "Unsupported file type. Please upload an image or video."
def gradio_predict(image, video, question, max_tokens):
answer = predict_answer(image, video, question, max_tokens)
return answer
# Define the Gradio interface
iface = gr.Interface(
fn=gradio_predict,
inputs=[gr.Image(type="pil", label="Upload or Drag an Image"),
gr.Video(label="upload your video here"),
gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
outputs=gr.TextArea(label="Answer"),
# examples=examples,
title="Super Rapid Annotator - Multimodal vision tool to annotate videos with LLaVA framework",
# description="An interactive chat model that can answer questions about images in an Academic context. \n We can input images, and the system will analyze them to provide information about their contents. I've utilized this capability by feeding slides from PowerPoint presentations used in classes and the lecture content passed as text. Consequently, the model now mimics the behavior and responses of my professors. So, if I present any PowerPoint slide, it explains it just like my professor would, further it can be personalized.",
)
# Launch the app
iface.queue().launch(debug=True)
|