Build

Paused

App Files Files Community

Build / app.py

ManishThota

Update app.py

feb8185 verified about 1 year ago

raw

history blame

4.98 kB

	import gradio as gr
	from PIL import Image
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import cv2
	import numpy as np
	import io


	# # Ensure GPU usage if available
	device = "cuda" if torch.cuda.is_available() else "cpu"



	# Initialize the model and tokenizer
	model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True)
	tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)


	# def process_video(video_bytes):
	# """Extracts frames from the video, 1 per second."""
	# video = cv2.VideoCapture(io.BytesIO(video_bytes))
	# fps = video.get(cv2.CAP_PROP_FPS)
	# frames = []
	# success, frame = video.read()
	# while success:
	# frames.append(frame)
	# for _ in range(int(fps)): # Skip fps frames
	# success, frame = video.read()
	# video.release()
	# return frames[:4] # Return the first 4 frames

	def video_to_frames(video_path):
	"""Converts a video file into frames and stores them as PNG images in a list."""
	# List to hold frames encoded as PNG
	frames_png = []

	# Open the video file
	cap = cv2.VideoCapture(video_path)

	# Check if video opened successfully
	if not cap.isOpened():
	print("Error opening video file")
	return frames_png

	# Read until video is completed
	while cap.isOpened():
	# Capture frame-by-frame
	ret, frame = cap.read()

	# If frame is read correctly ret is True
	if not ret:
	print("Can't receive frame (stream end?). Exiting ...")
	break

	# Convert the frame to PNG and store it
	is_success, buffer = cv2.imencode(".png", frame)
	if is_success:
	frames_png.append(np.array(buffer).tobytes())

	# When everything done, release the video capture object
	cap.release()

	return frames_png

	def predict_answer(image, video, question, max_tokens=100):

	text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
	input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)


	if image:
	# Process as an image
	image = image.convert("RGB")
	image_tensor = model.image_preprocess(image)

	#Generate the answer
	output_ids = model.generate(
	input_ids,
	max_new_tokens=max_tokens,
	images=image_tensor,
	use_cache=True)[0]

	return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

	elif video:
	# Process as a video
	frames = video_to_frames(video)
	answers = []
	for frame in frames:
	frame = Image.open(frame).convert("RGB")
	image_tensor = model.image_preprocess(frame)

	# Generate the answer
	output_ids = model.generate(
	input_ids,
	max_new_tokens=max_tokens,
	images=image_tensor,
	use_cache=True)[0]

	answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
	answers.append(answer)
	return "\n".join(answers)

	else:
	return "Unsupported file type. Please upload an image or video."




	def gradio_predict(image, video, question, max_tokens):
	answer = predict_answer(image, video, question, max_tokens)
	return answer



	# Define the Gradio interface
	iface = gr.Interface(
	fn=gradio_predict,
	inputs=[gr.Image(type="pil", label="Upload or Drag an Image"),
	gr.Video(label="upload your video here"),
	gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
	gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
	outputs=gr.TextArea(label="Answer"),
	# examples=examples,
	title="Super Rapid Annotator - Multimodal vision tool to annotate videos with LLaVA framework",
	# description="An interactive chat model that can answer questions about images in an Academic context. \n We can input images, and the system will analyze them to provide information about their contents. I've utilized this capability by feeding slides from PowerPoint presentations used in classes and the lecture content passed as text. Consequently, the model now mimics the behavior and responses of my professors. So, if I present any PowerPoint slide, it explains it just like my professor would, further it can be personalized.",
	)

	# Launch the app
	iface.queue().launch(debug=True)