Spaces:

VinitT
/

StoryGeneraterFromImages

Sleeping

App Files Files Community

StoryGeneraterFromImages / app.py

VinitT

Update app.py

36d8cb0 verified 12 months ago

raw

history blame

3.21 kB

	import streamlit as st
	from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
	from PIL import Image
	import torch
	import cv2
	import tempfile

	# Load the processor and model directly
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
	model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

	# Check if CUDA is available and set the device accordingly
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	# Streamlit app
	st.title("Media Description Generator")

	uploaded_file = st.file_uploader("Choose an image or video...", type=["jpg", "jpeg", "png", "mp4", "avi", "mov"])

	if uploaded_file is not None:
	file_type = uploaded_file.type.split('/')[0]

	if file_type == 'image':
	# Open the image
	image = Image.open(uploaded_file)
	st.image(image, caption='Uploaded Image.', use_column_width=True)
	st.write("Generating description...")

	elif file_type == 'video':
	# Save the uploaded video to a temporary file
	tfile = tempfile.NamedTemporaryFile(delete=False)
	tfile.write(uploaded_file.read())

	# Open the video file
	cap = cv2.VideoCapture(tfile.name)

	# Extract the first frame
	ret, frame = cap.read()
	if not ret:
	st.error("Failed to read the video file.")
	st.stop()
	else:
	# Convert the frame to an image
	image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
	st.image(image, caption='First Frame of Uploaded Video.', use_column_width=True)
	st.write("Generating description...")

	# Release the video capture object
	cap.release()

	else:
	st.error("Unsupported file type.")
	st.stop()

	# Add a text input for the user to ask a question
	user_question = st.text_input("Ask a question about the image or video:")

	if user_question:
	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": image,
	},
	{"type": "text", "text": user_question},
	],
	}
	]

	# Preparation for inference
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	# Pass the image to the processor
	inputs = processor(
	text=[text],
	images=[image],
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(device) # Ensure inputs are on the same device as the model

	# Inference: Generation of the output
	generated_ids = model.generate(**inputs, max_new_tokens=128)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)

	st.write("Description:")
	st.write(output_text[0])