Spaces:

VinitT
/

StoryGeneraterFromImages

Sleeping

File size: 3,212 Bytes

fd19cdd
2f43b7c
fd19cdd
 
36d8cb0
 
fd19cdd
2f43b7c
fd19cdd
2f43b7c
fd19cdd
36d8cb0
 
 
 
fd19cdd
36d8cb0
fd19cdd
36d8cb0
fd19cdd
 
36d8cb0

import streamlit as st
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from PIL import Image
import torch
import cv2
import tempfile

# Load the processor and model directly
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Streamlit app
st.title("Media Description Generator")

uploaded_file = st.file_uploader("Choose an image or video...", type=["jpg", "jpeg", "png", "mp4", "avi", "mov"])

if uploaded_file is not None:
    file_type = uploaded_file.type.split('/')[0]

    if file_type == 'image':
        # Open the image
        image = Image.open(uploaded_file)
        st.image(image, caption='Uploaded Image.', use_column_width=True)
        st.write("Generating description...")

    elif file_type == 'video':
        # Save the uploaded video to a temporary file
        tfile = tempfile.NamedTemporaryFile(delete=False)
        tfile.write(uploaded_file.read())

        # Open the video file
        cap = cv2.VideoCapture(tfile.name)

        # Extract the first frame
        ret, frame = cap.read()
        if not ret:
            st.error("Failed to read the video file.")
            st.stop()
        else:
            # Convert the frame to an image
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            st.image(image, caption='First Frame of Uploaded Video.', use_column_width=True)
            st.write("Generating description...")

        # Release the video capture object
        cap.release()

    else:
        st.error("Unsupported file type.")
        st.stop()

    # Add a text input for the user to ask a question
    user_question = st.text_input("Ask a question about the image or video:")

    if user_question:
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": image,
                    },
                    {"type": "text", "text": user_question},
                ],
            }
        ]

        # Preparation for inference
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        # Pass the image to the processor
        inputs = processor(
            text=[text],
            images=[image],
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(device)  # Ensure inputs are on the same device as the model

        # Inference: Generation of the output
        generated_ids = model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        st.write("Description:")
        st.write(output_text[0])