Spaces:

Arxived
/

qwen

Sleeping

File size: 3,632 Bytes

import streamlit as st
from yt_dlp import YoutubeDL
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import subprocess
import sys

def install_dependencies():
    try:
        # Install torch first
        subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.0.1"])
        # Install flash-attn after torch
        subprocess.check_call([sys.executable, "-m", "pip", "install", "flash-attn==2.7.2.post1"])
        # Install other dependencies
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
    except subprocess.CalledProcessError as e:
        print(f"Error occurred while installing dependencies: {e}")
        sys.exit(1)

# Call the function to install dependencies
install_dependencies()


# Title and Description
st.title("Video Analysis with Qwen2-VL")
st.markdown("""
This app downloads a YouTube video, processes it, and analyzes it using the Qwen2-VL model.
""")

# User input for YouTube URL
url = st.text_input("Enter YouTube Video URL:", value="https://www.youtube.com/watch?v=MCWJNOfJoSM")

if st.button("Analyze Video"):
    with st.spinner("Downloading video..."):
        ydl_opts = {
            "format": "best",
            "outtmpl": "football.mp4"
        }
        try:
            with YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            st.success("Video downloaded successfully!")
        except Exception as e:
            st.error(f"Error downloading video: {e}")
            st.stop()

    with st.spinner("Loading model..."):
        MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct"

        try:
            model = Qwen2VLForConditionalGeneration.from_pretrained(
                MODEL_NAME,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                attn_implementation="flash_attention_2",
            )
            processor = AutoProcessor.from_pretrained(MODEL_NAME)
            st.success("Model loaded successfully!")
        except Exception as e:
            st.error(f"Error loading model: {e}")
            st.stop()

    # Process video and generate response
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": "football.mp4",
                    "max_pixels": 1280 * 780,
                    "fps": 0.1,
                },
                {"type": "text", "text": "What's happening in the video? Who wins the penalty shootout?"},
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    with st.spinner("Generating response..."):
        try:
            generated_ids = model.generate(**inputs, max_new_tokens=512)
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]

            output_text = processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )
            st.success("Response generated!")
            st.text_area("Model Output:", value=output_text[0], height=200)
        except Exception as e:
            st.error(f"Error generating response: {e}")