qwen / app.py
DrishtiSharma's picture
Update app.py
8d5517e verified
import streamlit as st
from yt_dlp import YoutubeDL
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import subprocess
import sys
def install_dependencies():
try:
# Install torch first
subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.0.1"])
# Install flash-attn after torch
subprocess.check_call([sys.executable, "-m", "pip", "install", "flash-attn==2.7.2.post1"])
# Install other dependencies
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
except subprocess.CalledProcessError as e:
print(f"Error occurred while installing dependencies: {e}")
sys.exit(1)
# Call the function to install dependencies
install_dependencies()
# Title and Description
st.title("Video Analysis with Qwen2-VL")
st.markdown("""
This app downloads a YouTube video, processes it, and analyzes it using the Qwen2-VL model.
""")
# User input for YouTube URL
url = st.text_input("Enter YouTube Video URL:", value="https://www.youtube.com/watch?v=MCWJNOfJoSM")
if st.button("Analyze Video"):
with st.spinner("Downloading video..."):
ydl_opts = {
"format": "best",
"outtmpl": "football.mp4"
}
try:
with YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
st.success("Video downloaded successfully!")
except Exception as e:
st.error(f"Error downloading video: {e}")
st.stop()
with st.spinner("Loading model..."):
MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct"
try:
model = Qwen2VLForConditionalGeneration.from_pretrained(
MODEL_NAME,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
st.success("Model loaded successfully!")
except Exception as e:
st.error(f"Error loading model: {e}")
st.stop()
# Process video and generate response
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": "football.mp4",
"max_pixels": 1280 * 780,
"fps": 0.1,
},
{"type": "text", "text": "What's happening in the video? Who wins the penalty shootout?"},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
with st.spinner("Generating response..."):
try:
generated_ids = model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
st.success("Response generated!")
st.text_area("Model Output:", value=output_text[0], height=200)
except Exception as e:
st.error(f"Error generating response: {e}")