import streamlit as st from yt_dlp import YoutubeDL from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info import torch import subprocess import sys def install_dependencies(): try: # Install torch first subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.0.1"]) # Install flash-attn after torch subprocess.check_call([sys.executable, "-m", "pip", "install", "flash-attn==2.7.2.post1"]) # Install other dependencies subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]) except subprocess.CalledProcessError as e: print(f"Error occurred while installing dependencies: {e}") sys.exit(1) # Call the function to install dependencies install_dependencies() # Title and Description st.title("Video Analysis with Qwen2-VL") st.markdown(""" This app downloads a YouTube video, processes it, and analyzes it using the Qwen2-VL model. """) # User input for YouTube URL url = st.text_input("Enter YouTube Video URL:", value="https://www.youtube.com/watch?v=MCWJNOfJoSM") if st.button("Analyze Video"): with st.spinner("Downloading video..."): ydl_opts = { "format": "best", "outtmpl": "football.mp4" } try: with YoutubeDL(ydl_opts) as ydl: ydl.download([url]) st.success("Video downloaded successfully!") except Exception as e: st.error(f"Error downloading video: {e}") st.stop() with st.spinner("Loading model..."): MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct" try: model = Qwen2VLForConditionalGeneration.from_pretrained( MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="flash_attention_2", ) processor = AutoProcessor.from_pretrained(MODEL_NAME) st.success("Model loaded successfully!") except Exception as e: st.error(f"Error loading model: {e}") st.stop() # Process video and generate response messages = [ { "role": "user", "content": [ { "type": "video", "video": "football.mp4", "max_pixels": 1280 * 780, "fps": 0.1, }, {"type": "text", "text": "What's happening in the video? Who wins the penalty shootout?"}, ], } ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") with st.spinner("Generating response..."): try: generated_ids = model.generate(**inputs, max_new_tokens=512) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) st.success("Response generated!") st.text_area("Model Output:", value=output_text[0], height=200) except Exception as e: st.error(f"Error generating response: {e}")