Spaces:

saadfarhad
/

Crowdanalyzer_v1

Runtime error

File size: 2,740 Bytes

import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer

# Set the model path (this is the repository/model ID on Hugging Face)
model_path = "OpenGVLab/InternVideo2_5_Chat_8B"

# Load the tokenizer and model with remote code enabled.
# .half() converts the model to FP16 and .cuda() moves it to GPU (if available).
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()

# Get the image processor from the vision tower (if needed by the model's implementation)
image_processor = model.get_vision_tower().image_processor

# Evaluation settings
max_num_frames = 512
generation_config = {
    "do_sample": False,
    "temperature": 0.0,
    "max_new_tokens": 1024,
    "top_p": 0.1,
    "num_beams": 1,
}

# Define a chat function that performs either single-turn or multi-turn conversation.
def chat_interface(video_path, user_prompt, chat_history):
    """
    Performs a chat turn with the model. If no chat_history is provided,
    it starts a new conversation.
    
    Parameters:
        video_path (str): The filepath of the uploaded video.
        user_prompt (str): The user's question.
        chat_history (list): The conversation history (empty list for a new conversation).
    
    Returns:
        A tuple containing the model's output (str) and the updated chat history (list).
    """
    if chat_history is None:
        chat_history = []
    # The model.chat() method returns output and updated history.
    output, new_history = model.chat(
        video_path=video_path,
        tokenizer=tokenizer,
        user_prompt=user_prompt,
        chat_history=chat_history,
        return_history=True,
        max_num_frames=max_num_frames,
        generation_config=generation_config
    )
    return output, new_history

# Build the Gradio interface.
with gr.Blocks() as demo:
    gr.Markdown("## InternVideo2_5_Chat_8B Chat Interface")
    with gr.Row():
        video_input = gr.Video(label="Upload Video", type="filepath")
        question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...")
    # We'll use a hidden state to keep the conversation history.
    chat_state = gr.State([])
    output_text = gr.Textbox(label="Model Response")
    
    def process_chat(video, question, history):
        response, new_history = chat_interface(video, question, history)
        return response, new_history

    send_btn = gr.Button("Send")
    send_btn.click(
        process_chat,
        inputs=[video_input, question_input, chat_state],
        outputs=[output_text, chat_state]
    )

# Launch the app.
if __name__ == "__main__":
    demo.launch()