Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from transformers import AutoModel, AutoTokenizer | |
# Set the model path (this is the repository/model ID on Hugging Face) | |
model_path = "OpenGVLab/InternVideo2_5_Chat_8B" | |
# Load the tokenizer and model with remote code enabled. | |
# .half() converts the model to FP16 and .cuda() moves it to GPU (if available). | |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | |
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda() | |
# Get the image processor from the vision tower (if needed by the model's implementation) | |
image_processor = model.get_vision_tower().image_processor | |
# Evaluation settings | |
max_num_frames = 512 | |
generation_config = { | |
"do_sample": False, | |
"temperature": 0.0, | |
"max_new_tokens": 1024, | |
"top_p": 0.1, | |
"num_beams": 1, | |
} | |
# Define a chat function that performs either single-turn or multi-turn conversation. | |
def chat_interface(video_path, user_prompt, chat_history): | |
""" | |
Performs a chat turn with the model. If no chat_history is provided, | |
it starts a new conversation. | |
Parameters: | |
video_path (str): The filepath of the uploaded video. | |
user_prompt (str): The user's question. | |
chat_history (list): The conversation history (empty list for a new conversation). | |
Returns: | |
A tuple containing the model's output (str) and the updated chat history (list). | |
""" | |
if chat_history is None: | |
chat_history = [] | |
# The model.chat() method returns output and updated history. | |
output, new_history = model.chat( | |
video_path=video_path, | |
tokenizer=tokenizer, | |
user_prompt=user_prompt, | |
chat_history=chat_history, | |
return_history=True, | |
max_num_frames=max_num_frames, | |
generation_config=generation_config | |
) | |
return output, new_history | |
# Build the Gradio interface. | |
with gr.Blocks() as demo: | |
gr.Markdown("## InternVideo2_5_Chat_8B Chat Interface") | |
with gr.Row(): | |
video_input = gr.Video(label="Upload Video", type="filepath") | |
question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...") | |
# We'll use a hidden state to keep the conversation history. | |
chat_state = gr.State([]) | |
output_text = gr.Textbox(label="Model Response") | |
def process_chat(video, question, history): | |
response, new_history = chat_interface(video, question, history) | |
return response, new_history | |
send_btn = gr.Button("Send") | |
send_btn.click( | |
process_chat, | |
inputs=[video_input, question_input, chat_state], | |
outputs=[output_text, chat_state] | |
) | |
# Launch the app. | |
if __name__ == "__main__": | |
demo.launch() | |