# app.py import gradio as gr from src.model_loader import load_model from src.video_utils import process_video_for_internvl3 from src.ar_prompts import generate_conversation_questions tokenizer, model = load_model() def evaluate_ar_multi_turn(video): pixel_values, num_patches_list, image_prefix = process_video_for_internvl3(video) conversation = generate_conversation_questions(include_descriptions=True) history = None visible_outputs = [] for i, question in enumerate(conversation): prompt = image_prefix + question if i == 0 else question output, history = model.chat( tokenizer, pixel_values, prompt, generation_config={"max_new_tokens": 1024}, num_patches_list=num_patches_list, history=history, return_history=True ) # 仅保留评测和拓展部分的回答(即从第3轮开始) if i >= 2: visible_outputs.append(output) # 多个输出拼接成文本显示 return "\n\n".join(visible_outputs) gr.Interface( fn=evaluate_ar_multi_turn, inputs=gr.Video(label="Upload your AR video"), outputs="text", title="InternVL3 AR Evaluation (Multi-turn)", description="Upload a short AR video clip. The model will sample frames and conduct a multi-turn dialogue to assess occlusion/rendering/placement/lighting." ).launch()