AR_Testing / app.py
XiaoyiYangRIT
Update some files
aed9794
raw
history blame
3.92 kB
import os
import gradio as gr
import torch
import math
import time
from PIL import Image
from decord import VideoReader, cpu
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from transformers import (
AutoModel,
AutoTokenizer,
AutoProcessor,
AutoConfig
)
from huggingface_hub import snapshot_download
start_time = time.time()
# === 常量设定 ===
MODEL_NAME = "OpenGVLab/InternVL3-14B"
CACHE_DIR = "/data/internvl3_model"
# === 视觉预处理 ===
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
transform = Compose([
Resize((448, 448)),
ToTensor(),
Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])
# === 模型下载与缓存 ===
if not os.path.exists(CACHE_DIR):
print("⏬ First run: downloading model to persistent storage...")
snapshot_download(repo_id=MODEL_NAME, local_dir=CACHE_DIR)
else:
print("✅ Loaded model from persistent cache.")
# === GPU层级分配(多GPU支持) ===
def split_model(model_path):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
num_layers = config.llm_config.num_hidden_layers
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for _ in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
# === 加载组件(已缓存) ===
print("🚀 Loading tokenizer/processor/model from cache...")
tokenizer = AutoTokenizer.from_pretrained(CACHE_DIR, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(CACHE_DIR, trust_remote_code=True)
device_map = split_model(CACHE_DIR)
model = AutoModel.from_pretrained(
CACHE_DIR,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map
).eval()
# === 视频帧提取函数 ===
def extract_frames(video_path, num_frames=8):
vr = VideoReader(video_path, ctx=cpu(0))
total_frames = len(vr)
frame_indices = list(torch.linspace(0, total_frames - 1, num_frames).int().tolist())
images = []
for idx in frame_indices:
img = Image.fromarray(vr[idx].asnumpy()).convert("RGB")
img_tensor = transform(img)
images.append(img_tensor)
return torch.stack(images)
# === 主推理函数 ===
def evaluate_ar(video):
frames = extract_frames(video.name).to(torch.bfloat16).cuda()
prompt = "Evaluate the quality of AR occlusion and rendering in the uploaded video."
num_patches = [1] * frames.shape[0]
output, _ = model.chat(
tokenizer,
frames,
prompt,
generation_config=dict(max_new_tokens=512),
num_patches_list=num_patches,
history=None,
return_history=True
)
return output
# === Gradio 接口 ===
gr.Interface(
fn=evaluate_ar,
inputs=gr.Video(label="Upload your AR video"),
outputs="text",
title="InternVL3 AR Evaluation (Single-turn)",
description="Upload a short AR video clip. The model will sample frames and assess occlusion/rendering quality."
).launch()
# (在模型加载完成后)
print(f"✅ Model fully loaded. Time elapsed: {time.time() - start_time:.2f} sec.")