Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import torch | |
import math | |
import time | |
from PIL import Image | |
from decord import VideoReader, cpu | |
from torchvision.transforms import Compose, Resize, ToTensor, Normalize | |
from transformers import ( | |
AutoModel, | |
AutoTokenizer, | |
AutoProcessor, | |
AutoConfig | |
) | |
from huggingface_hub import snapshot_download | |
start_time = time.time() | |
# === 常量设定 === | |
MODEL_NAME = "OpenGVLab/InternVL3-14B" | |
CACHE_DIR = "/data/internvl3_model" | |
# === 视觉预处理 === | |
IMAGENET_MEAN = (0.485, 0.456, 0.406) | |
IMAGENET_STD = (0.229, 0.224, 0.225) | |
transform = Compose([ | |
Resize((448, 448)), | |
ToTensor(), | |
Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) | |
]) | |
# === 模型下载与缓存 === | |
if not os.path.exists(CACHE_DIR): | |
print("⏬ First run: downloading model to persistent storage...") | |
snapshot_download(repo_id=MODEL_NAME, local_dir=CACHE_DIR) | |
else: | |
print("✅ Loaded model from persistent cache.") | |
# === GPU层级分配(多GPU支持) === | |
def split_model(model_path): | |
device_map = {} | |
world_size = torch.cuda.device_count() | |
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) | |
num_layers = config.llm_config.num_hidden_layers | |
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5)) | |
num_layers_per_gpu = [num_layers_per_gpu] * world_size | |
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5) | |
layer_cnt = 0 | |
for i, num_layer in enumerate(num_layers_per_gpu): | |
for _ in range(num_layer): | |
device_map[f'language_model.model.layers.{layer_cnt}'] = i | |
layer_cnt += 1 | |
device_map['vision_model'] = 0 | |
device_map['mlp1'] = 0 | |
device_map['language_model.model.tok_embeddings'] = 0 | |
device_map['language_model.model.embed_tokens'] = 0 | |
device_map['language_model.output'] = 0 | |
device_map['language_model.model.norm'] = 0 | |
device_map['language_model.model.rotary_emb'] = 0 | |
device_map['language_model.lm_head'] = 0 | |
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0 | |
return device_map | |
# === 加载组件(已缓存) === | |
print("🚀 Loading tokenizer/processor/model from cache...") | |
tokenizer = AutoTokenizer.from_pretrained(CACHE_DIR, trust_remote_code=True) | |
processor = AutoProcessor.from_pretrained(CACHE_DIR, trust_remote_code=True) | |
device_map = split_model(CACHE_DIR) | |
model = AutoModel.from_pretrained( | |
CACHE_DIR, | |
torch_dtype=torch.bfloat16, | |
low_cpu_mem_usage=True, | |
use_flash_attn=True, | |
trust_remote_code=True, | |
device_map=device_map | |
).eval() | |
# === 视频帧提取函数 === | |
def extract_frames(video_path, num_frames=8): | |
vr = VideoReader(video_path, ctx=cpu(0)) | |
total_frames = len(vr) | |
frame_indices = list(torch.linspace(0, total_frames - 1, num_frames).int().tolist()) | |
images = [] | |
for idx in frame_indices: | |
img = Image.fromarray(vr[idx].asnumpy()).convert("RGB") | |
img_tensor = transform(img) | |
images.append(img_tensor) | |
return torch.stack(images) | |
# === 主推理函数 === | |
def evaluate_ar(video): | |
frames = extract_frames(video.name).to(torch.bfloat16).cuda() | |
prompt = "Evaluate the quality of AR occlusion and rendering in the uploaded video." | |
num_patches = [1] * frames.shape[0] | |
output, _ = model.chat( | |
tokenizer, | |
frames, | |
prompt, | |
generation_config=dict(max_new_tokens=512), | |
num_patches_list=num_patches, | |
history=None, | |
return_history=True | |
) | |
return output | |
# === Gradio 接口 === | |
gr.Interface( | |
fn=evaluate_ar, | |
inputs=gr.Video(label="Upload your AR video"), | |
outputs="text", | |
title="InternVL3 AR Evaluation (Single-turn)", | |
description="Upload a short AR video clip. The model will sample frames and assess occlusion/rendering quality." | |
).launch() | |
# (在模型加载完成后) | |
print(f"✅ Model fully loaded. Time elapsed: {time.time() - start_time:.2f} sec.") |