XiaoyiYangRIT commited on
Commit
aed9794
·
1 Parent(s): a6cd9f8

Update some files

Browse files
Files changed (1) hide show
  1. app.py +35 -24
app.py CHANGED
@@ -1,13 +1,26 @@
 
1
  import gradio as gr
2
  import torch
3
  import math
4
- import os
5
- from transformers import AutoTokenizer, AutoModel, AutoProcessor
6
- from huggingface_hub import snapshot_download
7
- from decord import VideoReader, cpu
8
  from PIL import Image
 
9
  from torchvision.transforms import Compose, Resize, ToTensor, Normalize
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # === 视觉预处理 ===
12
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
13
  IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -18,23 +31,15 @@ transform = Compose([
18
  Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
19
  ])
20
 
21
- # === 模型加载 ===
22
- PERSISTENT_DIR = "/data/internvl3_model" # 持久路径
23
- MODEL_NAME = "OpenGVLab/InternVL3-14B"
24
-
25
- # 如果第一次运行:下载模型并缓存到 /data
26
- if not os.path.exists(PERSISTENT_DIR):
27
  print("⏬ First run: downloading model to persistent storage...")
28
- snapshot_download(repo_id=MODEL_NAME, local_dir=PERSISTENT_DIR, trust_remote_code=True)
29
  else:
30
  print("✅ Loaded model from persistent cache.")
31
 
32
- # 模型加载(从本地)
33
- tokenizer = AutoTokenizer.from_pretrained(PERSISTENT_DIR, trust_remote_code=True)
34
- processor = AutoProcessor.from_pretrained(PERSISTENT_DIR, trust_remote_code=True)
35
-
36
  def split_model(model_path):
37
- from transformers import AutoConfig
38
  device_map = {}
39
  world_size = torch.cuda.device_count()
40
  config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
@@ -58,10 +63,13 @@ def split_model(model_path):
58
  device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
59
  return device_map
60
 
61
- device_map = split_model(PERSISTENT_DIR)
62
-
 
 
 
63
  model = AutoModel.from_pretrained(
64
- PERSISTENT_DIR,
65
  torch_dtype=torch.bfloat16,
66
  low_cpu_mem_usage=True,
67
  use_flash_attn=True,
@@ -69,7 +77,7 @@ model = AutoModel.from_pretrained(
69
  device_map=device_map
70
  ).eval()
71
 
72
- # === 视频帧采样 ===
73
  def extract_frames(video_path, num_frames=8):
74
  vr = VideoReader(video_path, ctx=cpu(0))
75
  total_frames = len(vr)
@@ -81,10 +89,10 @@ def extract_frames(video_path, num_frames=8):
81
  images.append(img_tensor)
82
  return torch.stack(images)
83
 
84
- # === 推理函数 ===
85
  def evaluate_ar(video):
86
  frames = extract_frames(video.name).to(torch.bfloat16).cuda()
87
- prompt = "Evaluate the quality of AR occlusion and rendering in the uploaded video." # 可换成具体任务
88
  num_patches = [1] * frames.shape[0]
89
  output, _ = model.chat(
90
  tokenizer,
@@ -97,11 +105,14 @@ def evaluate_ar(video):
97
  )
98
  return output
99
 
100
- # === Gradio 界面 ===
101
  gr.Interface(
102
  fn=evaluate_ar,
103
  inputs=gr.Video(label="Upload your AR video"),
104
  outputs="text",
105
  title="InternVL3 AR Evaluation (Single-turn)",
106
- description="Upload a video clip. The model will analyze AR occlusion and rendering quality."
107
  ).launch()
 
 
 
 
1
+ import os
2
  import gradio as gr
3
  import torch
4
  import math
5
+ import time
 
 
 
6
  from PIL import Image
7
+ from decord import VideoReader, cpu
8
  from torchvision.transforms import Compose, Resize, ToTensor, Normalize
9
 
10
+ from transformers import (
11
+ AutoModel,
12
+ AutoTokenizer,
13
+ AutoProcessor,
14
+ AutoConfig
15
+ )
16
+ from huggingface_hub import snapshot_download
17
+
18
+ start_time = time.time()
19
+
20
+ # === 常量设定 ===
21
+ MODEL_NAME = "OpenGVLab/InternVL3-14B"
22
+ CACHE_DIR = "/data/internvl3_model"
23
+
24
  # === 视觉预处理 ===
25
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
26
  IMAGENET_STD = (0.229, 0.224, 0.225)
 
31
  Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
32
  ])
33
 
34
+ # === 模型下载与缓存 ===
35
+ if not os.path.exists(CACHE_DIR):
 
 
 
 
36
  print("⏬ First run: downloading model to persistent storage...")
37
+ snapshot_download(repo_id=MODEL_NAME, local_dir=CACHE_DIR)
38
  else:
39
  print("✅ Loaded model from persistent cache.")
40
 
41
+ # === GPU层级分配(多GPU支持) ===
 
 
 
42
  def split_model(model_path):
 
43
  device_map = {}
44
  world_size = torch.cuda.device_count()
45
  config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 
63
  device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
64
  return device_map
65
 
66
+ # === 加载组件(已缓存) ===
67
+ print("🚀 Loading tokenizer/processor/model from cache...")
68
+ tokenizer = AutoTokenizer.from_pretrained(CACHE_DIR, trust_remote_code=True)
69
+ processor = AutoProcessor.from_pretrained(CACHE_DIR, trust_remote_code=True)
70
+ device_map = split_model(CACHE_DIR)
71
  model = AutoModel.from_pretrained(
72
+ CACHE_DIR,
73
  torch_dtype=torch.bfloat16,
74
  low_cpu_mem_usage=True,
75
  use_flash_attn=True,
 
77
  device_map=device_map
78
  ).eval()
79
 
80
+ # === 视频帧提取函数 ===
81
  def extract_frames(video_path, num_frames=8):
82
  vr = VideoReader(video_path, ctx=cpu(0))
83
  total_frames = len(vr)
 
89
  images.append(img_tensor)
90
  return torch.stack(images)
91
 
92
+ # === 主推理函数 ===
93
  def evaluate_ar(video):
94
  frames = extract_frames(video.name).to(torch.bfloat16).cuda()
95
+ prompt = "Evaluate the quality of AR occlusion and rendering in the uploaded video."
96
  num_patches = [1] * frames.shape[0]
97
  output, _ = model.chat(
98
  tokenizer,
 
105
  )
106
  return output
107
 
108
+ # === Gradio 接口 ===
109
  gr.Interface(
110
  fn=evaluate_ar,
111
  inputs=gr.Video(label="Upload your AR video"),
112
  outputs="text",
113
  title="InternVL3 AR Evaluation (Single-turn)",
114
+ description="Upload a short AR video clip. The model will sample frames and assess occlusion/rendering quality."
115
  ).launch()
116
+
117
+ # (在模型加载完成后)
118
+ print(f"✅ Model fully loaded. Time elapsed: {time.time() - start_time:.2f} sec.")