AR_Testing / src /model_loader.py
XiaoyiYangRIT
Update some files
7ea733c
# src/model_loader.py
import os
import math
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
from huggingface_hub import snapshot_download
MODEL_NAME = "OpenGVLab/InternVL3-14B"
CACHE_DIR = "/data/internvl3_model"
# === 自动分配模型层到多张 GPU(InternVL3 建议方式) ===
def split_model(model_path):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
num_layers = config.llm_config.num_hidden_layers
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for _ in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
# 固定组件放在 GPU 0
for key in [
'vision_model', 'mlp1',
'language_model.model.tok_embeddings',
'language_model.model.embed_tokens',
'language_model.output',
'language_model.model.norm',
'language_model.model.rotary_emb',
'language_model.lm_head',
f'language_model.model.layers.{num_layers - 1}'
]:
device_map[key] = 0
return device_map
# === 模型加载函数 ===
def load_model():
if not os.path.exists(CACHE_DIR):
print("⏬ First run: downloading model to persistent storage...")
snapshot_download(repo_id=MODEL_NAME, local_dir=CACHE_DIR)
else:
print("✅ Loaded model from persistent cache.")
device_map = split_model(CACHE_DIR)
tokenizer = AutoTokenizer.from_pretrained(CACHE_DIR, trust_remote_code=True)
model = AutoModel.from_pretrained(
CACHE_DIR,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=False, # 或者True,如果确认安装好FlashAttention
trust_remote_code=True,
device_map=device_map
).eval()
return tokenizer, model