XiaoyiYangRIT commited on
Commit
3b1917e
·
1 Parent(s): 29f26e8

Update some files

Browse files
Files changed (1) hide show
  1. app.py +57 -58
app.py CHANGED
@@ -1,68 +1,67 @@
1
- import os
2
- import gradio as gr
3
- import spaces
4
  import torch
5
- import tempfile
6
- import imageio
7
- from decord import VideoReader, cpu
8
- from transformers import pipeline
9
  from PIL import Image
10
 
11
- hf_token = os.environ.get("HUGGINGFACE_TOKEN")
12
- model_id = "google/gemma-3-27b-it"
13
- NUM_FRAMES = 8
14
-
15
- # 从视频中采样 N 帧
16
- def sample_video_frames(video_path, num_frames=8):
17
- vr = VideoReader(video_path, ctx=cpu(0))
18
- total_frames = len(vr)
19
- indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)]
20
-
21
- # 关键点:强制转换为 PIL.Image
22
- frames = [Image.fromarray(vr[i].asnumpy()) for i in indices]
23
- return frames
24
-
25
- # 推理函数:加载模型、采样视频帧、推理
26
- @spaces.GPU
27
- def analyze_video(video_file):
28
- # video_file 是路径字符串
29
- frames = sample_video_frames(video_file)
30
-
31
- # 构造 prompt
32
- system_prompt = (
33
- "You are a helpful AI assistant that analyzes AR effects in videos. "
34
- "Evaluate the realism and placement of virtual objects in the provided video frames."
35
- )
36
- user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment."
37
 
38
- history = [
39
- {
40
- "role": "system",
41
- "content": [{"type": "text", "text": system_prompt}]
42
- },
43
- {
44
- "role": "user",
45
- "content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames]
46
- }
47
- ]
48
 
49
- pipe = pipeline(
50
- "image-text-to-text",
51
- model=model_id,
52
- token=hf_token,
53
- torch_dtype=torch.bfloat16,
54
- model_kwargs={"device_map": "auto"}
55
- )
 
 
56
 
57
- result = pipe(text=history, max_new_tokens=512)
58
- return result[0]["generated_text"][-1]["content"]
59
 
 
 
 
 
 
 
60
 
61
- # Gradio 界面
62
  gr.Interface(
63
- fn=analyze_video,
64
- inputs=gr.Video(label="Upload an AR Video (.mp4 only)"),
65
- outputs=gr.Textbox(label="Gemma Analysis Result"),
66
- title="Gemma-3-27B Video Analysis (ZeroGPU)",
67
- description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism."
 
 
 
68
  ).launch()
 
1
+ import math
 
 
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModel, AutoProcessor
4
+ import gradio as gr
 
 
5
  from PIL import Image
6
 
7
+ # === 分配层到多 GPU ===
8
+ def split_model(model_path):
9
+ from transformers import AutoConfig
10
+ device_map = {}
11
+ world_size = torch.cuda.device_count()
12
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
13
+ num_layers = config.llm_config.num_hidden_layers
14
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
15
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
16
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
17
+ layer_cnt = 0
18
+ for i, num_layer in enumerate(num_layers_per_gpu):
19
+ for _ in range(num_layer):
20
+ device_map[f'language_model.model.layers.{layer_cnt}'] = i
21
+ layer_cnt += 1
22
+ device_map['vision_model'] = 0
23
+ device_map['mlp1'] = 0
24
+ device_map['language_model.model.tok_embeddings'] = 0
25
+ device_map['language_model.model.embed_tokens'] = 0
26
+ device_map['language_model.output'] = 0
27
+ device_map['language_model.model.norm'] = 0
28
+ device_map['language_model.model.rotary_emb'] = 0
29
+ device_map['language_model.lm_head'] = 0
30
+ device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
31
+ return device_map
 
32
 
33
+ # === 模型路径 ===
34
+ model_path = "OpenGVLab/InternVL3-14B"
35
+ device_map = split_model(model_path)
 
 
 
 
 
 
 
36
 
37
+ # === 加载模型和处理器 ===
38
+ model = AutoModel.from_pretrained(
39
+ model_path,
40
+ torch_dtype=torch.bfloat16,
41
+ low_cpu_mem_usage=True,
42
+ use_flash_attn=True,
43
+ trust_remote_code=True,
44
+ device_map=device_map
45
+ ).eval()
46
 
47
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
48
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
49
 
50
+ # === 推理函数 ===
51
+ def infer(image: Image.Image, prompt: str):
52
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
53
+ output = model.generate(**inputs, max_new_tokens=512)
54
+ answer = tokenizer.decode(output[0], skip_special_tokens=True)
55
+ return answer
56
 
57
+ # === Gradio 界面 ===
58
  gr.Interface(
59
+ fn=infer,
60
+ inputs=[
61
+ gr.Image(type="pil", label="Upload Image"),
62
+ gr.Textbox(label="Your Prompt", placeholder="Ask a question about the image...")
63
+ ],
64
+ outputs="text",
65
+ title="InternVL3-14B Multimodal Demo",
66
+ description="Upload an image and ask a question. InternVL3-14B will answer using vision + language."
67
  ).launch()