LoufAn commited on
Commit
60e3950
·
1 Parent(s): 460ce6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -14
app.py CHANGED
@@ -1,24 +1,66 @@
1
  import os
2
  import gradio as gr
3
  import spaces
4
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline
 
 
 
 
5
 
6
- model_id = "google/gemma-3-27b-it"
7
  hf_token = os.environ.get("HUGGINGFACE_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # 包含模型加载 + 推理
10
  @spaces.GPU
11
- def generate(prompt):
12
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
13
- model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_token, device_map="auto")
14
- pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
15
- output = pipeline(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)
16
- return output[0]["generated_text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # 构建界面
19
  gr.Interface(
20
- fn=generate,
21
- inputs=gr.Text(label="Enter your prompt"),
22
- outputs=gr.Textbox(label="Generated Text"),
23
- title="Gemma-3-27B Inference (ZeroGPU)"
 
24
  ).launch()
 
1
  import os
2
  import gradio as gr
3
  import spaces
4
+ import torch
5
+ import tempfile
6
+ import imageio
7
+ from decord import VideoReader, cpu
8
+ from transformers import pipeline
9
 
 
10
  hf_token = os.environ.get("HUGGINGFACE_TOKEN")
11
+ model_id = "google/gemma-3-27b-it"
12
+ NUM_FRAMES = 8
13
+
14
+ # 从视频中采样 N 帧
15
+ def sample_video_frames(video_path, num_frames=NUM_FRAMES):
16
+ vr = VideoReader(video_path, ctx=cpu(0))
17
+ total_frames = len(vr)
18
+ indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)]
19
+ frames = [vr[i].asnumpy() for i in indices]
20
+ pil_frames = [imageio.core.util.Array(frame) for frame in frames]
21
+ return pil_frames
22
 
23
+ # 推理函数:加载模型、采样视频帧、推理
24
  @spaces.GPU
25
+ def analyze_video(video_file):
26
+ # 从上传的视频中采样图像帧
27
+ frames = sample_video_frames(video_file.name)
28
+
29
+ # 构造单轮 prompt(可改为你需要的评估内容)
30
+ system_prompt = (
31
+ "You are a helpful AI assistant that analyzes AR effects in videos. "
32
+ "Evaluate the realism and placement of virtual objects in the provided video frames."
33
+ )
34
+ user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment."
35
+
36
+ # 构造输入对话历史(含图像)
37
+ history = [
38
+ {
39
+ "role": "system",
40
+ "content": [{"type": "text", "text": system_prompt}]
41
+ },
42
+ {
43
+ "role": "user",
44
+ "content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames]
45
+ }
46
+ ]
47
+
48
+ # 调用 pipeline 推理
49
+ pipe = pipeline(
50
+ "image-text-to-text",
51
+ model=model_id,
52
+ token=hf_token,
53
+ torch_dtype=torch.bfloat16,
54
+ model_kwargs={"device_map": "auto"}
55
+ )
56
+ result = pipe(text=history, max_new_tokens=512)
57
+ return result[0]["generated_text"][-1]["content"]
58
 
59
+ # Gradio 界面
60
  gr.Interface(
61
+ fn=analyze_video,
62
+ inputs=gr.Video(label="Upload an AR Video (.mp4 only)"),
63
+ outputs=gr.Textbox(label="Gemma Analysis Result"),
64
+ title="Gemma-3-27B Video Analysis (ZeroGPU)",
65
+ description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism."
66
  ).launch()