AR_Testing / app.py
LoufAn's picture
Update app.py
60e3950
raw
history blame
2.21 kB
import os
import gradio as gr
import spaces
import torch
import tempfile
import imageio
from decord import VideoReader, cpu
from transformers import pipeline
hf_token = os.environ.get("HUGGINGFACE_TOKEN")
model_id = "google/gemma-3-27b-it"
NUM_FRAMES = 8
# 从视频中采样 N 帧
def sample_video_frames(video_path, num_frames=NUM_FRAMES):
vr = VideoReader(video_path, ctx=cpu(0))
total_frames = len(vr)
indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)]
frames = [vr[i].asnumpy() for i in indices]
pil_frames = [imageio.core.util.Array(frame) for frame in frames]
return pil_frames
# 推理函数:加载模型、采样视频帧、推理
@spaces.GPU
def analyze_video(video_file):
# 从上传的视频中采样图像帧
frames = sample_video_frames(video_file.name)
# 构造单轮 prompt(可改为你需要的评估内容)
system_prompt = (
"You are a helpful AI assistant that analyzes AR effects in videos. "
"Evaluate the realism and placement of virtual objects in the provided video frames."
)
user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment."
# 构造输入对话历史(含图像)
history = [
{
"role": "system",
"content": [{"type": "text", "text": system_prompt}]
},
{
"role": "user",
"content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames]
}
]
# 调用 pipeline 推理
pipe = pipeline(
"image-text-to-text",
model=model_id,
token=hf_token,
torch_dtype=torch.bfloat16,
model_kwargs={"device_map": "auto"}
)
result = pipe(text=history, max_new_tokens=512)
return result[0]["generated_text"][-1]["content"]
# Gradio 界面
gr.Interface(
fn=analyze_video,
inputs=gr.Video(label="Upload an AR Video (.mp4 only)"),
outputs=gr.Textbox(label="Gemma Analysis Result"),
title="Gemma-3-27B Video Analysis (ZeroGPU)",
description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism."
).launch()