AR_Testing / app.py
LoufAn's picture
Update app.py
91db62e
raw
history blame
2.08 kB
import os
import gradio as gr
import spaces
import torch
import tempfile
import imageio
from decord import VideoReader, cpu
from transformers import pipeline
hf_token = os.environ.get("HUGGINGFACE_TOKEN")
model_id = "google/gemma-3-27b-it"
NUM_FRAMES = 8
# 从视频中采样 N 帧
def sample_video_frames(video_path, num_frames=NUM_FRAMES):
vr = VideoReader(video_path, ctx=cpu(0))
total_frames = len(vr)
indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)]
frames = [vr[i].asnumpy() for i in indices]
pil_frames = [imageio.core.util.Array(frame) for frame in frames]
return pil_frames
# 推理函数:加载模型、采样视频帧、推理
@spaces.GPU
def analyze_video(video_file):
# video_file 是路径字符串
frames = sample_video_frames(video_file)
# 构造 prompt
system_prompt = (
"You are a helpful AI assistant that analyzes AR effects in videos. "
"Evaluate the realism and placement of virtual objects in the provided video frames."
)
user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment."
history = [
{
"role": "system",
"content": [{"type": "text", "text": system_prompt}]
},
{
"role": "user",
"content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames]
}
]
pipe = pipeline(
"image-text-to-text",
model=model_id,
token=hf_token,
torch_dtype=torch.bfloat16,
model_kwargs={"device_map": "auto"}
)
result = pipe(text=history, max_new_tokens=512)
return result[0]["generated_text"][-1]["content"]
# Gradio 界面
gr.Interface(
fn=analyze_video,
inputs=gr.Video(label="Upload an AR Video (.mp4 only)"),
outputs=gr.Textbox(label="Gemma Analysis Result"),
title="Gemma-3-27B Video Analysis (ZeroGPU)",
description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism."
).launch()