Spaces:
Sleeping
Sleeping
File size: 2,212 Bytes
7010950 fe53d5f 60e3950 e5644ec 7010950 60e3950 7010950 60e3950 fe53d5f 60e3950 db7fd06 60e3950 5e97fbe 60e3950 7010950 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
import gradio as gr
import spaces
import torch
import tempfile
import imageio
from decord import VideoReader, cpu
from transformers import pipeline
hf_token = os.environ.get("HUGGINGFACE_TOKEN")
model_id = "google/gemma-3-27b-it"
NUM_FRAMES = 8
# 从视频中采样 N 帧
def sample_video_frames(video_path, num_frames=NUM_FRAMES):
vr = VideoReader(video_path, ctx=cpu(0))
total_frames = len(vr)
indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)]
frames = [vr[i].asnumpy() for i in indices]
pil_frames = [imageio.core.util.Array(frame) for frame in frames]
return pil_frames
# 推理函数:加载模型、采样视频帧、推理
@spaces.GPU
def analyze_video(video_file):
# 从上传的视频中采样图像帧
frames = sample_video_frames(video_file.name)
# 构造单轮 prompt(可改为你需要的评估内容)
system_prompt = (
"You are a helpful AI assistant that analyzes AR effects in videos. "
"Evaluate the realism and placement of virtual objects in the provided video frames."
)
user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment."
# 构造输入对话历史(含图像)
history = [
{
"role": "system",
"content": [{"type": "text", "text": system_prompt}]
},
{
"role": "user",
"content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames]
}
]
# 调用 pipeline 推理
pipe = pipeline(
"image-text-to-text",
model=model_id,
token=hf_token,
torch_dtype=torch.bfloat16,
model_kwargs={"device_map": "auto"}
)
result = pipe(text=history, max_new_tokens=512)
return result[0]["generated_text"][-1]["content"]
# Gradio 界面
gr.Interface(
fn=analyze_video,
inputs=gr.Video(label="Upload an AR Video (.mp4 only)"),
outputs=gr.Textbox(label="Gemma Analysis Result"),
title="Gemma-3-27B Video Analysis (ZeroGPU)",
description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism."
).launch()
|