Spaces:

LoufAn
/

AR_Testing

Sleeping

File size: 2,212 Bytes

import os
import gradio as gr
import spaces
import torch
import tempfile
import imageio
from decord import VideoReader, cpu
from transformers import pipeline

hf_token = os.environ.get("HUGGINGFACE_TOKEN")
model_id = "google/gemma-3-27b-it"
NUM_FRAMES = 8

# 从视频中采样 N 帧
def sample_video_frames(video_path, num_frames=NUM_FRAMES):
    vr = VideoReader(video_path, ctx=cpu(0))
    total_frames = len(vr)
    indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)]
    frames = [vr[i].asnumpy() for i in indices]
    pil_frames = [imageio.core.util.Array(frame) for frame in frames]
    return pil_frames

# 推理函数：加载模型、采样视频帧、推理
@spaces.GPU
def analyze_video(video_file):
    # 从上传的视频中采样图像帧
    frames = sample_video_frames(video_file.name)

    # 构造单轮 prompt（可改为你需要的评估内容）
    system_prompt = (
        "You are a helpful AI assistant that analyzes AR effects in videos. "
        "Evaluate the realism and placement of virtual objects in the provided video frames."
    )
    user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment."

    # 构造输入对话历史（含图像）
    history = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames]
        }
    ]

    # 调用 pipeline 推理
    pipe = pipeline(
        "image-text-to-text",
        model=model_id,
        token=hf_token,
        torch_dtype=torch.bfloat16,
        model_kwargs={"device_map": "auto"}
    )
    result = pipe(text=history, max_new_tokens=512)
    return result[0]["generated_text"][-1]["content"]

# Gradio 界面
gr.Interface(
    fn=analyze_video,
    inputs=gr.Video(label="Upload an AR Video (.mp4 only)"),
    outputs=gr.Textbox(label="Gemma Analysis Result"),
    title="Gemma-3-27B Video Analysis (ZeroGPU)",
    description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism."
).launch()