Spaces:

LoufAn
/

AR_Testing

Sleeping

App Files Files Community

AR_Testing / app.py

LoufAn

Update app.py

91db62e 27 days ago

raw

history blame

2.08 kB

	import os
	import gradio as gr
	import spaces
	import torch
	import tempfile
	import imageio
	from decord import VideoReader, cpu
	from transformers import pipeline

	hf_token = os.environ.get("HUGGINGFACE_TOKEN")
	model_id = "google/gemma-3-27b-it"
	NUM_FRAMES = 8

	# 从视频中采样 N 帧
	def sample_video_frames(video_path, num_frames=NUM_FRAMES):
	vr = VideoReader(video_path, ctx=cpu(0))
	total_frames = len(vr)
	indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)]
	frames = [vr[i].asnumpy() for i in indices]
	pil_frames = [imageio.core.util.Array(frame) for frame in frames]
	return pil_frames

	# 推理函数：加载模型、采样视频帧、推理
	@spaces.GPU
	def analyze_video(video_file):
	# video_file 是路径字符串
	frames = sample_video_frames(video_file)

	# 构造 prompt
	system_prompt = (
	"You are a helpful AI assistant that analyzes AR effects in videos. "
	"Evaluate the realism and placement of virtual objects in the provided video frames."
	)
	user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment."

	history = [
	{
	"role": "system",
	"content": [{"type": "text", "text": system_prompt}]
	},
	{
	"role": "user",
	"content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames]
	}
	]

	pipe = pipeline(
	"image-text-to-text",
	model=model_id,
	token=hf_token,
	torch_dtype=torch.bfloat16,
	model_kwargs={"device_map": "auto"}
	)

	result = pipe(text=history, max_new_tokens=512)
	return result[0]["generated_text"][-1]["content"]


	# Gradio 界面
	gr.Interface(
	fn=analyze_video,
	inputs=gr.Video(label="Upload an AR Video (.mp4 only)"),
	outputs=gr.Textbox(label="Gemma Analysis Result"),
	title="Gemma-3-27B Video Analysis (ZeroGPU)",
	description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism."
	).launch()