Spaces:

LoufAn
/

AR_Testing

Sleeping

App Files Files Community

LoufAn commited on 17 days ago

Commit

60e3950

1 Parent(s): 460ce6d

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -14

app.py CHANGED Viewed

@@ -1,24 +1,66 @@
 import os
 import gradio as gr
 import spaces
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline
-model_id = "google/gemma-3-27b-it"
 hf_token = os.environ.get("HUGGINGFACE_TOKEN")
-# 包含模型加载 + 推理
 @spaces.GPU
-def generate(prompt):
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
-    model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_token, device_map="auto")
-    pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
-    output = pipeline(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)
-    return output[0]["generated_text"]
-# 构建界面
 gr.Interface(
-    fn=generate,
-    inputs=gr.Text(label="Enter your prompt"),
-    outputs=gr.Textbox(label="Generated Text"),
-    title="Gemma-3-27B Inference (ZeroGPU)"
 ).launch()

 import os
 import gradio as gr
 import spaces
+import torch
+import tempfile
+import imageio
+from decord import VideoReader, cpu
+from transformers import pipeline
 hf_token = os.environ.get("HUGGINGFACE_TOKEN")
+model_id = "google/gemma-3-27b-it"
+NUM_FRAMES = 8
+# 从视频中采样 N 帧
+def sample_video_frames(video_path, num_frames=NUM_FRAMES):
+    vr = VideoReader(video_path, ctx=cpu(0))
+    total_frames = len(vr)
+    indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)]
+    frames = [vr[i].asnumpy() for i in indices]
+    pil_frames = [imageio.core.util.Array(frame) for frame in frames]
+    return pil_frames
+# 推理函数：加载模型、采样视频帧、推理
 @spaces.GPU
+def analyze_video(video_file):
+    # 从上传的视频中采样图像帧
+    frames = sample_video_frames(video_file.name)
+    # 构造单轮 prompt（可改为你需要的评估内容）
+    system_prompt = (
+        "You are a helpful AI assistant that analyzes AR effects in videos. "
+        "Evaluate the realism and placement of virtual objects in the provided video frames."
+    )
+    user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment."
+    # 构造输入对话历史（含图像）
+    history = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": system_prompt}]
+        },
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames]
+        }
+    ]
+    # 调用 pipeline 推理
+    pipe = pipeline(
+        "image-text-to-text",
+        model=model_id,
+        token=hf_token,
+        torch_dtype=torch.bfloat16,
+        model_kwargs={"device_map": "auto"}
+    )
+    result = pipe(text=history, max_new_tokens=512)
+    return result[0]["generated_text"][-1]["content"]
+# Gradio 界面
 gr.Interface(
+    fn=analyze_video,
+    inputs=gr.Video(label="Upload an AR Video (.mp4 only)"),
+    outputs=gr.Textbox(label="Gemma Analysis Result"),
+    title="Gemma-3-27B Video Analysis (ZeroGPU)",
+    description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism."
 ).launch()