Omniscient / app.py
Andy Lee
feat: real hf studio gui
d6d949c
raw
history blame
7.65 kB
import gradio as gr
import json
import os
import time
from io import BytesIO
from PIL import Image
# 导入项目的核心逻辑和配置
from geo_bot import GeoBot, AGENT_PROMPT_TEMPLATE
from benchmark import MapGuesserBenchmark
from config import MODELS_CONFIG, DATA_PATHS
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
# --- 全局设置 ---
# 从HF Secrets安全地读取API密钥
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
os.environ["ANTHROPIC_API_KEY"] = os.environ.get("ANTHROPIC_API_KEY", "")
# os.environ['GOOGLE_API_KEY'] = os.environ.get("GOOGLE_API_KEY", "")
# 加载golden labels数据
try:
with open(DATA_PATHS["golden_labels"], "r", encoding="utf-8") as f:
GOLDEN_LABELS = json.load(f).get("samples", [])
except FileNotFoundError:
print(f"警告: 数据文件 '{DATA_PATHS['golden_labels']}' 未找到。")
GOLDEN_LABELS = []
# --- 核心处理函数 (使用yield实现流式更新) ---
def run_agent_process(
model_choice, steps_per_sample, sample_index, progress=gr.Progress(track_tqdm=True)
):
"""
这个函数是整个应用的引擎,它是一个生成器 (generator),会逐步yield更新。
"""
# 1. 初始化环境
yield {
status_text: "状态: 正在初始化浏览器和AI模型...",
image_output: None,
reasoning_output: "",
action_output: "",
result_output: "",
}
config = MODELS_CONFIG.get(model_choice)
model_class = globals()[config["class"]]
model_instance_name = config["model_name"]
bot = GeoBot(model=model_class, model_name=model_instance_name, headless=True)
# 2. 加载选定的样本位置
sample = GOLDEN_LABELS[sample_index]
ground_truth = {"lat": sample.get("lat"), "lng": sample.get("lng")}
if not bot.controller.load_location_from_data(sample):
yield {status_text: "错误: 加载地图位置失败。请重试。"}
return
bot.controller.setup_clean_environment()
history = []
final_guess = None
# 3. 开始多步探索循环
for step in range(steps_per_sample):
step_num = step + 1
yield {status_text: f"状态: 探索中... (第 {step_num}/{steps_per_sample} 步)"}
# a. 观察 (Observe)
bot.controller.label_arrows_on_screen()
screenshot_bytes = bot.controller.take_street_view_screenshot()
# b. 思考 (Think)
current_screenshot_b64 = bot.pil_to_base64(
Image.open(BytesIO(screenshot_bytes))
)
history.append({"image_b64": current_screenshot_b64, "action": "N/A"})
prompt = AGENT_PROMPT_TEMPLATE.format(
remaining_steps=steps_per_sample - step,
history_text="\n".join(
[f"Step {j + 1}: {h['action']}" for j, h in enumerate(history)]
),
available_actions=json.dumps(bot.controller.get_available_actions()),
)
message = bot._create_message_with_history(
prompt, [h["image_b64"] for h in history]
)
response = bot.model.invoke(message)
decision = bot._parse_agent_response(response)
if not decision:
decision = {
"action_details": {"action": "PAN_RIGHT"},
"reasoning": "Default recovery.",
}
action = decision.get("action_details", {}).get("action")
reasoning = decision.get("reasoning", "N/A")
history[-1]["action"] = action
# c. 更新UI
yield {
image_output: Image.open(BytesIO(screenshot_bytes)),
reasoning_output: f"**AI Reasoning:**\n\n{reasoning}",
action_output: f"**AI Action:** `{action}`",
}
# d. 强制在最后一步猜测
if step_num == steps_per_sample and action != "GUESS":
action = "GUESS"
yield {status_text: "状态: 已达最大步数,强制执行GUESS..."}
# e. 行动 (Act)
if action == "GUESS":
lat, lon = (
decision.get("action_details", {}).get("lat"),
decision.get("action_details", {}).get("lon"),
)
if lat is not None and lon is not None:
final_guess = (lat, lon)
break
elif action == "MOVE_FORWARD":
bot.controller.move("forward")
elif action == "MOVE_BACKWARD":
bot.controller.move("backward")
elif action == "PAN_LEFT":
bot.controller.pan_view("left")
elif action == "PAN_RIGHT":
bot.controller.pan_view("right")
time.sleep(1) # 步骤间稍作停顿
# 4. 循环结束,计算最终结果并更新UI
yield {status_text: "状态: 探索完成,正在计算最终结果..."}
if final_guess:
distance = bot.calculate_distance(ground_truth, final_guess)
result_text = f"""
### 📍 最终结果
- **真实位置:** `Lat: {ground_truth["lat"]:.4f}, Lon: {ground_truth["lng"]:.4f}`
- **Agent猜测:** `Lat: {final_guess[0]:.4f}, Lon: {final_guess[1]:.4f}`
- **距离误差:** `{distance:.1f} km`
"""
yield {result_output: result_text, status_text: "状态: 完成!"}
else:
yield {
result_output: "### 📍 最终结果\n\nAgent 未能做出有效猜测。",
status_text: "状态: 完成!",
}
bot.close() # 关闭浏览器
# --- Gradio UI 布局 ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🗺️ 可视化 GeoBot 智能体")
gr.Markdown("选择配置并启动Agent,观察它如何通过探索来猜测自己的地理位置。")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## ⚙️ 控制面板")
model_choice = gr.Dropdown(
list(MODELS_CONFIG.keys()), label="选择AI模型", value="gpt-4o"
)
steps_per_sample = gr.Slider(
3, 20, value=10, step=1, label="每轮最大探索步数"
)
sample_index = gr.Dropdown(
[f"样本 {i}" for i in range(len(GOLDEN_LABELS))],
label="选择测试样本",
value="样本 0",
)
start_button = gr.Button("🚀 启动智能体", variant="primary")
status_text = gr.Markdown("状态: 等待启动")
result_output = gr.Markdown()
with gr.Column(scale=3):
gr.Markdown("## 🕵️ Agent探索过程")
image_output = gr.Image(label="Agent当前视角", height=600)
with gr.Row():
reasoning_output = gr.Markdown(label="AI 思考")
action_output = gr.Markdown(label="AI 行动")
# 将按钮点击事件连接到核心函数
# `lambda s: int(s.split(' ')[1])` 用于从"样本 0"中提取出数字0
start_button.click(
fn=run_agent_process,
inputs=[model_choice, steps_per_sample, sample_index],
outputs=[
status_text,
image_output,
reasoning_output,
action_output,
result_output,
],
# `js` 参数用于在点击按钮后禁用它,防止重复点击
js="""
(model_choice, steps_per_sample, sample_index) => {
return [
"状态: 初始化中...",
null,
"...",
"...",
""
];
}
""",
)
if __name__ == "__main__":
demo.launch()