Spaces:

Chrisyichuan
/

Omniscient

Running

File size: 7,648 Bytes

import gradio as gr
import json
import os
import time
from io import BytesIO
from PIL import Image

# 导入项目的核心逻辑和配置
from geo_bot import GeoBot, AGENT_PROMPT_TEMPLATE
from benchmark import MapGuesserBenchmark
from config import MODELS_CONFIG, DATA_PATHS
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI

# --- 全局设置 ---
# 从HF Secrets安全地读取API密钥
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
os.environ["ANTHROPIC_API_KEY"] = os.environ.get("ANTHROPIC_API_KEY", "")
# os.environ['GOOGLE_API_KEY'] = os.environ.get("GOOGLE_API_KEY", "")

# 加载golden labels数据
try:
    with open(DATA_PATHS["golden_labels"], "r", encoding="utf-8") as f:
        GOLDEN_LABELS = json.load(f).get("samples", [])
except FileNotFoundError:
    print(f"警告: 数据文件 '{DATA_PATHS['golden_labels']}' 未找到。")
    GOLDEN_LABELS = []


# --- 核心处理函数 (使用yield实现流式更新) ---
def run_agent_process(
    model_choice, steps_per_sample, sample_index, progress=gr.Progress(track_tqdm=True)
):
    """
    这个函数是整个应用的引擎，它是一个生成器 (generator)，会逐步yield更新。
    """
    # 1. 初始化环境
    yield {
        status_text: "状态: 正在初始化浏览器和AI模型...",
        image_output: None,
        reasoning_output: "",
        action_output: "",
        result_output: "",
    }

    config = MODELS_CONFIG.get(model_choice)
    model_class = globals()[config["class"]]
    model_instance_name = config["model_name"]
    bot = GeoBot(model=model_class, model_name=model_instance_name, headless=True)

    # 2. 加载选定的样本位置
    sample = GOLDEN_LABELS[sample_index]
    ground_truth = {"lat": sample.get("lat"), "lng": sample.get("lng")}

    if not bot.controller.load_location_from_data(sample):
        yield {status_text: "错误: 加载地图位置失败。请重试。"}
        return

    bot.controller.setup_clean_environment()

    history = []
    final_guess = None

    # 3. 开始多步探索循环
    for step in range(steps_per_sample):
        step_num = step + 1
        yield {status_text: f"状态: 探索中... (第 {step_num}/{steps_per_sample} 步)"}

        # a. 观察 (Observe)
        bot.controller.label_arrows_on_screen()
        screenshot_bytes = bot.controller.take_street_view_screenshot()

        # b. 思考 (Think)
        current_screenshot_b64 = bot.pil_to_base64(
            Image.open(BytesIO(screenshot_bytes))
        )
        history.append({"image_b64": current_screenshot_b64, "action": "N/A"})

        prompt = AGENT_PROMPT_TEMPLATE.format(
            remaining_steps=steps_per_sample - step,
            history_text="\n".join(
                [f"Step {j + 1}: {h['action']}" for j, h in enumerate(history)]
            ),
            available_actions=json.dumps(bot.controller.get_available_actions()),
        )
        message = bot._create_message_with_history(
            prompt, [h["image_b64"] for h in history]
        )
        response = bot.model.invoke(message)
        decision = bot._parse_agent_response(response)

        if not decision:
            decision = {
                "action_details": {"action": "PAN_RIGHT"},
                "reasoning": "Default recovery.",
            }

        action = decision.get("action_details", {}).get("action")
        reasoning = decision.get("reasoning", "N/A")
        history[-1]["action"] = action

        # c. 更新UI
        yield {
            image_output: Image.open(BytesIO(screenshot_bytes)),
            reasoning_output: f"**AI Reasoning:**\n\n{reasoning}",
            action_output: f"**AI Action:** `{action}`",
        }

        # d. 强制在最后一步猜测
        if step_num == steps_per_sample and action != "GUESS":
            action = "GUESS"
            yield {status_text: "状态: 已达最大步数，强制执行GUESS..."}

        # e. 行动 (Act)
        if action == "GUESS":
            lat, lon = (
                decision.get("action_details", {}).get("lat"),
                decision.get("action_details", {}).get("lon"),
            )
            if lat is not None and lon is not None:
                final_guess = (lat, lon)
            break
        elif action == "MOVE_FORWARD":
            bot.controller.move("forward")
        elif action == "MOVE_BACKWARD":
            bot.controller.move("backward")
        elif action == "PAN_LEFT":
            bot.controller.pan_view("left")
        elif action == "PAN_RIGHT":
            bot.controller.pan_view("right")

        time.sleep(1)  # 步骤间稍作停顿

    # 4. 循环结束，计算最终结果并更新UI
    yield {status_text: "状态: 探索完成，正在计算最终结果..."}

    if final_guess:
        distance = bot.calculate_distance(ground_truth, final_guess)
        result_text = f"""
        ### 📍 最终结果
        - **真实位置:** `Lat: {ground_truth["lat"]:.4f}, Lon: {ground_truth["lng"]:.4f}`
        - **Agent猜测:** `Lat: {final_guess[0]:.4f}, Lon: {final_guess[1]:.4f}`
        - **距离误差:** `{distance:.1f} km`
        """
        yield {result_output: result_text, status_text: "状态: 完成！"}
    else:
        yield {
            result_output: "### 📍 最终结果\n\nAgent 未能做出有效猜测。",
            status_text: "状态: 完成！",
        }

    bot.close()  # 关闭浏览器


# --- Gradio UI 布局 ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🗺️ 可视化 GeoBot 智能体")
    gr.Markdown("选择配置并启动Agent，观察它如何通过探索来猜测自己的地理位置。")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## ⚙️ 控制面板")
            model_choice = gr.Dropdown(
                list(MODELS_CONFIG.keys()), label="选择AI模型", value="gpt-4o"
            )
            steps_per_sample = gr.Slider(
                3, 20, value=10, step=1, label="每轮最大探索步数"
            )
            sample_index = gr.Dropdown(
                [f"样本 {i}" for i in range(len(GOLDEN_LABELS))],
                label="选择测试样本",
                value="样本 0",
            )
            start_button = gr.Button("🚀 启动智能体", variant="primary")
            status_text = gr.Markdown("状态: 等待启动")
            result_output = gr.Markdown()

        with gr.Column(scale=3):
            gr.Markdown("## 🕵️ Agent探索过程")
            image_output = gr.Image(label="Agent当前视角", height=600)
            with gr.Row():
                reasoning_output = gr.Markdown(label="AI 思考")
                action_output = gr.Markdown(label="AI 行动")

    # 将按钮点击事件连接到核心函数
    # `lambda s: int(s.split(' ')[1])` 用于从"样本 0"中提取出数字0
    start_button.click(
        fn=run_agent_process,
        inputs=[model_choice, steps_per_sample, sample_index],
        outputs=[
            status_text,
            image_output,
            reasoning_output,
            action_output,
            result_output,
        ],
        # `js` 参数用于在点击按钮后禁用它，防止重复点击
        js="""
        (model_choice, steps_per_sample, sample_index) => {
            return [
                "状态: 初始化中...", 
                null, 
                "...", 
                "...", 
                ""
            ];
        }
        """,
    )

if __name__ == "__main__":
    demo.launch()