import os import glob import json import pandas as pd import gradio as gr import re # --- Constants and Configuration --- # Set the path to your data directory. # The script will change its working directory to this path if it exists. ABS_DATA_PATH = "data" if os.path.exists(ABS_DATA_PATH): os.chdir(ABS_DATA_PATH) AITW_DATA_ROOT = "." MODEL_DISPLAY_MAPPING = { "gpt": "OpenAI o1", "gemini": "Gemini 2.5 Pro", "qwen": "Qwen 2.5 VL 72B" } MODELS_IN_ORDER = ["gpt", "gemini", "qwen"] MAX_CARDS_TO_DISPLAY = 50 # Let's create placeholders for up to 50 items. # --- Data Loading Logic (Unchanged) --- def load_and_prepare_data(data_root_path): """ Loads step data from JSON files and prepares it as a list of dictionaries. """ primary_model_dir = os.path.join(data_root_path, MODELS_IN_ORDER[0]) if not os.path.isdir(primary_model_dir): print(f"Error: Primary model directory not found at '{primary_model_dir}'") return [] all_steps = [] json_files = glob.glob(os.path.join(primary_model_dir, "*.json")) for json_path in json_files: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) for episode_id, episode_data in data.items(): for step in episode_data.get("steps", []): question_block = step.get("questions", {}) question = question_block.get("question", "N/A") options = question_block.get("options", []) answer_index = question_block.get("correct_answer_index") correct_option_text = "N/A" if answer_index is not None and 0 <= int(answer_index) < len(options): correct_option_text = options[int(answer_index)] image_paths = {} base_screenshot_path = step.get("screenshot_path", "").lstrip("/") for model_key in MODELS_IN_ORDER: img_path = os.path.join(data_root_path, model_key, base_screenshot_path) image_paths[model_key] = img_path step_info = { "episode_goal": episode_data.get("episode_goal", "N/A"), "question": question, "options": options, "correct_option": correct_option_text, "image_paths": image_paths } all_steps.append(step_info) return all_steps # --- CSS for a better, full-width layout (Unchanged) --- app_css = """ .gradio-container { max-width: 95% !important; } .comparison-card { border: 1px solid #E5E7EB; border-radius: 8px; padding: 1rem; margin-bottom: 1.5rem; box-shadow: 0 1px 3px 0 rgba(0,0,0,0.1), 0 1px 2px 0 rgba(0,0,0,0.06); } .card-title { font-size: 1.1rem; font-weight: 600; color: #1F2937; border-bottom: 1px solid #F3F4F6; padding-bottom: 0.5rem; margin-bottom: 1rem; } .info-column { min-width: 300px; } .image-column .label-wrapper { display: none !important; } .model-title { text-align: center; font-weight: 500; color: #4B5563; } """ # --- Gradio Interface --- with gr.Blocks(theme=gr.themes.Default(spacing_size=gr.themes.sizes.spacing_sm), css=app_css) as demo: gr.Markdown("# AITW Benchmark Visualizer") gr.Markdown("Visual comparison of model outputs for the Android in the Wild (AITW) benchmark.") # --- Create Static Placeholders --- # We will create a fixed number of hidden cards and then make them visible with data. placeholder_components = [] for i in range(MAX_CARDS_TO_DISPLAY): with gr.Group(visible=False) as card_group: card_title = gr.Markdown(elem_classes=["card-title"]) with gr.Row(): with gr.Column(scale=1, elem_classes=["info-column"]): info_md = gr.Markdown() with gr.Column(scale=3): with gr.Row(): image_outputs = {} for model_key in MODELS_IN_ORDER: with gr.Column(elem_classes=["image-column"]): gr.Markdown(f"

{MODEL_DISPLAY_MAPPING[model_key]}

") image_outputs[model_key] = gr.Image( show_label=False, show_download_button=True, interactive=False, height=350, show_fullscreen_button=True ) placeholder_components.append({ "card": card_group, "title": card_title, "info": info_md, "images": image_outputs }) # --- Function to update the placeholders --- def load_and_update_ui(): print("Loading and preparing AITW data...") all_steps = load_and_prepare_data(AITW_DATA_ROOT) if not all_steps: gr.Warning(f"No data loaded. Please check that the '{AITW_DATA_ROOT}' directory is structured correctly.") else: print(f"Successfully loaded {len(all_steps)} steps. Updating UI...") # Create a flat list of updates for all components updates = [] num_steps_to_show = min(len(all_steps), MAX_CARDS_TO_DISPLAY) for i in range(MAX_CARDS_TO_DISPLAY): if i < num_steps_to_show: step_data = all_steps[i] # Update card visibility and title updates.append(gr.update(visible=True)) updates.append(gr.update(value=f"### Main Goal: {step_data['episode_goal']}")) # Update text info text_content = f""" **Question:**

{step_data['question']}

**Options:**
    {''.join([f'
  1. {opt}
  2. ' for opt in step_data['options']])}
**Correct Answer:**

{step_data['correct_option']}

""" updates.append(gr.update(value=text_content)) # Update images for model_key in MODELS_IN_ORDER: img_path = step_data['image_paths'].get(model_key) updates.append(gr.update(value=img_path if os.path.exists(img_path) else None)) else: # Hide unused placeholder cards updates.append(gr.update(visible=False)) # Card group updates.append(gr.update(value="")) # Title updates.append(gr.update(value="")) # Info MD for model_key in MODELS_IN_ORDER: updates.append(gr.update(value=None)) # Images return updates # --- Flatten the list of placeholder components for the 'outputs' argument --- output_components_flat = [] for comp_dict in placeholder_components: output_components_flat.append(comp_dict['card']) output_components_flat.append(comp_dict['title']) output_components_flat.append(comp_dict['info']) for model_key in MODELS_IN_ORDER: output_components_flat.append(comp_dict['images'][model_key]) # --- Event Wiring --- demo.load(fn=load_and_update_ui, inputs=None, outputs=output_components_flat) if __name__ == "__main__": demo.launch(share=True, debug=True)