File size: 5,572 Bytes
10e9b7d
3c4371f
4ca33ca
 
2f798b2
ceee7cf
068bf30
 
4ca33ca
 
 
ceee7cf
4ca33ca
 
 
ceee7cf
9c7bde9
068bf30
4ca33ca
 
ceee7cf
 
068bf30
4ca33ca
 
33eedd4
4ca33ca
 
 
 
 
 
 
 
 
 
31243f4
4ca33ca
31243f4
4ca33ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e4a06b
4ca33ca
e80aab9
4ca33ca
 
 
 
 
 
 
 
 
 
 
e80aab9
4ca33ca
 
 
 
 
 
e80aab9
 
4ca33ca
e80aab9
4ca33ca
0ee0419
e514fd7
4ca33ca
 
 
 
 
 
 
 
 
068bf30
e514fd7
e80aab9
 
7e4a06b
e80aab9
4ca33ca
 
 
e80aab9
4ca33ca
 
 
 
e80aab9
4ca33ca
 
 
 
 
 
 
 
e80aab9
 
 
4ca33ca
 
 
3c4371f
ceee7cf
7d65c66
3c4371f
4ca33ca
 
3c4371f
4ca33ca
7d65c66
ceee7cf
4ca33ca
 
 
7d65c66
4ca33ca
 
 
 
 
068bf30
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import pandas as pd
import gradio as gr
import logging
import time


# Import the new Settings, Evaluator, and Runner classes
from settings import Settings
from evaluator import Evaluator
from runner import Runner

# Configure logging
logging.basicConfig(level=logging.INFO, force=True)
logger = logging.getLogger(__name__)

# Initialize settings, evaluator, and runner
settings = Settings()
evaluator = Evaluator(settings)
runner = Runner(settings)



LOGIN_MESSAGE = "Please Login to Hugging Face with the button."
EMPTY_RESULTS_TABLE = pd.DataFrame(columns=['task_id', 'question', 'answer'])
        
def _format_elapsed_time(elapsed_time):
    """Formats elapsed time into minutes and seconds."""
    minutes = int(elapsed_time // 60)
    seconds = elapsed_time % 60
    if minutes > 0:
        return f"Elapsed time: {minutes} minutes {seconds:.2f} seconds"
    else:
        return f"Elapsed time: {seconds:.2f} seconds"
    
def _run_agent_on_questions(questions_list: list, username: str) -> tuple[str, pd.DataFrame]:
    """
    Helper function to run the agent on a list of questions and return status and results.
    """
    start_time = time.time()
    logger.info(f"Starting agent run for user: {username} on {len(questions_list)} questions.")
    
    # The runner handles the agent execution and saving of answers
    question_answer_pairs_df = runner.run_agent(questions_list, username)
    
    end_time = time.time()
    elapsed_time_str = _format_elapsed_time(end_time - start_time)
    message = f"Agent run complete. {elapsed_time_str}"
    logger.info(message)
    return message, question_answer_pairs_df
    
def run_one(profile: gr.OAuthProfile | None) -> tuple[str, pd.DataFrame]:
    """Runs the agent on one random question."""
    if profile: 
        try:
            question = evaluator.get_one_question()
            return _run_agent_on_questions([question], profile.username)
        except Exception as e:
            logger.error(f"Error getting one question: {e}")
            return f"Error getting question: {e}", EMPTY_RESULTS_TABLE
    else:
        return LOGIN_MESSAGE, EMPTY_RESULTS_TABLE

def run_all(profile: gr.OAuthProfile | None) -> tuple[str, pd.DataFrame]:
    """Runs the agent on all questions."""
    if profile: 
        try:
            questions = evaluator.get_questions()
            return _run_agent_on_questions(questions, profile.username)
        except Exception as e:
            logger.error(f"Error getting all questions: {e}")
            return f"Error getting questions: {e}", EMPTY_RESULTS_TABLE
    else:
        return LOGIN_MESSAGE, EMPTY_RESULTS_TABLE

def submit(profile: gr.OAuthProfile | None) -> str:
    """Submits cached answers for evaluation."""
    if profile: 
        return evaluator.submit_answers(profile.username)
    else:
        return LOGIN_MESSAGE


# --- Build Gradio Interface using Blocks ---
with gr.Blocks() as demo:
    gr.Markdown("# GAIA Agent Evaluation Runner")
    gr.Markdown(
        """
        **Instructions:**
        1.  Log in to your Hugging Face account using the button below. 
        2.  Click 'Get One Answer' to run the agent on a random question or 'Get All Answers' to run all. 
        3.  Click 'Submit Answers' to submit answers for evaluation. **Your HF username will be submitted for leaderboard tracking.**
        ---
        **Disclaimers:**
        * Running 'Get All Answers' can take significant time as the agent processes all 20 questions.
        * Agent logs are detailed (DEBUG level) and may appear interleaved due to parallel execution.
        * The 'Submit Answers' button uses the most recent agent answers cached locally for your username.
        * **API Keys Required:** Ensure `GEMINI_API_KEY` is set as a Space Secret (or environment variable if running locally).
        """
    )

    gr.LoginButton()

    run_one_button = gr.Button("Get One Answer")
    run_all_button = gr.Button("Get All Answers")
    submit_button = gr.Button("Submit Answers")

    status_output = gr.Textbox(
        label="Run Status / Submission Result", lines=5, interactive=False)
    results_table = gr.DataFrame(
        label="Questions and Agent Answers", wrap=True)

    run_one_button.click(
        fn=run_one, outputs=[status_output, results_table]
    )
    run_all_button.click(
        fn=run_all, outputs=[status_output, results_table]
    )
    submit_button.click(
        fn=submit, outputs=[status_output]
    )

if __name__ == "__main__":
    logger.info("\n" + "-"*30 + " App Starting " + "-"*30)
    
    # Check for SPACE_HOST and SPACE_ID at startup for information
    space_host_startup = os.getenv("SPACE_HOST")
    space_id_startup = os.getenv("SPACE_ID")

    if space_host_startup:
        logger.info(f"✅ SPACE_HOST found: {space_host_startup}")
        logger.info(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
    else:
        logger.info("ℹ️  SPACE_HOST environment variable not found (running locally?).")

    if space_id_startup:
        logger.info(f"✅ SPACE_ID found: {space_id_startup}")
        logger.info(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
        logger.info(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
    else:
        logger.info("ℹ️  SPACE_ID environment variable not found. Repo URL cannot be determined.")

    logger.info("-"*(60 + len(" App Starting ")) + "\n")

    logger.info("Launching Gradio Interface for GAIA Agent Evaluation...")
    demo.launch(debug=True, share=False)