File size: 3,412 Bytes
10e9b7d
3bce169
4c200bf
188585a
f21f66c
42cbc8d
cfc7eb3
188585a
 
3bce169
42cbc8d
 
 
 
 
188585a
 
2c0ba2f
f21f66c
2c0ba2f
 
 
 
 
 
188585a
 
2c0ba2f
 
 
188585a
f21f66c
188585a
4c200bf
2c0ba2f
 
 
 
 
 
 
 
 
42cbc8d
2c0ba2f
 
 
 
188585a
2c0ba2f
188585a
 
2c0ba2f
 
 
 
 
 
 
 
 
 
 
188585a
2c0ba2f
4c200bf
2c0ba2f
f21f66c
2c0ba2f
 
 
 
 
 
 
 
188585a
2c0ba2f
 
 
 
 
 
188585a
4c200bf
2c0ba2f
 
 
 
4c200bf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import requests
import pandas as pd
import gradio as gr

from agent import GaiaAgent

DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

class GAIALlamaAgent:
    def __init__(self):
        self.agent = GaiaAgent()
    def __call__(self, question: str, task_id: str) -> str:
        # Pass both question and task_id to the agent
        return self.agent(question, task_id)

def run_and_submit_all(profile: gr.OAuthProfile | None):
    space_id = os.getenv("SPACE_ID")
    if not profile or not profile.username:
        return "Please Login to Hugging Face with the button.", None

    username = profile.username.strip()
    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
    questions_url = f"{DEFAULT_API_URL}/questions"
    submit_url = f"{DEFAULT_API_URL}/submit"

    try:
        response = requests.get(questions_url, timeout=15)
        response.raise_for_status()
        questions_data = response.json()
    except Exception as e:
        return f"Error fetching questions: {e}", None

    agent = GAIALlamaAgent()
    results_log = []
    answers_payload = []

    for item in questions_data:
        task_id = item.get("task_id")
        question_text = item.get("question")
        if not task_id or question_text is None:
            continue
        try:
            submitted_answer = agent(question_text, task_id)
        except Exception as e:
            submitted_answer = f"[ERROR] {e}"
        answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
        results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})

    submission_data = {"username": username, "agent_code": agent_code, "answers": answers_payload}

    try:
        response = requests.post(submit_url, json=submission_data, timeout=60)
        response.raise_for_status()
        result_data = response.json()
        final_status = (
            f"Submission Successful!\n"
            f"User: {result_data.get('username')}\n"
            f"Overall Score: {result_data.get('score', 'N/A')}% "
            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
            f"Message: {result_data.get('message', 'No message received.')}"
        )
        return final_status, pd.DataFrame(results_log)
    except Exception as e:
        return f"Submission Failed: {e}", pd.DataFrame(results_log)

# --- Gradio Interface matching original benchmark ---
with gr.Blocks() as demo:
    gr.Markdown("# Basic Agent Evaluation Runner")
    gr.Markdown("""
    **Instructions:**
    1. Please clone this space and modify the agent logic.
    2. Log in to Hugging Face with the button.
    3. Click 'Run Evaluation & Submit All Answers' to run the full GAIA test.
    """)

    gr.LoginButton()

    run_button = gr.Button("Run Evaluation & Submit All Answers")
    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
    results_table = gr.DataFrame(label="Questions and Agent Answers")

    run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])

if __name__ == "__main__":
    print("\n===== Application Startup =====")
    space_id = os.getenv("SPACE_ID")
    if space_id:
        print(f"🔗 Space: https://huggingface.co/spaces/{space_id}")
    demo.launch(debug=True)