File size: 6,119 Bytes
6b4a7ef
 
 
 
 
f7cf33f
 
6b4a7ef
 
 
 
 
f7cf33f
6b4a7ef
 
 
 
 
 
 
 
 
f7cf33f
6b4a7ef
f7cf33f
 
6b4a7ef
 
 
f7cf33f
6b4a7ef
71c05d4
6b4a7ef
 
f7cf33f
6b4a7ef
f7cf33f
6b4a7ef
 
 
f7cf33f
6b4a7ef
 
f7cf33f
6b4a7ef
f7cf33f
6b4a7ef
 
 
f7cf33f
 
6b4a7ef
 
 
f7cf33f
6b4a7ef
 
 
 
 
f7cf33f
6b4a7ef
f7cf33f
6b4a7ef
f7cf33f
 
 
6b4a7ef
 
f7cf33f
6b4a7ef
f7cf33f
 
 
6b4a7ef
 
 
 
f7cf33f
6b4a7ef
 
f7cf33f
6b4a7ef
 
 
f7cf33f
6b4a7ef
f7cf33f
 
 
 
 
 
 
 
6b4a7ef
f7cf33f
6b4a7ef
 
f7cf33f
 
 
6b4a7ef
f7cf33f
 
 
6b4a7ef
f7cf33f
 
 
6b4a7ef
f7cf33f
6b4a7ef
 
 
f7cf33f
 
6b4a7ef
 
 
f7cf33f
 
 
 
 
 
6b4a7ef
 
f7cf33f
 
 
 
 
 
6b4a7ef
f7cf33f
6b4a7ef
 
f7cf33f
 
6b4a7ef
 
f7cf33f
6b4a7ef
 
 
f7cf33f
 
6b4a7ef
 
 
 
 
 
 
f7cf33f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import json
import time
import requests
import gradio as gr
import pandas as pd
from tqdm import tqdm
from agent import GAIAExpertAgent

# Константы
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

class EvaluationRunner:
    """Оптимизированный обработчик оценки"""
    
    def __init__(self, api_url=DEFAULT_API_URL):
        self.api_url = api_url
        self.questions_url = f"{api_url}/questions"
        self.submit_url = f"{api_url}/submit"
        self.results_url = f"{api_url}/results"
        self.correct_answers = 0
        self.total_questions = 0
    
    def run_evaluation(self, agent, username: str, agent_code: str) -> Tuple[str, pd.DataFrame]:
        questions_data = self._fetch_questions()
        if not isinstance(questions_data, list):
            return questions_data, pd.DataFrame()
        
        results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
        if not answers_payload:
            return "No answers generated", pd.DataFrame()
        
        submission_result = self._submit_answers(username, agent_code, answers_payload)
        return submission_result, pd.DataFrame(results_log)
    
    def _fetch_questions(self):
        try:
            response = requests.get(self.questions_url, timeout=30)
            response.raise_for_status()
            questions_data = response.json()
            self.total_questions = len(questions_data)
            print(f"Fetched {self.total_questions} questions")
            return questions_data
        except Exception as e:
            return f"Error: {str(e)}"
    
    def _run_agent_on_questions(self, agent, questions_data):
        results_log = []
        answers_payload = []
        
        print(f"Processing {len(questions_data)} questions...")
        for item in tqdm(questions_data, desc="Questions"):
            task_id = item.get("task_id")
            question_text = item.get("question")
            
            if not task_id or not question_text:
                continue
            
            try:
                json_response = agent(question_text, task_id)
                response_obj = json.loads(json_response)
                answer = response_obj.get("final_answer", "")
                
                answers_payload.append({"task_id": task_id, "submitted_answer": answer})
                results_log.append({
                    "Task ID": task_id,
                    "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                    "Answer": answer[:50] + "..." if len(answer) > 50 else answer
                })
            except Exception as e:
                answers_payload.append({"task_id": task_id, "submitted_answer": f"ERROR: {str(e)}"})
                results_log.append({
                    "Task ID": task_id,
                    "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                    "Answer": f"ERROR: {str(e)}"
                })
        
        return results_log, answers_payload
    
    def _submit_answers(self, username: str, agent_code: str, answers_payload):
        submission_data = {
            "username": username.strip(),
            "agent_code": agent_code.strip(),
            "answers": answers_payload
        }
        
        print("Submitting answers...")
        try:
            response = requests.post(
                self.submit_url,
                json=submission_data,
                headers={"Content-Type": "application/json"},
                timeout=60
            )
            response.raise_for_status()
            return response.json().get("message", "Answers submitted successfully")
        except Exception as e:
            return f"Submission failed: {str(e)}"


def run_evaluation(username: str, agent_code: str, model_name: str):
    print("Initializing GAIA Expert Agent...")
    agent = GAIAExpertAgent(model_name=model_name)
    
    print("Starting evaluation...")
    runner = EvaluationRunner()
    result, results_df = runner.run_evaluation(agent, username, agent_code)
    
    # Добавляем счетчики вопросов
    total_questions = runner.total_questions
    correct_answers = runner.correct_answers if hasattr(runner, 'correct_answers') else 0
    
    return result, correct_answers, total_questions, results_df


def create_gradio_interface():
    with gr.Blocks(title="GAIA Expert Agent") as demo:
        gr.Markdown("# 🧠 GAIA Expert Agent Evaluation")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("### Configuration")
                username = gr.Textbox(label="Hugging Face Username", value="yoshizen")
                agent_code = gr.Textbox(
                    label="Agent Code", 
                    value="https://huggingface.co/spaces/yoshizen/FinalTest"
                )
                model_name = gr.Dropdown(
                    label="Model",
                    choices=[
                        "google/flan-t5-small", 
                        "google/flan-t5-base",
                        "google/flan-t5-large"
                    ],
                    value="google/flan-t5-large"
                )
                run_button = gr.Button("🚀 Run Evaluation", variant="primary")
            
            with gr.Column():
                gr.Markdown("### Results")
                result_text = gr.Textbox(label="Submission Status")
                correct_answers = gr.Number(label="Correct Answers")
                total_questions = gr.Number(label="Total Questions")
                results_table = gr.Dataframe(label="Processed Questions", interactive=False)
        
        run_button.click(
            fn=run_evaluation,
            inputs=[username, agent_code, model_name],
            outputs=[result_text, correct_answers, total_questions, results_table]
        )
    
    return demo


if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(server_name="0.0.0.0", server_port=7860)