import os import json import time import torch import requests import gradio as gr import pandas as pd from typing import List, Dict, Any, Optional, Union, Callable, Tuple from agent_gaia import GAIAExpertAgent as OptimizedGAIAAgent # Константы DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" MAX_RETRIES = 3 RETRY_DELAY = 5 class EvaluationRunner: """Обрабатывает процесс оценки: получение вопросов, запуск агента, отправку ответов""" def __init__(self, api_url=DEFAULT_API_URL): self.api_url = api_url self.questions_url = f"{api_url}/questions" self.submit_url = f"{api_url}/submit" self.results_url = f"{api_url}/results" self.correct_answers = 0 self.total_questions = 0 def run_evaluation(self, agent: Callable[[str], str], username: str, agent_code: str) -> tuple[str, pd.DataFrame]: # Получаем вопросы questions_data = self._fetch_questions() if isinstance(questions_data, str): # Сообщение об ошибке return questions_data, None # Запускаем агента на всех вопросах results_log, answers_payload = self._run_agent_on_questions(agent, questions_data) if not answers_payload: return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) # Отправляем ответы submission_result = self._submit_answers(username, agent_code, answers_payload) # Проверяем результаты self._check_results(username) self.print_evaluation_summary(username) return submission_result, pd.DataFrame(results_log) def _fetch_questions(self) -> Union[List[Dict[str, Any]], str]: try: response = requests.get(self.questions_url, timeout=15) response.raise_for_status() questions_data = response.json() if not questions_data: return "Fetched questions list is empty or invalid format." self.total_questions = len(questions_data) print(f"Successfully fetched {self.total_questions} questions.") return questions_data except Exception as e: return f"Error fetching questions: {e}" def _run_agent_on_questions(self, agent: Any, questions_data: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: results_log = [] answers_payload = [] print(f"Running agent on {len(questions_data)} questions...") for item in questions_data: task_id = item.get("task_id") question_text = item.get("question") if not task_id or question_text is None: continue try: json_response = agent(question_text, task_id) response_obj = json.loads(json_response) submitted_answer = response_obj.get("final_answer", "") answers_payload.append({ "task_id": task_id, "submitted_answer": submitted_answer }) results_log.append({ "Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer, "Full Response": json_response }) except Exception as e: results_log.append({ "Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}" }) return results_log, answers_payload def _submit_answers(self, username: str, agent_code: str, answers_payload: List[Dict[str, Any]]) -> str: submission_data = { "username": username.strip(), "agent_code": agent_code.strip(), # Ключевое исправление: agent_code вместо agent_code_url "answers": answers_payload } print(f"Submitting {len(answers_payload)} answers to: {self.submit_url}") print("Submission data:", json.dumps(submission_data, indent=2)) for attempt in range(1, MAX_RETRIES + 1): try: response = requests.post( self.submit_url, json=submission_data, headers={"Content-Type": "application/json"}, timeout=30 ) response.raise_for_status() try: result = response.json() if "message" in result: return result["message"] return "Evaluation submitted successfully" except: return f"Submission successful, but response was not JSON: {response.text}" except Exception as e: print(f"Submission attempt {attempt} failed: {e}") time.sleep(RETRY_DELAY) return "Error submitting answers after multiple attempts" def _check_results(self, username: str) -> None: try: results_url = f"{self.results_url}?username={username}" response = requests.get(results_url, timeout=15) if response.status_code == 200: data = response.json() if isinstance(data, dict) and "score" in data: self.correct_answers = int(data["score"]) except Exception as e: print(f"Error checking results: {e}") def get_correct_answers_count(self) -> int: return self.correct_answers def get_total_questions_count(self) -> int: return self.total_questions def print_evaluation_summary(self, username: str) -> None: print("\n===== EVALUATION SUMMARY =====") print(f"User: {username}") print(f"Overall Score: {self.correct_answers}/{self.total_questions}") print("=============================\n") def run_evaluation(username: str, agent_code: str, # Исправлено имя параметра model_name: str = "google/flan-t5-base", use_cache: bool = False) -> Tuple[str, int, int, str, str, str]: # Кэш отключен по умолчанию start_time = time.time() # Инициализируем агента agent = EnhancedGAIAAgent(model_name=model_name, use_cache=use_cache) # Инициализируем runner runner = EvaluationRunner(api_url=DEFAULT_API_URL) # Запускаем оценку result, results_log = runner.run_evaluation(agent, username, agent_code) # Вычисляем время выполнения elapsed_time = time.time() - start_time elapsed_time_str = f"{elapsed_time:.2f} seconds" # Формируем URL результатов results_url = f"{DEFAULT_API_URL}/results?username={username}" cache_status = "Cache enabled and used" if use_cache else "Cache disabled" return ( result, runner.get_correct_answers_count(), runner.get_total_questions_count(), elapsed_time_str, results_url, cache_status ) def create_gradio_interface(): with gr.Blocks(title="GAIA Agent Evaluation") as demo: gr.Markdown("# GAIA Agent Evaluation") with gr.Row(): with gr.Column(): username = gr.Textbox(label="Hugging Face Username") agent_code = gr.Textbox(label="Agent Code", lines=2, placeholder="Your agent code here") model_name = gr.Dropdown( label="Model", choices=["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"], value="google/flan-t5-base" ) use_cache = gr.Checkbox(label="Use Answer Cache", value=False) run_button = gr.Button("Run Evaluation & Submit All Answers") with gr.Column(): result_text = gr.Textbox(label="Result", lines=2) correct_answers = gr.Number(label="Correct Answers") total_questions = gr.Number(label="Total Questions") elapsed_time = gr.Textbox(label="Elapsed Time") results_url = gr.Textbox(label="Results URL") cache_status = gr.Textbox(label="Cache Status") run_button.click( fn=run_evaluation, inputs=[username, agent_code, model_name, use_cache], outputs=[ result_text, correct_answers, total_questions, elapsed_time, results_url, cache_status ] ) return demo if __name__ == "__main__": demo = create_gradio_interface() demo.launch(share=True)