FinalTest

Runtime error

File size: 6,119 Bytes

6b4a7ef
 
 
 
 
f7cf33f
 
6b4a7ef
 
 
 
 
f7cf33f
6b4a7ef
 
 
 
 
 
 
 
 
f7cf33f
6b4a7ef
f7cf33f
 
6b4a7ef
 
 
f7cf33f
6b4a7ef
71c05d4
6b4a7ef
 
f7cf33f
6b4a7ef
f7cf33f
6b4a7ef
 
 
f7cf33f
6b4a7ef
 
f7cf33f
6b4a7ef
f7cf33f
6b4a7ef
 
 
f7cf33f
 
6b4a7ef
 
 
f7cf33f
6b4a7ef
 
 
 
 
f7cf33f
6b4a7ef
f7cf33f
6b4a7ef
f7cf33f
 
 
6b4a7ef
 
f7cf33f
6b4a7ef
f7cf33f
 
 
6b4a7ef
 
 
 
f7cf33f
6b4a7ef
 
f7cf33f
6b4a7ef
 
 
f7cf33f
6b4a7ef
f7cf33f
 
 
 
 
 
 
 
6b4a7ef
f7cf33f
6b4a7ef
 
f7cf33f
 
 
6b4a7ef
f7cf33f
 
 
6b4a7ef
f7cf33f
 
 
6b4a7ef
f7cf33f
6b4a7ef
 
 
f7cf33f
 
6b4a7ef
 
 
f7cf33f
 
 
 
 
 
6b4a7ef
 
f7cf33f
 
 
 
 
 
6b4a7ef
f7cf33f
6b4a7ef
 
f7cf33f
 
6b4a7ef
 
f7cf33f
6b4a7ef
 
 
f7cf33f
 
6b4a7ef
 
 
 
 
 
 
f7cf33f

import json
import time
import requests
import gradio as gr
import pandas as pd
from tqdm import tqdm
from agent import GAIAExpertAgent

# Константы
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

class EvaluationRunner:
    """Оптимизированный обработчик оценки"""
    
    def __init__(self, api_url=DEFAULT_API_URL):
        self.api_url = api_url
        self.questions_url = f"{api_url}/questions"
        self.submit_url = f"{api_url}/submit"
        self.results_url = f"{api_url}/results"
        self.correct_answers = 0
        self.total_questions = 0
    
    def run_evaluation(self, agent, username: str, agent_code: str) -> Tuple[str, pd.DataFrame]:
        questions_data = self._fetch_questions()
        if not isinstance(questions_data, list):
            return questions_data, pd.DataFrame()
        
        results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
        if not answers_payload:
            return "No answers generated", pd.DataFrame()
        
        submission_result = self._submit_answers(username, agent_code, answers_payload)
        return submission_result, pd.DataFrame(results_log)
    
    def _fetch_questions(self):
        try:
            response = requests.get(self.questions_url, timeout=30)
            response.raise_for_status()
            questions_data = response.json()
            self.total_questions = len(questions_data)
            print(f"Fetched {self.total_questions} questions")
            return questions_data
        except Exception as e:
            return f"Error: {str(e)}"
    
    def _run_agent_on_questions(self, agent, questions_data):
        results_log = []
        answers_payload = []
        
        print(f"Processing {len(questions_data)} questions...")
        for item in tqdm(questions_data, desc="Questions"):
            task_id = item.get("task_id")
            question_text = item.get("question")
            
            if not task_id or not question_text:
                continue
            
            try:
                json_response = agent(question_text, task_id)
                response_obj = json.loads(json_response)
                answer = response_obj.get("final_answer", "")
                
                answers_payload.append({"task_id": task_id, "submitted_answer": answer})
                results_log.append({
                    "Task ID": task_id,
                    "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                    "Answer": answer[:50] + "..." if len(answer) > 50 else answer
                })
            except Exception as e:
                answers_payload.append({"task_id": task_id, "submitted_answer": f"ERROR: {str(e)}"})
                results_log.append({
                    "Task ID": task_id,
                    "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                    "Answer": f"ERROR: {str(e)}"
                })
        
        return results_log, answers_payload
    
    def _submit_answers(self, username: str, agent_code: str, answers_payload):
        submission_data = {
            "username": username.strip(),
            "agent_code": agent_code.strip(),
            "answers": answers_payload
        }
        
        print("Submitting answers...")
        try:
            response = requests.post(
                self.submit_url,
                json=submission_data,
                headers={"Content-Type": "application/json"},
                timeout=60
            )
            response.raise_for_status()
            return response.json().get("message", "Answers submitted successfully")
        except Exception as e:
            return f"Submission failed: {str(e)}"


def run_evaluation(username: str, agent_code: str, model_name: str):
    print("Initializing GAIA Expert Agent...")
    agent = GAIAExpertAgent(model_name=model_name)
    
    print("Starting evaluation...")
    runner = EvaluationRunner()
    result, results_df = runner.run_evaluation(agent, username, agent_code)
    
    # Добавляем счетчики вопросов
    total_questions = runner.total_questions
    correct_answers = runner.correct_answers if hasattr(runner, 'correct_answers') else 0
    
    return result, correct_answers, total_questions, results_df


def create_gradio_interface():
    with gr.Blocks(title="GAIA Expert Agent") as demo:
        gr.Markdown("# 🧠 GAIA Expert Agent Evaluation")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("### Configuration")
                username = gr.Textbox(label="Hugging Face Username", value="yoshizen")
                agent_code = gr.Textbox(
                    label="Agent Code", 
                    value="https://huggingface.co/spaces/yoshizen/FinalTest"
                )
                model_name = gr.Dropdown(
                    label="Model",
                    choices=[
                        "google/flan-t5-small", 
                        "google/flan-t5-base",
                        "google/flan-t5-large"
                    ],
                    value="google/flan-t5-large"
                )
                run_button = gr.Button("🚀 Run Evaluation", variant="primary")
            
            with gr.Column():
                gr.Markdown("### Results")
                result_text = gr.Textbox(label="Submission Status")
                correct_answers = gr.Number(label="Correct Answers")
                total_questions = gr.Number(label="Total Questions")
                results_table = gr.Dataframe(label="Processed Questions", interactive=False)
        
        run_button.click(
            fn=run_evaluation,
            inputs=[username, agent_code, model_name],
            outputs=[result_text, correct_answers, total_questions, results_table]
        )
    
    return demo


if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(server_name="0.0.0.0", server_port=7860)