|
""" |
|
Improved GAIA Agent with LLM Integration for Hugging Face Course |
|
""" |
|
|
|
import os |
|
import gradio as gr |
|
import requests |
|
import pandas as pd |
|
import json |
|
import re |
|
from typing import List, Dict, Any, Optional, Callable, Union |
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
|
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
DEFAULT_MODEL = "google/flan-t5-small" |
|
|
|
class LLMGAIAAgent: |
|
""" |
|
An improved GAIA agent that uses a language model to generate responses |
|
instead of template-based answers. |
|
""" |
|
|
|
def __init__(self, model_name=DEFAULT_MODEL): |
|
"""Initialize the agent with a language model.""" |
|
print(f"Initializing LLMGAIAAgent with model: {model_name}") |
|
try: |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
self.model_name = model_name |
|
print(f"Successfully loaded model: {model_name}") |
|
except Exception as e: |
|
print(f"Error loading model: {e}") |
|
print("Falling back to template-based responses") |
|
self.model = None |
|
self.tokenizer = None |
|
self.model_name = None |
|
|
|
def __call__(self, question: str) -> str: |
|
"""Process a question and return an answer using the language model.""" |
|
print(f"Processing question: {question}") |
|
|
|
|
|
if self.model is None or self.tokenizer is None: |
|
return self._fallback_response(question) |
|
|
|
try: |
|
|
|
prompt = self._prepare_prompt(question) |
|
|
|
|
|
inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True) |
|
outputs = self.model.generate( |
|
inputs["input_ids"], |
|
max_length=150, |
|
min_length=20, |
|
temperature=0.7, |
|
top_p=0.9, |
|
do_sample=True, |
|
num_return_sequences=1 |
|
) |
|
|
|
|
|
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
response = self._clean_response(response) |
|
|
|
return response |
|
except Exception as e: |
|
print(f"Error generating response: {e}") |
|
return self._fallback_response(question) |
|
|
|
def _prepare_prompt(self, question: str) -> str: |
|
"""Prepare an appropriate prompt based on the question type.""" |
|
question_lower = question.lower() |
|
|
|
|
|
if any(keyword in question_lower for keyword in [ |
|
"calculate", "compute", "sum", "difference", |
|
"product", "divide", "plus", "minus", "times" |
|
]): |
|
return f"Solve this math problem step by step: {question}" |
|
|
|
|
|
elif any(keyword in question_lower for keyword in [ |
|
"image", "picture", "photo", "graph", "chart", "diagram" |
|
]): |
|
return f"Describe what might be seen in an image related to this question: {question}" |
|
|
|
|
|
elif any(keyword in question_lower for keyword in [ |
|
"who", "what", "where", "when", "why", "how" |
|
]): |
|
return f"Answer this factual question concisely and accurately: {question}" |
|
|
|
|
|
else: |
|
return f"Provide a concise, informative answer to this question: {question}" |
|
|
|
def _clean_response(self, response: str) -> str: |
|
"""Clean up the model's response if needed.""" |
|
|
|
for prefix in ["Answer:", "Response:", "A:"]: |
|
if response.startswith(prefix): |
|
response = response[len(prefix):].strip() |
|
|
|
|
|
if len(response) < 10: |
|
return self._fallback_response("general") |
|
|
|
return response |
|
|
|
def _fallback_response(self, question: str) -> str: |
|
"""Provide a fallback response if the model fails.""" |
|
question_lower = question.lower() |
|
|
|
|
|
if "who" in question_lower: |
|
return "The person involved is a notable figure in this field with significant contributions and achievements." |
|
elif "when" in question_lower: |
|
return "This occurred during a significant historical period, specifically in the early part of the relevant era." |
|
elif "where" in question_lower: |
|
return "The location is in a region known for its historical and cultural significance." |
|
elif "what" in question_lower: |
|
return "This refers to an important concept or entity that has several key characteristics and functions." |
|
elif "why" in question_lower: |
|
return "This happened due to a combination of factors including historical context, individual decisions, and broader societal trends." |
|
elif "how" in question_lower: |
|
return "The process involves several key steps that must be followed in sequence to achieve the desired outcome." |
|
|
|
|
|
return "Based on my analysis, the answer to your question involves several important factors. First, we need to consider the context and specific details mentioned." |
|
|
|
|
|
class EvaluationRunner: |
|
""" |
|
Handles the evaluation process: fetching questions, running the agent, |
|
and submitting answers to the evaluation server. |
|
""" |
|
|
|
def __init__(self, api_url: str = DEFAULT_API_URL): |
|
"""Initialize with API endpoints.""" |
|
self.api_url = api_url |
|
self.questions_url = f"{api_url}/questions" |
|
self.submit_url = f"{api_url}/submit" |
|
|
|
def run_evaluation(self, |
|
agent: Callable[[str], str], |
|
username: str, |
|
agent_code_url: str) -> tuple[str, pd.DataFrame]: |
|
""" |
|
Run the full evaluation process: |
|
1. Fetch questions |
|
2. Run agent on all questions |
|
3. Submit answers |
|
4. Return results |
|
""" |
|
|
|
questions_data = self._fetch_questions() |
|
if isinstance(questions_data, str): |
|
return questions_data, None |
|
|
|
|
|
results_log, answers_payload = self._run_agent_on_questions(agent, questions_data) |
|
if not answers_payload: |
|
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) |
|
|
|
|
|
submission_result = self._submit_answers(username, agent_code_url, answers_payload) |
|
|
|
|
|
return submission_result, pd.DataFrame(results_log) |
|
|
|
def _fetch_questions(self) -> Union[List[Dict[str, Any]], str]: |
|
"""Fetch questions from the evaluation server.""" |
|
print(f"Fetching questions from: {self.questions_url}") |
|
try: |
|
response = requests.get(self.questions_url, timeout=15) |
|
response.raise_for_status() |
|
questions_data = response.json() |
|
|
|
if not questions_data: |
|
error_msg = "Fetched questions list is empty or invalid format." |
|
print(error_msg) |
|
return error_msg |
|
|
|
print(f"Successfully fetched {len(questions_data)} questions.") |
|
return questions_data |
|
|
|
except requests.exceptions.RequestException as e: |
|
error_msg = f"Error fetching questions: {e}" |
|
print(error_msg) |
|
return error_msg |
|
|
|
except requests.exceptions.JSONDecodeError as e: |
|
error_msg = f"Error decoding JSON response from questions endpoint: {e}" |
|
print(error_msg) |
|
print(f"Response text: {response.text[:500]}") |
|
return error_msg |
|
|
|
except Exception as e: |
|
error_msg = f"An unexpected error occurred fetching questions: {e}" |
|
print(error_msg) |
|
return error_msg |
|
|
|
def _run_agent_on_questions(self, |
|
agent: Callable[[str], str], |
|
questions_data: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: |
|
"""Run the agent on all questions and collect results.""" |
|
results_log = [] |
|
answers_payload = [] |
|
|
|
print(f"Running agent on {len(questions_data)} questions...") |
|
for item in questions_data: |
|
task_id = item.get("task_id") |
|
question_text = item.get("question") |
|
|
|
if not task_id or question_text is None: |
|
print(f"Skipping item with missing task_id or question: {item}") |
|
continue |
|
|
|
try: |
|
submitted_answer = agent(question_text) |
|
answers_payload.append({ |
|
"task_id": task_id, |
|
"submitted_answer": submitted_answer |
|
}) |
|
results_log.append({ |
|
"Task ID": task_id, |
|
"Question": question_text, |
|
"Submitted Answer": submitted_answer |
|
}) |
|
except Exception as e: |
|
print(f"Error running agent on task {task_id}: {e}") |
|
results_log.append({ |
|
"Task ID": task_id, |
|
"Question": question_text, |
|
"Submitted Answer": f"AGENT ERROR: {e}" |
|
}) |
|
|
|
return results_log, answers_payload |
|
|
|
def _submit_answers(self, |
|
username: str, |
|
agent_code_url: str, |
|
answers_payload: List[Dict[str, Any]]) -> str: |
|
"""Submit answers to the evaluation server.""" |
|
submission_data = { |
|
"username": username.strip(), |
|
"agent_code": agent_code_url, |
|
"answers": answers_payload |
|
} |
|
|
|
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..." |
|
print(status_update) |
|
|
|
try: |
|
response = requests.post(self.submit_url, json=submission_data, timeout=60) |
|
response.raise_for_status() |
|
result_data = response.json() |
|
|
|
final_status = ( |
|
f"Submission Successful!\n" |
|
f"User: {result_data.get('username')}\n" |
|
f"Overall Score: {result_data.get('overall_score', 'N/A')}\n" |
|
f"Correct Answers: {result_data.get('correct_answers', 'N/A')}\n" |
|
f"Total Questions: {result_data.get('total_questions', 'N/A')}\n" |
|
) |
|
print(final_status) |
|
return final_status |
|
|
|
except requests.exceptions.RequestException as e: |
|
error_msg = f"Error submitting answers: {e}" |
|
print(error_msg) |
|
return error_msg |
|
|
|
except Exception as e: |
|
error_msg = f"An unexpected error occurred during submission: {e}" |
|
print(error_msg) |
|
return error_msg |
|
|
|
|
|
def run_and_submit_all(profile: gr.OAuthProfile | None, *args): |
|
""" |
|
Fetches all questions, runs the agent on them, submits all answers, and displays the results. |
|
This is the main function called by the Gradio interface. |
|
""" |
|
|
|
if not profile: |
|
return "Please Login to Hugging Face with the button.", None |
|
|
|
username = profile.username |
|
print(f"User logged in: {username}") |
|
|
|
|
|
space_id = os.getenv("SPACE_ID") |
|
agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" |
|
print(f"Agent code URL: {agent_code_url}") |
|
|
|
|
|
try: |
|
|
|
agent = LLMGAIAAgent() |
|
runner = EvaluationRunner() |
|
except Exception as e: |
|
error_msg = f"Error initializing agent or evaluation runner: {e}" |
|
print(error_msg) |
|
return error_msg, None |
|
|
|
|
|
return runner.run_evaluation(agent, username, agent_code_url) |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# GAIA Agent Evaluation Runner (LLM-Enhanced)") |
|
|
|
gr.Markdown("## Instructions:") |
|
gr.Markdown("1. Log in to your Hugging Face account using the button below.") |
|
gr.Markdown("2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run the agent, and submit answers.") |
|
gr.Markdown("3. View your score and detailed results in the output section.") |
|
|
|
gr.Markdown("---") |
|
|
|
gr.Markdown("**Note:** This version uses a language model to generate responses. The evaluation process may take longer than the template-based version.") |
|
|
|
with gr.Row(): |
|
login_button = gr.LoginButton(value="Sign in with Hugging Face") |
|
|
|
with gr.Row(): |
|
submit_button = gr.Button("Run Evaluation & Submit All Answers") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
output_status = gr.Textbox(label="Submission Result") |
|
output_results = gr.Dataframe(label="Questions and Agent Answers") |
|
|
|
submit_button.click(run_and_submit_all, inputs=[login_button], outputs=[output_status, output_results]) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|