Spaces:

davidgturner
/

GaiaAgentEvaluator

Sleeping

App Files Files Community

davidgturner commited on Jun 1

Commit

f5bafc2

1 Parent(s): 08e2c16

- changes for running agent

Browse files

Files changed (4) hide show

app.py +136 -1035
config.py +10 -22
data/knowledge_base.txt +20 -0
tools/tool_manager.py +56 -0

app.py CHANGED Viewed

@@ -14,7 +14,6 @@ from bs4 import BeautifulSoup
 from duckduckgo_search import DDGS
 import pytube
 from dateutil import parser
-import pandas as pd
 try:
     from youtube_transcript_api import YouTubeTranscriptApi
 except ImportError:
@@ -22,936 +21,108 @@ except ImportError:
 from smolagents import Tool, CodeAgent, InferenceClientModel
-import random
-from smolagents import CodeAgent, InferenceClientModel
-# Import our custom tools from their modules
-# from smolagents.tools import DuckDuckGoSearchTool, WeatherInfoTool, HubStatsTool
-# from smolagents.tools import WebPageVisitTool, WebpageContentExtractorTool
-from smolagents import CodeAgent, InferenceClientModel, load_tool
-# Import necessary libraries
-import random
-from smolagents import CodeAgent, InferenceClientModel
-# Import our custom tools from their modules
-# from tools import DuckDuckGoSearchTool, WeatherInfoTool, HubStatsTool
-# from retriever import load_guest_dataset
-from langchain.docstore.document import Document
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.retrievers import BM25Retriever
-import functools
-# Create a knowledge base for the agent
-GAIA_KNOWLEDGE = """
-### AI and Agent Concepts
-- An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals.
-- GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks.
-- The agent loop consists of perception, reasoning, and action.
-- RAG (Retrieval-Augmented Generation) combines retrieval of relevant information with generation capabilities of language models.
-- An LLM (Large Language Model) is a neural network trained on vast amounts of text data to understand and generate human language.
-### Agent Capabilities
-- Tool use refers to an agent's ability to employ external tools like search engines, APIs, or specialized algorithms.
-- An effective agent should be able to decompose complex problems into manageable parts.
-- Chain-of-thought reasoning allows agents to break down problem-solving steps to improve accuracy.
-- Agents should apply appropriate reasoning strategies based on the type of question (factual, analytical, etc.)
-- Self-reflection helps agents identify and correct errors in their reasoning.
-### Evaluation Criteria
-- Agent responses should be accurate, relevant, and factually correct.
-- Effective agents provide concise yet comprehensive answers.
-- Agents should acknowledge limitations and uncertainties when appropriate.
-- Good agents can follow multi-step instructions and fulfill all requirements.
-- Reasoning transparency helps users understand how the agent arrived at its conclusions.
-"""
-# (Keep Constants as is)
-# --- Constants ---
-DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# Use a more powerful model for better responses
-LLAMA_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
-HF_API_TOKEN = os.getenv("HF_API_TOKEN")
-HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"} if HF_API_TOKEN else {}
-MAX_RETRIES = 3
-RETRY_DELAY = 2  # seconds
-# Create knowledge base documents
-def create_knowledge_documents():
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=500,
-        chunk_overlap=50,
-        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
-    )
-    knowledge_chunks = text_splitter.split_text(GAIA_KNOWLEDGE)
-    return [Document(page_content=chunk) for chunk in knowledge_chunks]
-# --- Basic Agent Definition ---
-# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
-# --- Tools ---
-class WebSearchTool(Tool):
-    name = "web_search"
-    description = "Search the web for information about a query using DuckDuckGo."
-    inputs = {
-        "query": {
-            "type": "string",
-            "description": "The search query."
-        }
-    }
-    output_type = "string"
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.max_results = 3
-    def forward(self, query: str) -> str:
-        assert isinstance(query, str), "Query must be a string."
-        try:
-            results = []
-            with DDGS() as ddgs:
-                ddgs_results = list(ddgs.text(query, max_results=self.max_results))
-            if not ddgs_results:
-                return "No web search results found."
-            formatted_results = "\nWeb Search Results:\n"
-            for i, r in enumerate(ddgs_results, 1):
-                formatted_results += f"\n{i}. {r['title']}\n   {r['body']}\n   Source: {r['href']}\n"
-            return formatted_results
-        except Exception as e:
-            print(f"Error in web search: {str(e)}")
-            return f"Error performing web search: {str(e)}"
-class WebContentTool(Tool):
-    name = "web_content"
-    description = "Fetch and extract content from a specific webpage."
-    inputs = {
-        "url": {
-            "type": "string",
-            "description": "The URL of the webpage to fetch content from."
-        }
-    }
-    output_type = "string"
-    def forward(self, url: str) -> str:
-        assert isinstance(url, str), "URL must be a string."
-        try:
-            headers = {
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-            }
-            response = requests.get(url, headers=headers, timeout=10)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, 'html.parser')
-            for script in soup(["script", "style"]):
-                script.extract()
-            text = soup.get_text(separator='\n')
-            lines = (line.strip() for line in text.splitlines())
-            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-            text = '\n'.join(chunk for chunk in chunks if chunk)
-            if len(text) > 2000:
-                text = text[:2000] + "... [content truncated]"
-            return f"Content from {url}:\n\n{text}"
-        except Exception as e:
-            print(f"Error fetching web content: {str(e)}")
-            return f"Error fetching content from {url}: {str(e)}"
-class GaiaRetrieverTool(Tool):
-    name = "gaia_retriever"
-    description = "Semantic search for retrieving relevant information for GaiaAgent."
-    inputs = {
-        "query": {
-            "type": "string",
-            "description": "Query for semantic search."
-        }
-    }
-    output_type = "string"
-    def __init__(self, docs, **kwargs):
-        super().__init__(**kwargs)
-        self.retriever = BM25Retriever.from_documents(docs, k=3)
-        self.docs = docs  # Store docs for fallback
-    def forward(self, query: str) -> str:
-        assert isinstance(query, str), "Query must be a string."
-        try:
-            docs = self.retriever.invoke(query)
-            if not docs:
-                return "\nNo specific information found. Here's some general knowledge:\n" + "".join([
-                    f"\n- {self.docs[i].page_content}" for i in range(min(3, len(self.docs)))
-                ])
-            return "\nRetrieved Information:\n" + "".join([
-                f"\n- {doc.page_content}" for doc in docs
-            ])
-        except Exception as e:
-            print(f"Error in retriever: {str(e)}")
-            return f"Unable to retrieve specific information. The agent will rely on its general knowledge."
-# --- Agent ---
-class YoutubeVideoTool(Tool):
-    name = "youtube_video"
-    description = "Analyze YouTube videos to answer questions about their content."
-    inputs = {
-        "video_url": {
-            "type": "string",
-            "description": "The YouTube video URL"
-        }
-    }
-    output_type = "string"
-    def forward(self, video_url: str) -> str:
-        assert isinstance(video_url, str), "Video URL must be a string"
-        try:
-            # Extract video ID from URL
-            if "youtu.be" in video_url:
-                video_id = video_url.split("/")[-1].split("?")[0]
-            else:
-                video_id = re.search(r'v=([^&]+)', video_url).group(1)
-            # Get video info
-            yt = pytube.YouTube(video_url)
-            title = yt.title
-            author = yt.author
-            length = yt.length  # in seconds
-            description = yt.description
-            # Try to get transcript
-            transcript_text = ""
-            try:
-                transcript = YouTubeTranscriptApi.get_transcript(video_id)
-                transcript_text = "\n".join([f"{item['start']:.1f}s: {item['text']}" for item in transcript])
-            except Exception as e:
-                transcript_text = f"Could not retrieve transcript: {str(e)}"
-            result = f"""
-YouTube Video Analysis:
-Title: {title}
-Author: {author}
-Length: {length//60} minutes {length%60} seconds
-Description: {description[:500]}... [truncated]
-Transcript Excerpts:
-{transcript_text[:2000]}... [transcript truncated]
-"""
-            return result
-        except Exception as e:
-            print(f"Error analyzing YouTube video: {str(e)}")
-            return f"Error analyzing YouTube video {video_url}: {str(e)}"
-class WikipediaTool(Tool):
-    name = "wikipedia_search"
-    description = "Search Wikipedia for information about a topic."
-    inputs = {
-        "query": {
-            "type": "string",
-            "description": "The search query"
-        }
-    }
-    output_type = "string"
-    def forward(self, query: str) -> str:
-        assert isinstance(query, str), "Query must be a string"
-        try:
-            search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={query}&format=json"
-            search_response = requests.get(search_url, timeout=10)
-            search_data = search_response.json()
-            if "query" not in search_data or "search" not in search_data["query"] or not search_data["query"]["search"]:
-                return f"No Wikipedia results found for {query}"
-            # Get the first result
-            first_result = search_data["query"]["search"][0]
-            page_id = first_result["pageid"]
-            # Get the page content
-            content_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exintro&explaintext&pageids={page_id}&format=json"
-            content_response = requests.get(content_url, timeout=10)
-            content_data = content_response.json()
-            extract = content_data["query"]["pages"][str(page_id)]["extract"]
-            title = content_data["query"]["pages"][str(page_id)]["title"]
-            return f"""Wikipedia: {title}
-{extract[:1500]}... [content truncated]
-Source: https://en.wikipedia.org/wiki/{title.replace(' ', '_')}
-"""
-        except Exception as e:
-            print(f"Error searching Wikipedia: {str(e)}")
-            return f"Error searching Wikipedia for {query}: {str(e)}"
-class GaiaAgent:
     def __init__(self):
-        print("GaiaAgent initialized.")
-        # Create knowledge base documents
-        self.knowledge_docs = create_knowledge_documents()
-        # Create our tools
-        self.retriever_tool = GaiaRetrieverTool(self.knowledge_docs)
-        self.web_search_tool = WebSearchTool()
-        self.web_content_tool = WebContentTool()
-        self.youtube_tool = YoutubeVideoTool()
-        self.wikipedia_tool = WikipediaTool()
-        # Initialize the Hugging Face model
-        self.model = InferenceClientModel()
-        # Initialize the web search tool
-        # self.search_tool = DuckDuckGoSearchTool()
-        # Initialize the weather tool
-        # self.weather_info_tool = WeatherInfoTool()
-        # Initialize the Hub stats tool
-        # self.hub_stats_tool = HubStatsTool()
-        # Load the guest dataset and initialize the guest info tool
-        # self.guest_info_tool = load_guest_dataset()
-        # Set up LLM API access
-        self.hf_api_url = LLAMA_API_URL
-        self.headers = HEADERS
-        # Set up caching for responses
-        self.cache = {}
-    def query_llm(self, prompt):
-        """Send a prompt to the LLM API and return the response."""
-        # Check cache first
-        if prompt in self.cache:
-            print("Using cached response")
-            return self.cache[prompt]
-        if not HF_API_TOKEN:
-            # Fallback to rule-based approach if no API token
-            return self.rule_based_answer(prompt)
-        payload = {
-            "inputs": prompt,
-            "parameters": {
-                "max_new_tokens": 512,
-                "temperature": 0.7,
-                "top_p": 0.9,
-                "do_sample": True
-            }
-        }
-        for attempt in range(MAX_RETRIES):
             try:
-                response = requests.post(self.hf_api_url, headers=self.headers, json=payload, timeout=30)
-                response.raise_for_status()
-                result = response.json()
-                # Extract the generated text from the response
-                if isinstance(result, list) and len(result) > 0:
-                    generated_text = result[0].get("generated_text", "")
-                    # Clean up the response to get just the answer
-                    clean_response = self.clean_response(generated_text, prompt)
-                    # Cache the response
-                    self.cache[prompt] = clean_response
-                    return clean_response
-                return "I couldn't generate a proper response."
             except Exception as e:
-                print(f"Attempt {attempt+1}/{MAX_RETRIES} failed: {str(e)}")
-                if attempt < MAX_RETRIES - 1:
-                    time.sleep(RETRY_DELAY)
-                else:
-                    # Fall back to rule-based method on failure
-                    return self.rule_based_answer(prompt)
-    def clean_response(self, response, prompt):
-        """Clean up the LLM response to extract the answer."""
-        # Remove the prompt from the beginning if it's included
-        if response.startswith(prompt):
-            response = response[len(prompt):]
-        # Try to find where the model's actual answer begins
-        markers = ["<answer>", "<response>", "Answer:", "Response:", "Assistant:"]
-        for marker in markers:
-            if marker.lower() in response.lower():
-                parts = response.lower().split(marker.lower(), 1)
-                if len(parts) > 1:
-                    response = parts[1].strip()
-        # Remove any closing tags if they exist
-        end_markers = ["</answer>", "</response>", "Human:", "User:"]
-        for marker in end_markers:
-            if marker.lower() in response.lower():
-                response = response.lower().split(marker.lower())[0].strip()
-        return response.strip()
-    def rule_based_answer(self, question):
-        """Fallback method using rule-based answers for common question types."""
-        question_lower = question.lower()
-        # Simple pattern matching for common question types
-        if "what is" in question_lower or "define" in question_lower:
-            if "agent" in question_lower:
-                return "An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals."
-            if "gaia" in question_lower:
-                return "GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks."
-            if "llm" in question_lower or "large language model" in question_lower:
-                return "A Large Language Model (LLM) is a neural network trained on vast amounts of text data to understand and generate human language."
-            if "rag" in question_lower or "retrieval" in question_lower:
-                return "RAG (Retrieval-Augmented Generation) combines retrieval of relevant information with generation capabilities of language models."
-        if "how to" in question_lower:
-            return "To accomplish this task, you should first understand the requirements, then implement a solution step by step, and finally test your implementation."
-        if "example" in question_lower:
-            return "Here's an example implementation that demonstrates the concept in a practical manner."
-        if "evaluate" in question_lower or "criteria" in question_lower:
-            return "Evaluation criteria for agents typically include accuracy, relevance, factual correctness, conciseness, ability to follow instructions, and transparency in reasoning."
-        # More specific fallback answers instead of a generic one
-        if "tools" in question_lower:
-            return "Tools for AI agents include web search, content extraction, API connections, and various knowledge retrieval mechanisms."
-        if "chain" in question_lower:
-            return "Chain-of-thought reasoning allows AI agents to break down complex problems into sequential steps, improving accuracy and transparency."
-        if "purpose" in question_lower or "goal" in question_lower:
-            return "The purpose of AI agents is to assist users by answering questions, performing tasks, and providing helpful information while maintaining ethical standards."
-        # Default response for truly unmatched questions - more specific than before
-        return "This question relates to AI agent capabilities. While I don't have a specific pre-programmed answer, I can recommend reviewing literature on agent architectures, tool use in LLMs, and evaluation methods in AI systems."
-    def determine_tools_needed(self, question):
-        """Determine which tools should be used for a given question."""
-        question_lower = question.lower()
-        # Check for YouTube links
-        youtube_patterns = ["youtube.com", "youtu.be"]
-        needs_youtube = any(pattern in question_lower for pattern in youtube_patterns)
-        # Check if this is a reverse text question
-        is_reverse_text = question_lower != question_lower[::-1] and len(set(question_lower)) < 30
-        # Check for Wikipedia-related questions
-        wiki_patterns = ["wikipedia", "article", "published", "paper", "study", "research"]
-        needs_wikipedia = any(pattern in question_lower for pattern in wiki_patterns)
-        # Patterns that suggest the need for web search
-        web_search_patterns = [
-            "current", "latest", "recent", "news", "update", "today",
-            "statistics", "data", "facts", "information about", "published",
-            "what is happening", "how many", "where is", "when was", "who", "which",
-            "country", "city", "2023", "2022", "published", "album", "studio", "paper",
-            "olympics", "sport", "athlete", "player", "pitcher", "baseball", "competition",
-            "name", "first", "last", "actor", "played", "version", "language", "company"
-        ]
-        # Check if the question likely needs web search
-        needs_web_search = any(pattern in question_lower for pattern in web_search_patterns)
-                  # Check if question appears to be about GAIA, agents, or AI concepts
-        needs_knowledge_retrieval = any(term in question_lower for term in
-                                       ["agent", "gaia", "llm", "ai", "artificial intelligence",
-                                        "evaluation", "tool", "rag", "retrieval"])
-        # Determine which tools to use based on the analysis
-        return {
-            "use_youtube": needs_youtube,
-            "use_wikipedia": needs_wikipedia,
-            "is_reverse_text": is_reverse_text,
-            "use_web_search": needs_web_search,
-            "use_knowledge_retrieval": needs_knowledge_retrieval,
-            "use_webpage_visit": "example" in question_lower or "details" in question_lower or "explain" in question_lower or "link" in question_lower
-        }
-    def handle_special_questions(self, question, tool_selection):
-        """Handle specific question types that require special logic."""
-        question_lower = question.lower()
-        # Handle reverse text questions - generalized approach
-        if tool_selection.get("is_reverse_text", False):
-            # Check if this looks like a reverse text puzzle
-            if "rewsna" in question_lower:  # "answer" reversed
-                reversed_question = question[::-1]
-                print(f"Detected reverse text question, reversed: {reversed_question}")
-                # Use the LLM to answer the reversed question
-                reversed_prompt = self.format_prompt(reversed_question)
-                answer = self.query_llm(reversed_prompt)
-                return self.extract_final_answer(answer)
-        # Handle mathematical table analysis - look for patterns
-        if "table" in question_lower and ("commutative" in question_lower or "operation" in question_lower):
-            # Extract table data and analyze mathematically
-            return self.analyze_table(question)
-        # Handle grocery/botany questions - use categorization
-        if "grocery" in question_lower and "botany" in question_lower:
-            return self.analyze_botanical_categories(question)
-        # Handle file analysis questions - Excel, Python, Audio etc.
-        file_extensions = ["excel", "xlsx", "csv", "python", ".py", "mp3", "wav", "audio"]
-        if any(ext in question_lower for ext in file_extensions):
-            if "excel" in question_lower or "xlsx" in question_lower:
-                return self.analyze_excel_data(question)
-            elif "python" in question_lower or ".py" in question_lower:
-                return self.analyze_python_code(question)
-            elif any(audio in question_lower for audio in ["mp3", "wav", "audio", "voice memo"]):
-                return self.analyze_audio_content(question)
-        return None
-    def analyze_table(self, question):
-        """Analyze mathematical table for patterns - generalized approach."""
-        # Look for table data in the question and analyze commutativity
-        # This should extract table elements and check mathematical properties
-        if "commutative" in question.lower():
-            # Use regex to find table elements or parse structured data
-            # For now, use LLM to analyze the mathematical content
-            table_prompt = f"""Analyze the mathematical table in this question and determine the answer:
-{question}
-Look for patterns in commutativity, operations, or mathematical relationships.
-Provide only the direct answer requested."""
-            answer = self.query_llm(table_prompt)
-            return self.extract_final_answer(answer)
-        return None
-    def analyze_botanical_categories(self, question):
-        """Analyze botanical categories from grocery items - generalized approach."""
-        # Extract grocery items and categorize botanically
-        botanical_prompt = f"""Analyze the grocery items in this question from a botanical perspective:
-{question}
-Identify which items are true botanical vegetables (not fruits, seeds, or other plant parts).
-Provide the answer in the exact format requested."""
-        answer = self.query_llm(botanical_prompt)
-        return self.extract_final_answer(answer)
-    def analyze_excel_data(self, question):
-        """Analyze Excel spreadsheet data - generalized approach."""
-        # Parse Excel data mentioned in question and perform calculations
-        excel_prompt = f"""Analyze the Excel spreadsheet data in this question:
-{question}
-Perform the required calculations or data analysis as specified.
-Provide only the numeric or exact answer requested."""
-        answer = self.query_llm(excel_prompt)
-        return self.extract_final_answer(answer)
-    def analyze_audio_content(self, question):
-        """Analyze audio content from voice memos - generalized approach."""
-        # Parse audio content description and extract requested information
-        audio_prompt = f"""Analyze the audio content described in this question:
-{question}
-Extract the specific information requested (ingredients, page numbers, names, etc.).
-Provide the answer in the exact format requested."""
-        answer = self.query_llm(audio_prompt)
-        return self.extract_final_answer(answer)
-    def analyze_python_code(self, question):
-        """Analyze Python code for output - generalized approach."""
-        # Parse Python code in question and determine output
-        code_prompt = f"""Analyze the Python code in this question and determine its output:
-{question}
-Execute the code logic mentally and provide the exact numeric or text output that would result.
-Provide only the direct answer requested."""
-        answer = self.query_llm(code_prompt)
-        return self.extract_final_answer(answer)
-    def improved_determine_tools_needed(self, question):
-        """Enhanced tool selection with better pattern matching."""
-        question_lower = question.lower()
-        # YouTube detection - more comprehensive
-        youtube_patterns = ["youtube.com", "youtu.be", "video", "watch?v=", "channel"]
-        needs_youtube = any(pattern in question_lower for pattern in youtube_patterns)
-        # Reverse text detection - improved logic
-        is_reverse_text = ("rewsna" in question_lower or
-                          (question_lower != question_lower[::-1] and
-                           "ecnetnes" in question_lower or "sdrow" in question_lower))
-        # Wikipedia detection - expanded patterns
-        wiki_patterns = ["wikipedia", "article", "published", "featured article",
-                        "promoted", "nominated", "discography", "studio albums",
-                        "encyclopedia", "wiki", "featured content"]
-        needs_wikipedia = any(pattern in question_lower for pattern in wiki_patterns)
-        # Web search patterns - comprehensive list
-        web_search_patterns = [
-            # Time indicators
-            "current", "latest", "recent", "2023", "2022", "2021", "2020", "today",
-            # Question words
-            "how many", "where", "when", "who", "which", "what", "whose",
-            # Sports and competitions
-            "yankee", "walks", "athletes", "olympics", "competition", "pitcher", "baseball",
-            # Specific entities that need web lookup
-            "malko", "taishō tamai", "universe today", "nedoshivina",
-            "specimens", "polish-language", "actor", "played",
-            # Geographic and demographic
-            "country", "nationality", "first name", "award number", "city",
-            # Publications and research
-            "published", "paper", "study", "research", "journal", "author",
-            # Statistics and data
-            "statistics", "data", "facts", "information about", "number of"
-        ]
-        needs_web_search = any(pattern in question_lower for pattern in web_search_patterns)
-        # Knowledge retrieval for AI/agent questions
-        ai_patterns = ["agent", "gaia", "llm", "ai", "evaluation", "tool", "artificial intelligence"]
-        needs_knowledge = any(term in question_lower for term in ai_patterns)
-        # File analysis detection
-        file_patterns = ["excel", "xlsx", "csv", "python", ".py", "mp3", "wav", "audio", "voice memo"]
-        has_file_analysis = any(pattern in question_lower for pattern in file_patterns)
-        return {
-            "use_youtube": needs_youtube,
-            "use_wikipedia": needs_wikipedia,
-            "is_reverse_text": is_reverse_text,
-            "use_web_search": needs_web_search,
-            "use_knowledge_retrieval": needs_knowledge,
-            "use_webpage_visit": needs_web_search and ("link" in question_lower or "paper" in question_lower),
-            "has_file_analysis": has_file_analysis
         }
-    def __call__(self, question: str) -> str:
-        """Main agent execution method - completely refactored for generalizability."""
-        import re
-        print(f"GaiaAgent received question (raw): {question}")
-        try:
-            # Step 1: Analyze question and determine tool strategy
-            tool_selection = self.improved_determine_tools_needed(question)
-            print(f"Tool selection: {tool_selection}")
-            # Step 2: Try special handlers first
-            special_answer = self.handle_special_questions(question, tool_selection)
-            if special_answer:
-                print(f"Special handler returned: {special_answer}")
-                return special_answer
-            # Step 3: Gather information from tools
-            context_info = []
-            # YouTube analysis
-            if tool_selection["use_youtube"]:
-                youtube_urls = re.findall(r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[\w-]+)', question)
-                if youtube_urls:
-                    try:
-                        youtube_info = self.youtube_tool.forward(youtube_urls[0])
-                        context_info.append(f"YouTube Analysis:\n{youtube_info}")
-                        print("Retrieved YouTube information")
-                              # YouTube content is now in context_info for LLM processing
-                    # No hardcoded answers - let LLM analyze the YouTube content
-                    except Exception as e:
-                        print(f"Error with YouTube tool: {e}")
-            # Wikipedia research
-            if tool_selection["use_wikipedia"]:
-                try:
-                    # Smart search term extraction
-                    search_query = question
-                    if "mercedes sosa" in question.lower():
-                        search_query = "Mercedes Sosa discography"
-                    elif "dinosaur" in question.lower() and "featured article" in question.lower():
-                        search_query = "dinosaur featured articles wikipedia"
-                    wikipedia_info = self.wikipedia_tool.forward(search_query)
-                    context_info.append(f"Wikipedia Research:\n{wikipedia_info}")
-                    print("Retrieved Wikipedia information")
-                      # Wikipedia content is now in context_info for LLM processing
-                    # No hardcoded answers - let LLM analyze the Wikipedia content
-                except Exception as e:
-                    print(f"Error with Wikipedia tool: {e}")
-            # Web search and analysis
-            if tool_selection["use_web_search"]:
-                try:
-                    web_info = self.web_search_tool.forward(question)
-                    context_info.append(f"Web Search Results:\n{web_info}")
-                    print("Retrieved web search results")
-                      # Web search content is now in context_info for LLM processing
-                    # No hardcoded answers - let LLM analyze the web search results
-                    # Follow up with webpage content if needed
-                    if tool_selection["use_webpage_visit"] and "http" in web_info.lower():
-                        url_match = re.search(r'Source: (https?://[^\s]+)', web_info)
-                        if url_match:
-                            try:
-                                webpage_content = self.web_content_tool.forward(url_match.group(1))
-                                context_info.append(f"Webpage Content:\n{webpage_content}")
-                                print("Retrieved detailed webpage content")
-                            except Exception as e:
-                                print(f"Error retrieving webpage content: {e}")
-                except Exception as e:
-                    print(f"Error with web search: {e}")
-            # Knowledge base retrieval
-            if tool_selection["use_knowledge_retrieval"]:
-                try:
-                    knowledge_info = self.retriever_tool.forward(question)
-                    context_info.append(f"Knowledge Base:\n{knowledge_info}")
-                    print("Retrieved knowledge base information")
-                except Exception as e:
-                    print(f"Error with knowledge retrieval: {e}")
-            # Step 4: Synthesize answer using LLM
-            if context_info:
-                all_context = "\n\n".join(context_info)
-                prompt = self.format_prompt(question, all_context)
-            else:
-                prompt = self.format_prompt(question)
-            # Query LLM for final answer
-            answer = self.query_llm(prompt)
-            # Step 5: Clean and validate answer
-            clean_answer = self.extract_final_answer(answer)
-            print(f"GaiaAgent returning answer: {clean_answer}")
-            return clean_answer
-        except Exception as e:
-            print(f"Error in GaiaAgent: {e}")
-            # Fallback to rule-based method
-            fallback_answer = self.rule_based_answer(question)
-            print(f"GaiaAgent returning fallback answer: {fallback_answer}")
-        return fallback_answer
-    def format_prompt(self, question, context=""):
-        """Format the question into a proper prompt for the LLM."""
-        if context:
-            return f"""You are a precise AI assistant that answers questions using available information. Your answer will be evaluated with exact string matching, so provide only the specific answer requested without additional text.
-Context Information:
-{context}
-Question: {question}
-Critical Instructions:
-- Provide ONLY the exact answer requested, nothing else
-- Do not include phrases like "The answer is", "Final answer", or "Based on the context"
-- For numerical answers, use the exact format requested (integers, decimals, etc.)
-- For lists, use the exact formatting specified in the question (commas, spaces, etc.)
-- For names, use proper capitalization as would appear in official sources
-- Be concise and precise - extra words will cause evaluation failure
-- If the question asks for multiple items, provide them in the exact format requested
-Direct Answer:"""
-        else:
-            return f"""You are a precise AI assistant that answers questions accurately. Your answer will be evaluated with exact string matching, so provide only the specific answer requested without additional text.
-Question: {question}
-Critical Instructions:
-- Provide ONLY the exact answer requested, nothing else
-- Do not include phrases like "The answer is", "Final answer", or explanations
-- For numerical answers, use the exact format that would be expected
-- For lists, use appropriate formatting (commas, spaces, etc.)
-- For names, use proper capitalization
-- Be concise and precise - extra words will cause evaluation failure
-- Answer based on your knowledge and reasoning
-Direct Answer:"""
-    def extract_final_answer(self, answer):
-        """Extract and clean the final answer for exact matching."""
-        # Remove common prefixes that might interfere with exact matching
-        prefixes_to_remove = [
-            "final answer:", "answer:", "the answer is:", "result:",
-            "solution:", "conclusion:", "final answer is:", "direct answer:",
-            "based on the context:", "according to:", "the result is:"
-        ]
-        clean_answer = answer.strip()
-        # Remove prefixes (case insensitive)
-        for prefix in prefixes_to_remove:
-            if clean_answer.lower().startswith(prefix.lower()):
-                clean_answer = clean_answer[len(prefix):].strip()
-        # Remove quotes if the entire answer is quoted
-        if clean_answer.startswith('"') and clean_answer.endswith('"'):
-            clean_answer = clean_answer[1:-1]
-        elif clean_answer.startswith("'") and clean_answer.endswith("'"):
-            clean_answer = clean_answer[1:-1]
-        # Remove trailing periods if they seem extraneous
-        if clean_answer.endswith('.') and not clean_answer.replace('.', '').isdigit():
-            # Don't remove decimal points from numbers
-            if not (clean_answer.count('.') == 1 and clean_answer.replace('.', '').isdigit()):
-                clean_answer = clean_answer[:-1]
-        # Clean up extra whitespace
-        clean_answer = ' '.join(clean_answer.split())
-        return clean_answer
-class BasicAgent:
-    def __init__(self):
-        print("BasicAgent initialized.")
-        # Initialize the Hugging Face API client
-        # https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
-        self.hf_api_url = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
-        self.hf_api_token = os.getenv("HF_API_TOKEN")
-        if not self.hf_api_token:
-            print("WARNING: HF_API_TOKEN not found. Using default fallback methods.")
-        self.headers = {"Authorization": f"Bearer {self.hf_api_token}"} if self.hf_api_token else {}
-        self.max_retries = 3
-        self.retry_delay = 2  # seconds
-    def query_llm(self, prompt):
-        """Send a prompt to the LLM API and return the response."""
-        if not self.hf_api_token:
-            # Fallback to a rule-based approach if no API token
-            return self.rule_based_answer(prompt)
-        payload = {
-            "inputs": prompt,
-            "parameters": {
-                "max_new_tokens": 512,
-                "temperature": 0.7,
-                "top_p": 0.9,
-                "do_sample": True
-            }
-        }
-        for attempt in range(self.max_retries):
-            try:
-                response = requests.post(self.hf_api_url, headers=self.headers, json=payload, timeout=30)
-                response.raise_for_status()
-                result = response.json()
-                # Extract the generated text from the response
-                if isinstance(result, list) and len(result) > 0:
-                    generated_text = result[0].get("generated_text", "")
-                    # Clean up the response to get just the answer
-                    return self.clean_response(generated_text, prompt)
-                return "I couldn't generate a proper response."
-            except Exception as e:
-                print(f"Attempt {attempt+1}/{self.max_retries} failed: {str(e)}")
-                if attempt < self.max_retries - 1:
-                    time.sleep(self.retry_delay)
-                else:
-                    # Fall back to rule-based method on failure
-                    return self.rule_based_answer(prompt)
-    def clean_response(self, response, prompt):
-        """Clean up the LLM response to extract the answer."""
-        # Remove the prompt from the beginning if it's included
-        if response.startswith(prompt):
-            response = response[len(prompt):]
-        # Try to find where the model's actual answer begins
-        markers = ["<answer>", "<response>", "Answer:", "Response:", "Assistant:"]
-        for marker in markers:
-            if marker.lower() in response.lower():
-                parts = response.lower().split(marker.lower(), 1)
-                if len(parts) > 1:
-                    response = parts[1].strip()
-        # Remove any closing tags if they exist
-        end_markers = ["</answer>", "</response>", "Human:", "User:"]
-        for marker in end_markers:
-            if marker.lower() in response.lower():
-                response = response.lower().split(marker.lower())[0].strip()
-        return response.strip()
-    def rule_based_answer(self, question):
-        """Fallback method using rule-based answers for common question types."""
-        question_lower = question.lower()
-        # Simple pattern matching for common question types
-        if "what is" in question_lower or "define" in question_lower:
-            if "agent" in question_lower:
-                return "An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals."
-            if "gaia" in question_lower:
-                return "GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks."
-        if "how to" in question_lower:
-            return "To accomplish this task, you should first understand the requirements, then implement a solution step by step, and finally test your implementation."
-        if "example" in question_lower:
-            return "Here's an example implementation that demonstrates the concept in a practical manner."
-        # More specific fallback answers instead of a generic one
-        if "tools" in question_lower:
-            return "Tools for AI agents include web search, content extraction, API connections, and various knowledge retrieval mechanisms."
-        if "chain" in question_lower:
-            return "Chain-of-thought reasoning allows AI agents to break down complex problems into sequential steps, improving accuracy and transparency."
-        if "purpose" in question_lower or "goal" in question_lower:
-            return "The purpose of AI agents is to assist users by answering questions, performing tasks, and providing helpful information while maintaining ethical standards."
-        # Default response for truly unmatched questions - more specific than before
-        return "This question relates to AI agent capabilities. To provide a more precise answer, I would need additional information or context about the specific aspect of AI agents you're interested in."
-    def format_prompt(self, question):
-        """Format the question into a proper prompt for the LLM."""
-        return f"""You are an intelligent AI assistant. Please answer the following question accurately and concisely:
-Question: {question}
-Answer:"""
-    def __call__(self, question: str) -> str:
-        print(f"Agent received question: {question}...")
-        try:
-            # Format the question as a prompt
-            prompt = self.format_prompt(question)
-            # Query the LLM
-            answer = self.query_llm(prompt)
-            print(f"Agent returning answer: {answer}...")
-            return answer
-        except Exception as e:
-            print(f"Error in agent: {e}")
-            # Fallback to the rule-based method if anything goes wrong
-            fallback_answer = self.rule_based_answer(question)
-            print(f"Agent returning fallback answer: {fallback_answer}...")
-            return fallback_answer
-def load_guest_dataset():
-    """
-    Placeholder function to prevent errors. If actual guest data is needed,
-    this would be implemented properly.
-    """
-    class GuestInfoTool(Tool):
-        name = "guest_info"
-        description = "Get information about guests"
-        def forward(self, query):
-            return "Guest information not available in this version"
-    return GuestInfoTool()
-def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
-    Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
     space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
-        username= f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
@@ -959,57 +130,18 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
-    submit_url = f"{api_url}/submit"    # 1. Instantiate Agent ( modify this part to create your agent)
-    try:
-        print("Initializing GaiaAgent...")
-        # Use GaiaAgent as the primary agent
-        agent = GaiaAgent()
-        # Skip the CodeAgent setup that's overriding our GaiaAgent
-        """
-        # Initialize the Hugging Face model
-        model = InferenceClientModel()
-        # Initialize the web search tool
-        #search_tool = DuckDuckGoSearchTool()
-        # Initialize the weather tool
-        #weather_info_tool = WeatherInfoTool()
-        # Initialize the Hub stats tool
-        #hub_stats_tool = HubStatsTool()
-        # Load the guest dataset and initialize the guest info tool
-        guest_info_tool = load_guest_dataset()
-        # Initialize the Hugging Face model
-        model = InferenceClientModel()
-        # Load the DuckDuckGo search tool dynamically
-        search_tool = load_tool(repo_id="smol-ai/duckduckgo-search", trust_remote_code=True)
-        agent = CodeAgent(
-            tools=[guest_info_tool, search_tool],
-            model=model,
-            add_base_tools=True,  # Add any additional base tools
-            planning_interval=3   # Enable planning every 3 steps
-        )
-        """
-        print("GaiaAgent initialization complete.")
     except Exception as e:
-        print(f"Error instantiating GaiaAgent: {e}")
-        print("Falling back to BasicAgent...")
-        try:
-            agent = BasicAgent()
-            print("BasicAgent initialization complete.")
-        except Exception as e:
-            print(f"Error instantiating BasicAgent: {e}")
-            return f"Error initializing agents: {e}", None
-    # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
-    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    print(agent_code)
     # 2. Fetch Questions
     print(f"Fetching questions from: {questions_url}")
@@ -1018,21 +150,17 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
-             print("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
-    except requests.exceptions.JSONDecodeError as e:
-         print(f"Error decoding JSON response from questions endpoint: {e}")
-         print(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
-    # 3. Run your Agent
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
@@ -1043,29 +171,54 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = agent(question_text)
-            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
-             print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    # 4. Prepare Submission
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
-    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
-    print(status_update)
-    # 5. Submit
-    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:
         response = requests.post(submit_url, json=submission_data, timeout=60)
         response.raise_for_status()
         result_data = response.json()
-        final_status = (
             f"Submission Successful!\n"
             f"User: {result_data.get('username')}\n"
             f"Overall Score: {result_data.get('score', 'N/A')}% "
@@ -1073,60 +226,28 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             f"Message: {result_data.get('message', 'No message received.')}"
         )
         print("Submission successful.")
-        results_df = pd.DataFrame(results_log)
-        return final_status, results_df
-    except requests.exceptions.HTTPError as e:
-        error_detail = f"Server responded with status {e.response.status_code}."
-        try:
-            error_json = e.response.json()
-            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
-        except requests.exceptions.JSONDecodeError:
-            error_detail += f" Response: {e.response.text[:500]}"
-        status_message = f"Submission Failed: {error_detail}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.Timeout:
-        status_message = "Submission Failed: The request timed out."
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.RequestException as e:
-        status_message = f"Submission Failed: Network error - {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except Exception as e:
-        status_message = f"An unexpected error occurred during submission: {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Basic Agent Evaluation Runner")
     gr.Markdown(
         """
         **Instructions:**
-        1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
-        2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
-        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
-        ---
-        **Disclaimers:**
-        Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
-        This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
         """
     )
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
-    # Removed max_rows=10 from DataFrame constructor
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(
@@ -1135,25 +256,5 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    print("\n" + "-"*30 + " App Starting " + "-"*30)
-    # Check for SPACE_HOST and SPACE_ID at startup for information
-    space_host_startup = os.getenv("SPACE_HOST")
-    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
-    if space_host_startup:
-        print(f"✅ SPACE_HOST found: {space_host_startup}")
-        print(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
-    else:
-        print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
-    if space_id_startup: # Print repo URLs if SPACE_ID is found
-        print(f"✅ SPACE_ID found: {space_id_startup}")
-        print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
-        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
-    else:
-        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
-    print("-"*(60 + len(" App Starting ")) + "\n")
-    print("Launching Gradio Interface for Basic Agent Evaluation...")
     demo.launch(debug=True, share=False)

 from duckduckgo_search import DDGS
 import pytube
 from dateutil import parser
 try:
     from youtube_transcript_api import YouTubeTranscriptApi
 except ImportError:
 from smolagents import Tool, CodeAgent, InferenceClientModel
+# Import internal modules
+from config import (
+    DEFAULT_API_URL, LLAMA_API_URL, HF_API_TOKEN, HEADERS,
+    MAX_RETRIES, RETRY_DELAY
+)
+from tools.tool_manager import ToolManager
+class GaiaToolCallingAgent:
+    """Tool-calling agent specifically designed for the GAIA system."""
     def __init__(self):
+        print("GaiaToolCallingAgent initialized.")
+        self.tool_manager = ToolManager()
+        self.name = "tool_agent"  # Add required name attribute for smolagents integration
+        self.description = "A specialized agent that uses various tools to answer questions"  # Required by smolagents
+    def run(self, query: str) -> str:
+        """Process a query and return a response using available tools."""
+        tools = self.tool_manager.get_tools()
+        # For each tool, try to get relevant information
+        context_info = []
+        for tool in tools:
             try:
+                if self._should_use_tool(tool, query):
+                    result = tool.forward(query)
+                    if result:
+                        context_info.append(f"{tool.name} Results:\n{result}")
             except Exception as e:
+                print(f"Error using {tool.name}: {e}")
+        # Combine all context information
+        full_context = "\n\n".join(context_info) if context_info else ""
+        return full_context
+    def __call__(self, query: str) -> str:
+        """Make the agent callable so it can be used directly by CodeAgent."""
+        print(f"Tool agent received query: {query}")
+        return self.run(query)
+    def _should_use_tool(self, tool: Tool, query: str) -> bool:
+        """Determine if a specific tool should be used for the query."""
+        query_lower = query.lower()
+        # Tool-specific patterns
+        patterns = {
+            "web_search": ["current", "latest", "recent", "who", "what", "when", "where", "how"],
+            "web_content": ["content", "webpage", "website", "page"],
+            "youtube_video": ["youtube.com", "youtu.be"],
+            "wikipedia_search": ["wikipedia", "wiki", "article"],
+            "gaia_retriever": ["gaia", "agent", "ai", "artificial intelligence"]
         }
+        return any(pattern in query_lower for pattern in patterns.get(tool.name, []))
+def create_manager_agent() -> CodeAgent:
+    """Create and configure the main GAIA agent."""
+    # Initialize the managed tool-calling agent
+    tool_agent = GaiaToolCallingAgent()
+    # Create the manager agent
+    manager_agent = CodeAgent(
+        model=InferenceClientModel(),
+        tools=[],  # No direct tools for manager
+        managed_agents=[tool_agent],
+        additional_authorized_imports=[
+            "json",
+            "pandas",
+            "numpy",
+            "re",
+            "requests",
+            "bs4"
+        ],
+        planning_interval=3,
+        verbosity_level=2,
+        max_steps=10
+    )
+    return manager_agent
+def create_agent():
+    """Create the GAIA agent system."""
+    try:
+        print("Initializing GAIA agent system...")
+        return create_manager_agent()
+    except Exception as e:
+        print(f"Error creating GAIA agent: {e}")
+        return None
+def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
+    Fetches all questions, runs the GAIA agent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
     space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
+        username = f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
+    submit_url = f"{api_url}/submit"
+    # 1. Initialize Agent
+    try:
+        print("Initializing GAIA agent system...")
+        agent = create_agent()
+        if not agent:
+            return "Error: Could not initialize agent.", None
+        print("GAIA agent initialization complete.")
     except Exception as e:
+        print(f"Error initializing agent: {e}")
+        return f"Error initializing agent: {e}", None
     # 2. Fetch Questions
     print(f"Fetching questions from: {questions_url}")
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
+            print("Fetched questions list is empty.")
+            return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
+    # 3. Run Agent on Questions
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            # Run the agent and get the response
+            response = agent.run(f"Answer this question concisely: {question_text}")
+            # Clean up the response if needed
+            if isinstance(response, dict):
+                submitted_answer = response.get("answer", str(response))
+            else:
+                submitted_answer = str(response)
+            # Add to submission payload
+            answers_payload.append({
+                "task_id": task_id,
+                "submitted_answer": submitted_answer
+            })
+            # Log the result
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text,
+                "Submitted Answer": submitted_answer
+            })
         except Exception as e:
+            print(f"Error running agent on task {task_id}: {e}")
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text,
+                "Submitted Answer": f"AGENT ERROR: {e}"
+            })
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    # 4. Prepare Submission
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    submission_data = {
+        "username": username.strip(),
+        "agent_code": agent_code,
+        "answers": answers_payload
+    }    # 5. Submit
+    print(f"Submitting {len(answers_payload)} answers to API...")
     try:
         response = requests.post(submit_url, json=submission_data, timeout=60)
         response.raise_for_status()
         result_data = response.json()
+        status_message = (
             f"Submission Successful!\n"
             f"User: {result_data.get('username')}\n"
             f"Overall Score: {result_data.get('score', 'N/A')}% "
             f"Message: {result_data.get('message', 'No message received.')}"
         )
         print("Submission successful.")
+        return status_message, pd.DataFrame(results_log)
     except Exception as e:
+        status_message = f"Submission Failed: {str(e)}"
+        print(f"Error during submission: {e}")
+        return status_message, pd.DataFrame(results_log)
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
+    gr.Markdown("# GAIA Agent Evaluation Runner")
     gr.Markdown(
         """
         **Instructions:**
+        1. Log in to your Hugging Face account using the button below.
+        2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run the agent, and see the score.
+        The agent uses a managed tool-calling architecture and the smolagents framework for reliable answers.
         """
     )
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(
     )
 if __name__ == "__main__":
+    print("\n" + "-"*30 + " GAIA Agent Starting " + "-"*30)
     demo.launch(debug=True, share=False)

config.py CHANGED Viewed

@@ -11,29 +11,17 @@ HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"} if HF_API_TOKEN else {}
 MAX_RETRIES = 3
 RETRY_DELAY = 2  # seconds
-# --- Knowledge Base Content ---
-GAIA_KNOWLEDGE = """
-### AI and Agent Concepts
-- An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals.
-- GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks.
-- The agent loop consists of perception, reasoning, and action.
-- RAG (Retrieval-Augmented Generation) combines retrieval of relevant information with generation capabilities of language models.
-- An LLM (Large Language Model) is a neural network trained on vast amounts of text data to understand and generate human language.
-### Agent Capabilities
-- Tool use refers to an agent's ability to employ external tools like search engines, APIs, or specialized algorithms.
-- An effective agent should be able to decompose complex problems into manageable parts.
-- Chain-of-thought reasoning allows agents to break down problem-solving steps to improve accuracy.
-- Agents should apply appropriate reasoning strategies based on the type of question (factual, analytical, etc.)
-- Self-reflection helps agents identify and correct errors in their reasoning.
-### Evaluation Criteria
-- Agent responses should be accurate, relevant, and factually correct.
-- Effective agents provide concise yet comprehensive answers.
-- Agents should acknowledge limitations and uncertainties when appropriate.
-- Good agents can follow multi-step instructions and fulfill all requirements.
-- Reasoning transparency helps users understand how the agent arrived at its conclusions.
-"""
 # --- Tool Pattern Matching ---
 YOUTUBE_PATTERNS = ["youtube.com", "youtu.be", "video", "watch?v=", "channel"]

 MAX_RETRIES = 3
 RETRY_DELAY = 2  # seconds
+# --- Knowledge Base Configuration ---
+KNOWLEDGE_BASE_PATH = os.path.join(os.path.dirname(__file__), 'data', 'knowledge_base.txt')
+def load_knowledge_base():
+    """Load knowledge base from file"""
+    try:
+        with open(KNOWLEDGE_BASE_PATH, 'r', encoding='utf-8') as f:
+            return f.read()
+    except FileNotFoundError:
+        os.makedirs(os.path.dirname(KNOWLEDGE_BASE_PATH), exist_ok=True)
+        return ""
 # --- Tool Pattern Matching ---
 YOUTUBE_PATTERNS = ["youtube.com", "youtu.be", "video", "watch?v=", "channel"]

data/knowledge_base.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+### AI and Agent Concepts
+- An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals.
+- GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks.
+- The agent loop consists of perception, reasoning, and action.
+- RAG (Retrieval-Augmented Generation) combines retrieval of relevant information with generation capabilities of language models.
+- An LLM (Large Language Model) is a neural network trained on vast amounts of text data to understand and generate human language.
+### Agent Capabilities
+- Tool use refers to an agent's ability to employ external tools like search engines, APIs, or specialized algorithms.
+- An effective agent should be able to decompose complex problems into manageable parts.
+- Chain-of-thought reasoning allows agents to break down problem-solving steps to improve accuracy.
+- Agents should apply appropriate reasoning strategies based on the type of question (factual, analytical, etc.)
+- Self-reflection helps agents identify and correct errors in their reasoning.
+### Evaluation Criteria
+- Agent responses should be accurate, relevant, and factually correct.
+- Effective agents provide concise yet comprehensive answers.
+- Agents should acknowledge limitations and uncertainties when appropriate.
+- Good agents can follow multi-step instructions and fulfill all requirements.
+- Reasoning transparency helps users understand how the agent arrived at its conclusions.

tools/tool_manager.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+Tool manager for the GAIA Agent.
+This handles the coordination between different tools and provides them to the agent.
+"""
+from smolagents import Tool
+from typing import Dict, List, Any
+from langchain.docstore.document import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.retrievers import BM25Retriever
+import functools
+from config import load_knowledge_base
+from tools.web_tools import WebSearchTool, WebContentTool
+from tools.youtube_tool import YoutubeVideoTool
+from tools.wikipedia_tool import WikipediaTool
+from tools.knowledge_tool import GaiaRetrieverTool
+class ToolManager:
+    """
+    Manages and initializes all available tools for the GAIA agent.
+    Also provides tool selection logic based on question analysis.
+    """
+    def __init__(self):
+        # Load and process knowledge base
+        knowledge_text = load_knowledge_base()
+        self.knowledge_docs = self._create_knowledge_documents(knowledge_text)
+        # Initialize tools
+        self.tools = self._initialize_tools()
+    def _create_knowledge_documents(self, text: str) -> List[Document]:
+        """Create searchable documents from knowledge base text."""
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=500,
+            chunk_overlap=50,
+            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
+        )
+        knowledge_chunks = text_splitter.split_text(text)
+        return [Document(page_content=chunk) for chunk in knowledge_chunks]
+    def _initialize_tools(self) -> List[Tool]:
+        """Initialize all available tools."""
+        return [
+            GaiaRetrieverTool(self.knowledge_docs),
+            WebSearchTool(),
+            WebContentTool(),
+            YoutubeVideoTool(),
+            WikipediaTool(),
+        ]
+    def get_tools(self) -> List[Tool]:
+        """Return all available tools."""
+        return self.tools