davidgturner commited on
Commit
f5bafc2
·
1 Parent(s): 08e2c16

- changes for running agent

Browse files
Files changed (4) hide show
  1. app.py +136 -1035
  2. config.py +10 -22
  3. data/knowledge_base.txt +20 -0
  4. tools/tool_manager.py +56 -0
app.py CHANGED
@@ -14,7 +14,6 @@ from bs4 import BeautifulSoup
14
  from duckduckgo_search import DDGS
15
  import pytube
16
  from dateutil import parser
17
- import pandas as pd
18
  try:
19
  from youtube_transcript_api import YouTubeTranscriptApi
20
  except ImportError:
@@ -22,936 +21,108 @@ except ImportError:
22
 
23
  from smolagents import Tool, CodeAgent, InferenceClientModel
24
 
25
- import random
26
- from smolagents import CodeAgent, InferenceClientModel
27
-
28
- # Import our custom tools from their modules
29
- # from smolagents.tools import DuckDuckGoSearchTool, WeatherInfoTool, HubStatsTool
30
- # from smolagents.tools import WebPageVisitTool, WebpageContentExtractorTool
31
-
32
- from smolagents import CodeAgent, InferenceClientModel, load_tool
33
-
34
-
35
- # Import necessary libraries
36
- import random
37
- from smolagents import CodeAgent, InferenceClientModel
38
-
39
- # Import our custom tools from their modules
40
- # from tools import DuckDuckGoSearchTool, WeatherInfoTool, HubStatsTool
41
- # from retriever import load_guest_dataset
42
-
43
- from langchain.docstore.document import Document
44
- from langchain.text_splitter import RecursiveCharacterTextSplitter
45
- from langchain_community.retrievers import BM25Retriever
46
- import functools
47
-
48
- # Create a knowledge base for the agent
49
- GAIA_KNOWLEDGE = """
50
- ### AI and Agent Concepts
51
- - An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals.
52
- - GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks.
53
- - The agent loop consists of perception, reasoning, and action.
54
- - RAG (Retrieval-Augmented Generation) combines retrieval of relevant information with generation capabilities of language models.
55
- - An LLM (Large Language Model) is a neural network trained on vast amounts of text data to understand and generate human language.
56
-
57
- ### Agent Capabilities
58
- - Tool use refers to an agent's ability to employ external tools like search engines, APIs, or specialized algorithms.
59
- - An effective agent should be able to decompose complex problems into manageable parts.
60
- - Chain-of-thought reasoning allows agents to break down problem-solving steps to improve accuracy.
61
- - Agents should apply appropriate reasoning strategies based on the type of question (factual, analytical, etc.)
62
- - Self-reflection helps agents identify and correct errors in their reasoning.
63
-
64
- ### Evaluation Criteria
65
- - Agent responses should be accurate, relevant, and factually correct.
66
- - Effective agents provide concise yet comprehensive answers.
67
- - Agents should acknowledge limitations and uncertainties when appropriate.
68
- - Good agents can follow multi-step instructions and fulfill all requirements.
69
- - Reasoning transparency helps users understand how the agent arrived at its conclusions.
70
- """
71
-
72
- # (Keep Constants as is)
73
- # --- Constants ---
74
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
75
-
76
- # Use a more powerful model for better responses
77
- LLAMA_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
78
- HF_API_TOKEN = os.getenv("HF_API_TOKEN")
79
- HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"} if HF_API_TOKEN else {}
80
- MAX_RETRIES = 3
81
- RETRY_DELAY = 2 # seconds
82
-
83
- # Create knowledge base documents
84
- def create_knowledge_documents():
85
- text_splitter = RecursiveCharacterTextSplitter(
86
- chunk_size=500,
87
- chunk_overlap=50,
88
- separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
89
- )
90
- knowledge_chunks = text_splitter.split_text(GAIA_KNOWLEDGE)
91
- return [Document(page_content=chunk) for chunk in knowledge_chunks]
92
 
93
- # --- Basic Agent Definition ---
94
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
95
-
96
- # --- Tools ---
97
- class WebSearchTool(Tool):
98
- name = "web_search"
99
- description = "Search the web for information about a query using DuckDuckGo."
100
- inputs = {
101
- "query": {
102
- "type": "string",
103
- "description": "The search query."
104
- }
105
- }
106
- output_type = "string"
107
-
108
- def __init__(self, **kwargs):
109
- super().__init__(**kwargs)
110
- self.max_results = 3
111
-
112
- def forward(self, query: str) -> str:
113
- assert isinstance(query, str), "Query must be a string."
114
- try:
115
- results = []
116
- with DDGS() as ddgs:
117
- ddgs_results = list(ddgs.text(query, max_results=self.max_results))
118
- if not ddgs_results:
119
- return "No web search results found."
120
- formatted_results = "\nWeb Search Results:\n"
121
- for i, r in enumerate(ddgs_results, 1):
122
- formatted_results += f"\n{i}. {r['title']}\n {r['body']}\n Source: {r['href']}\n"
123
- return formatted_results
124
- except Exception as e:
125
- print(f"Error in web search: {str(e)}")
126
- return f"Error performing web search: {str(e)}"
127
-
128
- class WebContentTool(Tool):
129
- name = "web_content"
130
- description = "Fetch and extract content from a specific webpage."
131
- inputs = {
132
- "url": {
133
- "type": "string",
134
- "description": "The URL of the webpage to fetch content from."
135
- }
136
- }
137
- output_type = "string"
138
-
139
- def forward(self, url: str) -> str:
140
- assert isinstance(url, str), "URL must be a string."
141
- try:
142
- headers = {
143
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
144
- }
145
- response = requests.get(url, headers=headers, timeout=10)
146
- response.raise_for_status()
147
- soup = BeautifulSoup(response.text, 'html.parser')
148
- for script in soup(["script", "style"]):
149
- script.extract()
150
- text = soup.get_text(separator='\n')
151
- lines = (line.strip() for line in text.splitlines())
152
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
153
- text = '\n'.join(chunk for chunk in chunks if chunk)
154
- if len(text) > 2000:
155
- text = text[:2000] + "... [content truncated]"
156
- return f"Content from {url}:\n\n{text}"
157
- except Exception as e:
158
- print(f"Error fetching web content: {str(e)}")
159
- return f"Error fetching content from {url}: {str(e)}"
160
-
161
- class GaiaRetrieverTool(Tool):
162
- name = "gaia_retriever"
163
- description = "Semantic search for retrieving relevant information for GaiaAgent."
164
- inputs = {
165
- "query": {
166
- "type": "string",
167
- "description": "Query for semantic search."
168
- }
169
- }
170
- output_type = "string"
171
-
172
- def __init__(self, docs, **kwargs):
173
- super().__init__(**kwargs)
174
- self.retriever = BM25Retriever.from_documents(docs, k=3)
175
- self.docs = docs # Store docs for fallback
176
-
177
- def forward(self, query: str) -> str:
178
- assert isinstance(query, str), "Query must be a string."
179
- try:
180
- docs = self.retriever.invoke(query)
181
- if not docs:
182
- return "\nNo specific information found. Here's some general knowledge:\n" + "".join([
183
- f"\n- {self.docs[i].page_content}" for i in range(min(3, len(self.docs)))
184
- ])
185
- return "\nRetrieved Information:\n" + "".join([
186
- f"\n- {doc.page_content}" for doc in docs
187
- ])
188
- except Exception as e:
189
- print(f"Error in retriever: {str(e)}")
190
- return f"Unable to retrieve specific information. The agent will rely on its general knowledge."
191
-
192
- # --- Agent ---
193
- class YoutubeVideoTool(Tool):
194
- name = "youtube_video"
195
- description = "Analyze YouTube videos to answer questions about their content."
196
- inputs = {
197
- "video_url": {
198
- "type": "string",
199
- "description": "The YouTube video URL"
200
- }
201
- }
202
- output_type = "string"
203
-
204
- def forward(self, video_url: str) -> str:
205
- assert isinstance(video_url, str), "Video URL must be a string"
206
- try:
207
- # Extract video ID from URL
208
- if "youtu.be" in video_url:
209
- video_id = video_url.split("/")[-1].split("?")[0]
210
- else:
211
- video_id = re.search(r'v=([^&]+)', video_url).group(1)
212
-
213
- # Get video info
214
- yt = pytube.YouTube(video_url)
215
- title = yt.title
216
- author = yt.author
217
- length = yt.length # in seconds
218
- description = yt.description
219
-
220
- # Try to get transcript
221
- transcript_text = ""
222
- try:
223
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
224
- transcript_text = "\n".join([f"{item['start']:.1f}s: {item['text']}" for item in transcript])
225
- except Exception as e:
226
- transcript_text = f"Could not retrieve transcript: {str(e)}"
227
-
228
- result = f"""
229
- YouTube Video Analysis:
230
- Title: {title}
231
- Author: {author}
232
- Length: {length//60} minutes {length%60} seconds
233
- Description: {description[:500]}... [truncated]
234
-
235
- Transcript Excerpts:
236
- {transcript_text[:2000]}... [transcript truncated]
237
- """
238
- return result
239
-
240
- except Exception as e:
241
- print(f"Error analyzing YouTube video: {str(e)}")
242
- return f"Error analyzing YouTube video {video_url}: {str(e)}"
243
-
244
- class WikipediaTool(Tool):
245
- name = "wikipedia_search"
246
- description = "Search Wikipedia for information about a topic."
247
- inputs = {
248
- "query": {
249
- "type": "string",
250
- "description": "The search query"
251
- }
252
- }
253
- output_type = "string"
254
 
255
- def forward(self, query: str) -> str:
256
- assert isinstance(query, str), "Query must be a string"
257
- try:
258
- search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={query}&format=json"
259
- search_response = requests.get(search_url, timeout=10)
260
- search_data = search_response.json()
261
-
262
- if "query" not in search_data or "search" not in search_data["query"] or not search_data["query"]["search"]:
263
- return f"No Wikipedia results found for {query}"
264
-
265
- # Get the first result
266
- first_result = search_data["query"]["search"][0]
267
- page_id = first_result["pageid"]
268
-
269
- # Get the page content
270
- content_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exintro&explaintext&pageids={page_id}&format=json"
271
- content_response = requests.get(content_url, timeout=10)
272
- content_data = content_response.json()
273
-
274
- extract = content_data["query"]["pages"][str(page_id)]["extract"]
275
- title = content_data["query"]["pages"][str(page_id)]["title"]
276
-
277
- return f"""Wikipedia: {title}
278
-
279
- {extract[:1500]}... [content truncated]
280
-
281
- Source: https://en.wikipedia.org/wiki/{title.replace(' ', '_')}
282
- """
283
- except Exception as e:
284
- print(f"Error searching Wikipedia: {str(e)}")
285
- return f"Error searching Wikipedia for {query}: {str(e)}"
286
-
287
- class GaiaAgent:
288
  def __init__(self):
289
- print("GaiaAgent initialized.")
290
- # Create knowledge base documents
291
- self.knowledge_docs = create_knowledge_documents()
 
292
 
293
- # Create our tools
294
- self.retriever_tool = GaiaRetrieverTool(self.knowledge_docs)
295
- self.web_search_tool = WebSearchTool()
296
- self.web_content_tool = WebContentTool()
297
- self.youtube_tool = YoutubeVideoTool()
298
- self.wikipedia_tool = WikipediaTool()
299
-
300
- # Initialize the Hugging Face model
301
- self.model = InferenceClientModel()
302
-
303
- # Initialize the web search tool
304
- # self.search_tool = DuckDuckGoSearchTool()
305
-
306
- # Initialize the weather tool
307
- # self.weather_info_tool = WeatherInfoTool()
308
-
309
- # Initialize the Hub stats tool
310
- # self.hub_stats_tool = HubStatsTool()
311
-
312
- # Load the guest dataset and initialize the guest info tool
313
- # self.guest_info_tool = load_guest_dataset()
314
-
315
- # Set up LLM API access
316
- self.hf_api_url = LLAMA_API_URL
317
- self.headers = HEADERS
318
 
319
- # Set up caching for responses
320
- self.cache = {}
321
 
322
- def query_llm(self, prompt):
323
- """Send a prompt to the LLM API and return the response."""
324
- # Check cache first
325
- if prompt in self.cache:
326
- print("Using cached response")
327
- return self.cache[prompt]
328
-
329
- if not HF_API_TOKEN:
330
- # Fallback to rule-based approach if no API token
331
- return self.rule_based_answer(prompt)
332
-
333
- payload = {
334
- "inputs": prompt,
335
- "parameters": {
336
- "max_new_tokens": 512,
337
- "temperature": 0.7,
338
- "top_p": 0.9,
339
- "do_sample": True
340
- }
341
- }
342
-
343
- for attempt in range(MAX_RETRIES):
344
  try:
345
- response = requests.post(self.hf_api_url, headers=self.headers, json=payload, timeout=30)
346
- response.raise_for_status()
347
- result = response.json()
348
-
349
- # Extract the generated text from the response
350
- if isinstance(result, list) and len(result) > 0:
351
- generated_text = result[0].get("generated_text", "")
352
- # Clean up the response to get just the answer
353
- clean_response = self.clean_response(generated_text, prompt)
354
- # Cache the response
355
- self.cache[prompt] = clean_response
356
- return clean_response
357
- return "I couldn't generate a proper response."
358
-
359
  except Exception as e:
360
- print(f"Attempt {attempt+1}/{MAX_RETRIES} failed: {str(e)}")
361
- if attempt < MAX_RETRIES - 1:
362
- time.sleep(RETRY_DELAY)
363
- else:
364
- # Fall back to rule-based method on failure
365
- return self.rule_based_answer(prompt)
366
-
367
- def clean_response(self, response, prompt):
368
- """Clean up the LLM response to extract the answer."""
369
- # Remove the prompt from the beginning if it's included
370
- if response.startswith(prompt):
371
- response = response[len(prompt):]
372
-
373
- # Try to find where the model's actual answer begins
374
- markers = ["<answer>", "<response>", "Answer:", "Response:", "Assistant:"]
375
- for marker in markers:
376
- if marker.lower() in response.lower():
377
- parts = response.lower().split(marker.lower(), 1)
378
- if len(parts) > 1:
379
- response = parts[1].strip()
380
-
381
- # Remove any closing tags if they exist
382
- end_markers = ["</answer>", "</response>", "Human:", "User:"]
383
- for marker in end_markers:
384
- if marker.lower() in response.lower():
385
- response = response.lower().split(marker.lower())[0].strip()
386
-
387
- return response.strip()
388
-
389
- def rule_based_answer(self, question):
390
- """Fallback method using rule-based answers for common question types."""
391
- question_lower = question.lower()
392
-
393
- # Simple pattern matching for common question types
394
- if "what is" in question_lower or "define" in question_lower:
395
- if "agent" in question_lower:
396
- return "An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals."
397
- if "gaia" in question_lower:
398
- return "GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks."
399
- if "llm" in question_lower or "large language model" in question_lower:
400
- return "A Large Language Model (LLM) is a neural network trained on vast amounts of text data to understand and generate human language."
401
- if "rag" in question_lower or "retrieval" in question_lower:
402
- return "RAG (Retrieval-Augmented Generation) combines retrieval of relevant information with generation capabilities of language models."
403
-
404
- if "how to" in question_lower:
405
- return "To accomplish this task, you should first understand the requirements, then implement a solution step by step, and finally test your implementation."
406
-
407
- if "example" in question_lower:
408
- return "Here's an example implementation that demonstrates the concept in a practical manner."
409
-
410
- if "evaluate" in question_lower or "criteria" in question_lower:
411
- return "Evaluation criteria for agents typically include accuracy, relevance, factual correctness, conciseness, ability to follow instructions, and transparency in reasoning."
412
-
413
- # More specific fallback answers instead of a generic one
414
- if "tools" in question_lower:
415
- return "Tools for AI agents include web search, content extraction, API connections, and various knowledge retrieval mechanisms."
416
- if "chain" in question_lower:
417
- return "Chain-of-thought reasoning allows AI agents to break down complex problems into sequential steps, improving accuracy and transparency."
418
- if "purpose" in question_lower or "goal" in question_lower:
419
- return "The purpose of AI agents is to assist users by answering questions, performing tasks, and providing helpful information while maintaining ethical standards."
420
-
421
- # Default response for truly unmatched questions - more specific than before
422
- return "This question relates to AI agent capabilities. While I don't have a specific pre-programmed answer, I can recommend reviewing literature on agent architectures, tool use in LLMs, and evaluation methods in AI systems."
423
-
424
- def determine_tools_needed(self, question):
425
- """Determine which tools should be used for a given question."""
426
- question_lower = question.lower()
427
-
428
- # Check for YouTube links
429
- youtube_patterns = ["youtube.com", "youtu.be"]
430
- needs_youtube = any(pattern in question_lower for pattern in youtube_patterns)
431
-
432
- # Check if this is a reverse text question
433
- is_reverse_text = question_lower != question_lower[::-1] and len(set(question_lower)) < 30
434
-
435
- # Check for Wikipedia-related questions
436
- wiki_patterns = ["wikipedia", "article", "published", "paper", "study", "research"]
437
- needs_wikipedia = any(pattern in question_lower for pattern in wiki_patterns)
438
-
439
- # Patterns that suggest the need for web search
440
- web_search_patterns = [
441
- "current", "latest", "recent", "news", "update", "today",
442
- "statistics", "data", "facts", "information about", "published",
443
- "what is happening", "how many", "where is", "when was", "who", "which",
444
- "country", "city", "2023", "2022", "published", "album", "studio", "paper",
445
- "olympics", "sport", "athlete", "player", "pitcher", "baseball", "competition",
446
- "name", "first", "last", "actor", "played", "version", "language", "company"
447
- ]
448
 
449
- # Check if the question likely needs web search
450
- needs_web_search = any(pattern in question_lower for pattern in web_search_patterns)
451
- # Check if question appears to be about GAIA, agents, or AI concepts
452
- needs_knowledge_retrieval = any(term in question_lower for term in
453
- ["agent", "gaia", "llm", "ai", "artificial intelligence",
454
- "evaluation", "tool", "rag", "retrieval"])
455
 
456
- # Determine which tools to use based on the analysis
457
- return {
458
- "use_youtube": needs_youtube,
459
- "use_wikipedia": needs_wikipedia,
460
- "is_reverse_text": is_reverse_text,
461
- "use_web_search": needs_web_search,
462
- "use_knowledge_retrieval": needs_knowledge_retrieval,
463
- "use_webpage_visit": "example" in question_lower or "details" in question_lower or "explain" in question_lower or "link" in question_lower
464
- }
465
-
466
- def handle_special_questions(self, question, tool_selection):
467
- """Handle specific question types that require special logic."""
468
- question_lower = question.lower()
469
-
470
- # Handle reverse text questions - generalized approach
471
- if tool_selection.get("is_reverse_text", False):
472
- # Check if this looks like a reverse text puzzle
473
- if "rewsna" in question_lower: # "answer" reversed
474
- reversed_question = question[::-1]
475
- print(f"Detected reverse text question, reversed: {reversed_question}")
476
- # Use the LLM to answer the reversed question
477
- reversed_prompt = self.format_prompt(reversed_question)
478
- answer = self.query_llm(reversed_prompt)
479
- return self.extract_final_answer(answer)
480
-
481
- # Handle mathematical table analysis - look for patterns
482
- if "table" in question_lower and ("commutative" in question_lower or "operation" in question_lower):
483
- # Extract table data and analyze mathematically
484
- return self.analyze_table(question)
485
-
486
- # Handle grocery/botany questions - use categorization
487
- if "grocery" in question_lower and "botany" in question_lower:
488
- return self.analyze_botanical_categories(question)
489
-
490
- # Handle file analysis questions - Excel, Python, Audio etc.
491
- file_extensions = ["excel", "xlsx", "csv", "python", ".py", "mp3", "wav", "audio"]
492
- if any(ext in question_lower for ext in file_extensions):
493
- if "excel" in question_lower or "xlsx" in question_lower:
494
- return self.analyze_excel_data(question)
495
- elif "python" in question_lower or ".py" in question_lower:
496
- return self.analyze_python_code(question)
497
- elif any(audio in question_lower for audio in ["mp3", "wav", "audio", "voice memo"]):
498
- return self.analyze_audio_content(question)
499
- return None
500
-
501
- def analyze_table(self, question):
502
- """Analyze mathematical table for patterns - generalized approach."""
503
- # Look for table data in the question and analyze commutativity
504
- # This should extract table elements and check mathematical properties
505
- if "commutative" in question.lower():
506
- # Use regex to find table elements or parse structured data
507
- # For now, use LLM to analyze the mathematical content
508
- table_prompt = f"""Analyze the mathematical table in this question and determine the answer:
509
-
510
- {question}
511
-
512
- Look for patterns in commutativity, operations, or mathematical relationships.
513
- Provide only the direct answer requested."""
514
-
515
- answer = self.query_llm(table_prompt)
516
- return self.extract_final_answer(answer)
517
- return None
518
-
519
- def analyze_botanical_categories(self, question):
520
- """Analyze botanical categories from grocery items - generalized approach."""
521
- # Extract grocery items and categorize botanically
522
- botanical_prompt = f"""Analyze the grocery items in this question from a botanical perspective:
523
-
524
- {question}
525
-
526
- Identify which items are true botanical vegetables (not fruits, seeds, or other plant parts).
527
- Provide the answer in the exact format requested."""
528
- answer = self.query_llm(botanical_prompt)
529
- return self.extract_final_answer(answer)
530
-
531
- def analyze_excel_data(self, question):
532
- """Analyze Excel spreadsheet data - generalized approach."""
533
- # Parse Excel data mentioned in question and perform calculations
534
- excel_prompt = f"""Analyze the Excel spreadsheet data in this question:
535
-
536
- {question}
537
-
538
- Perform the required calculations or data analysis as specified.
539
- Provide only the numeric or exact answer requested."""
540
-
541
- answer = self.query_llm(excel_prompt)
542
- return self.extract_final_answer(answer)
543
-
544
- def analyze_audio_content(self, question):
545
- """Analyze audio content from voice memos - generalized approach."""
546
- # Parse audio content description and extract requested information
547
- audio_prompt = f"""Analyze the audio content described in this question:
548
-
549
- {question}
550
-
551
- Extract the specific information requested (ingredients, page numbers, names, etc.).
552
- Provide the answer in the exact format requested."""
553
 
554
- answer = self.query_llm(audio_prompt)
555
- return self.extract_final_answer(answer)
 
 
556
 
557
- def analyze_python_code(self, question):
558
- """Analyze Python code for output - generalized approach."""
559
- # Parse Python code in question and determine output
560
- code_prompt = f"""Analyze the Python code in this question and determine its output:
561
-
562
- {question}
563
-
564
- Execute the code logic mentally and provide the exact numeric or text output that would result.
565
- Provide only the direct answer requested."""
566
- answer = self.query_llm(code_prompt)
567
- return self.extract_final_answer(answer)
568
-
569
- def improved_determine_tools_needed(self, question):
570
- """Enhanced tool selection with better pattern matching."""
571
- question_lower = question.lower()
572
-
573
- # YouTube detection - more comprehensive
574
- youtube_patterns = ["youtube.com", "youtu.be", "video", "watch?v=", "channel"]
575
- needs_youtube = any(pattern in question_lower for pattern in youtube_patterns)
576
-
577
- # Reverse text detection - improved logic
578
- is_reverse_text = ("rewsna" in question_lower or
579
- (question_lower != question_lower[::-1] and
580
- "ecnetnes" in question_lower or "sdrow" in question_lower))
581
-
582
- # Wikipedia detection - expanded patterns
583
- wiki_patterns = ["wikipedia", "article", "published", "featured article",
584
- "promoted", "nominated", "discography", "studio albums",
585
- "encyclopedia", "wiki", "featured content"]
586
- needs_wikipedia = any(pattern in question_lower for pattern in wiki_patterns)
587
-
588
- # Web search patterns - comprehensive list
589
- web_search_patterns = [
590
- # Time indicators
591
- "current", "latest", "recent", "2023", "2022", "2021", "2020", "today",
592
- # Question words
593
- "how many", "where", "when", "who", "which", "what", "whose",
594
- # Sports and competitions
595
- "yankee", "walks", "athletes", "olympics", "competition", "pitcher", "baseball",
596
- # Specific entities that need web lookup
597
- "malko", "taishō tamai", "universe today", "nedoshivina",
598
- "specimens", "polish-language", "actor", "played",
599
- # Geographic and demographic
600
- "country", "nationality", "first name", "award number", "city",
601
- # Publications and research
602
- "published", "paper", "study", "research", "journal", "author",
603
- # Statistics and data
604
- "statistics", "data", "facts", "information about", "number of"
605
- ]
606
- needs_web_search = any(pattern in question_lower for pattern in web_search_patterns)
607
-
608
- # Knowledge retrieval for AI/agent questions
609
- ai_patterns = ["agent", "gaia", "llm", "ai", "evaluation", "tool", "artificial intelligence"]
610
- needs_knowledge = any(term in question_lower for term in ai_patterns)
611
-
612
- # File analysis detection
613
- file_patterns = ["excel", "xlsx", "csv", "python", ".py", "mp3", "wav", "audio", "voice memo"]
614
- has_file_analysis = any(pattern in question_lower for pattern in file_patterns)
615
-
616
- return {
617
- "use_youtube": needs_youtube,
618
- "use_wikipedia": needs_wikipedia,
619
- "is_reverse_text": is_reverse_text,
620
- "use_web_search": needs_web_search,
621
- "use_knowledge_retrieval": needs_knowledge,
622
- "use_webpage_visit": needs_web_search and ("link" in question_lower or "paper" in question_lower),
623
- "has_file_analysis": has_file_analysis
624
  }
625
-
626
- def __call__(self, question: str) -> str:
627
- """Main agent execution method - completely refactored for generalizability."""
628
- import re
629
- print(f"GaiaAgent received question (raw): {question}")
630
 
631
- try:
632
- # Step 1: Analyze question and determine tool strategy
633
- tool_selection = self.improved_determine_tools_needed(question)
634
- print(f"Tool selection: {tool_selection}")
635
-
636
- # Step 2: Try special handlers first
637
- special_answer = self.handle_special_questions(question, tool_selection)
638
- if special_answer:
639
- print(f"Special handler returned: {special_answer}")
640
- return special_answer
641
-
642
- # Step 3: Gather information from tools
643
- context_info = []
644
-
645
- # YouTube analysis
646
- if tool_selection["use_youtube"]:
647
- youtube_urls = re.findall(r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[\w-]+)', question)
648
- if youtube_urls:
649
- try:
650
- youtube_info = self.youtube_tool.forward(youtube_urls[0])
651
- context_info.append(f"YouTube Analysis:\n{youtube_info}")
652
- print("Retrieved YouTube information")
653
- # YouTube content is now in context_info for LLM processing
654
- # No hardcoded answers - let LLM analyze the YouTube content
655
-
656
- except Exception as e:
657
- print(f"Error with YouTube tool: {e}")
658
-
659
- # Wikipedia research
660
- if tool_selection["use_wikipedia"]:
661
- try:
662
- # Smart search term extraction
663
- search_query = question
664
- if "mercedes sosa" in question.lower():
665
- search_query = "Mercedes Sosa discography"
666
- elif "dinosaur" in question.lower() and "featured article" in question.lower():
667
- search_query = "dinosaur featured articles wikipedia"
668
-
669
- wikipedia_info = self.wikipedia_tool.forward(search_query)
670
- context_info.append(f"Wikipedia Research:\n{wikipedia_info}")
671
- print("Retrieved Wikipedia information")
672
- # Wikipedia content is now in context_info for LLM processing
673
- # No hardcoded answers - let LLM analyze the Wikipedia content
674
-
675
- except Exception as e:
676
- print(f"Error with Wikipedia tool: {e}")
677
-
678
- # Web search and analysis
679
- if tool_selection["use_web_search"]:
680
- try:
681
- web_info = self.web_search_tool.forward(question)
682
- context_info.append(f"Web Search Results:\n{web_info}")
683
- print("Retrieved web search results")
684
- # Web search content is now in context_info for LLM processing
685
- # No hardcoded answers - let LLM analyze the web search results
686
-
687
- # Follow up with webpage content if needed
688
- if tool_selection["use_webpage_visit"] and "http" in web_info.lower():
689
- url_match = re.search(r'Source: (https?://[^\s]+)', web_info)
690
- if url_match:
691
- try:
692
- webpage_content = self.web_content_tool.forward(url_match.group(1))
693
- context_info.append(f"Webpage Content:\n{webpage_content}")
694
- print("Retrieved detailed webpage content")
695
- except Exception as e:
696
- print(f"Error retrieving webpage content: {e}")
697
-
698
- except Exception as e:
699
- print(f"Error with web search: {e}")
700
-
701
- # Knowledge base retrieval
702
- if tool_selection["use_knowledge_retrieval"]:
703
- try:
704
- knowledge_info = self.retriever_tool.forward(question)
705
- context_info.append(f"Knowledge Base:\n{knowledge_info}")
706
- print("Retrieved knowledge base information")
707
- except Exception as e:
708
- print(f"Error with knowledge retrieval: {e}")
709
-
710
- # Step 4: Synthesize answer using LLM
711
- if context_info:
712
- all_context = "\n\n".join(context_info)
713
- prompt = self.format_prompt(question, all_context)
714
- else:
715
- prompt = self.format_prompt(question)
716
-
717
- # Query LLM for final answer
718
- answer = self.query_llm(prompt)
719
-
720
- # Step 5: Clean and validate answer
721
- clean_answer = self.extract_final_answer(answer)
722
-
723
- print(f"GaiaAgent returning answer: {clean_answer}")
724
- return clean_answer
725
-
726
- except Exception as e:
727
- print(f"Error in GaiaAgent: {e}")
728
- # Fallback to rule-based method
729
- fallback_answer = self.rule_based_answer(question)
730
- print(f"GaiaAgent returning fallback answer: {fallback_answer}")
731
-
732
- return fallback_answer
733
-
734
- def format_prompt(self, question, context=""):
735
- """Format the question into a proper prompt for the LLM."""
736
- if context:
737
- return f"""You are a precise AI assistant that answers questions using available information. Your answer will be evaluated with exact string matching, so provide only the specific answer requested without additional text.
738
-
739
- Context Information:
740
- {context}
741
-
742
- Question: {question}
743
-
744
- Critical Instructions:
745
- - Provide ONLY the exact answer requested, nothing else
746
- - Do not include phrases like "The answer is", "Final answer", or "Based on the context"
747
- - For numerical answers, use the exact format requested (integers, decimals, etc.)
748
- - For lists, use the exact formatting specified in the question (commas, spaces, etc.)
749
- - For names, use proper capitalization as would appear in official sources
750
- - Be concise and precise - extra words will cause evaluation failure
751
- - If the question asks for multiple items, provide them in the exact format requested
752
-
753
- Direct Answer:"""
754
- else:
755
- return f"""You are a precise AI assistant that answers questions accurately. Your answer will be evaluated with exact string matching, so provide only the specific answer requested without additional text.
756
-
757
- Question: {question}
758
-
759
- Critical Instructions:
760
- - Provide ONLY the exact answer requested, nothing else
761
- - Do not include phrases like "The answer is", "Final answer", or explanations
762
- - For numerical answers, use the exact format that would be expected
763
- - For lists, use appropriate formatting (commas, spaces, etc.)
764
- - For names, use proper capitalization
765
- - Be concise and precise - extra words will cause evaluation failure
766
- - Answer based on your knowledge and reasoning
767
 
768
- Direct Answer:"""
769
-
770
- def extract_final_answer(self, answer):
771
- """Extract and clean the final answer for exact matching."""
772
- # Remove common prefixes that might interfere with exact matching
773
- prefixes_to_remove = [
774
- "final answer:", "answer:", "the answer is:", "result:",
775
- "solution:", "conclusion:", "final answer is:", "direct answer:",
776
- "based on the context:", "according to:", "the result is:"
777
- ]
778
-
779
- clean_answer = answer.strip()
780
-
781
- # Remove prefixes (case insensitive)
782
- for prefix in prefixes_to_remove:
783
- if clean_answer.lower().startswith(prefix.lower()):
784
- clean_answer = clean_answer[len(prefix):].strip()
785
-
786
- # Remove quotes if the entire answer is quoted
787
- if clean_answer.startswith('"') and clean_answer.endswith('"'):
788
- clean_answer = clean_answer[1:-1]
789
- elif clean_answer.startswith("'") and clean_answer.endswith("'"):
790
- clean_answer = clean_answer[1:-1]
791
-
792
- # Remove trailing periods if they seem extraneous
793
- if clean_answer.endswith('.') and not clean_answer.replace('.', '').isdigit():
794
- # Don't remove decimal points from numbers
795
- if not (clean_answer.count('.') == 1 and clean_answer.replace('.', '').isdigit()):
796
- clean_answer = clean_answer[:-1]
797
-
798
- # Clean up extra whitespace
799
- clean_answer = ' '.join(clean_answer.split())
800
-
801
- return clean_answer
802
 
803
- class BasicAgent:
804
- def __init__(self):
805
- print("BasicAgent initialized.")
806
- # Initialize the Hugging Face API client
807
- # https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
808
- self.hf_api_url = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
809
- self.hf_api_token = os.getenv("HF_API_TOKEN")
810
- if not self.hf_api_token:
811
- print("WARNING: HF_API_TOKEN not found. Using default fallback methods.")
812
- self.headers = {"Authorization": f"Bearer {self.hf_api_token}"} if self.hf_api_token else {}
813
- self.max_retries = 3
814
- self.retry_delay = 2 # seconds
815
-
816
- def query_llm(self, prompt):
817
- """Send a prompt to the LLM API and return the response."""
818
- if not self.hf_api_token:
819
- # Fallback to a rule-based approach if no API token
820
- return self.rule_based_answer(prompt)
821
-
822
- payload = {
823
- "inputs": prompt,
824
- "parameters": {
825
- "max_new_tokens": 512,
826
- "temperature": 0.7,
827
- "top_p": 0.9,
828
- "do_sample": True
829
- }
830
- }
831
-
832
- for attempt in range(self.max_retries):
833
- try:
834
- response = requests.post(self.hf_api_url, headers=self.headers, json=payload, timeout=30)
835
- response.raise_for_status()
836
- result = response.json()
837
-
838
- # Extract the generated text from the response
839
- if isinstance(result, list) and len(result) > 0:
840
- generated_text = result[0].get("generated_text", "")
841
- # Clean up the response to get just the answer
842
- return self.clean_response(generated_text, prompt)
843
- return "I couldn't generate a proper response."
844
-
845
- except Exception as e:
846
- print(f"Attempt {attempt+1}/{self.max_retries} failed: {str(e)}")
847
- if attempt < self.max_retries - 1:
848
- time.sleep(self.retry_delay)
849
- else:
850
- # Fall back to rule-based method on failure
851
- return self.rule_based_answer(prompt)
852
-
853
- def clean_response(self, response, prompt):
854
- """Clean up the LLM response to extract the answer."""
855
- # Remove the prompt from the beginning if it's included
856
- if response.startswith(prompt):
857
- response = response[len(prompt):]
858
-
859
- # Try to find where the model's actual answer begins
860
- markers = ["<answer>", "<response>", "Answer:", "Response:", "Assistant:"]
861
- for marker in markers:
862
- if marker.lower() in response.lower():
863
- parts = response.lower().split(marker.lower(), 1)
864
- if len(parts) > 1:
865
- response = parts[1].strip()
866
-
867
- # Remove any closing tags if they exist
868
- end_markers = ["</answer>", "</response>", "Human:", "User:"]
869
- for marker in end_markers:
870
- if marker.lower() in response.lower():
871
- response = response.lower().split(marker.lower())[0].strip()
872
-
873
- return response.strip()
874
 
875
- def rule_based_answer(self, question):
876
- """Fallback method using rule-based answers for common question types."""
877
- question_lower = question.lower()
878
-
879
- # Simple pattern matching for common question types
880
- if "what is" in question_lower or "define" in question_lower:
881
- if "agent" in question_lower:
882
- return "An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals."
883
- if "gaia" in question_lower:
884
- return "GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks."
885
-
886
- if "how to" in question_lower:
887
- return "To accomplish this task, you should first understand the requirements, then implement a solution step by step, and finally test your implementation."
888
-
889
- if "example" in question_lower:
890
- return "Here's an example implementation that demonstrates the concept in a practical manner."
891
-
892
- # More specific fallback answers instead of a generic one
893
- if "tools" in question_lower:
894
- return "Tools for AI agents include web search, content extraction, API connections, and various knowledge retrieval mechanisms."
895
- if "chain" in question_lower:
896
- return "Chain-of-thought reasoning allows AI agents to break down complex problems into sequential steps, improving accuracy and transparency."
897
- if "purpose" in question_lower or "goal" in question_lower:
898
- return "The purpose of AI agents is to assist users by answering questions, performing tasks, and providing helpful information while maintaining ethical standards."
899
-
900
- # Default response for truly unmatched questions - more specific than before
901
- return "This question relates to AI agent capabilities. To provide a more precise answer, I would need additional information or context about the specific aspect of AI agents you're interested in."
902
 
903
- def format_prompt(self, question):
904
- """Format the question into a proper prompt for the LLM."""
905
- return f"""You are an intelligent AI assistant. Please answer the following question accurately and concisely:
906
 
907
- Question: {question}
908
-
909
- Answer:"""
910
-
911
- def __call__(self, question: str) -> str:
912
- print(f"Agent received question: {question}...")
913
-
914
- try:
915
- # Format the question as a prompt
916
- prompt = self.format_prompt(question)
917
-
918
- # Query the LLM
919
- answer = self.query_llm(prompt)
920
-
921
- print(f"Agent returning answer: {answer}...")
922
- return answer
923
-
924
- except Exception as e:
925
- print(f"Error in agent: {e}")
926
- # Fallback to the rule-based method if anything goes wrong
927
- fallback_answer = self.rule_based_answer(question)
928
- print(f"Agent returning fallback answer: {fallback_answer}...")
929
- return fallback_answer
930
-
931
- def load_guest_dataset():
932
- """
933
- Placeholder function to prevent errors. If actual guest data is needed,
934
- this would be implemented properly.
935
- """
936
- class GuestInfoTool(Tool):
937
- name = "guest_info"
938
- description = "Get information about guests"
939
-
940
- def forward(self, query):
941
- return "Guest information not available in this version"
942
-
943
- return GuestInfoTool()
944
 
945
- def run_and_submit_all( profile: gr.OAuthProfile | None):
946
  """
947
- Fetches all questions, runs the BasicAgent on them, submits all answers,
948
  and displays the results.
949
  """
950
  # --- Determine HF Space Runtime URL and Repo URL ---
951
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
952
 
953
  if profile:
954
- username= f"{profile.username}"
955
  print(f"User logged in: {username}")
956
  else:
957
  print("User not logged in.")
@@ -959,57 +130,18 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
959
 
960
  api_url = DEFAULT_API_URL
961
  questions_url = f"{api_url}/questions"
962
- submit_url = f"{api_url}/submit" # 1. Instantiate Agent ( modify this part to create your agent)
963
- try:
964
- print("Initializing GaiaAgent...")
965
- # Use GaiaAgent as the primary agent
966
- agent = GaiaAgent()
967
-
968
- # Skip the CodeAgent setup that's overriding our GaiaAgent
969
- """
970
- # Initialize the Hugging Face model
971
- model = InferenceClientModel()
972
-
973
- # Initialize the web search tool
974
- #search_tool = DuckDuckGoSearchTool()
975
-
976
- # Initialize the weather tool
977
- #weather_info_tool = WeatherInfoTool()
978
-
979
- # Initialize the Hub stats tool
980
- #hub_stats_tool = HubStatsTool()
981
-
982
- # Load the guest dataset and initialize the guest info tool
983
- guest_info_tool = load_guest_dataset()
984
 
985
- # Initialize the Hugging Face model
986
- model = InferenceClientModel()
987
-
988
- # Load the DuckDuckGo search tool dynamically
989
- search_tool = load_tool(repo_id="smol-ai/duckduckgo-search", trust_remote_code=True)
990
-
991
-
992
- agent = CodeAgent(
993
- tools=[guest_info_tool, search_tool],
994
- model=model,
995
- add_base_tools=True, # Add any additional base tools
996
- planning_interval=3 # Enable planning every 3 steps
997
- )
998
- """
999
-
1000
- print("GaiaAgent initialization complete.")
1001
  except Exception as e:
1002
- print(f"Error instantiating GaiaAgent: {e}")
1003
- print("Falling back to BasicAgent...")
1004
- try:
1005
- agent = BasicAgent()
1006
- print("BasicAgent initialization complete.")
1007
- except Exception as e:
1008
- print(f"Error instantiating BasicAgent: {e}")
1009
- return f"Error initializing agents: {e}", None
1010
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
1011
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
1012
- print(agent_code)
1013
 
1014
  # 2. Fetch Questions
1015
  print(f"Fetching questions from: {questions_url}")
@@ -1018,21 +150,17 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
1018
  response.raise_for_status()
1019
  questions_data = response.json()
1020
  if not questions_data:
1021
- print("Fetched questions list is empty.")
1022
- return "Fetched questions list is empty or invalid format.", None
1023
  print(f"Fetched {len(questions_data)} questions.")
1024
  except requests.exceptions.RequestException as e:
1025
  print(f"Error fetching questions: {e}")
1026
  return f"Error fetching questions: {e}", None
1027
- except requests.exceptions.JSONDecodeError as e:
1028
- print(f"Error decoding JSON response from questions endpoint: {e}")
1029
- print(f"Response text: {response.text[:500]}")
1030
- return f"Error decoding server response for questions: {e}", None
1031
  except Exception as e:
1032
  print(f"An unexpected error occurred fetching questions: {e}")
1033
  return f"An unexpected error occurred fetching questions: {e}", None
1034
 
1035
- # 3. Run your Agent
1036
  results_log = []
1037
  answers_payload = []
1038
  print(f"Running agent on {len(questions_data)} questions...")
@@ -1043,29 +171,54 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
1043
  print(f"Skipping item with missing task_id or question: {item}")
1044
  continue
1045
  try:
1046
- submitted_answer = agent(question_text)
1047
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
1048
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
  except Exception as e:
1050
- print(f"Error running agent on task {task_id}: {e}")
1051
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
1052
 
1053
  if not answers_payload:
1054
  print("Agent did not produce any answers to submit.")
1055
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
1056
 
1057
- # 4. Prepare Submission
1058
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
1059
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
1060
- print(status_update)
1061
-
1062
- # 5. Submit
1063
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
 
1064
  try:
1065
  response = requests.post(submit_url, json=submission_data, timeout=60)
1066
  response.raise_for_status()
1067
  result_data = response.json()
1068
- final_status = (
 
1069
  f"Submission Successful!\n"
1070
  f"User: {result_data.get('username')}\n"
1071
  f"Overall Score: {result_data.get('score', 'N/A')}% "
@@ -1073,60 +226,28 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
1073
  f"Message: {result_data.get('message', 'No message received.')}"
1074
  )
1075
  print("Submission successful.")
1076
- results_df = pd.DataFrame(results_log)
1077
- return final_status, results_df
1078
- except requests.exceptions.HTTPError as e:
1079
- error_detail = f"Server responded with status {e.response.status_code}."
1080
- try:
1081
- error_json = e.response.json()
1082
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
1083
- except requests.exceptions.JSONDecodeError:
1084
- error_detail += f" Response: {e.response.text[:500]}"
1085
- status_message = f"Submission Failed: {error_detail}"
1086
- print(status_message)
1087
- results_df = pd.DataFrame(results_log)
1088
- return status_message, results_df
1089
- except requests.exceptions.Timeout:
1090
- status_message = "Submission Failed: The request timed out."
1091
- print(status_message)
1092
- results_df = pd.DataFrame(results_log)
1093
- return status_message, results_df
1094
- except requests.exceptions.RequestException as e:
1095
- status_message = f"Submission Failed: Network error - {e}"
1096
- print(status_message)
1097
- results_df = pd.DataFrame(results_log)
1098
- return status_message, results_df
1099
  except Exception as e:
1100
- status_message = f"An unexpected error occurred during submission: {e}"
1101
- print(status_message)
1102
- results_df = pd.DataFrame(results_log)
1103
- return status_message, results_df
1104
-
1105
 
1106
  # --- Build Gradio Interface using Blocks ---
1107
  with gr.Blocks() as demo:
1108
- gr.Markdown("# Basic Agent Evaluation Runner")
1109
  gr.Markdown(
1110
  """
1111
  **Instructions:**
1112
-
1113
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
1114
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
1115
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
1116
-
1117
- ---
1118
- **Disclaimers:**
1119
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
1120
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
1121
  """
1122
  )
1123
 
1124
  gr.LoginButton()
1125
-
1126
  run_button = gr.Button("Run Evaluation & Submit All Answers")
1127
-
1128
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
1129
- # Removed max_rows=10 from DataFrame constructor
1130
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
1131
 
1132
  run_button.click(
@@ -1135,25 +256,5 @@ with gr.Blocks() as demo:
1135
  )
1136
 
1137
  if __name__ == "__main__":
1138
- print("\n" + "-"*30 + " App Starting " + "-"*30)
1139
- # Check for SPACE_HOST and SPACE_ID at startup for information
1140
- space_host_startup = os.getenv("SPACE_HOST")
1141
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
1142
-
1143
- if space_host_startup:
1144
- print(f"✅ SPACE_HOST found: {space_host_startup}")
1145
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
1146
- else:
1147
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
1148
-
1149
- if space_id_startup: # Print repo URLs if SPACE_ID is found
1150
- print(f"✅ SPACE_ID found: {space_id_startup}")
1151
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
1152
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
1153
- else:
1154
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
1155
-
1156
- print("-"*(60 + len(" App Starting ")) + "\n")
1157
-
1158
- print("Launching Gradio Interface for Basic Agent Evaluation...")
1159
  demo.launch(debug=True, share=False)
 
14
  from duckduckgo_search import DDGS
15
  import pytube
16
  from dateutil import parser
 
17
  try:
18
  from youtube_transcript_api import YouTubeTranscriptApi
19
  except ImportError:
 
21
 
22
  from smolagents import Tool, CodeAgent, InferenceClientModel
23
 
24
+ # Import internal modules
25
+ from config import (
26
+ DEFAULT_API_URL, LLAMA_API_URL, HF_API_TOKEN, HEADERS,
27
+ MAX_RETRIES, RETRY_DELAY
28
+ )
29
+ from tools.tool_manager import ToolManager
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ class GaiaToolCallingAgent:
32
+ """Tool-calling agent specifically designed for the GAIA system."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def __init__(self):
35
+ print("GaiaToolCallingAgent initialized.")
36
+ self.tool_manager = ToolManager()
37
+ self.name = "tool_agent" # Add required name attribute for smolagents integration
38
+ self.description = "A specialized agent that uses various tools to answer questions" # Required by smolagents
39
 
40
+ def run(self, query: str) -> str:
41
+ """Process a query and return a response using available tools."""
42
+ tools = self.tool_manager.get_tools()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ # For each tool, try to get relevant information
45
+ context_info = []
46
 
47
+ for tool in tools:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  try:
49
+ if self._should_use_tool(tool, query):
50
+ result = tool.forward(query)
51
+ if result:
52
+ context_info.append(f"{tool.name} Results:\n{result}")
 
 
 
 
 
 
 
 
 
 
53
  except Exception as e:
54
+ print(f"Error using {tool.name}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ # Combine all context information
57
+ full_context = "\n\n".join(context_info) if context_info else ""
 
 
 
 
58
 
59
+ return full_context
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ def __call__(self, query: str) -> str:
62
+ """Make the agent callable so it can be used directly by CodeAgent."""
63
+ print(f"Tool agent received query: {query}")
64
+ return self.run(query)
65
 
66
+ def _should_use_tool(self, tool: Tool, query: str) -> bool:
67
+ """Determine if a specific tool should be used for the query."""
68
+ query_lower = query.lower()
69
+
70
+ # Tool-specific patterns
71
+ patterns = {
72
+ "web_search": ["current", "latest", "recent", "who", "what", "when", "where", "how"],
73
+ "web_content": ["content", "webpage", "website", "page"],
74
+ "youtube_video": ["youtube.com", "youtu.be"],
75
+ "wikipedia_search": ["wikipedia", "wiki", "article"],
76
+ "gaia_retriever": ["gaia", "agent", "ai", "artificial intelligence"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
 
 
 
 
 
78
 
79
+ return any(pattern in query_lower for pattern in patterns.get(tool.name, []))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ def create_manager_agent() -> CodeAgent:
82
+ """Create and configure the main GAIA agent."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ # Initialize the managed tool-calling agent
85
+ tool_agent = GaiaToolCallingAgent()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # Create the manager agent
88
+ manager_agent = CodeAgent(
89
+ model=InferenceClientModel(),
90
+ tools=[], # No direct tools for manager
91
+ managed_agents=[tool_agent],
92
+ additional_authorized_imports=[
93
+ "json",
94
+ "pandas",
95
+ "numpy",
96
+ "re",
97
+ "requests",
98
+ "bs4"
99
+ ],
100
+ planning_interval=3,
101
+ verbosity_level=2,
102
+ max_steps=10
103
+ )
 
 
 
 
 
 
 
 
 
 
104
 
105
+ return manager_agent
 
 
106
 
107
+ def create_agent():
108
+ """Create the GAIA agent system."""
109
+ try:
110
+ print("Initializing GAIA agent system...")
111
+ return create_manager_agent()
112
+ except Exception as e:
113
+ print(f"Error creating GAIA agent: {e}")
114
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
117
  """
118
+ Fetches all questions, runs the GAIA agent on them, submits all answers,
119
  and displays the results.
120
  """
121
  # --- Determine HF Space Runtime URL and Repo URL ---
122
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
123
 
124
  if profile:
125
+ username = f"{profile.username}"
126
  print(f"User logged in: {username}")
127
  else:
128
  print("User not logged in.")
 
130
 
131
  api_url = DEFAULT_API_URL
132
  questions_url = f"{api_url}/questions"
133
+ submit_url = f"{api_url}/submit"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ # 1. Initialize Agent
136
+ try:
137
+ print("Initializing GAIA agent system...")
138
+ agent = create_agent()
139
+ if not agent:
140
+ return "Error: Could not initialize agent.", None
141
+ print("GAIA agent initialization complete.")
 
 
 
 
 
 
 
 
 
142
  except Exception as e:
143
+ print(f"Error initializing agent: {e}")
144
+ return f"Error initializing agent: {e}", None
 
 
 
 
 
 
 
 
 
145
 
146
  # 2. Fetch Questions
147
  print(f"Fetching questions from: {questions_url}")
 
150
  response.raise_for_status()
151
  questions_data = response.json()
152
  if not questions_data:
153
+ print("Fetched questions list is empty.")
154
+ return "Fetched questions list is empty or invalid format.", None
155
  print(f"Fetched {len(questions_data)} questions.")
156
  except requests.exceptions.RequestException as e:
157
  print(f"Error fetching questions: {e}")
158
  return f"Error fetching questions: {e}", None
 
 
 
 
159
  except Exception as e:
160
  print(f"An unexpected error occurred fetching questions: {e}")
161
  return f"An unexpected error occurred fetching questions: {e}", None
162
 
163
+ # 3. Run Agent on Questions
164
  results_log = []
165
  answers_payload = []
166
  print(f"Running agent on {len(questions_data)} questions...")
 
171
  print(f"Skipping item with missing task_id or question: {item}")
172
  continue
173
  try:
174
+ # Run the agent and get the response
175
+ response = agent.run(f"Answer this question concisely: {question_text}")
176
+
177
+ # Clean up the response if needed
178
+ if isinstance(response, dict):
179
+ submitted_answer = response.get("answer", str(response))
180
+ else:
181
+ submitted_answer = str(response)
182
+
183
+ # Add to submission payload
184
+ answers_payload.append({
185
+ "task_id": task_id,
186
+ "submitted_answer": submitted_answer
187
+ })
188
+
189
+ # Log the result
190
+ results_log.append({
191
+ "Task ID": task_id,
192
+ "Question": question_text,
193
+ "Submitted Answer": submitted_answer
194
+ })
195
+
196
  except Exception as e:
197
+ print(f"Error running agent on task {task_id}: {e}")
198
+ results_log.append({
199
+ "Task ID": task_id,
200
+ "Question": question_text,
201
+ "Submitted Answer": f"AGENT ERROR: {e}"
202
+ })
203
 
204
  if not answers_payload:
205
  print("Agent did not produce any answers to submit.")
206
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
207
 
208
+ # 4. Prepare Submission
209
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
210
+ submission_data = {
211
+ "username": username.strip(),
212
+ "agent_code": agent_code,
213
+ "answers": answers_payload
214
+ } # 5. Submit
215
+ print(f"Submitting {len(answers_payload)} answers to API...")
216
  try:
217
  response = requests.post(submit_url, json=submission_data, timeout=60)
218
  response.raise_for_status()
219
  result_data = response.json()
220
+
221
+ status_message = (
222
  f"Submission Successful!\n"
223
  f"User: {result_data.get('username')}\n"
224
  f"Overall Score: {result_data.get('score', 'N/A')}% "
 
226
  f"Message: {result_data.get('message', 'No message received.')}"
227
  )
228
  print("Submission successful.")
229
+ return status_message, pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  except Exception as e:
231
+ status_message = f"Submission Failed: {str(e)}"
232
+ print(f"Error during submission: {e}")
233
+ return status_message, pd.DataFrame(results_log)
 
 
234
 
235
  # --- Build Gradio Interface using Blocks ---
236
  with gr.Blocks() as demo:
237
+ gr.Markdown("# GAIA Agent Evaluation Runner")
238
  gr.Markdown(
239
  """
240
  **Instructions:**
241
+ 1. Log in to your Hugging Face account using the button below.
242
+ 2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run the agent, and see the score.
243
+
244
+ The agent uses a managed tool-calling architecture and the smolagents framework for reliable answers.
 
 
 
 
 
245
  """
246
  )
247
 
248
  gr.LoginButton()
 
249
  run_button = gr.Button("Run Evaluation & Submit All Answers")
 
250
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
251
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
252
 
253
  run_button.click(
 
256
  )
257
 
258
  if __name__ == "__main__":
259
+ print("\n" + "-"*30 + " GAIA Agent Starting " + "-"*30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  demo.launch(debug=True, share=False)
config.py CHANGED
@@ -11,29 +11,17 @@ HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"} if HF_API_TOKEN else {}
11
  MAX_RETRIES = 3
12
  RETRY_DELAY = 2 # seconds
13
 
14
- # --- Knowledge Base Content ---
15
- GAIA_KNOWLEDGE = """
16
- ### AI and Agent Concepts
17
- - An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals.
18
- - GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks.
19
- - The agent loop consists of perception, reasoning, and action.
20
- - RAG (Retrieval-Augmented Generation) combines retrieval of relevant information with generation capabilities of language models.
21
- - An LLM (Large Language Model) is a neural network trained on vast amounts of text data to understand and generate human language.
22
 
23
- ### Agent Capabilities
24
- - Tool use refers to an agent's ability to employ external tools like search engines, APIs, or specialized algorithms.
25
- - An effective agent should be able to decompose complex problems into manageable parts.
26
- - Chain-of-thought reasoning allows agents to break down problem-solving steps to improve accuracy.
27
- - Agents should apply appropriate reasoning strategies based on the type of question (factual, analytical, etc.)
28
- - Self-reflection helps agents identify and correct errors in their reasoning.
29
-
30
- ### Evaluation Criteria
31
- - Agent responses should be accurate, relevant, and factually correct.
32
- - Effective agents provide concise yet comprehensive answers.
33
- - Agents should acknowledge limitations and uncertainties when appropriate.
34
- - Good agents can follow multi-step instructions and fulfill all requirements.
35
- - Reasoning transparency helps users understand how the agent arrived at its conclusions.
36
- """
37
 
38
  # --- Tool Pattern Matching ---
39
  YOUTUBE_PATTERNS = ["youtube.com", "youtu.be", "video", "watch?v=", "channel"]
 
11
  MAX_RETRIES = 3
12
  RETRY_DELAY = 2 # seconds
13
 
14
+ # --- Knowledge Base Configuration ---
15
+ KNOWLEDGE_BASE_PATH = os.path.join(os.path.dirname(__file__), 'data', 'knowledge_base.txt')
 
 
 
 
 
 
16
 
17
+ def load_knowledge_base():
18
+ """Load knowledge base from file"""
19
+ try:
20
+ with open(KNOWLEDGE_BASE_PATH, 'r', encoding='utf-8') as f:
21
+ return f.read()
22
+ except FileNotFoundError:
23
+ os.makedirs(os.path.dirname(KNOWLEDGE_BASE_PATH), exist_ok=True)
24
+ return ""
 
 
 
 
 
 
25
 
26
  # --- Tool Pattern Matching ---
27
  YOUTUBE_PATTERNS = ["youtube.com", "youtu.be", "video", "watch?v=", "channel"]
data/knowledge_base.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### AI and Agent Concepts
2
+ - An agent is an autonomous entity that observes and acts upon an environment using sensors and actuators, usually to achieve specific goals.
3
+ - GAIA (General AI Assistant) is a framework for creating and evaluating AI assistants that can perform a wide range of tasks.
4
+ - The agent loop consists of perception, reasoning, and action.
5
+ - RAG (Retrieval-Augmented Generation) combines retrieval of relevant information with generation capabilities of language models.
6
+ - An LLM (Large Language Model) is a neural network trained on vast amounts of text data to understand and generate human language.
7
+
8
+ ### Agent Capabilities
9
+ - Tool use refers to an agent's ability to employ external tools like search engines, APIs, or specialized algorithms.
10
+ - An effective agent should be able to decompose complex problems into manageable parts.
11
+ - Chain-of-thought reasoning allows agents to break down problem-solving steps to improve accuracy.
12
+ - Agents should apply appropriate reasoning strategies based on the type of question (factual, analytical, etc.)
13
+ - Self-reflection helps agents identify and correct errors in their reasoning.
14
+
15
+ ### Evaluation Criteria
16
+ - Agent responses should be accurate, relevant, and factually correct.
17
+ - Effective agents provide concise yet comprehensive answers.
18
+ - Agents should acknowledge limitations and uncertainties when appropriate.
19
+ - Good agents can follow multi-step instructions and fulfill all requirements.
20
+ - Reasoning transparency helps users understand how the agent arrived at its conclusions.
tools/tool_manager.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tool manager for the GAIA Agent.
3
+
4
+ This handles the coordination between different tools and provides them to the agent.
5
+ """
6
+
7
+ from smolagents import Tool
8
+ from typing import Dict, List, Any
9
+ from langchain.docstore.document import Document
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain_community.retrievers import BM25Retriever
12
+ import functools
13
+
14
+ from config import load_knowledge_base
15
+ from tools.web_tools import WebSearchTool, WebContentTool
16
+ from tools.youtube_tool import YoutubeVideoTool
17
+ from tools.wikipedia_tool import WikipediaTool
18
+ from tools.knowledge_tool import GaiaRetrieverTool
19
+
20
+ class ToolManager:
21
+ """
22
+ Manages and initializes all available tools for the GAIA agent.
23
+ Also provides tool selection logic based on question analysis.
24
+ """
25
+
26
+ def __init__(self):
27
+ # Load and process knowledge base
28
+ knowledge_text = load_knowledge_base()
29
+ self.knowledge_docs = self._create_knowledge_documents(knowledge_text)
30
+
31
+ # Initialize tools
32
+ self.tools = self._initialize_tools()
33
+
34
+ def _create_knowledge_documents(self, text: str) -> List[Document]:
35
+ """Create searchable documents from knowledge base text."""
36
+ text_splitter = RecursiveCharacterTextSplitter(
37
+ chunk_size=500,
38
+ chunk_overlap=50,
39
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
40
+ )
41
+ knowledge_chunks = text_splitter.split_text(text)
42
+ return [Document(page_content=chunk) for chunk in knowledge_chunks]
43
+
44
+ def _initialize_tools(self) -> List[Tool]:
45
+ """Initialize all available tools."""
46
+ return [
47
+ GaiaRetrieverTool(self.knowledge_docs),
48
+ WebSearchTool(),
49
+ WebContentTool(),
50
+ YoutubeVideoTool(),
51
+ WikipediaTool(),
52
+ ]
53
+
54
+ def get_tools(self) -> List[Tool]:
55
+ """Return all available tools."""
56
+ return self.tools