Spaces:
Sleeping
Sleeping
Enhance agent functionality in main_v2.py by adding WikipediaSearchTool and updating DuckDuckGoSearchTool and VisitWebpageTool parameters. Modify agent initialization to accommodate new tools and increase max results and output length. Update requirements.txt to include Wikipedia-API dependency. Refactor imports for better organization across agent modules.
e4c7240
unverified
import os | |
from typing import List, Optional | |
import pandas as pd | |
from langchain.docstore.document import Document | |
from langchain_community.retrievers import BM25Retriever | |
from smolagents import Tool | |
class WikipediaRAGTool(Tool): | |
name = "wikipedia_rag" | |
description = "Retrieves relevant information from Wikipedia articles using RAG." | |
inputs = { | |
"query": { | |
"type": "string", | |
"description": "The search query to find relevant Wikipedia content.", | |
} | |
} | |
output_type = "string" | |
def __init__(self, dataset_path: str = "wikipedia-structured-contents"): | |
self.is_initialized = False | |
self.dataset_path = dataset_path | |
self.docs: List[Document] = [] | |
self.retriever: Optional[BM25Retriever] = None | |
def _load_documents(self) -> None: | |
"""Load and process the Wikipedia dataset into Document objects.""" | |
try: | |
# Load the dataset | |
df = pd.read_csv(os.path.join(self.dataset_path, "wikipedia_articles.csv")) | |
# Convert each article into a Document | |
self.docs = [ | |
Document( | |
page_content=f"Title: {row['title']}\n\nContent: {row['content']}", | |
metadata={ | |
"title": row["title"], | |
"url": row["url"], | |
"category": row.get("category", ""), | |
}, | |
) | |
for _, row in df.iterrows() | |
] | |
# Initialize the retriever | |
self.retriever = BM25Retriever.from_documents(self.docs) | |
self.is_initialized = True | |
except Exception as e: | |
print(f"Error loading documents: {e}") | |
raise | |
def forward(self, query: str) -> str: | |
"""Process the query and return relevant Wikipedia content.""" | |
if not self.is_initialized: | |
self._load_documents() | |
if not self.retriever: | |
return "Error: Retriever not initialized properly." | |
try: | |
# Get relevant documents | |
results = self.retriever.get_relevant_documents(query) | |
if not results: | |
return "No relevant Wikipedia articles found." | |
# Format the results | |
formatted_results = [] | |
for doc in results[:3]: # Return top 3 most relevant results | |
metadata = doc.metadata | |
formatted_results.append( | |
f"Title: {metadata['title']}\n" | |
f"URL: {metadata['url']}\n" | |
f"Category: {metadata['category']}\n" | |
f"Content: {doc.page_content[:500]}...\n" | |
) | |
return "\n\n".join(formatted_results) | |
except Exception as e: | |
return f"Error retrieving information: {str(e)}" | |