mjschock's picture
Enhance agent functionality in main_v2.py by adding WikipediaSearchTool and updating DuckDuckGoSearchTool and VisitWebpageTool parameters. Modify agent initialization to accommodate new tools and increase max results and output length. Update requirements.txt to include Wikipedia-API dependency. Refactor imports for better organization across agent modules.
e4c7240 unverified
raw
history blame
2.85 kB
import os
from typing import List, Optional
import pandas as pd
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever
from smolagents import Tool
class WikipediaRAGTool(Tool):
name = "wikipedia_rag"
description = "Retrieves relevant information from Wikipedia articles using RAG."
inputs = {
"query": {
"type": "string",
"description": "The search query to find relevant Wikipedia content.",
}
}
output_type = "string"
def __init__(self, dataset_path: str = "wikipedia-structured-contents"):
self.is_initialized = False
self.dataset_path = dataset_path
self.docs: List[Document] = []
self.retriever: Optional[BM25Retriever] = None
def _load_documents(self) -> None:
"""Load and process the Wikipedia dataset into Document objects."""
try:
# Load the dataset
df = pd.read_csv(os.path.join(self.dataset_path, "wikipedia_articles.csv"))
# Convert each article into a Document
self.docs = [
Document(
page_content=f"Title: {row['title']}\n\nContent: {row['content']}",
metadata={
"title": row["title"],
"url": row["url"],
"category": row.get("category", ""),
},
)
for _, row in df.iterrows()
]
# Initialize the retriever
self.retriever = BM25Retriever.from_documents(self.docs)
self.is_initialized = True
except Exception as e:
print(f"Error loading documents: {e}")
raise
def forward(self, query: str) -> str:
"""Process the query and return relevant Wikipedia content."""
if not self.is_initialized:
self._load_documents()
if not self.retriever:
return "Error: Retriever not initialized properly."
try:
# Get relevant documents
results = self.retriever.get_relevant_documents(query)
if not results:
return "No relevant Wikipedia articles found."
# Format the results
formatted_results = []
for doc in results[:3]: # Return top 3 most relevant results
metadata = doc.metadata
formatted_results.append(
f"Title: {metadata['title']}\n"
f"URL: {metadata['url']}\n"
f"Category: {metadata['category']}\n"
f"Content: {doc.page_content[:500]}...\n"
)
return "\n\n".join(formatted_results)
except Exception as e:
return f"Error retrieving information: {str(e)}"