mjschock's picture
Refactor agent structure by modularizing agent implementations into separate directories for web, data analysis, and media agents. Remove legacy code from agents.py, prompts.py, and tools.py, enhancing maintainability. Update main_v2.py to reflect new import paths and agent initialization. Add new tools for enhanced functionality, including web searching and data extraction. Update requirements.txt to include necessary dependencies for new tools.
837e221 unverified
raw
history blame
2.96 kB
import os
import pandas as pd
from typing import List, Optional
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever
from smolagents import Tool
class WikipediaRAGTool(Tool):
name = "wikipedia_rag"
description = "Retrieves relevant information from Wikipedia articles using RAG."
inputs = {
"query": {
"type": "string",
"description": "The search query to find relevant Wikipedia content."
}
}
output_type = "string"
def __init__(self, dataset_path: str = "wikipedia-structured-contents"):
self.is_initialized = False
self.dataset_path = dataset_path
self.docs: List[Document] = []
self.retriever: Optional[BM25Retriever] = None
def _load_documents(self) -> None:
"""Load and process the Wikipedia dataset into Document objects."""
try:
# Load the dataset
df = pd.read_csv(os.path.join(self.dataset_path, "wikipedia_articles.csv"))
# Convert each article into a Document
self.docs = [
Document(
page_content=f"Title: {row['title']}\n\nContent: {row['content']}",
metadata={
"title": row['title'],
"url": row['url'],
"category": row.get('category', '')
}
)
for _, row in df.iterrows()
]
# Initialize the retriever
self.retriever = BM25Retriever.from_documents(self.docs)
self.is_initialized = True
except Exception as e:
print(f"Error loading documents: {e}")
raise
def forward(self, query: str) -> str:
"""Process the query and return relevant Wikipedia content."""
if not self.is_initialized:
self._load_documents()
if not self.retriever:
return "Error: Retriever not initialized properly."
try:
# Get relevant documents
results = self.retriever.get_relevant_documents(query)
if not results:
return "No relevant Wikipedia articles found."
# Format the results
formatted_results = []
for doc in results[:3]: # Return top 3 most relevant results
metadata = doc.metadata
formatted_results.append(
f"Title: {metadata['title']}\n"
f"URL: {metadata['url']}\n"
f"Category: {metadata['category']}\n"
f"Content: {doc.page_content[:500]}...\n"
)
return "\n\n".join(formatted_results)
except Exception as e:
return f"Error retrieving information: {str(e)}"