Spaces:
Sleeping
Sleeping
Refactor agent structure by modularizing agent implementations into separate directories for web, data analysis, and media agents. Remove legacy code from agents.py, prompts.py, and tools.py, enhancing maintainability. Update main_v2.py to reflect new import paths and agent initialization. Add new tools for enhanced functionality, including web searching and data extraction. Update requirements.txt to include necessary dependencies for new tools.
837e221
unverified
import os | |
import pandas as pd | |
from typing import List, Optional | |
from langchain.docstore.document import Document | |
from langchain_community.retrievers import BM25Retriever | |
from smolagents import Tool | |
class WikipediaRAGTool(Tool): | |
name = "wikipedia_rag" | |
description = "Retrieves relevant information from Wikipedia articles using RAG." | |
inputs = { | |
"query": { | |
"type": "string", | |
"description": "The search query to find relevant Wikipedia content." | |
} | |
} | |
output_type = "string" | |
def __init__(self, dataset_path: str = "wikipedia-structured-contents"): | |
self.is_initialized = False | |
self.dataset_path = dataset_path | |
self.docs: List[Document] = [] | |
self.retriever: Optional[BM25Retriever] = None | |
def _load_documents(self) -> None: | |
"""Load and process the Wikipedia dataset into Document objects.""" | |
try: | |
# Load the dataset | |
df = pd.read_csv(os.path.join(self.dataset_path, "wikipedia_articles.csv")) | |
# Convert each article into a Document | |
self.docs = [ | |
Document( | |
page_content=f"Title: {row['title']}\n\nContent: {row['content']}", | |
metadata={ | |
"title": row['title'], | |
"url": row['url'], | |
"category": row.get('category', '') | |
} | |
) | |
for _, row in df.iterrows() | |
] | |
# Initialize the retriever | |
self.retriever = BM25Retriever.from_documents(self.docs) | |
self.is_initialized = True | |
except Exception as e: | |
print(f"Error loading documents: {e}") | |
raise | |
def forward(self, query: str) -> str: | |
"""Process the query and return relevant Wikipedia content.""" | |
if not self.is_initialized: | |
self._load_documents() | |
if not self.retriever: | |
return "Error: Retriever not initialized properly." | |
try: | |
# Get relevant documents | |
results = self.retriever.get_relevant_documents(query) | |
if not results: | |
return "No relevant Wikipedia articles found." | |
# Format the results | |
formatted_results = [] | |
for doc in results[:3]: # Return top 3 most relevant results | |
metadata = doc.metadata | |
formatted_results.append( | |
f"Title: {metadata['title']}\n" | |
f"URL: {metadata['url']}\n" | |
f"Category: {metadata['category']}\n" | |
f"Content: {doc.page_content[:500]}...\n" | |
) | |
return "\n\n".join(formatted_results) | |
except Exception as e: | |
return f"Error retrieving information: {str(e)}" |