Spaces:
Sleeping
Sleeping
File size: 2,851 Bytes
837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 837e221 e4c7240 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os
from typing import List, Optional
import pandas as pd
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever
from smolagents import Tool
class WikipediaRAGTool(Tool):
name = "wikipedia_rag"
description = "Retrieves relevant information from Wikipedia articles using RAG."
inputs = {
"query": {
"type": "string",
"description": "The search query to find relevant Wikipedia content.",
}
}
output_type = "string"
def __init__(self, dataset_path: str = "wikipedia-structured-contents"):
self.is_initialized = False
self.dataset_path = dataset_path
self.docs: List[Document] = []
self.retriever: Optional[BM25Retriever] = None
def _load_documents(self) -> None:
"""Load and process the Wikipedia dataset into Document objects."""
try:
# Load the dataset
df = pd.read_csv(os.path.join(self.dataset_path, "wikipedia_articles.csv"))
# Convert each article into a Document
self.docs = [
Document(
page_content=f"Title: {row['title']}\n\nContent: {row['content']}",
metadata={
"title": row["title"],
"url": row["url"],
"category": row.get("category", ""),
},
)
for _, row in df.iterrows()
]
# Initialize the retriever
self.retriever = BM25Retriever.from_documents(self.docs)
self.is_initialized = True
except Exception as e:
print(f"Error loading documents: {e}")
raise
def forward(self, query: str) -> str:
"""Process the query and return relevant Wikipedia content."""
if not self.is_initialized:
self._load_documents()
if not self.retriever:
return "Error: Retriever not initialized properly."
try:
# Get relevant documents
results = self.retriever.get_relevant_documents(query)
if not results:
return "No relevant Wikipedia articles found."
# Format the results
formatted_results = []
for doc in results[:3]: # Return top 3 most relevant results
metadata = doc.metadata
formatted_results.append(
f"Title: {metadata['title']}\n"
f"URL: {metadata['url']}\n"
f"Category: {metadata['category']}\n"
f"Content: {doc.page_content[:500]}...\n"
)
return "\n\n".join(formatted_results)
except Exception as e:
return f"Error retrieving information: {str(e)}"
|