File size: 2,851 Bytes
837e221
 
e4c7240
 
837e221
 
 
 
e4c7240
837e221
 
 
 
 
 
e4c7240
837e221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4c7240
837e221
 
 
 
 
e4c7240
 
 
 
837e221
 
 
e4c7240
837e221
 
 
e4c7240
837e221
 
 
 
 
 
 
 
e4c7240
837e221
 
e4c7240
837e221
 
 
e4c7240
837e221
 
e4c7240
837e221
 
 
 
 
 
 
 
 
 
e4c7240
837e221
e4c7240
837e221
e4c7240
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
from typing import List, Optional

import pandas as pd
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever
from smolagents import Tool


class WikipediaRAGTool(Tool):
    name = "wikipedia_rag"
    description = "Retrieves relevant information from Wikipedia articles using RAG."
    inputs = {
        "query": {
            "type": "string",
            "description": "The search query to find relevant Wikipedia content.",
        }
    }
    output_type = "string"

    def __init__(self, dataset_path: str = "wikipedia-structured-contents"):
        self.is_initialized = False
        self.dataset_path = dataset_path
        self.docs: List[Document] = []
        self.retriever: Optional[BM25Retriever] = None

    def _load_documents(self) -> None:
        """Load and process the Wikipedia dataset into Document objects."""
        try:
            # Load the dataset
            df = pd.read_csv(os.path.join(self.dataset_path, "wikipedia_articles.csv"))

            # Convert each article into a Document
            self.docs = [
                Document(
                    page_content=f"Title: {row['title']}\n\nContent: {row['content']}",
                    metadata={
                        "title": row["title"],
                        "url": row["url"],
                        "category": row.get("category", ""),
                    },
                )
                for _, row in df.iterrows()
            ]

            # Initialize the retriever
            self.retriever = BM25Retriever.from_documents(self.docs)
            self.is_initialized = True

        except Exception as e:
            print(f"Error loading documents: {e}")
            raise

    def forward(self, query: str) -> str:
        """Process the query and return relevant Wikipedia content."""
        if not self.is_initialized:
            self._load_documents()

        if not self.retriever:
            return "Error: Retriever not initialized properly."

        try:
            # Get relevant documents
            results = self.retriever.get_relevant_documents(query)

            if not results:
                return "No relevant Wikipedia articles found."

            # Format the results
            formatted_results = []
            for doc in results[:3]:  # Return top 3 most relevant results
                metadata = doc.metadata
                formatted_results.append(
                    f"Title: {metadata['title']}\n"
                    f"URL: {metadata['url']}\n"
                    f"Category: {metadata['category']}\n"
                    f"Content: {doc.page_content[:500]}...\n"
                )

            return "\n\n".join(formatted_results)

        except Exception as e:
            return f"Error retrieving information: {str(e)}"