File size: 2,195 Bytes
731825e
c657a71
 
 
 
 
 
 
6a809e4
c657a71
731825e
c657a71
 
6a809e4
c657a71
 
731825e
c657a71
 
 
731825e
 
6a809e4
731825e
 
6a809e4
 
731825e
 
6a809e4
 
c657a71
6a809e4
 
731825e
 
6a809e4
 
 
 
 
c657a71
6a809e4
 
731825e
6a809e4
c657a71
731825e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# Importing necessary libraries and modules
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from pydantic import PrivateAttr
from langchain_core.tools.base import BaseTool
from langchain_community.document_loaders import WikipediaLoader
import requests
from bs4 import BeautifulSoup
import wikipedia

# Defining the WikipediaTool class which extends BaseTool
class WikipediaTool(BaseTool):
    name: str = "wikipedia_tool"
    description: str = "Search Wikipedia for a given query, retrieving the corresponding page's HTML content. The query should not contain any noise and ask for something specific."

    def __init__(self):
        # Initializing the WikipediaTool
        super().__init__()

    def _run(self, query: str):
        # Method to run the Wikipedia tool with the given query
        print(f"wikipedia_search_html called with query='{query}'")  # Logging the query
        # Step 1: Get Wikipedia HTML
        page = wikipedia.page(query)  # Fetching the Wikipedia page for the query
        html = page.html()  # Extracting the HTML content of the page

        # Step 2: Parse HTML
        soup = BeautifulSoup(html, "html.parser")  # Parsing the HTML content
        content_div = soup.find("div", class_="mw-parser-output")  # Finding the content division
        # content_div = soup.find("table", class_="wikitable")
        if not content_div:
            return ""

        # Step 3: Find all tags to remove (style, script, sup, infobox, etc.)
        to_decompose = []  # Collecting tags to be removed
        for tag in content_div.find_all():  # Looping through all tags in the content division
            tag_classes = tag.get("class", [])
            if (
                tag.name in ["style", "script", "sup"]
                or any(cls in ["infobox", "navbox", "reference"] for cls in tag_classes)
            ):
                to_decompose.append(tag)

        # Remove them after collecting
        for tag in to_decompose:  # Decompose and remove the collected tags
            tag.decompose()
        
        return str(content_div)  # Returning the cleaned content division as string