Spaces:
Runtime error
Runtime error
File size: 2,195 Bytes
731825e c657a71 6a809e4 c657a71 731825e c657a71 6a809e4 c657a71 731825e c657a71 731825e 6a809e4 731825e 6a809e4 731825e 6a809e4 c657a71 6a809e4 731825e 6a809e4 c657a71 6a809e4 731825e 6a809e4 c657a71 731825e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# Importing necessary libraries and modules
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from pydantic import PrivateAttr
from langchain_core.tools.base import BaseTool
from langchain_community.document_loaders import WikipediaLoader
import requests
from bs4 import BeautifulSoup
import wikipedia
# Defining the WikipediaTool class which extends BaseTool
class WikipediaTool(BaseTool):
name: str = "wikipedia_tool"
description: str = "Search Wikipedia for a given query, retrieving the corresponding page's HTML content. The query should not contain any noise and ask for something specific."
def __init__(self):
# Initializing the WikipediaTool
super().__init__()
def _run(self, query: str):
# Method to run the Wikipedia tool with the given query
print(f"wikipedia_search_html called with query='{query}'") # Logging the query
# Step 1: Get Wikipedia HTML
page = wikipedia.page(query) # Fetching the Wikipedia page for the query
html = page.html() # Extracting the HTML content of the page
# Step 2: Parse HTML
soup = BeautifulSoup(html, "html.parser") # Parsing the HTML content
content_div = soup.find("div", class_="mw-parser-output") # Finding the content division
# content_div = soup.find("table", class_="wikitable")
if not content_div:
return ""
# Step 3: Find all tags to remove (style, script, sup, infobox, etc.)
to_decompose = [] # Collecting tags to be removed
for tag in content_div.find_all(): # Looping through all tags in the content division
tag_classes = tag.get("class", [])
if (
tag.name in ["style", "script", "sup"]
or any(cls in ["infobox", "navbox", "reference"] for cls in tag_classes)
):
to_decompose.append(tag)
# Remove them after collecting
for tag in to_decompose: # Decompose and remove the collected tags
tag.decompose()
return str(content_div) # Returning the cleaned content division as string |