Spaces:
Sleeping
Sleeping
File size: 1,938 Bytes
396f5a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
from smolagents import tool
@tool
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
"""
Visits a website and fetches the content of a given URL / webpage.
if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
Args:
url (str): The URL to fetch.
convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
Returns:
str: The HTML content of the URL.
"""
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
content = None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, timeout=30, headers=headers)
# print(response.text)
if response.text is not None:
print("not none")
if (convert_to_markdown):
soup = BeautifulSoup(response.text, "html.parser")
# remove script and style tags
for script in soup(["script", "style"]):
script.extract()
# for wikipedia only keep the main content
if "wikipedia.org" in url:
main_content = soup.find("main",{"id":"content"})
if main_content:
content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
else:
content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
else:
# Fallback for all other sites - from chatgpt - not tested
content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip()
else:
content = response.text
# save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html")
return content
|