# this is asmolagent too to fetch html content from a url
from smolagents import tool
import requests
from markdownify import markdownify as md
from bs4 import BeautifulSoup
from common.mylogger import save_file_with_timestamp, mylog
@tool
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
"""
Fetches the HTML content of a given URL.
if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
Args:
url (str): The URL to fetch.
convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
Returns:
str: The HTML content of the URL.
"""
content = None
response = requests.get(url, timeout=30)
if (convert_to_markdown):
soup = BeautifulSoup(response.text, "html.parser")
# remove script and style tags
for script in soup(["script", "style"]):
script.extract()
# for wikipedia only keep the main content
if "wikipedia.org" in url:
main_content = soup.find("main",{"id":"content"})
if main_content:
content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
else:
content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
else:
content = response.text
save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html")
return content
@tool
# this tool allow web search on a local SearXNG instance
def search_web(query: str, num_results: int = 5) -> list:
"""
Perform a web search using local SearXNG instance.
Args:
query (str): The search query.
num_results (int): The number of results to return.
Returns:
list: A list of search results sorted by score with {url, title, content, score} for each result.
"""
# local metaserach engine searxng, run on localhost:8888
searxng_url = "http://localhost:8888/search"
params = {"q": query, "format": 'json'}
response = requests.get(searxng_url, params=params)
if response.status_code == 200:
ret = response.json()
# keep only the response'results' array
results = ret.get("results", [])
# keep only the first num_results
results = results[:num_results]
# for each result keep only the url, title and content ans score
results = [
{
"url": result.get("url"),
"title": result.get("title"),
"content": result.get("content"),
"score": result.get("score"),
}
for result in results
]
return results
else:
print(f"Error: {response.status_code}")
return []
if __name__ == "__main__":
try:
# Test the function
query = "What is the capital of France?"
results = search_web(query,3)
print(results)
except Exception as e:
print(f"An error occurred: {e}")
try:
# Test the function
video_id = "L1vXCYZAYYM" # Replace with your YouTube video ID
video_url = "https://www.youtube.com/watch?v=" + video_id
url = "https://en.wikipedia.org/wiki/Malko_Competition"
# page_content = fetch_webpage(video_url)
page_content = fetch_webpage(url, convert_to_markdown=True)
print(page_content.encode("utf-8"))
except Exception as e:
print(f"An error occurred: {e}")