# this is asmolagent too to fetch html content from a url from smolagents import tool import requests from markdownify import markdownify as md from bs4 import BeautifulSoup from common.mylogger import save_file_with_timestamp, mylog @tool def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str: """ Fetches the HTML content of a given URL. if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML Args: url (str): The URL to fetch. convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML. Returns: str: The HTML content of the URL. """ content = None response = requests.get(url, timeout=30) if (convert_to_markdown): soup = BeautifulSoup(response.text, "html.parser") # remove script and style tags for script in soup(["script", "style"]): script.extract() # for wikipedia only keep the main content if "wikipedia.org" in url: main_content = soup.find("main",{"id":"content"}) if main_content: content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip() else: content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip() else: content = response.text save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html") return content @tool # this tool allow web search on a local SearXNG instance def search_web(query: str, num_results: int = 5) -> list: """ Perform a web search using local SearXNG instance. Args: query (str): The search query. num_results (int): The number of results to return. Returns: list: A list of search results sorted by score with {url, title, content, score} for each result. """ # local metaserach engine searxng, run on localhost:8888 searxng_url = "http://localhost:8888/search" params = {"q": query, "format": 'json'} response = requests.get(searxng_url, params=params) if response.status_code == 200: ret = response.json() # keep only the response'results' array results = ret.get("results", []) # keep only the first num_results results = results[:num_results] # for each result keep only the url, title and content ans score results = [ { "url": result.get("url"), "title": result.get("title"), "content": result.get("content"), "score": result.get("score"), } for result in results ] return results else: print(f"Error: {response.status_code}") return [] if __name__ == "__main__": try: # Test the function query = "What is the capital of France?" results = search_web(query,3) print(results) except Exception as e: print(f"An error occurred: {e}") try: # Test the function video_id = "L1vXCYZAYYM" # Replace with your YouTube video ID video_url = "https://www.youtube.com/watch?v=" + video_id url = "https://en.wikipedia.org/wiki/Malko_Competition" # page_content = fetch_webpage(video_url) page_content = fetch_webpage(url, convert_to_markdown=True) print(page_content.encode("utf-8")) except Exception as e: print(f"An error occurred: {e}")