# this is asmolagent too to fetch html content from a url from smolagents import tool import requests from markdownify import markdownify as md import time from bs4 import BeautifulSoup @tool def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str: """ Fetches the HTML content of a given URL. if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML Args: url (str): The URL to fetch. convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML. Returns: str: The HTML content of the URL. """ response = requests.get(url) if (convert_to_markdown): soup = BeautifulSoup(response.text, "html.parser") # remove script and style tags for script in soup(["script", "style"]): script.extract() # for wikipedia only keep the main content if "wikipedia.org" in url: main_content = soup.find("main",{"id":"content"}) if main_content: content = md(str(main_content),strip=['script', 'style']).strip() else: content = md(response.text,strip=['script', 'style']).strip() else: content = response.text try: # save content to a file in test folder before returning # compute filepath with correct extension based on convert_to_markdown and add a timestamp for unicity file_extension = ".md" if convert_to_markdown else ".html" unicity_suffix = str(int(time.time())) file_name = f"test/fetched_content_{unicity_suffix}{file_extension}" with open(file_name, "w", encoding="utf-8") as f: f.write(content) except Exception as e: print(f"Error saving content to file: {e}") return content @tool # this tool allow web search on a local SearXNG instance def search_web(query: str, num_results: int = 5) -> list: """ Perform a web search using local SearXNG instance. Args: query (str): The search query. num_results (int): The number of results to return. Returns: list: A list of search results sorted by score with {url, title, content, score} for each result. """ # local metaserach engine searxng, run on localhost:8888 searxng_url = "http://localhost:8888/search" params = {"q": query, "format": 'json'} response = requests.get(searxng_url, params=params) if response.status_code == 200: ret = response.json() # keep only the response'results' array results = ret.get("results", []) # keep only the first num_results results = results[:num_results] # for each result keep only the url, title and content ans score results = [ { "url": result.get("url"), "title": result.get("title"), "content": result.get("content"), "score": result.get("score"), } for result in results ] return results else: print(f"Error: {response.status_code}") return [] if __name__ == "__main__": try: # Test the function query = "What is the capital of France?" results = search_web(query,3) print(results) except Exception as e: print(f"An error occurred: {e}") try: # Test the function video_id = "L1vXCYZAYYM" # Replace with your YouTube video ID video_url = "https://www.youtube.com/watch?v=" + video_id url = "https://en.wikipedia.org/wiki/Malko_Competition" # page_content = fetch_webpage(video_url) page_content = fetch_webpage(url, convert_to_markdown=True) print(page_content.encode("utf-8")) except Exception as e: print(f"An error occurred: {e}")