HF_Agents_Final_Project

Sleeping

HF_Agents_Final_Project / src /web_browsing_tool.py

Yago Bolivar

refactor: update tool classes to inherit from Tool base class for consistency and improved structure

bffd09a 3 months ago

4.8 kB

	import requests
	from bs4 import BeautifulSoup
	from smolagents.tools import Tool

	class WebBrowser(Tool):
	"""
	Retrieves information from online sources by browsing web pages.
	Useful for extracting or summarizing web content.
	"""
	name = "web_browser"
	description = "Fetches the content of a web page and extracts its text. Input should be a valid URL."
	inputs = {'url': {'type': 'string', 'description': 'The URL of the web page to browse.'}}
	outputs = {'text_content': {'type': 'string', 'description': 'The extracted text content of the web page, or an error message.'}}
	output_type = "string"

	def __init__(self, user_agent="GAIA-Agent/1.0", args, *kwargs):
	"""
	Initializes the web browser with a user agent.
	Args:
	user_agent (str): The User-Agent string to use for requests.
	"""
	super().__init__(args, *kwargs)
	self.headers = {"User-Agent": user_agent}
	self.is_initialized = True # Example of a tool state

	def forward(self, url: str) -> str:
	"""
	Fetches the content of a web page and extracts its text.

	Args:
	url (str): The URL of the web page to browse.

	Returns:
	str: The extracted text content of the web page, or an error message
	if fetching or parsing fails.
	"""
	if not url.startswith(('http://', 'https://')):
	return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}"

	try:
	response = requests.get(url, headers=self.headers, timeout=15)
	response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX)

	# Use BeautifulSoup to parse the HTML content
	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script and style elements
	for script_or_style in soup(["script", "style"]):
	script_or_style.decompose()

	# Get text
	text_from_soup = soup.get_text(separator='\n', strip=True)

	# Convert multiple newlines to a single newline and clean spaces within lines
	cleaned_lines = []
	for line in text_from_soup.splitlines():
	line = line.strip() # Strip leading/trailing whitespace from the line itself
	if line: # Only process non-empty lines
	# Replace multiple spaces with a single space
	cleaned_line = ' '.join(line.split())
	cleaned_lines.append(cleaned_line)

	text = '\n'.join(cleaned_lines)

	if not text:
	return f"Error: No text content found at {url}."

	return text

	except requests.exceptions.HTTPError as http_err:
	return f"Error: HTTP error occurred while fetching {url}: {http_err}"
	except requests.exceptions.ConnectionError as conn_err:
	return f"Error: Connection error occurred while fetching {url}: {conn_err}"
	except requests.exceptions.Timeout as timeout_err:
	return f"Error: Timeout occurred while fetching {url}: {timeout_err}"
	except requests.exceptions.RequestException as req_err:
	return f"Error: An unexpected error occurred while fetching {url}: {req_err}"
	except Exception as e:
	return f"Error: An unexpected error occurred during parsing of {url}: {e}"

	if __name__ == '__main__':
	browser = WebBrowser() # Instantiation remains the same for testing

	# Example usage:
	# Note: For a real agent, the URL would come from the task or a search step.
	# This example uses a known Wikipedia page for demonstration.
	# For tasks like "How many studio albums were published by Mercedes Sosa...",
	# the agent would first need to find the relevant Wikipedia URL.

	test_url_wikipedia = "https://en.wikipedia.org/wiki/Mercedes_Sosa"
	print(f"--- Browsing: {test_url_wikipedia} ---")
	# For testing, call 'forward' directly
	content_wikipedia = browser.forward(test_url_wikipedia)
	if content_wikipedia.startswith("Error:"):
	print(content_wikipedia)
	else:
	# Print first 1000 characters for brevity in example
	print(content_wikipedia[:1000] + "..." if len(content_wikipedia) > 1000 else content_wikipedia)

	print("\n--- Example with a non-existent page ---")
	test_url_non_existent = "http://example.com/nonexistentpage12345.html"
	content_non_existent = browser.forward(test_url_non_existent)
	print(content_non_existent)

	print("\n--- Example with an invalid URL format ---")
	test_url_invalid_format = "www.google.com"
	content_invalid_format = browser.forward(test_url_invalid_format)
	print(content_invalid_format)