HF_Agents_Final_Project / src /web_browsing_tool.py
Yago Bolivar
refactor: update tool classes to inherit from Tool base class for consistency and improved structure
bffd09a
raw
history blame
4.8 kB
import requests
from bs4 import BeautifulSoup
from smolagents.tools import Tool
class WebBrowser(Tool):
"""
Retrieves information from online sources by browsing web pages.
Useful for extracting or summarizing web content.
"""
name = "web_browser"
description = "Fetches the content of a web page and extracts its text. Input should be a valid URL."
inputs = {'url': {'type': 'string', 'description': 'The URL of the web page to browse.'}}
outputs = {'text_content': {'type': 'string', 'description': 'The extracted text content of the web page, or an error message.'}}
output_type = "string"
def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
"""
Initializes the web browser with a user agent.
Args:
user_agent (str): The User-Agent string to use for requests.
"""
super().__init__(*args, **kwargs)
self.headers = {"User-Agent": user_agent}
self.is_initialized = True # Example of a tool state
def forward(self, url: str) -> str:
"""
Fetches the content of a web page and extracts its text.
Args:
url (str): The URL of the web page to browse.
Returns:
str: The extracted text content of the web page, or an error message
if fetching or parsing fails.
"""
if not url.startswith(('http://', 'https://')):
return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}"
try:
response = requests.get(url, headers=self.headers, timeout=15)
response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX)
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script_or_style in soup(["script", "style"]):
script_or_style.decompose()
# Get text
text_from_soup = soup.get_text(separator='\n', strip=True)
# Convert multiple newlines to a single newline and clean spaces within lines
cleaned_lines = []
for line in text_from_soup.splitlines():
line = line.strip() # Strip leading/trailing whitespace from the line itself
if line: # Only process non-empty lines
# Replace multiple spaces with a single space
cleaned_line = ' '.join(line.split())
cleaned_lines.append(cleaned_line)
text = '\n'.join(cleaned_lines)
if not text:
return f"Error: No text content found at {url}."
return text
except requests.exceptions.HTTPError as http_err:
return f"Error: HTTP error occurred while fetching {url}: {http_err}"
except requests.exceptions.ConnectionError as conn_err:
return f"Error: Connection error occurred while fetching {url}: {conn_err}"
except requests.exceptions.Timeout as timeout_err:
return f"Error: Timeout occurred while fetching {url}: {timeout_err}"
except requests.exceptions.RequestException as req_err:
return f"Error: An unexpected error occurred while fetching {url}: {req_err}"
except Exception as e:
return f"Error: An unexpected error occurred during parsing of {url}: {e}"
if __name__ == '__main__':
browser = WebBrowser() # Instantiation remains the same for testing
# Example usage:
# Note: For a real agent, the URL would come from the task or a search step.
# This example uses a known Wikipedia page for demonstration.
# For tasks like "How many studio albums were published by Mercedes Sosa...",
# the agent would first need to find the relevant Wikipedia URL.
test_url_wikipedia = "https://en.wikipedia.org/wiki/Mercedes_Sosa"
print(f"--- Browsing: {test_url_wikipedia} ---")
# For testing, call 'forward' directly
content_wikipedia = browser.forward(test_url_wikipedia)
if content_wikipedia.startswith("Error:"):
print(content_wikipedia)
else:
# Print first 1000 characters for brevity in example
print(content_wikipedia[:1000] + "..." if len(content_wikipedia) > 1000 else content_wikipedia)
print("\n--- Example with a non-existent page ---")
test_url_non_existent = "http://example.com/nonexistentpage12345.html"
content_non_existent = browser.forward(test_url_non_existent)
print(content_non_existent)
print("\n--- Example with an invalid URL format ---")
test_url_invalid_format = "www.google.com"
content_invalid_format = browser.forward(test_url_invalid_format)
print(content_invalid_format)