Yago Bolivar
refactor: update tool classes to inherit from Tool base class for consistency and improved structure
bffd09a
import requests | |
from bs4 import BeautifulSoup | |
from smolagents.tools import Tool | |
class WebBrowser(Tool): | |
""" | |
Retrieves information from online sources by browsing web pages. | |
Useful for extracting or summarizing web content. | |
""" | |
name = "web_browser" | |
description = "Fetches the content of a web page and extracts its text. Input should be a valid URL." | |
inputs = {'url': {'type': 'string', 'description': 'The URL of the web page to browse.'}} | |
outputs = {'text_content': {'type': 'string', 'description': 'The extracted text content of the web page, or an error message.'}} | |
output_type = "string" | |
def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs): | |
""" | |
Initializes the web browser with a user agent. | |
Args: | |
user_agent (str): The User-Agent string to use for requests. | |
""" | |
super().__init__(*args, **kwargs) | |
self.headers = {"User-Agent": user_agent} | |
self.is_initialized = True # Example of a tool state | |
def forward(self, url: str) -> str: | |
""" | |
Fetches the content of a web page and extracts its text. | |
Args: | |
url (str): The URL of the web page to browse. | |
Returns: | |
str: The extracted text content of the web page, or an error message | |
if fetching or parsing fails. | |
""" | |
if not url.startswith(('http://', 'https://')): | |
return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}" | |
try: | |
response = requests.get(url, headers=self.headers, timeout=15) | |
response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX) | |
# Use BeautifulSoup to parse the HTML content | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Remove script and style elements | |
for script_or_style in soup(["script", "style"]): | |
script_or_style.decompose() | |
# Get text | |
text_from_soup = soup.get_text(separator='\n', strip=True) | |
# Convert multiple newlines to a single newline and clean spaces within lines | |
cleaned_lines = [] | |
for line in text_from_soup.splitlines(): | |
line = line.strip() # Strip leading/trailing whitespace from the line itself | |
if line: # Only process non-empty lines | |
# Replace multiple spaces with a single space | |
cleaned_line = ' '.join(line.split()) | |
cleaned_lines.append(cleaned_line) | |
text = '\n'.join(cleaned_lines) | |
if not text: | |
return f"Error: No text content found at {url}." | |
return text | |
except requests.exceptions.HTTPError as http_err: | |
return f"Error: HTTP error occurred while fetching {url}: {http_err}" | |
except requests.exceptions.ConnectionError as conn_err: | |
return f"Error: Connection error occurred while fetching {url}: {conn_err}" | |
except requests.exceptions.Timeout as timeout_err: | |
return f"Error: Timeout occurred while fetching {url}: {timeout_err}" | |
except requests.exceptions.RequestException as req_err: | |
return f"Error: An unexpected error occurred while fetching {url}: {req_err}" | |
except Exception as e: | |
return f"Error: An unexpected error occurred during parsing of {url}: {e}" | |
if __name__ == '__main__': | |
browser = WebBrowser() # Instantiation remains the same for testing | |
# Example usage: | |
# Note: For a real agent, the URL would come from the task or a search step. | |
# This example uses a known Wikipedia page for demonstration. | |
# For tasks like "How many studio albums were published by Mercedes Sosa...", | |
# the agent would first need to find the relevant Wikipedia URL. | |
test_url_wikipedia = "https://en.wikipedia.org/wiki/Mercedes_Sosa" | |
print(f"--- Browsing: {test_url_wikipedia} ---") | |
# For testing, call 'forward' directly | |
content_wikipedia = browser.forward(test_url_wikipedia) | |
if content_wikipedia.startswith("Error:"): | |
print(content_wikipedia) | |
else: | |
# Print first 1000 characters for brevity in example | |
print(content_wikipedia[:1000] + "..." if len(content_wikipedia) > 1000 else content_wikipedia) | |
print("\n--- Example with a non-existent page ---") | |
test_url_non_existent = "http://example.com/nonexistentpage12345.html" | |
content_non_existent = browser.forward(test_url_non_existent) | |
print(content_non_existent) | |
print("\n--- Example with an invalid URL format ---") | |
test_url_invalid_format = "www.google.com" | |
content_invalid_format = browser.forward(test_url_invalid_format) | |
print(content_invalid_format) |