import requests from bs4 import BeautifulSoup from smolagents.tools import Tool class WebBrowser(Tool): """ Retrieves information from online sources by browsing web pages. Useful for extracting or summarizing web content. """ name = "web_browser" description = "Fetches the content of a web page and extracts its text. Input should be a valid URL." inputs = {'url': {'type': 'string', 'description': 'The URL of the web page to browse.'}} outputs = {'text_content': {'type': 'string', 'description': 'The extracted text content of the web page, or an error message.'}} output_type = "string" def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs): """ Initializes the web browser with a user agent. Args: user_agent (str): The User-Agent string to use for requests. """ super().__init__(*args, **kwargs) self.headers = {"User-Agent": user_agent} self.is_initialized = True # Example of a tool state def forward(self, url: str) -> str: """ Fetches the content of a web page and extracts its text. Args: url (str): The URL of the web page to browse. Returns: str: The extracted text content of the web page, or an error message if fetching or parsing fails. """ if not url.startswith(('http://', 'https://')): return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}" try: response = requests.get(url, headers=self.headers, timeout=15) response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX) # Use BeautifulSoup to parse the HTML content soup = BeautifulSoup(response.content, 'html.parser') # Remove script and style elements for script_or_style in soup(["script", "style"]): script_or_style.decompose() # Get text text_from_soup = soup.get_text(separator='\n', strip=True) # Convert multiple newlines to a single newline and clean spaces within lines cleaned_lines = [] for line in text_from_soup.splitlines(): line = line.strip() # Strip leading/trailing whitespace from the line itself if line: # Only process non-empty lines # Replace multiple spaces with a single space cleaned_line = ' '.join(line.split()) cleaned_lines.append(cleaned_line) text = '\n'.join(cleaned_lines) if not text: return f"Error: No text content found at {url}." return text except requests.exceptions.HTTPError as http_err: return f"Error: HTTP error occurred while fetching {url}: {http_err}" except requests.exceptions.ConnectionError as conn_err: return f"Error: Connection error occurred while fetching {url}: {conn_err}" except requests.exceptions.Timeout as timeout_err: return f"Error: Timeout occurred while fetching {url}: {timeout_err}" except requests.exceptions.RequestException as req_err: return f"Error: An unexpected error occurred while fetching {url}: {req_err}" except Exception as e: return f"Error: An unexpected error occurred during parsing of {url}: {e}" if __name__ == '__main__': browser = WebBrowser() # Instantiation remains the same for testing # Example usage: # Note: For a real agent, the URL would come from the task or a search step. # This example uses a known Wikipedia page for demonstration. # For tasks like "How many studio albums were published by Mercedes Sosa...", # the agent would first need to find the relevant Wikipedia URL. test_url_wikipedia = "https://en.wikipedia.org/wiki/Mercedes_Sosa" print(f"--- Browsing: {test_url_wikipedia} ---") # For testing, call 'forward' directly content_wikipedia = browser.forward(test_url_wikipedia) if content_wikipedia.startswith("Error:"): print(content_wikipedia) else: # Print first 1000 characters for brevity in example print(content_wikipedia[:1000] + "..." if len(content_wikipedia) > 1000 else content_wikipedia) print("\n--- Example with a non-existent page ---") test_url_non_existent = "http://example.com/nonexistentpage12345.html" content_non_existent = browser.forward(test_url_non_existent) print(content_non_existent) print("\n--- Example with an invalid URL format ---") test_url_invalid_format = "www.google.com" content_invalid_format = browser.forward(test_url_invalid_format) print(content_invalid_format)