import requests
from bs4 import BeautifulSoup
from smolagents.tools import Tool

class WebBrowser(Tool):
    """
    Retrieves information from online sources by browsing web pages.
    Useful for extracting or summarizing web content.
    """
    name = "web_browser"
    description = "Fetches the content of a web page and extracts its text. Input should be a valid URL."
    inputs = {'url': {'type': 'string', 'description': 'The URL of the web page to browse.'}}
    outputs = {'text_content': {'type': 'string', 'description': 'The extracted text content of the web page, or an error message.'}}
    output_type = "string"

    def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
        """
        Initializes the web browser with a user agent.
        Args:
            user_agent (str): The User-Agent string to use for requests.
        """
        super().__init__(*args, **kwargs)
        self.headers = {"User-Agent": user_agent}
        self.is_initialized = True  # Example of a tool state

    def forward(self, url: str) -> str:
        """
        Fetches the content of a web page and extracts its text.

        Args:
            url (str): The URL of the web page to browse.

        Returns:
            str: The extracted text content of the web page, or an error message
                 if fetching or parsing fails.
        """
        if not url.startswith(('http://', 'https://')):
            return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}"

        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)

            # Use BeautifulSoup to parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Remove script and style elements
            for script_or_style in soup(["script", "style"]):
                script_or_style.decompose()

            # Get text
            text_from_soup = soup.get_text(separator='\n', strip=True)

            # Convert multiple newlines to a single newline and clean spaces within lines
            cleaned_lines = []
            for line in text_from_soup.splitlines():
                line = line.strip()  # Strip leading/trailing whitespace from the line itself
                if line:  # Only process non-empty lines
                    # Replace multiple spaces with a single space
                    cleaned_line = ' '.join(line.split())
                    cleaned_lines.append(cleaned_line)

            text = '\n'.join(cleaned_lines)

            if not text:
                return f"Error: No text content found at {url}."

            return text

        except requests.exceptions.HTTPError as http_err:
            return f"Error: HTTP error occurred while fetching {url}: {http_err}"
        except requests.exceptions.ConnectionError as conn_err:
            return f"Error: Connection error occurred while fetching {url}: {conn_err}"
        except requests.exceptions.Timeout as timeout_err:
            return f"Error: Timeout occurred while fetching {url}: {timeout_err}"
        except requests.exceptions.RequestException as req_err:
            return f"Error: An unexpected error occurred while fetching {url}: {req_err}"
        except Exception as e:
            return f"Error: An unexpected error occurred during parsing of {url}: {e}"

if __name__ == '__main__':
    browser = WebBrowser()  # Instantiation remains the same for testing

    # Example usage:
    # Note: For a real agent, the URL would come from the task or a search step.
    # This example uses a known Wikipedia page for demonstration.
    # For tasks like "How many studio albums were published by Mercedes Sosa...",
    # the agent would first need to find the relevant Wikipedia URL.

    test_url_wikipedia = "https://en.wikipedia.org/wiki/Mercedes_Sosa"
    print(f"--- Browsing: {test_url_wikipedia} ---")
    # For testing, call 'forward' directly
    content_wikipedia = browser.forward(test_url_wikipedia)
    if content_wikipedia.startswith("Error:"):
        print(content_wikipedia)
    else:
        # Print first 1000 characters for brevity in example
        print(content_wikipedia[:1000] + "..." if len(content_wikipedia) > 1000 else content_wikipedia)

    print("\n--- Example with a non-existent page ---")
    test_url_non_existent = "http://example.com/nonexistentpage12345.html"
    content_non_existent = browser.forward(test_url_non_existent)
    print(content_non_existent)

    print("\n--- Example with an invalid URL format ---")
    test_url_invalid_format = "www.google.com"
    content_invalid_format = browser.forward(test_url_invalid_format)
    print(content_invalid_format)