File size: 4,804 Bytes
c467d81
 
bffd09a
c467d81
bffd09a
c467d81
bffd09a
b1939df
c467d81
bffd09a
 
 
 
 
c467d81
bffd09a
c467d81
 
 
 
 
bffd09a
c467d81
bffd09a
c467d81
bffd09a
c467d81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bffd09a
 
c467d81
 
 
bffd09a
c467d81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bffd09a
c467d81
 
 
 
 
 
 
 
 
bffd09a
 
c467d81
 
 
 
 
 
 
 
bffd09a
c467d81
 
 
 
bffd09a
c467d81
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import requests
from bs4 import BeautifulSoup
from smolagents.tools import Tool

class WebBrowser(Tool):
    """
    Retrieves information from online sources by browsing web pages.
    Useful for extracting or summarizing web content.
    """
    name = "web_browser"
    description = "Fetches the content of a web page and extracts its text. Input should be a valid URL."
    inputs = {'url': {'type': 'string', 'description': 'The URL of the web page to browse.'}}
    outputs = {'text_content': {'type': 'string', 'description': 'The extracted text content of the web page, or an error message.'}}
    output_type = "string"

    def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
        """
        Initializes the web browser with a user agent.
        Args:
            user_agent (str): The User-Agent string to use for requests.
        """
        super().__init__(*args, **kwargs)
        self.headers = {"User-Agent": user_agent}
        self.is_initialized = True  # Example of a tool state

    def forward(self, url: str) -> str:
        """
        Fetches the content of a web page and extracts its text.

        Args:
            url (str): The URL of the web page to browse.

        Returns:
            str: The extracted text content of the web page, or an error message
                 if fetching or parsing fails.
        """
        if not url.startswith(('http://', 'https://')):
            return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}"

        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)

            # Use BeautifulSoup to parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Remove script and style elements
            for script_or_style in soup(["script", "style"]):
                script_or_style.decompose()

            # Get text
            text_from_soup = soup.get_text(separator='\n', strip=True)

            # Convert multiple newlines to a single newline and clean spaces within lines
            cleaned_lines = []
            for line in text_from_soup.splitlines():
                line = line.strip()  # Strip leading/trailing whitespace from the line itself
                if line:  # Only process non-empty lines
                    # Replace multiple spaces with a single space
                    cleaned_line = ' '.join(line.split())
                    cleaned_lines.append(cleaned_line)

            text = '\n'.join(cleaned_lines)

            if not text:
                return f"Error: No text content found at {url}."

            return text

        except requests.exceptions.HTTPError as http_err:
            return f"Error: HTTP error occurred while fetching {url}: {http_err}"
        except requests.exceptions.ConnectionError as conn_err:
            return f"Error: Connection error occurred while fetching {url}: {conn_err}"
        except requests.exceptions.Timeout as timeout_err:
            return f"Error: Timeout occurred while fetching {url}: {timeout_err}"
        except requests.exceptions.RequestException as req_err:
            return f"Error: An unexpected error occurred while fetching {url}: {req_err}"
        except Exception as e:
            return f"Error: An unexpected error occurred during parsing of {url}: {e}"

if __name__ == '__main__':
    browser = WebBrowser()  # Instantiation remains the same for testing

    # Example usage:
    # Note: For a real agent, the URL would come from the task or a search step.
    # This example uses a known Wikipedia page for demonstration.
    # For tasks like "How many studio albums were published by Mercedes Sosa...",
    # the agent would first need to find the relevant Wikipedia URL.

    test_url_wikipedia = "https://en.wikipedia.org/wiki/Mercedes_Sosa"
    print(f"--- Browsing: {test_url_wikipedia} ---")
    # For testing, call 'forward' directly
    content_wikipedia = browser.forward(test_url_wikipedia)
    if content_wikipedia.startswith("Error:"):
        print(content_wikipedia)
    else:
        # Print first 1000 characters for brevity in example
        print(content_wikipedia[:1000] + "..." if len(content_wikipedia) > 1000 else content_wikipedia)

    print("\n--- Example with a non-existent page ---")
    test_url_non_existent = "http://example.com/nonexistentpage12345.html"
    content_non_existent = browser.forward(test_url_non_existent)
    print(content_non_existent)

    print("\n--- Example with an invalid URL format ---")
    test_url_invalid_format = "www.google.com"
    content_invalid_format = browser.forward(test_url_invalid_format)
    print(content_invalid_format)