HF_Agents_Final_Project / tests /test_web_browser.py
Yago Bolivar
feat: implement WebBrowser class for fetching and parsing web content with error handling
c467d81
import unittest
from unittest.mock import patch, MagicMock
import requests # Import requests for its exception types
import os
import sys
# Add the parent directory to sys.path to find the src module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Adjust the import path based on your project structure
# If web_browser.py is in a 'src' directory:
from src.web_browsing_tool import WebBrowser
# If web_browser.py is in the same directory as app.py (and tools are in a 'tools' subdir):
# from tools.web_browser import WebBrowser
class TestWebBrowser(unittest.TestCase):
def setUp(self):
self.browser = WebBrowser(user_agent="TestAgent/1.0")
@patch('src.web_browsing_tool.requests.get')
def test_browse_successful_fetch_and_parse(self, mock_get):
# Mock the response from requests.get
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.content = b"<html><head><title>Test Page</title></head><body><p>Hello World!</p><script>alert('test');</script></body></html>"
mock_response.raise_for_status = MagicMock() # Ensure this doesn't raise an error
mock_get.return_value = mock_response
url = "http://example.com/testpage"
result = self.browser.browse(url)
mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15)
self.assertEqual(result, "Test Page\nHello World!")
@patch('src.web_browsing_tool.requests.get')
def test_browse_http_error(self, mock_get):
# Mock requests.get to raise an HTTPError
mock_get.side_effect = requests.exceptions.HTTPError("404 Client Error: Not Found for url")
url = "http://example.com/notfound"
result = self.browser.browse(url)
mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15)
self.assertTrue(result.startswith("Error: HTTP error occurred"))
self.assertIn("404 Client Error", result)
@patch('src.web_browsing_tool.requests.get')
def test_browse_connection_error(self, mock_get):
mock_get.side_effect = requests.exceptions.ConnectionError("Connection refused")
url = "http://example.com/unreachable"
result = self.browser.browse(url)
self.assertTrue(result.startswith("Error: Connection error occurred"))
self.assertIn("Connection refused", result)
@patch('src.web_browsing_tool.requests.get')
def test_browse_timeout_error(self, mock_get):
mock_get.side_effect = requests.exceptions.Timeout("Request timed out")
url = "http://example.com/slowresponse"
result = self.browser.browse(url)
self.assertTrue(result.startswith("Error: Timeout occurred"))
self.assertIn("Request timed out", result)
@patch('src.web_browsing_tool.requests.get')
def test_browse_generic_request_exception(self, mock_get):
mock_get.side_effect = requests.exceptions.RequestException("Some other request error")
url = "http://example.com/othererror"
result = self.browser.browse(url)
self.assertTrue(result.startswith("Error: An unexpected error occurred while fetching"))
self.assertIn("Some other request error", result)
def test_browse_invalid_url_format(self):
url = "www.example.com" # Missing http:// or https://
result = self.browser.browse(url)
self.assertEqual(result, "Error: Invalid URL format. URL must start with http:// or https://. Received: www.example.com")
@patch('src.web_browsing_tool.requests.get')
def test_browse_no_text_content(self, mock_get):
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.content = b"<html><head><script>var x=1;</script></head><body><style>.body {color:red;}</style></body></html>"
mock_response.raise_for_status = MagicMock()
mock_get.return_value = mock_response
url = "http://example.com/notext"
result = self.browser.browse(url)
self.assertEqual(result, f"Error: No text content found at {url}.")
@patch('src.web_browsing_tool.requests.get')
def test_browse_strips_extra_whitespace_and_newlines(self, mock_get):
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.content = b"<html><body><p>Line 1</p> <p>Line 2</p>\n\n<p>Line\n3</p><div><span>Text</span></div></body></html>"
mock_response.raise_for_status = MagicMock()
mock_get.return_value = mock_response
url = "http://example.com/whitespace"
result = self.browser.browse(url)
expected_text = "Line 1\nLine 2\nLine\n3\nText"
self.assertEqual(result, expected_text)
@patch('src.web_browsing_tool.requests.get')
def test_browse_for_question_answering_scenario_mercedes_sosa(self, mock_get):
"""
Tests if the browser can extract relevant text for a question
similar to the Mercedes Sosa studio albums count.
"""
# Use a regular string for HTML content
mock_html_content_str = """
<html>
<head><title>Mercedes Sosa Discography</title></head>
<body>
<h1>Mercedes Sosa</h1>
<h2>Studio Albums</h2>
<ul>
<li>1999 - Misa Criolla</li>
<li>2002 - Ac煤stico</li>
<li>2005 - Coraz贸n libre</li>
<li>2009 - Cantora 1</li>
<li>2011 - Canto para caminar</li>
</ul>
<h2>Live Albums</h2>
<ul>
<li>2000 - Live in Concert</li>
</ul>
</body>
</html>
"""
mock_response = MagicMock()
mock_response.status_code = 200
# Encode the string to bytes for the content
mock_response.content = mock_html_content_str.encode('utf-8')
mock_response.raise_for_status = MagicMock()
mock_get.return_value = mock_response
url = "http://example.com/mercedes_sosa_discography"
result = self.browser.browse(url)
# Assert that key information is present in the extracted text
self.assertIn("Mercedes Sosa Discography", result) # From title
self.assertIn("Studio Albums", result)
self.assertIn("1999 - Misa Criolla", result)
self.assertIn("2002 - Ac煤stico", result)
self.assertIn("2005 - Coraz贸n libre", result)
self.assertIn("2009 - Cantora 1", result)
self.assertIn("2011 - Canto para caminar", result)
# Ensure it doesn't just grab everything indiscriminately or miss sections
self.assertIn("Live Albums", result)
self.assertIn("2000 - Live in Concert", result)
# A further step (outside this tool's direct responsibility but for agent context)
# would be to pass this 'result' to an LLM with the question:
# "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?"
# The LLM should be able to parse the structured list and count "Ac煤stico", "Coraz贸n libre", "Cantora 1" -> 3.
if __name__ == '__main__':
unittest.main()