Yago Bolivar
feat: implement WebBrowser class for fetching and parsing web content with error handling
c467d81
import unittest | |
from unittest.mock import patch, MagicMock | |
import requests # Import requests for its exception types | |
import os | |
import sys | |
# Add the parent directory to sys.path to find the src module | |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
# Adjust the import path based on your project structure | |
# If web_browser.py is in a 'src' directory: | |
from src.web_browsing_tool import WebBrowser | |
# If web_browser.py is in the same directory as app.py (and tools are in a 'tools' subdir): | |
# from tools.web_browser import WebBrowser | |
class TestWebBrowser(unittest.TestCase): | |
def setUp(self): | |
self.browser = WebBrowser(user_agent="TestAgent/1.0") | |
def test_browse_successful_fetch_and_parse(self, mock_get): | |
# Mock the response from requests.get | |
mock_response = MagicMock() | |
mock_response.status_code = 200 | |
mock_response.content = b"<html><head><title>Test Page</title></head><body><p>Hello World!</p><script>alert('test');</script></body></html>" | |
mock_response.raise_for_status = MagicMock() # Ensure this doesn't raise an error | |
mock_get.return_value = mock_response | |
url = "http://example.com/testpage" | |
result = self.browser.browse(url) | |
mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15) | |
self.assertEqual(result, "Test Page\nHello World!") | |
def test_browse_http_error(self, mock_get): | |
# Mock requests.get to raise an HTTPError | |
mock_get.side_effect = requests.exceptions.HTTPError("404 Client Error: Not Found for url") | |
url = "http://example.com/notfound" | |
result = self.browser.browse(url) | |
mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15) | |
self.assertTrue(result.startswith("Error: HTTP error occurred")) | |
self.assertIn("404 Client Error", result) | |
def test_browse_connection_error(self, mock_get): | |
mock_get.side_effect = requests.exceptions.ConnectionError("Connection refused") | |
url = "http://example.com/unreachable" | |
result = self.browser.browse(url) | |
self.assertTrue(result.startswith("Error: Connection error occurred")) | |
self.assertIn("Connection refused", result) | |
def test_browse_timeout_error(self, mock_get): | |
mock_get.side_effect = requests.exceptions.Timeout("Request timed out") | |
url = "http://example.com/slowresponse" | |
result = self.browser.browse(url) | |
self.assertTrue(result.startswith("Error: Timeout occurred")) | |
self.assertIn("Request timed out", result) | |
def test_browse_generic_request_exception(self, mock_get): | |
mock_get.side_effect = requests.exceptions.RequestException("Some other request error") | |
url = "http://example.com/othererror" | |
result = self.browser.browse(url) | |
self.assertTrue(result.startswith("Error: An unexpected error occurred while fetching")) | |
self.assertIn("Some other request error", result) | |
def test_browse_invalid_url_format(self): | |
url = "www.example.com" # Missing http:// or https:// | |
result = self.browser.browse(url) | |
self.assertEqual(result, "Error: Invalid URL format. URL must start with http:// or https://. Received: www.example.com") | |
def test_browse_no_text_content(self, mock_get): | |
mock_response = MagicMock() | |
mock_response.status_code = 200 | |
mock_response.content = b"<html><head><script>var x=1;</script></head><body><style>.body {color:red;}</style></body></html>" | |
mock_response.raise_for_status = MagicMock() | |
mock_get.return_value = mock_response | |
url = "http://example.com/notext" | |
result = self.browser.browse(url) | |
self.assertEqual(result, f"Error: No text content found at {url}.") | |
def test_browse_strips_extra_whitespace_and_newlines(self, mock_get): | |
mock_response = MagicMock() | |
mock_response.status_code = 200 | |
mock_response.content = b"<html><body><p>Line 1</p> <p>Line 2</p>\n\n<p>Line\n3</p><div><span>Text</span></div></body></html>" | |
mock_response.raise_for_status = MagicMock() | |
mock_get.return_value = mock_response | |
url = "http://example.com/whitespace" | |
result = self.browser.browse(url) | |
expected_text = "Line 1\nLine 2\nLine\n3\nText" | |
self.assertEqual(result, expected_text) | |
def test_browse_for_question_answering_scenario_mercedes_sosa(self, mock_get): | |
""" | |
Tests if the browser can extract relevant text for a question | |
similar to the Mercedes Sosa studio albums count. | |
""" | |
# Use a regular string for HTML content | |
mock_html_content_str = """ | |
<html> | |
<head><title>Mercedes Sosa Discography</title></head> | |
<body> | |
<h1>Mercedes Sosa</h1> | |
<h2>Studio Albums</h2> | |
<ul> | |
<li>1999 - Misa Criolla</li> | |
<li>2002 - Ac煤stico</li> | |
<li>2005 - Coraz贸n libre</li> | |
<li>2009 - Cantora 1</li> | |
<li>2011 - Canto para caminar</li> | |
</ul> | |
<h2>Live Albums</h2> | |
<ul> | |
<li>2000 - Live in Concert</li> | |
</ul> | |
</body> | |
</html> | |
""" | |
mock_response = MagicMock() | |
mock_response.status_code = 200 | |
# Encode the string to bytes for the content | |
mock_response.content = mock_html_content_str.encode('utf-8') | |
mock_response.raise_for_status = MagicMock() | |
mock_get.return_value = mock_response | |
url = "http://example.com/mercedes_sosa_discography" | |
result = self.browser.browse(url) | |
# Assert that key information is present in the extracted text | |
self.assertIn("Mercedes Sosa Discography", result) # From title | |
self.assertIn("Studio Albums", result) | |
self.assertIn("1999 - Misa Criolla", result) | |
self.assertIn("2002 - Ac煤stico", result) | |
self.assertIn("2005 - Coraz贸n libre", result) | |
self.assertIn("2009 - Cantora 1", result) | |
self.assertIn("2011 - Canto para caminar", result) | |
# Ensure it doesn't just grab everything indiscriminately or miss sections | |
self.assertIn("Live Albums", result) | |
self.assertIn("2000 - Live in Concert", result) | |
# A further step (outside this tool's direct responsibility but for agent context) | |
# would be to pass this 'result' to an LLM with the question: | |
# "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?" | |
# The LLM should be able to parse the structured list and count "Ac煤stico", "Coraz贸n libre", "Cantora 1" -> 3. | |
if __name__ == '__main__': | |
unittest.main() |