Yago Bolivar
commited on
Commit
·
b09a8ba
1
Parent(s):
baa65ee
feat: Enhance tools with new web content extractor and improved functionality
Browse files- Added WebContentExtractor for structured content extraction from websites, including specialized handling for Wikipedia.
- Updated WebBrowser to support multiple extraction modes (text, tables, lists, structured) and improved error handling.
- Enhanced CodeExecutionTool with utility functions for web data processing and robust error handling.
- Improved logging across tools for better debugging and traceability.
- app.py +7 -4
- src/python_tool.py +211 -62
- src/web_browsing_tool.py +365 -42
- src/web_content_extractor.py +410 -0
app.py
CHANGED
@@ -13,12 +13,13 @@ from src.final_answer_tool import FinalAnswerTool
|
|
13 |
from src.web_browsing_tool import WebBrowser
|
14 |
from src.file_processing_tool import FileIdentifier
|
15 |
from src.image_processing_tool import ImageProcessor
|
16 |
-
from src.markdown_table_parser import MarkdownTableParserTool
|
17 |
from src.python_tool import CodeExecutionTool
|
18 |
-
from src.speech_to_text import SpeechToTextTool
|
19 |
from src.spreadsheet_tool import SpreadsheetTool
|
20 |
from src.text_reversal_tool import TextReversalTool
|
21 |
from src.video_processing_tool import VideoProcessingTool
|
|
|
22 |
|
23 |
# (Keep Constants as is)
|
24 |
# --- Constants ---
|
@@ -67,7 +68,8 @@ python_tool = CodeExecutionTool()
|
|
67 |
speech_to_text_tool = SpeechToTextTool() # Updated
|
68 |
spreadsheet_tool = SpreadsheetTool()
|
69 |
text_reversal_tool = TextReversalTool()
|
70 |
-
video_processing_tool = VideoProcessingTool()
|
|
|
71 |
|
72 |
# Add debug prints for file paths
|
73 |
print("Current directory:", os.getcwd())
|
@@ -160,7 +162,8 @@ agent_tools = [
|
|
160 |
speech_to_text_tool, # Updated
|
161 |
spreadsheet_tool,
|
162 |
text_reversal_tool,
|
163 |
-
video_processing_tool
|
|
|
164 |
]
|
165 |
|
166 |
# Flatten system_prompt if it's a dict (e.g., from YAML)
|
|
|
13 |
from src.web_browsing_tool import WebBrowser
|
14 |
from src.file_processing_tool import FileIdentifier
|
15 |
from src.image_processing_tool import ImageProcessor
|
16 |
+
from src.markdown_table_parser import MarkdownTableParserTool
|
17 |
from src.python_tool import CodeExecutionTool
|
18 |
+
from src.speech_to_text import SpeechToTextTool
|
19 |
from src.spreadsheet_tool import SpreadsheetTool
|
20 |
from src.text_reversal_tool import TextReversalTool
|
21 |
from src.video_processing_tool import VideoProcessingTool
|
22 |
+
from src.web_content_extractor import WebContentExtractor
|
23 |
|
24 |
# (Keep Constants as is)
|
25 |
# --- Constants ---
|
|
|
68 |
speech_to_text_tool = SpeechToTextTool() # Updated
|
69 |
spreadsheet_tool = SpreadsheetTool()
|
70 |
text_reversal_tool = TextReversalTool()
|
71 |
+
video_processing_tool = VideoProcessingTool()
|
72 |
+
web_content_extractor = WebContentExtractor() # Instantiate the new extractor tool
|
73 |
|
74 |
# Add debug prints for file paths
|
75 |
print("Current directory:", os.getcwd())
|
|
|
162 |
speech_to_text_tool, # Updated
|
163 |
spreadsheet_tool,
|
164 |
text_reversal_tool,
|
165 |
+
video_processing_tool,
|
166 |
+
web_content_extractor # Add the new tool here
|
167 |
]
|
168 |
|
169 |
# Flatten system_prompt if it's a dict (e.g., from YAML)
|
src/python_tool.py
CHANGED
@@ -7,21 +7,32 @@ import traceback
|
|
7 |
from typing import Dict, Any, Optional, Union, List
|
8 |
from smolagents.tools import Tool
|
9 |
import os
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
class CodeExecutionTool(Tool):
|
12 |
"""
|
13 |
-
Executes Python code
|
14 |
-
Useful for
|
|
|
15 |
"""
|
16 |
-
name = "
|
17 |
-
description = "
|
18 |
inputs = {
|
19 |
-
'code_string': {'type': 'string', 'description': 'The Python code to execute
|
20 |
-
'filepath': {'type': 'string', 'description': '
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
}
|
22 |
-
outputs = {'result': {'type': 'object', 'description': 'A dictionary containing \'success\', \'output\', and/or \'error\'.'}}
|
23 |
output_type = "object"
|
24 |
-
|
25 |
def __init__(self, timeout: int = 10, max_output_size: int = 20000, *args, **kwargs):
|
26 |
super().__init__(*args, **kwargs)
|
27 |
self.timeout = timeout
|
@@ -31,6 +42,124 @@ class CodeExecutionTool(Tool):
|
|
31 |
'pickle', 'requests', 'socket', 'shutil', 'ctypes', 'multiprocessing'
|
32 |
]
|
33 |
self.is_initialized = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def _analyze_code_safety(self, code: str) -> Dict[str, Any]:
|
36 |
"""Perform static analysis to check for potentially harmful code."""
|
@@ -68,34 +197,40 @@ class CodeExecutionTool(Tool):
|
|
68 |
return {"safe": True}
|
69 |
except SyntaxError:
|
70 |
return {"safe": False, "reason": "Invalid Python syntax"}
|
71 |
-
|
72 |
def _timeout_handler(self, signum, frame):
|
73 |
"""Handler for timeout signal."""
|
74 |
-
raise TimeoutError("Code execution timed out")
|
75 |
-
|
76 |
def _extract_numeric_value(self, output: str) -> Optional[Union[int, float]]:
|
77 |
"""Extract the final numeric value from output."""
|
78 |
-
|
79 |
-
|
80 |
|
|
|
|
|
81 |
for line in reversed(lines):
|
82 |
-
# Try
|
|
|
83 |
try:
|
84 |
-
|
|
|
|
|
|
|
85 |
except ValueError:
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
96 |
return None
|
97 |
-
|
98 |
-
# Main entry point for the agent
|
99 |
def forward(self, code_string: Optional[str] = None, filepath: Optional[str] = None) -> Dict[str, Any]:
|
100 |
if not code_string and not filepath:
|
101 |
return {"success": False, "error": "No code string or filepath provided."}
|
@@ -116,56 +251,70 @@ class CodeExecutionTool(Tool):
|
|
116 |
elif code_string:
|
117 |
code_to_execute = code_string
|
118 |
|
119 |
-
|
|
|
|
|
|
|
120 |
|
121 |
-
# Renamed from execute_code to _execute_actual_code to be internal
|
122 |
def _execute_actual_code(self, code: str) -> Dict[str, Any]:
|
123 |
"""Execute Python code and capture the output or error."""
|
124 |
safety_check = self._analyze_code_safety(code)
|
125 |
if not safety_check["safe"]:
|
126 |
-
return {
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
# However, for a tool that might need to define functions/classes and use them,
|
136 |
-
# a shared scope might be necessary. This needs careful consideration.
|
137 |
-
exec_globals = {}
|
138 |
-
|
139 |
try:
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
-
|
|
|
145 |
if len(output) > self.max_output_size:
|
146 |
-
output = output[:self.max_output_size] + "...
|
147 |
|
148 |
-
#
|
149 |
-
|
150 |
-
|
151 |
|
152 |
return {
|
153 |
-
"success": True,
|
154 |
"output": output,
|
155 |
-
|
156 |
}
|
157 |
-
|
158 |
-
|
|
|
|
|
159 |
except Exception as e:
|
160 |
-
#
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
error_details = error_details[:self.max_output_size] + "... [error truncated]"
|
165 |
-
return {"success": False, "error": f"Execution failed: {str(e)}\nTraceback:\n{error_details}"}
|
166 |
finally:
|
167 |
-
|
168 |
-
|
169 |
|
170 |
# Kept execute_file and execute_code as helper methods if direct access is ever needed,
|
171 |
# but they now call the main _execute_actual_code method.
|
|
|
7 |
from typing import Dict, Any, Optional, Union, List
|
8 |
from smolagents.tools import Tool
|
9 |
import os
|
10 |
+
import logging
|
11 |
+
|
12 |
+
# Set up logging
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
|
16 |
class CodeExecutionTool(Tool):
|
17 |
"""
|
18 |
+
Executes Python code snippets safely with timeout protection.
|
19 |
+
Useful for data processing, analysis, and transformation.
|
20 |
+
Includes special utilities for web data processing and robust error handling.
|
21 |
"""
|
22 |
+
name = "python_executor"
|
23 |
+
description = "Safely executes Python code with enhancements for data processing, parsing, and error recovery."
|
24 |
inputs = {
|
25 |
+
'code_string': {'type': 'string', 'description': 'The Python code to execute.', 'nullable': True},
|
26 |
+
'filepath': {'type': 'string', 'description': 'Path to a Python file to execute.', 'nullable': True}
|
27 |
+
}
|
28 |
+
outputs = {
|
29 |
+
'success': {'type': 'boolean', 'description': 'Whether the code executed successfully.'},
|
30 |
+
'output': {'type': 'string', 'description': 'The captured stdout or formatted result.', 'nullable': True},
|
31 |
+
'error': {'type': 'string', 'description': 'Error message if execution failed.', 'nullable': True},
|
32 |
+
'result_value': {'type': 'any', 'description': 'The final expression value if applicable.', 'nullable': True}
|
33 |
}
|
|
|
34 |
output_type = "object"
|
35 |
+
|
36 |
def __init__(self, timeout: int = 10, max_output_size: int = 20000, *args, **kwargs):
|
37 |
super().__init__(*args, **kwargs)
|
38 |
self.timeout = timeout
|
|
|
42 |
'pickle', 'requests', 'socket', 'shutil', 'ctypes', 'multiprocessing'
|
43 |
]
|
44 |
self.is_initialized = True
|
45 |
+
# Add utility functions that will be available to executed code
|
46 |
+
self._utility_functions = self._get_utility_functions()
|
47 |
+
|
48 |
+
def _get_utility_functions(self):
|
49 |
+
"""Define utility functions that will be available in the executed code"""
|
50 |
+
utility_code = """
|
51 |
+
# Utility functions for web data processing
|
52 |
+
def extract_pattern(text, pattern, group=0, all_matches=False):
|
53 |
+
"""
|
54 |
+
"Extract data using regex pattern from text.
|
55 |
+
Args:
|
56 |
+
text (str): Text to search in
|
57 |
+
pattern (str): Regex pattern to use
|
58 |
+
group (int): Capture group to return (default 0 - entire match)
|
59 |
+
all_matches (bool): If True, return all matches, otherwise just first
|
60 |
+
Returns:
|
61 |
+
Matched string(s) or None if no match
|
62 |
+
"""
|
63 |
+
import re
|
64 |
+
if not text or not pattern:
|
65 |
+
print("Warning: Empty text or pattern provided to extract_pattern")
|
66 |
+
return None
|
67 |
+
|
68 |
+
try:
|
69 |
+
matches = re.finditer(pattern, text)
|
70 |
+
results = [m.group(group) if group < len(m.groups())+1 else m.group(0) for m in matches]
|
71 |
+
|
72 |
+
if not results:
|
73 |
+
print(f"No matches found for pattern '{pattern}'")
|
74 |
+
return None
|
75 |
+
|
76 |
+
if all_matches:
|
77 |
+
return results
|
78 |
+
else:
|
79 |
+
return results[0]
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Error in extract_pattern: {e}")
|
82 |
+
return None
|
83 |
+
|
84 |
+
def clean_text(text, remove_extra_whitespace=True, remove_special_chars=False):
|
85 |
+
"""
|
86 |
+
Clean text by removing extra whitespace and optionally special characters.
|
87 |
+
Args:
|
88 |
+
text (str): Text to clean
|
89 |
+
remove_extra_whitespace (bool): If True, replace multiple spaces with single space
|
90 |
+
remove_special_chars (bool): If True, remove special characters
|
91 |
+
Returns:
|
92 |
+
Cleaned string
|
93 |
+
"""
|
94 |
+
import re
|
95 |
+
if not text:
|
96 |
+
return ""
|
97 |
+
|
98 |
+
# Replace newlines and tabs with spaces
|
99 |
+
text = re.sub(r'[\\n\\t\\r]+', ' ', text)
|
100 |
+
|
101 |
+
if remove_special_chars:
|
102 |
+
# Keep only alphanumeric, spaces, and basic punctuation
|
103 |
+
text = re.sub(r'[^\\w\\s.,;:!?\'"()-]', '', text)
|
104 |
+
|
105 |
+
if remove_extra_whitespace:
|
106 |
+
# Replace multiple spaces with single space
|
107 |
+
text = re.sub(r'\\s+', ' ', text)
|
108 |
+
|
109 |
+
return text.strip()
|
110 |
+
|
111 |
+
def parse_table_text(table_text):
|
112 |
+
"""
|
113 |
+
Parse table-like text into list of rows
|
114 |
+
Args:
|
115 |
+
table_text (str): Text containing table-like data
|
116 |
+
Returns:
|
117 |
+
List of rows (each row is a list of cells)
|
118 |
+
"""
|
119 |
+
rows = []
|
120 |
+
lines = table_text.strip().split('\\n')
|
121 |
+
|
122 |
+
for line in lines:
|
123 |
+
# Skip empty lines
|
124 |
+
if not line.strip():
|
125 |
+
continue
|
126 |
+
|
127 |
+
# Split by whitespace or common separators
|
128 |
+
cells = re.split(r'\\s{2,}|\\t+|\\|+', line.strip())
|
129 |
+
# Clean up cells
|
130 |
+
cells = [cell.strip() for cell in cells if cell.strip()]
|
131 |
+
|
132 |
+
if cells:
|
133 |
+
rows.append(cells)
|
134 |
+
|
135 |
+
# Print parsing result for debugging
|
136 |
+
print(f"Parsed {len(rows)} rows from table text")
|
137 |
+
if rows and len(rows) > 0:
|
138 |
+
print(f"First row (columns: {len(rows[0])}): {rows[0]}")
|
139 |
+
|
140 |
+
return rows
|
141 |
+
|
142 |
+
def safe_float(text):
|
143 |
+
"""
|
144 |
+
Safely convert text to float, handling various formats.
|
145 |
+
Args:
|
146 |
+
text (str): Text to convert
|
147 |
+
Returns:
|
148 |
+
float or None if conversion fails
|
149 |
+
"""
|
150 |
+
if not text:
|
151 |
+
return None
|
152 |
+
|
153 |
+
# Remove currency symbols, commas in numbers, etc.
|
154 |
+
text = re.sub(r'[^0-9.-]', '', str(text))
|
155 |
+
|
156 |
+
try:
|
157 |
+
return float(text)
|
158 |
+
except ValueError:
|
159 |
+
print(f"Warning: Could not convert '{text}' to float")
|
160 |
+
return None
|
161 |
+
"""
|
162 |
+
return utility_code
|
163 |
|
164 |
def _analyze_code_safety(self, code: str) -> Dict[str, Any]:
|
165 |
"""Perform static analysis to check for potentially harmful code."""
|
|
|
197 |
return {"safe": True}
|
198 |
except SyntaxError:
|
199 |
return {"safe": False, "reason": "Invalid Python syntax"}
|
200 |
+
|
201 |
def _timeout_handler(self, signum, frame):
|
202 |
"""Handler for timeout signal."""
|
203 |
+
raise TimeoutError(f"Code execution timed out after {self.timeout} seconds")
|
204 |
+
|
205 |
def _extract_numeric_value(self, output: str) -> Optional[Union[int, float]]:
|
206 |
"""Extract the final numeric value from output."""
|
207 |
+
if not output:
|
208 |
+
return None
|
209 |
|
210 |
+
# Look for the last line that contains a number
|
211 |
+
lines = output.strip().split('\n')
|
212 |
for line in reversed(lines):
|
213 |
+
# Try to interpret it as a pure number
|
214 |
+
line = line.strip()
|
215 |
try:
|
216 |
+
if '.' in line:
|
217 |
+
return float(line)
|
218 |
+
else:
|
219 |
+
return int(line)
|
220 |
except ValueError:
|
221 |
+
# Not a pure number, try to extract numbers with regex
|
222 |
+
match = re.search(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?$', line)
|
223 |
+
if match:
|
224 |
+
num_str = match.group(0)
|
225 |
+
try:
|
226 |
+
if '.' in num_str:
|
227 |
+
return float(num_str)
|
228 |
+
else:
|
229 |
+
return int(num_str)
|
230 |
+
except ValueError:
|
231 |
+
pass
|
232 |
return None
|
233 |
+
|
|
|
234 |
def forward(self, code_string: Optional[str] = None, filepath: Optional[str] = None) -> Dict[str, Any]:
|
235 |
if not code_string and not filepath:
|
236 |
return {"success": False, "error": "No code string or filepath provided."}
|
|
|
251 |
elif code_string:
|
252 |
code_to_execute = code_string
|
253 |
|
254 |
+
# Inject utility functions
|
255 |
+
enhanced_code = self._utility_functions + "\n\n" + code_to_execute
|
256 |
+
|
257 |
+
return self._execute_actual_code(enhanced_code)
|
258 |
|
|
|
259 |
def _execute_actual_code(self, code: str) -> Dict[str, Any]:
|
260 |
"""Execute Python code and capture the output or error."""
|
261 |
safety_check = self._analyze_code_safety(code)
|
262 |
if not safety_check["safe"]:
|
263 |
+
return {
|
264 |
+
"success": False,
|
265 |
+
"error": f"Safety check failed: {safety_check['reason']}"
|
266 |
+
}
|
267 |
+
|
268 |
+
# Capture stdout and execute the code with a timeout
|
269 |
+
stdout_buffer = io.StringIO()
|
270 |
+
result_value = None
|
271 |
+
|
|
|
|
|
|
|
|
|
272 |
try:
|
273 |
+
# Set timeout handler
|
274 |
+
signal.signal(signal.SIGALRM, self._timeout_handler)
|
275 |
+
signal.alarm(self.timeout)
|
276 |
+
|
277 |
+
# Execute code and capture stdout
|
278 |
+
with contextlib.redirect_stdout(stdout_buffer):
|
279 |
+
# Execute the code within a new dictionary for local variables
|
280 |
+
local_vars = {}
|
281 |
+
exec(code, {}, local_vars)
|
282 |
+
|
283 |
+
# Try to extract the result from common variable names
|
284 |
+
for var_name in ['result', 'answer', 'output', 'value', 'final_result', 'data']:
|
285 |
+
if var_name in local_vars:
|
286 |
+
result_value = local_vars[var_name]
|
287 |
+
break
|
288 |
+
|
289 |
+
# Reset the alarm
|
290 |
+
signal.alarm(0)
|
291 |
|
292 |
+
# Get the captured output
|
293 |
+
output = stdout_buffer.getvalue()
|
294 |
if len(output) > self.max_output_size:
|
295 |
+
output = output[:self.max_output_size] + f"\n... (output truncated, exceeded {self.max_output_size} characters)"
|
296 |
|
297 |
+
# If no result_value was found, try to extract a numeric value from the output
|
298 |
+
if result_value is None:
|
299 |
+
result_value = self._extract_numeric_value(output)
|
300 |
|
301 |
return {
|
302 |
+
"success": True,
|
303 |
"output": output,
|
304 |
+
"result_value": result_value
|
305 |
}
|
306 |
+
|
307 |
+
except TimeoutError as e:
|
308 |
+
signal.alarm(0) # Reset the alarm
|
309 |
+
return {"success": False, "error": f"Code execution timed out after {self.timeout} seconds"}
|
310 |
except Exception as e:
|
311 |
+
signal.alarm(0) # Reset the alarm
|
312 |
+
trace = traceback.format_exc()
|
313 |
+
error_msg = f"Error executing code: {str(e)}\n{trace}"
|
314 |
+
return {"success": False, "error": error_msg}
|
|
|
|
|
315 |
finally:
|
316 |
+
# Ensure the alarm is reset
|
317 |
+
signal.alarm(0)
|
318 |
|
319 |
# Kept execute_file and execute_code as helper methods if direct access is ever needed,
|
320 |
# but they now call the main _execute_actual_code method.
|
src/web_browsing_tool.py
CHANGED
@@ -1,17 +1,31 @@
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
from smolagents.tools import Tool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
class WebBrowser(Tool):
|
6 |
"""
|
7 |
Retrieves information from online sources by browsing web pages.
|
8 |
-
Useful for extracting or summarizing web content.
|
|
|
9 |
"""
|
10 |
name = "web_browser"
|
11 |
-
description = "Fetches
|
12 |
-
inputs = {
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
15 |
|
16 |
def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
|
17 |
"""
|
@@ -21,62 +35,371 @@ class WebBrowser(Tool):
|
|
21 |
"""
|
22 |
super().__init__(*args, **kwargs)
|
23 |
self.headers = {"User-Agent": user_agent}
|
24 |
-
self.is_initialized = True
|
|
|
|
|
|
|
25 |
|
26 |
-
def forward(self, url: str) ->
|
27 |
"""
|
28 |
-
Fetches the content of a web page and extracts
|
29 |
|
30 |
Args:
|
31 |
url (str): The URL of the web page to browse.
|
|
|
32 |
|
33 |
Returns:
|
34 |
-
|
35 |
-
if fetching or parsing fails.
|
36 |
"""
|
|
|
37 |
if not url.startswith(('http://', 'https://')):
|
38 |
-
return
|
39 |
|
40 |
try:
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
|
52 |
-
|
|
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
|
63 |
-
|
64 |
|
65 |
-
|
66 |
-
|
67 |
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
if __name__ == '__main__':
|
82 |
browser = WebBrowser() # Instantiation remains the same for testing
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
from smolagents.tools import Tool
|
4 |
+
import re
|
5 |
+
import json
|
6 |
+
import logging
|
7 |
+
import time
|
8 |
+
from urllib.parse import urlparse, urljoin
|
9 |
+
import pandas as pd
|
10 |
+
|
11 |
+
# Set up logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
|
15 |
class WebBrowser(Tool):
|
16 |
"""
|
17 |
Retrieves information from online sources by browsing web pages.
|
18 |
+
Useful for extracting or summarizing web content, with special handling for structured data.
|
19 |
+
Can extract tables, lists, and key information from web pages.
|
20 |
"""
|
21 |
name = "web_browser"
|
22 |
+
description = "Fetches content from web pages with improved structured data handling. Has specialized extraction for Wikipedia. Returns text content or structured data."
|
23 |
+
inputs = {
|
24 |
+
'url': {'type': 'string', 'description': 'The URL of the web page to browse.'},
|
25 |
+
'extraction_mode': {'type': 'string', 'description': 'Mode for data extraction: "text" (default), "tables", "lists", or "structured".', 'nullable': True}
|
26 |
+
}
|
27 |
+
outputs = {'content': {'type': 'object', 'description': 'The extracted content from the web page, either as text or structured data.'}}
|
28 |
+
output_type = "object"
|
29 |
|
30 |
def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
|
31 |
"""
|
|
|
35 |
"""
|
36 |
super().__init__(*args, **kwargs)
|
37 |
self.headers = {"User-Agent": user_agent}
|
38 |
+
self.is_initialized = True
|
39 |
+
# Add a session to maintain cookies
|
40 |
+
self.session = requests.Session()
|
41 |
+
self.session.headers.update(self.headers)
|
42 |
|
43 |
+
def forward(self, url: str, extraction_mode: str = "text") -> dict:
|
44 |
"""
|
45 |
+
Fetches the content of a web page and extracts information based on the specified mode.
|
46 |
|
47 |
Args:
|
48 |
url (str): The URL of the web page to browse.
|
49 |
+
extraction_mode (str): The mode for data extraction - "text" (default), "tables", "lists", or "structured"
|
50 |
|
51 |
Returns:
|
52 |
+
dict: The extracted content or an error message
|
|
|
53 |
"""
|
54 |
+
# Validate URL
|
55 |
if not url.startswith(('http://', 'https://')):
|
56 |
+
return {"error": f"Invalid URL format. URL must start with http:// or https://. Received: {url}"}
|
57 |
|
58 |
try:
|
59 |
+
# Check if it's Wikipedia and use special handling
|
60 |
+
if 'wikipedia.org' in url:
|
61 |
+
return self._handle_wikipedia(url, extraction_mode)
|
62 |
+
|
63 |
+
# Process normal web pages
|
64 |
+
return self._process_regular_webpage(url, extraction_mode)
|
65 |
+
|
66 |
+
except requests.exceptions.HTTPError as http_err:
|
67 |
+
return {"error": f"HTTP error occurred while fetching {url}: {http_err}"}
|
68 |
+
except requests.exceptions.ConnectionError as conn_err:
|
69 |
+
return {"error": f"Connection error occurred while fetching {url}: {conn_err}"}
|
70 |
+
except requests.exceptions.Timeout as timeout_err:
|
71 |
+
return {"error": f"Timeout occurred while fetching {url}: {timeout_err}"}
|
72 |
+
except requests.exceptions.RequestException as req_err:
|
73 |
+
return {"error": f"An unexpected error occurred while fetching {url}: {req_err}"}
|
74 |
+
except Exception as e:
|
75 |
+
return {"error": f"An unexpected error occurred during parsing of {url}: {e}"}
|
76 |
|
77 |
+
def _process_regular_webpage(self, url, extraction_mode):
|
78 |
+
"""Process a regular (non-Wikipedia) webpage"""
|
79 |
+
response = self.session.get(url, timeout=15)
|
80 |
+
response.raise_for_status()
|
81 |
+
|
82 |
+
# Use BeautifulSoup to parse the HTML content
|
83 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
84 |
+
|
85 |
+
# Remove script and style elements
|
86 |
+
for script_or_style in soup(["script", "style"]):
|
87 |
+
script_or_style.decompose()
|
88 |
+
|
89 |
+
if extraction_mode == "text":
|
90 |
+
return self._extract_text(soup, url)
|
91 |
+
elif extraction_mode == "tables":
|
92 |
+
return self._extract_tables(soup, url)
|
93 |
+
elif extraction_mode == "lists":
|
94 |
+
return self._extract_lists(soup, url)
|
95 |
+
elif extraction_mode == "structured":
|
96 |
+
return self._extract_structured_data(soup, url)
|
97 |
+
else:
|
98 |
+
return {"error": f"Unknown extraction mode: {extraction_mode}"}
|
99 |
|
100 |
+
def _handle_wikipedia(self, url, extraction_mode):
|
101 |
+
"""Special handling for Wikipedia pages"""
|
102 |
+
# For Wikipedia, try to use the API instead of scraping the HTML
|
103 |
+
parsed_url = urlparse(url)
|
104 |
+
if not parsed_url.netloc.endswith('wikipedia.org'):
|
105 |
+
return self._process_regular_webpage(url, extraction_mode)
|
106 |
+
|
107 |
+
# Extract the title from the URL path
|
108 |
+
path_parts = parsed_url.path.split('/')
|
109 |
+
if len(path_parts) < 3 or path_parts[1] != 'wiki':
|
110 |
+
# Not a standard Wikipedia article URL
|
111 |
+
return self._process_regular_webpage(url, extraction_mode)
|
112 |
+
|
113 |
+
title = path_parts[2]
|
114 |
+
lang = parsed_url.netloc.split('.')[0]
|
115 |
+
|
116 |
+
# Use Wikipedia API to get structured content
|
117 |
+
api_url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{title}"
|
118 |
+
|
119 |
+
try:
|
120 |
+
logger.info(f"Fetching Wikipedia API data from {api_url}")
|
121 |
+
api_response = self.session.get(api_url, timeout=15)
|
122 |
+
api_response.raise_for_status()
|
123 |
+
api_data = api_response.json()
|
124 |
+
|
125 |
+
# Basic information from the API
|
126 |
+
wiki_data = {
|
127 |
+
"title": api_data.get("title", ""),
|
128 |
+
"description": api_data.get("description", ""),
|
129 |
+
"extract": api_data.get("extract", ""),
|
130 |
+
"url": api_data.get("content_urls", {}).get("desktop", {}).get("page", url)
|
131 |
+
}
|
132 |
+
|
133 |
+
# If we need more detailed data beyond the summary
|
134 |
+
if extraction_mode in ["tables", "structured"]:
|
135 |
+
# Get the full HTML anyway for tables and other structured data
|
136 |
+
response = self.session.get(url, timeout=15)
|
137 |
+
response.raise_for_status()
|
138 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
139 |
+
|
140 |
+
# Add tables to the response
|
141 |
+
tables = self._extract_tables(soup, url, return_raw=False)
|
142 |
+
wiki_data["tables"] = tables.get("tables", [])
|
143 |
+
|
144 |
+
# For "structured" mode, add sections, infobox and other elements
|
145 |
+
if extraction_mode == "structured":
|
146 |
+
wiki_data["infobox"] = self._extract_wikipedia_infobox(soup)
|
147 |
+
wiki_data["sections"] = self._extract_wikipedia_sections(soup)
|
148 |
+
|
149 |
+
return {
|
150 |
+
"source": "wikipedia_api_enhanced",
|
151 |
+
"url": url,
|
152 |
+
"data": wiki_data
|
153 |
+
}
|
154 |
+
|
155 |
+
# For basic text, return the API data
|
156 |
+
return {
|
157 |
+
"source": "wikipedia_api",
|
158 |
+
"url": url,
|
159 |
+
"data": wiki_data
|
160 |
+
}
|
161 |
+
|
162 |
+
except (requests.exceptions.RequestException, ValueError) as e:
|
163 |
+
logger.warning(f"Wikipedia API request failed: {e}. Falling back to HTML scraping.")
|
164 |
+
# Fallback to normal HTML processing
|
165 |
+
return self._process_regular_webpage(url, extraction_mode)
|
166 |
|
167 |
+
def _extract_text(self, soup, url):
|
168 |
+
"""Extract clean text from the page"""
|
169 |
+
text_from_soup = soup.get_text(separator='\n', strip=True)
|
170 |
|
171 |
+
# Convert multiple newlines to a single newline and clean spaces within lines
|
172 |
+
cleaned_lines = []
|
173 |
+
for line in text_from_soup.splitlines():
|
174 |
+
line = line.strip() # Strip leading/trailing whitespace
|
175 |
+
if line: # Only process non-empty lines
|
176 |
+
# Replace multiple spaces with a single space
|
177 |
+
cleaned_line = ' '.join(line.split())
|
178 |
+
cleaned_lines.append(cleaned_line)
|
179 |
|
180 |
+
text = '\n'.join(cleaned_lines)
|
181 |
|
182 |
+
if not text:
|
183 |
+
return {"error": f"No text content found at {url}."}
|
184 |
|
185 |
+
return {
|
186 |
+
"source": "web_page",
|
187 |
+
"url": url,
|
188 |
+
"content_type": "text",
|
189 |
+
"text": text
|
190 |
+
}
|
191 |
|
192 |
+
def _extract_tables(self, soup, url, return_raw=True):
|
193 |
+
"""Extract tables from the page"""
|
194 |
+
tables = []
|
195 |
+
|
196 |
+
# Find all table elements
|
197 |
+
html_tables = soup.find_all('table')
|
198 |
+
|
199 |
+
for i, table in enumerate(html_tables):
|
200 |
+
try:
|
201 |
+
# Try to convert to a pandas DataFrame
|
202 |
+
dfs = pd.read_html(str(table))
|
203 |
+
|
204 |
+
if dfs:
|
205 |
+
# Convert each DataFrame to a dict for JSON serialization
|
206 |
+
for j, df in enumerate(dfs):
|
207 |
+
# Clean column names
|
208 |
+
df.columns = [str(col).strip() for col in df.columns]
|
209 |
+
|
210 |
+
# Convert DataFrame to dict
|
211 |
+
table_dict = {
|
212 |
+
"table_id": f"table_{i}_{j}",
|
213 |
+
"headers": df.columns.tolist(),
|
214 |
+
"rows": df.values.tolist(),
|
215 |
+
}
|
216 |
+
tables.append(table_dict)
|
217 |
+
except Exception as e:
|
218 |
+
logger.warning(f"Failed to parse table {i}: {e}")
|
219 |
+
# Try a manual extraction
|
220 |
+
try:
|
221 |
+
headers = []
|
222 |
+
header_row = table.find('tr')
|
223 |
+
if header_row:
|
224 |
+
headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
|
225 |
+
|
226 |
+
rows = []
|
227 |
+
for tr in table.find_all('tr'):
|
228 |
+
row = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
|
229 |
+
if row and row != headers: # Skip header row in data
|
230 |
+
rows.append(row)
|
231 |
+
|
232 |
+
if headers or rows:
|
233 |
+
tables.append({
|
234 |
+
"table_id": f"table_{i}_manual",
|
235 |
+
"headers": headers,
|
236 |
+
"rows": rows
|
237 |
+
})
|
238 |
+
except Exception:
|
239 |
+
continue # Skip if manual extraction also fails
|
240 |
+
|
241 |
+
if return_raw:
|
242 |
+
return {
|
243 |
+
"source": "web_page",
|
244 |
+
"url": url,
|
245 |
+
"content_type": "tables",
|
246 |
+
"table_count": len(tables),
|
247 |
+
"tables": tables
|
248 |
+
}
|
249 |
+
else:
|
250 |
+
return {"tables": tables}
|
251 |
+
|
252 |
+
def _extract_lists(self, soup, url):
|
253 |
+
"""Extract lists from the page"""
|
254 |
+
lists = []
|
255 |
+
|
256 |
+
# Find all ul and ol elements
|
257 |
+
for list_type in ['ul', 'ol']:
|
258 |
+
list_elements = soup.find_all(list_type, recursive=True)
|
259 |
+
|
260 |
+
for i, list_elem in enumerate(list_elements):
|
261 |
+
# Skip nested lists to avoid duplication
|
262 |
+
if list_elem.parent.name in ['li', 'ul', 'ol']:
|
263 |
+
continue
|
264 |
+
|
265 |
+
items = []
|
266 |
+
for li in list_elem.find_all('li', recursive=False):
|
267 |
+
# Get text but exclude any nested lists
|
268 |
+
for nested_list in li.find_all(['ul', 'ol']):
|
269 |
+
nested_list.decompose()
|
270 |
+
|
271 |
+
item_text = li.get_text(strip=True)
|
272 |
+
if item_text:
|
273 |
+
items.append(item_text)
|
274 |
+
|
275 |
+
if items:
|
276 |
+
lists.append({
|
277 |
+
"list_id": f"{list_type}_{i}",
|
278 |
+
"list_type": "ordered" if list_type == "ol" else "unordered",
|
279 |
+
"items": items
|
280 |
+
})
|
281 |
+
|
282 |
+
return {
|
283 |
+
"source": "web_page",
|
284 |
+
"url": url,
|
285 |
+
"content_type": "lists",
|
286 |
+
"list_count": len(lists),
|
287 |
+
"lists": lists
|
288 |
+
}
|
289 |
+
|
290 |
+
def _extract_structured_data(self, soup, url):
|
291 |
+
"""Extract various types of structured data from the page"""
|
292 |
+
result = {
|
293 |
+
"source": "web_page",
|
294 |
+
"url": url,
|
295 |
+
"content_type": "structured",
|
296 |
+
"title": soup.title.string if soup.title else "",
|
297 |
+
"meta_description": "",
|
298 |
+
}
|
299 |
+
|
300 |
+
# Extract meta description
|
301 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
302 |
+
if meta_desc:
|
303 |
+
result["meta_description"] = meta_desc.get('content', '')
|
304 |
+
|
305 |
+
# Extract main text content
|
306 |
+
text_result = self._extract_text(soup, url)
|
307 |
+
if "text" in text_result:
|
308 |
+
result["text"] = text_result["text"]
|
309 |
+
|
310 |
+
# Extract tables
|
311 |
+
tables_result = self._extract_tables(soup, url, return_raw=False)
|
312 |
+
result["tables"] = tables_result.get("tables", [])
|
313 |
+
|
314 |
+
# Extract lists
|
315 |
+
lists_result = self._extract_lists(soup, url)
|
316 |
+
result["lists"] = lists_result.get("lists", [])
|
317 |
+
|
318 |
+
# Extract headings for document structure
|
319 |
+
headings = []
|
320 |
+
for i, heading in enumerate(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])):
|
321 |
+
headings.append({
|
322 |
+
"id": f"heading_{i}",
|
323 |
+
"level": int(heading.name[1]),
|
324 |
+
"text": heading.get_text(strip=True)
|
325 |
+
})
|
326 |
+
result["headings"] = headings
|
327 |
+
|
328 |
+
# Look for JSON-LD structured data
|
329 |
+
json_ld_data = []
|
330 |
+
for script in soup.find_all('script', type='application/ld+json'):
|
331 |
+
try:
|
332 |
+
json_data = json.loads(script.string)
|
333 |
+
json_ld_data.append(json_data)
|
334 |
+
except (json.JSONDecodeError, ValueError):
|
335 |
+
continue
|
336 |
+
|
337 |
+
if json_ld_data:
|
338 |
+
result["structured_data"] = json_ld_data
|
339 |
+
|
340 |
+
return result
|
341 |
+
|
342 |
+
def _extract_wikipedia_infobox(self, soup):
|
343 |
+
"""Extract information from Wikipedia infobox"""
|
344 |
+
infobox = {}
|
345 |
+
|
346 |
+
# Look for the infobox table
|
347 |
+
infobox_table = soup.find('table', class_=['infobox', 'vcard'])
|
348 |
+
if infobox_table:
|
349 |
+
for row in infobox_table.find_all('tr'):
|
350 |
+
# Look for th/td pairs
|
351 |
+
header = row.find('th')
|
352 |
+
value = row.find('td')
|
353 |
+
|
354 |
+
if header and value:
|
355 |
+
key = header.get_text(strip=True)
|
356 |
+
# Clean up the value text
|
357 |
+
for sup in value.find_all('sup'):
|
358 |
+
sup.decompose() # Remove reference superscripts
|
359 |
+
|
360 |
+
val = value.get_text(strip=True)
|
361 |
+
if key and val:
|
362 |
+
infobox[key] = val
|
363 |
+
|
364 |
+
return infobox
|
365 |
+
|
366 |
+
def _extract_wikipedia_sections(self, soup):
|
367 |
+
"""Extract sections and their content from Wikipedia"""
|
368 |
+
sections = []
|
369 |
+
current_section = None
|
370 |
+
|
371 |
+
# Find all headings
|
372 |
+
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
373 |
+
|
374 |
+
for heading in headings:
|
375 |
+
# Skip non-content headings
|
376 |
+
if heading.get('id') in ['firstHeading', 'mw-toc-heading']:
|
377 |
+
continue
|
378 |
+
|
379 |
+
level = int(heading.name[1])
|
380 |
+
title = heading.get_text(strip=True)
|
381 |
+
|
382 |
+
# Start a new section
|
383 |
+
current_section = {
|
384 |
+
"level": level,
|
385 |
+
"title": title,
|
386 |
+
"content": ""
|
387 |
+
}
|
388 |
+
|
389 |
+
# Get content until next heading
|
390 |
+
content_elements = []
|
391 |
+
sibling = heading.next_sibling
|
392 |
+
|
393 |
+
while sibling and not (sibling.name and sibling.name.startswith('h')):
|
394 |
+
if sibling.name in ['p', 'ul', 'ol']:
|
395 |
+
content_elements.append(sibling.get_text(strip=True))
|
396 |
+
sibling = sibling.next_sibling
|
397 |
+
|
398 |
+
if content_elements:
|
399 |
+
current_section["content"] = "\n".join(content_elements)
|
400 |
+
sections.append(current_section)
|
401 |
+
|
402 |
+
return sections
|
403 |
|
404 |
if __name__ == '__main__':
|
405 |
browser = WebBrowser() # Instantiation remains the same for testing
|
src/web_content_extractor.py
ADDED
@@ -0,0 +1,410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from smolagents.tools import Tool
|
2 |
+
from typing import Dict, Any, Optional
|
3 |
+
import requests
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import re
|
6 |
+
import json
|
7 |
+
import pandas as pd
|
8 |
+
import logging
|
9 |
+
|
10 |
+
# Set up logging
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
class WebContentExtractor(Tool):
|
15 |
+
"""
|
16 |
+
Specialized tool for extracting structured content from specific websites.
|
17 |
+
Has optimized extractors for Wikipedia, tabular data, and common content patterns.
|
18 |
+
"""
|
19 |
+
name = "web_content_extractor"
|
20 |
+
description = "Extracts structured data from websites with specialized handlers for Wikipedia and other content types."
|
21 |
+
inputs = {
|
22 |
+
'url': {'type': 'string', 'description': 'The URL of the web page to extract content from.'},
|
23 |
+
'target_type': {'type': 'string', 'description': 'Type of content to extract: "info", "table", "list", or "specific_data".'},
|
24 |
+
'extraction_details': {'type': 'object', 'description': 'Additional details for extraction (e.g., table index, data label).', 'nullable': True}
|
25 |
+
}
|
26 |
+
outputs = {'result': {'type': 'object', 'description': 'The extracted content as structured data.'}}
|
27 |
+
output_type = "object"
|
28 |
+
|
29 |
+
def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
|
30 |
+
super().__init__(*args, **kwargs)
|
31 |
+
self.headers = {"User-Agent": user_agent}
|
32 |
+
self.session = requests.Session()
|
33 |
+
self.session.headers.update(self.headers)
|
34 |
+
self.is_initialized = True
|
35 |
+
|
36 |
+
def forward(self, url: str, target_type: str, extraction_details: Optional[Dict] = None) -> Dict[str, Any]:
|
37 |
+
"""
|
38 |
+
Extract specific content from a web page.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
url: URL of the web page
|
42 |
+
target_type: Type of content to extract ("info", "table", "list", "specific_data")
|
43 |
+
extraction_details: Additional details for extraction
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
Dict with extracted content or error message
|
47 |
+
"""
|
48 |
+
if not extraction_details:
|
49 |
+
extraction_details = {}
|
50 |
+
|
51 |
+
# Validate URL
|
52 |
+
if not url.startswith(('http://', 'https://')):
|
53 |
+
return {"error": f"Invalid URL format: {url}"}
|
54 |
+
|
55 |
+
try:
|
56 |
+
# For Wikipedia, use specialized extraction
|
57 |
+
if 'wikipedia.org' in url:
|
58 |
+
return self._extract_from_wikipedia(url, target_type, extraction_details)
|
59 |
+
|
60 |
+
# For general websites
|
61 |
+
response = self.session.get(url, timeout=15)
|
62 |
+
response.raise_for_status()
|
63 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
64 |
+
|
65 |
+
# Handle different extraction types
|
66 |
+
if target_type == "info":
|
67 |
+
return self._extract_general_info(soup, url)
|
68 |
+
elif target_type == "table":
|
69 |
+
return self._extract_table(soup, url, extraction_details)
|
70 |
+
elif target_type == "list":
|
71 |
+
return self._extract_list(soup, url, extraction_details)
|
72 |
+
elif target_type == "specific_data":
|
73 |
+
return self._extract_specific_data(soup, url, extraction_details)
|
74 |
+
else:
|
75 |
+
return {"error": f"Unknown extraction type: {target_type}"}
|
76 |
+
|
77 |
+
except requests.exceptions.RequestException as e:
|
78 |
+
return {"error": f"Request error: {str(e)}"}
|
79 |
+
except Exception as e:
|
80 |
+
return {"error": f"Extraction error: {str(e)}"}
|
81 |
+
|
82 |
+
def _extract_general_info(self, soup, url):
|
83 |
+
"""Extract general information from a web page"""
|
84 |
+
title = soup.title.string if soup.title else "No title found"
|
85 |
+
|
86 |
+
# Try to get meta description
|
87 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
88 |
+
description = meta_desc.get('content', '') if meta_desc else "No description found"
|
89 |
+
|
90 |
+
# Get main headings
|
91 |
+
main_headings = [h1.get_text(strip=True) for h1 in soup.find_all('h1')]
|
92 |
+
|
93 |
+
# Get key facts (look for definition lists, key-value pairs)
|
94 |
+
key_facts = {}
|
95 |
+
# Check for definition lists
|
96 |
+
for dl in soup.find_all('dl'):
|
97 |
+
for dt, dd in zip(dl.find_all('dt'), dl.find_all('dd')):
|
98 |
+
key = dt.get_text(strip=True)
|
99 |
+
value = dd.get_text(strip=True)
|
100 |
+
if key and value:
|
101 |
+
key_facts[key] = value
|
102 |
+
|
103 |
+
# Get text from first few paragraphs for a summary
|
104 |
+
paragraphs = soup.find_all('p')
|
105 |
+
summary = ""
|
106 |
+
para_count = 0
|
107 |
+
for p in paragraphs:
|
108 |
+
text = p.get_text(strip=True)
|
109 |
+
if len(text) > 50: # Only include substantial paragraphs
|
110 |
+
summary += text + "\n\n"
|
111 |
+
para_count += 1
|
112 |
+
if para_count >= 3: # Limit to first 3 substantial paragraphs
|
113 |
+
break
|
114 |
+
|
115 |
+
return {
|
116 |
+
"title": title,
|
117 |
+
"url": url,
|
118 |
+
"description": description,
|
119 |
+
"main_headings": main_headings,
|
120 |
+
"key_facts": key_facts,
|
121 |
+
"summary": summary.strip()
|
122 |
+
}
|
123 |
+
|
124 |
+
def _extract_table(self, soup, url, details):
|
125 |
+
"""Extract table data from a web page"""
|
126 |
+
table_index = details.get('table_index', 0)
|
127 |
+
|
128 |
+
# Find all tables
|
129 |
+
tables = soup.find_all('table')
|
130 |
+
|
131 |
+
if not tables:
|
132 |
+
return {"error": "No tables found on the page"}
|
133 |
+
|
134 |
+
if table_index >= len(tables):
|
135 |
+
return {"error": f"Table index {table_index} is out of range. Found {len(tables)} tables."}
|
136 |
+
|
137 |
+
try:
|
138 |
+
# Try to use pandas to extract the table
|
139 |
+
table = tables[table_index]
|
140 |
+
dfs = pd.read_html(str(table))
|
141 |
+
|
142 |
+
if not dfs:
|
143 |
+
return {"error": "Failed to parse table with pandas"}
|
144 |
+
|
145 |
+
df = dfs[0]
|
146 |
+
|
147 |
+
# Convert to dictionary format
|
148 |
+
headers = df.columns.tolist()
|
149 |
+
rows = df.values.tolist()
|
150 |
+
|
151 |
+
return {
|
152 |
+
"table_data": {
|
153 |
+
"headers": headers,
|
154 |
+
"rows": rows
|
155 |
+
},
|
156 |
+
"row_count": len(rows),
|
157 |
+
"column_count": len(headers),
|
158 |
+
"url": url
|
159 |
+
}
|
160 |
+
|
161 |
+
except Exception as e:
|
162 |
+
# Fallback to manual extraction
|
163 |
+
logger.warning(f"Pandas table extraction failed: {e}. Falling back to manual extraction.")
|
164 |
+
|
165 |
+
table = tables[table_index]
|
166 |
+
headers = []
|
167 |
+
rows = []
|
168 |
+
|
169 |
+
# Try to find headers
|
170 |
+
thead = table.find('thead')
|
171 |
+
if thead:
|
172 |
+
header_row = thead.find('tr')
|
173 |
+
if header_row:
|
174 |
+
headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
|
175 |
+
|
176 |
+
# If no thead, use first row as header
|
177 |
+
if not headers:
|
178 |
+
first_row = table.find('tr')
|
179 |
+
if first_row:
|
180 |
+
headers = [th.get_text(strip=True) for th in first_row.find_all(['th', 'td'])]
|
181 |
+
|
182 |
+
# Extract rows
|
183 |
+
for tr in table.find_all('tr'):
|
184 |
+
row = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
|
185 |
+
if row and row != headers: # Skip header row in data
|
186 |
+
rows.append(row)
|
187 |
+
|
188 |
+
return {
|
189 |
+
"table_data": {
|
190 |
+
"headers": headers,
|
191 |
+
"rows": rows
|
192 |
+
},
|
193 |
+
"row_count": len(rows),
|
194 |
+
"column_count": len(headers) if headers else (len(rows[0]) if rows else 0),
|
195 |
+
"url": url,
|
196 |
+
"extraction_method": "manual_fallback"
|
197 |
+
}
|
198 |
+
|
199 |
+
def _extract_list(self, soup, url, details):
|
200 |
+
"""Extract list data from a web page"""
|
201 |
+
list_type = details.get('list_type', 'all') # 'ul', 'ol', or 'all'
|
202 |
+
position = details.get('position', 0) # Which list to extract (0-based index)
|
203 |
+
|
204 |
+
list_elements = []
|
205 |
+
|
206 |
+
if list_type == 'ul' or list_type == 'all':
|
207 |
+
list_elements.extend(soup.find_all('ul'))
|
208 |
+
|
209 |
+
if list_type == 'ol' or list_type == 'all':
|
210 |
+
list_elements.extend(soup.find_all('ol'))
|
211 |
+
|
212 |
+
if not list_elements:
|
213 |
+
return {"error": "No lists found on the page"}
|
214 |
+
|
215 |
+
if position >= len(list_elements):
|
216 |
+
return {"error": f"List position {position} is out of range. Found {len(list_elements)} lists."}
|
217 |
+
|
218 |
+
target_list = list_elements[position]
|
219 |
+
items = []
|
220 |
+
|
221 |
+
for li in target_list.find_all('li', recursive=False):
|
222 |
+
# Ignore nested lists
|
223 |
+
for nested_list in li.find_all(['ul', 'ol']):
|
224 |
+
nested_list.decompose()
|
225 |
+
|
226 |
+
item_text = li.get_text(strip=True)
|
227 |
+
if item_text:
|
228 |
+
items.append(item_text)
|
229 |
+
|
230 |
+
return {
|
231 |
+
"list_type": target_list.name, # 'ul' or 'ol'
|
232 |
+
"items": items,
|
233 |
+
"count": len(items),
|
234 |
+
"url": url
|
235 |
+
}
|
236 |
+
|
237 |
+
def _extract_specific_data(self, soup, url, details):
|
238 |
+
"""Extract specific data based on given selectors or patterns"""
|
239 |
+
data_label = details.get('data_label', '')
|
240 |
+
selector = details.get('selector', '')
|
241 |
+
attribute = details.get('attribute', '')
|
242 |
+
regex_pattern = details.get('regex_pattern', '')
|
243 |
+
|
244 |
+
result = {
|
245 |
+
"url": url,
|
246 |
+
"data_label": data_label,
|
247 |
+
"found": False
|
248 |
+
}
|
249 |
+
|
250 |
+
# Try CSS selector if provided
|
251 |
+
if selector:
|
252 |
+
elements = soup.select(selector)
|
253 |
+
if elements:
|
254 |
+
result["found"] = True
|
255 |
+
|
256 |
+
if attribute:
|
257 |
+
# Extract attribute value
|
258 |
+
values = [elem.get(attribute, '') for elem in elements]
|
259 |
+
result["values"] = values
|
260 |
+
else:
|
261 |
+
# Extract text content
|
262 |
+
values = [elem.get_text(strip=True) for elem in elements]
|
263 |
+
result["values"] = values
|
264 |
+
|
265 |
+
# If only one value, simplify the result
|
266 |
+
if len(values) == 1:
|
267 |
+
result["value"] = values[0]
|
268 |
+
|
269 |
+
return result
|
270 |
+
|
271 |
+
# Try regex pattern if provided
|
272 |
+
if regex_pattern:
|
273 |
+
page_text = soup.get_text()
|
274 |
+
matches = re.findall(regex_pattern, page_text)
|
275 |
+
|
276 |
+
if matches:
|
277 |
+
result["found"] = True
|
278 |
+
result["matches"] = matches
|
279 |
+
|
280 |
+
# If only one match, simplify the result
|
281 |
+
if len(matches) == 1:
|
282 |
+
result["value"] = matches[0]
|
283 |
+
|
284 |
+
return result
|
285 |
+
|
286 |
+
# Try common patterns based on data_label
|
287 |
+
if data_label:
|
288 |
+
# Look for label in text
|
289 |
+
label_pattern = re.compile(rf'{re.escape(data_label)}\s*[:=-]?\s*([\w\s,.()-]+)', re.IGNORECASE)
|
290 |
+
page_text = soup.get_text()
|
291 |
+
match = label_pattern.search(page_text)
|
292 |
+
|
293 |
+
if match:
|
294 |
+
result["found"] = True
|
295 |
+
result["value"] = match.group(1).strip()
|
296 |
+
return result
|
297 |
+
|
298 |
+
# Look for label in headings followed by paragraph
|
299 |
+
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
|
300 |
+
if data_label.lower() in heading.get_text().lower():
|
301 |
+
next_sibling = heading.find_next_sibling()
|
302 |
+
if next_sibling and next_sibling.name == 'p':
|
303 |
+
result["found"] = True
|
304 |
+
result["value"] = next_sibling.get_text(strip=True)
|
305 |
+
return result
|
306 |
+
|
307 |
+
# If nothing found
|
308 |
+
return result
|
309 |
+
|
310 |
+
def _extract_from_wikipedia(self, url, target_type, details):
|
311 |
+
"""Specialized extraction for Wikipedia pages using APIs when possible"""
|
312 |
+
# Extract page title from URL
|
313 |
+
title = url.split('/')[-1]
|
314 |
+
|
315 |
+
# Determine Wikipedia language
|
316 |
+
domain = url.split('//')[1].split('.')[0]
|
317 |
+
|
318 |
+
try:
|
319 |
+
# First try the Wikipedia API
|
320 |
+
api_url = f"https://{domain}.wikipedia.org/api/rest_v1/page/summary/{title}"
|
321 |
+
response = self.session.get(api_url, timeout=15)
|
322 |
+
response.raise_for_status()
|
323 |
+
api_data = response.json()
|
324 |
+
|
325 |
+
# For info requests, we can use just the API data
|
326 |
+
if target_type == "info":
|
327 |
+
return {
|
328 |
+
"title": api_data.get("title", ""),
|
329 |
+
"description": api_data.get("description", ""),
|
330 |
+
"extract": api_data.get("extract", ""),
|
331 |
+
"url": url,
|
332 |
+
"source": "wikipedia_api"
|
333 |
+
}
|
334 |
+
|
335 |
+
# For other requests, we need to fetch the HTML as well
|
336 |
+
html_response = self.session.get(url, timeout=15)
|
337 |
+
html_response.raise_for_status()
|
338 |
+
soup = BeautifulSoup(html_response.content, 'html.parser')
|
339 |
+
|
340 |
+
if target_type == "table":
|
341 |
+
# Get the infobox if requested
|
342 |
+
if details.get('infobox', False):
|
343 |
+
infobox = {}
|
344 |
+
infobox_div = soup.find('table', {'class': 'infobox'})
|
345 |
+
|
346 |
+
if infobox_div:
|
347 |
+
for row in infobox_div.find_all('tr'):
|
348 |
+
header = row.find('th')
|
349 |
+
data = row.find('td')
|
350 |
+
if header and data:
|
351 |
+
key = header.get_text(strip=True)
|
352 |
+
value = data.get_text(strip=True)
|
353 |
+
if key and value:
|
354 |
+
infobox[key] = value
|
355 |
+
|
356 |
+
return {
|
357 |
+
"title": api_data.get("title", ""),
|
358 |
+
"infobox": infobox,
|
359 |
+
"url": url,
|
360 |
+
"source": "wikipedia_infobox"
|
361 |
+
}
|
362 |
+
|
363 |
+
# Regular table extraction
|
364 |
+
return self._extract_table(soup, url, details)
|
365 |
+
|
366 |
+
elif target_type == "list":
|
367 |
+
return self._extract_list(soup, url, details)
|
368 |
+
|
369 |
+
elif target_type == "specific_data":
|
370 |
+
# Enhanced extraction for Wikipedia specific data
|
371 |
+
data_label = details.get('data_label', '')
|
372 |
+
|
373 |
+
# Try to find it in infobox first
|
374 |
+
infobox = soup.find('table', {'class': 'infobox'})
|
375 |
+
if infobox and data_label:
|
376 |
+
for row in infobox.find_all('tr'):
|
377 |
+
header = row.find('th')
|
378 |
+
if header and data_label.lower() in header.get_text().lower():
|
379 |
+
data = row.find('td')
|
380 |
+
if data:
|
381 |
+
return {
|
382 |
+
"found": True,
|
383 |
+
"value": data.get_text(strip=True),
|
384 |
+
"source": "wikipedia_infobox",
|
385 |
+
"url": url
|
386 |
+
}
|
387 |
+
|
388 |
+
# Fallback to regular specific data extraction
|
389 |
+
return self._extract_specific_data(soup, url, details)
|
390 |
+
|
391 |
+
except Exception as e:
|
392 |
+
logger.warning(f"Wikipedia API extraction failed: {e}. Falling back to HTML extraction.")
|
393 |
+
|
394 |
+
# Fallback to regular HTML extraction
|
395 |
+
try:
|
396 |
+
response = self.session.get(url, timeout=15)
|
397 |
+
response.raise_for_status()
|
398 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
399 |
+
|
400 |
+
if target_type == "info":
|
401 |
+
return self._extract_general_info(soup, url)
|
402 |
+
elif target_type == "table":
|
403 |
+
return self._extract_table(soup, url, details)
|
404 |
+
elif target_type == "list":
|
405 |
+
return self._extract_list(soup, url, details)
|
406 |
+
elif target_type == "specific_data":
|
407 |
+
return self._extract_specific_data(soup, url, details)
|
408 |
+
|
409 |
+
except Exception as fallback_error:
|
410 |
+
return {"error": f"Wikipedia extraction error: {fallback_error}"}
|