Yago Bolivar commited on
Commit
b09a8ba
·
1 Parent(s): baa65ee

feat: Enhance tools with new web content extractor and improved functionality

Browse files

- Added WebContentExtractor for structured content extraction from websites, including specialized handling for Wikipedia.
- Updated WebBrowser to support multiple extraction modes (text, tables, lists, structured) and improved error handling.
- Enhanced CodeExecutionTool with utility functions for web data processing and robust error handling.
- Improved logging across tools for better debugging and traceability.

Files changed (4) hide show
  1. app.py +7 -4
  2. src/python_tool.py +211 -62
  3. src/web_browsing_tool.py +365 -42
  4. src/web_content_extractor.py +410 -0
app.py CHANGED
@@ -13,12 +13,13 @@ from src.final_answer_tool import FinalAnswerTool
13
  from src.web_browsing_tool import WebBrowser
14
  from src.file_processing_tool import FileIdentifier
15
  from src.image_processing_tool import ImageProcessor
16
- from src.markdown_table_parser import MarkdownTableParserTool # Updated
17
  from src.python_tool import CodeExecutionTool
18
- from src.speech_to_text import SpeechToTextTool # Updated
19
  from src.spreadsheet_tool import SpreadsheetTool
20
  from src.text_reversal_tool import TextReversalTool
21
  from src.video_processing_tool import VideoProcessingTool
 
22
 
23
  # (Keep Constants as is)
24
  # --- Constants ---
@@ -67,7 +68,8 @@ python_tool = CodeExecutionTool()
67
  speech_to_text_tool = SpeechToTextTool() # Updated
68
  spreadsheet_tool = SpreadsheetTool()
69
  text_reversal_tool = TextReversalTool()
70
- video_processing_tool = VideoProcessingTool()
 
71
 
72
  # Add debug prints for file paths
73
  print("Current directory:", os.getcwd())
@@ -160,7 +162,8 @@ agent_tools = [
160
  speech_to_text_tool, # Updated
161
  spreadsheet_tool,
162
  text_reversal_tool,
163
- video_processing_tool
 
164
  ]
165
 
166
  # Flatten system_prompt if it's a dict (e.g., from YAML)
 
13
  from src.web_browsing_tool import WebBrowser
14
  from src.file_processing_tool import FileIdentifier
15
  from src.image_processing_tool import ImageProcessor
16
+ from src.markdown_table_parser import MarkdownTableParserTool
17
  from src.python_tool import CodeExecutionTool
18
+ from src.speech_to_text import SpeechToTextTool
19
  from src.spreadsheet_tool import SpreadsheetTool
20
  from src.text_reversal_tool import TextReversalTool
21
  from src.video_processing_tool import VideoProcessingTool
22
+ from src.web_content_extractor import WebContentExtractor
23
 
24
  # (Keep Constants as is)
25
  # --- Constants ---
 
68
  speech_to_text_tool = SpeechToTextTool() # Updated
69
  spreadsheet_tool = SpreadsheetTool()
70
  text_reversal_tool = TextReversalTool()
71
+ video_processing_tool = VideoProcessingTool()
72
+ web_content_extractor = WebContentExtractor() # Instantiate the new extractor tool
73
 
74
  # Add debug prints for file paths
75
  print("Current directory:", os.getcwd())
 
162
  speech_to_text_tool, # Updated
163
  spreadsheet_tool,
164
  text_reversal_tool,
165
+ video_processing_tool,
166
+ web_content_extractor # Add the new tool here
167
  ]
168
 
169
  # Flatten system_prompt if it's a dict (e.g., from YAML)
src/python_tool.py CHANGED
@@ -7,21 +7,32 @@ import traceback
7
  from typing import Dict, Any, Optional, Union, List
8
  from smolagents.tools import Tool
9
  import os
 
 
 
 
 
10
 
11
  class CodeExecutionTool(Tool):
12
  """
13
- Executes Python code in a controlled environment for safe code interpretation.
14
- Useful for evaluating code snippets and returning their output or errors.
 
15
  """
16
- name = "python_code_executor"
17
- description = "Executes a given Python code string or Python code from a file. Returns the output or error."
18
  inputs = {
19
- 'code_string': {'type': 'string', 'description': 'The Python code to execute directly.', 'nullable': True},
20
- 'filepath': {'type': 'string', 'description': 'The path to a Python file to execute.', 'nullable': True}
 
 
 
 
 
 
21
  }
22
- outputs = {'result': {'type': 'object', 'description': 'A dictionary containing \'success\', \'output\', and/or \'error\'.'}}
23
  output_type = "object"
24
-
25
  def __init__(self, timeout: int = 10, max_output_size: int = 20000, *args, **kwargs):
26
  super().__init__(*args, **kwargs)
27
  self.timeout = timeout
@@ -31,6 +42,124 @@ class CodeExecutionTool(Tool):
31
  'pickle', 'requests', 'socket', 'shutil', 'ctypes', 'multiprocessing'
32
  ]
33
  self.is_initialized = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def _analyze_code_safety(self, code: str) -> Dict[str, Any]:
36
  """Perform static analysis to check for potentially harmful code."""
@@ -68,34 +197,40 @@ class CodeExecutionTool(Tool):
68
  return {"safe": True}
69
  except SyntaxError:
70
  return {"safe": False, "reason": "Invalid Python syntax"}
71
-
72
  def _timeout_handler(self, signum, frame):
73
  """Handler for timeout signal."""
74
- raise TimeoutError("Code execution timed out")
75
-
76
  def _extract_numeric_value(self, output: str) -> Optional[Union[int, float]]:
77
  """Extract the final numeric value from output."""
78
- # First try to get the last line that's a number
79
- lines = [line.strip() for line in output.strip().split('\n') if line.strip()]
80
 
 
 
81
  for line in reversed(lines):
82
- # Try direct conversion first
 
83
  try:
84
- return float(line)
 
 
 
85
  except ValueError:
86
- pass
87
-
88
- # Try to extract numeric portion if embedded in text
89
- numeric_match = re.search(r'[-+]?\d*\.?\d+', line)
90
- if numeric_match:
91
- try:
92
- return float(numeric_match.group())
93
- except ValueError:
94
- pass
95
-
 
96
  return None
97
-
98
- # Main entry point for the agent
99
  def forward(self, code_string: Optional[str] = None, filepath: Optional[str] = None) -> Dict[str, Any]:
100
  if not code_string and not filepath:
101
  return {"success": False, "error": "No code string or filepath provided."}
@@ -116,56 +251,70 @@ class CodeExecutionTool(Tool):
116
  elif code_string:
117
  code_to_execute = code_string
118
 
119
- return self._execute_actual_code(code_to_execute)
 
 
 
120
 
121
- # Renamed from execute_code to _execute_actual_code to be internal
122
  def _execute_actual_code(self, code: str) -> Dict[str, Any]:
123
  """Execute Python code and capture the output or error."""
124
  safety_check = self._analyze_code_safety(code)
125
  if not safety_check["safe"]:
126
- return {"success": False, "error": f"Safety check failed: {safety_check['reason']}"}
127
-
128
- # Setup timeout
129
- signal.signal(signal.SIGALRM, self._timeout_handler)
130
- signal.alarm(self.timeout)
131
-
132
- captured_output = io.StringIO()
133
- # It's generally safer to execute in a restricted scope
134
- # and not provide access to all globals/locals by default.
135
- # However, for a tool that might need to define functions/classes and use them,
136
- # a shared scope might be necessary. This needs careful consideration.
137
- exec_globals = {}
138
-
139
  try:
140
- with contextlib.redirect_stdout(captured_output):
141
- with contextlib.redirect_stderr(captured_output): # Capture stderr as well
142
- exec(code, exec_globals) # Execute in a controlled global scope
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- output = captured_output.getvalue()
 
145
  if len(output) > self.max_output_size:
146
- output = output[:self.max_output_size] + "... [output truncated]"
147
 
148
- # Attempt to extract a final numeric value if applicable
149
- # This might be specific to certain tasks, consider making it optional
150
- # numeric_result = self._extract_numeric_value(output)
151
 
152
  return {
153
- "success": True,
154
  "output": output,
155
- # "numeric_value": numeric_result
156
  }
157
- except TimeoutError:
158
- return {"success": False, "error": "Code execution timed out"}
 
 
159
  except Exception as e:
160
- # Get detailed traceback
161
- tb_lines = traceback.format_exception(type(e), e, e.__traceback__)
162
- error_details = "".join(tb_lines)
163
- if len(error_details) > self.max_output_size:
164
- error_details = error_details[:self.max_output_size] + "... [error truncated]"
165
- return {"success": False, "error": f"Execution failed: {str(e)}\nTraceback:\n{error_details}"}
166
  finally:
167
- signal.alarm(0) # Disable the alarm
168
- captured_output.close()
169
 
170
  # Kept execute_file and execute_code as helper methods if direct access is ever needed,
171
  # but they now call the main _execute_actual_code method.
 
7
  from typing import Dict, Any, Optional, Union, List
8
  from smolagents.tools import Tool
9
  import os
10
+ import logging
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
 
16
  class CodeExecutionTool(Tool):
17
  """
18
+ Executes Python code snippets safely with timeout protection.
19
+ Useful for data processing, analysis, and transformation.
20
+ Includes special utilities for web data processing and robust error handling.
21
  """
22
+ name = "python_executor"
23
+ description = "Safely executes Python code with enhancements for data processing, parsing, and error recovery."
24
  inputs = {
25
+ 'code_string': {'type': 'string', 'description': 'The Python code to execute.', 'nullable': True},
26
+ 'filepath': {'type': 'string', 'description': 'Path to a Python file to execute.', 'nullable': True}
27
+ }
28
+ outputs = {
29
+ 'success': {'type': 'boolean', 'description': 'Whether the code executed successfully.'},
30
+ 'output': {'type': 'string', 'description': 'The captured stdout or formatted result.', 'nullable': True},
31
+ 'error': {'type': 'string', 'description': 'Error message if execution failed.', 'nullable': True},
32
+ 'result_value': {'type': 'any', 'description': 'The final expression value if applicable.', 'nullable': True}
33
  }
 
34
  output_type = "object"
35
+
36
  def __init__(self, timeout: int = 10, max_output_size: int = 20000, *args, **kwargs):
37
  super().__init__(*args, **kwargs)
38
  self.timeout = timeout
 
42
  'pickle', 'requests', 'socket', 'shutil', 'ctypes', 'multiprocessing'
43
  ]
44
  self.is_initialized = True
45
+ # Add utility functions that will be available to executed code
46
+ self._utility_functions = self._get_utility_functions()
47
+
48
+ def _get_utility_functions(self):
49
+ """Define utility functions that will be available in the executed code"""
50
+ utility_code = """
51
+ # Utility functions for web data processing
52
+ def extract_pattern(text, pattern, group=0, all_matches=False):
53
+ """
54
+ "Extract data using regex pattern from text.
55
+ Args:
56
+ text (str): Text to search in
57
+ pattern (str): Regex pattern to use
58
+ group (int): Capture group to return (default 0 - entire match)
59
+ all_matches (bool): If True, return all matches, otherwise just first
60
+ Returns:
61
+ Matched string(s) or None if no match
62
+ """
63
+ import re
64
+ if not text or not pattern:
65
+ print("Warning: Empty text or pattern provided to extract_pattern")
66
+ return None
67
+
68
+ try:
69
+ matches = re.finditer(pattern, text)
70
+ results = [m.group(group) if group < len(m.groups())+1 else m.group(0) for m in matches]
71
+
72
+ if not results:
73
+ print(f"No matches found for pattern '{pattern}'")
74
+ return None
75
+
76
+ if all_matches:
77
+ return results
78
+ else:
79
+ return results[0]
80
+ except Exception as e:
81
+ print(f"Error in extract_pattern: {e}")
82
+ return None
83
+
84
+ def clean_text(text, remove_extra_whitespace=True, remove_special_chars=False):
85
+ """
86
+ Clean text by removing extra whitespace and optionally special characters.
87
+ Args:
88
+ text (str): Text to clean
89
+ remove_extra_whitespace (bool): If True, replace multiple spaces with single space
90
+ remove_special_chars (bool): If True, remove special characters
91
+ Returns:
92
+ Cleaned string
93
+ """
94
+ import re
95
+ if not text:
96
+ return ""
97
+
98
+ # Replace newlines and tabs with spaces
99
+ text = re.sub(r'[\\n\\t\\r]+', ' ', text)
100
+
101
+ if remove_special_chars:
102
+ # Keep only alphanumeric, spaces, and basic punctuation
103
+ text = re.sub(r'[^\\w\\s.,;:!?\'"()-]', '', text)
104
+
105
+ if remove_extra_whitespace:
106
+ # Replace multiple spaces with single space
107
+ text = re.sub(r'\\s+', ' ', text)
108
+
109
+ return text.strip()
110
+
111
+ def parse_table_text(table_text):
112
+ """
113
+ Parse table-like text into list of rows
114
+ Args:
115
+ table_text (str): Text containing table-like data
116
+ Returns:
117
+ List of rows (each row is a list of cells)
118
+ """
119
+ rows = []
120
+ lines = table_text.strip().split('\\n')
121
+
122
+ for line in lines:
123
+ # Skip empty lines
124
+ if not line.strip():
125
+ continue
126
+
127
+ # Split by whitespace or common separators
128
+ cells = re.split(r'\\s{2,}|\\t+|\\|+', line.strip())
129
+ # Clean up cells
130
+ cells = [cell.strip() for cell in cells if cell.strip()]
131
+
132
+ if cells:
133
+ rows.append(cells)
134
+
135
+ # Print parsing result for debugging
136
+ print(f"Parsed {len(rows)} rows from table text")
137
+ if rows and len(rows) > 0:
138
+ print(f"First row (columns: {len(rows[0])}): {rows[0]}")
139
+
140
+ return rows
141
+
142
+ def safe_float(text):
143
+ """
144
+ Safely convert text to float, handling various formats.
145
+ Args:
146
+ text (str): Text to convert
147
+ Returns:
148
+ float or None if conversion fails
149
+ """
150
+ if not text:
151
+ return None
152
+
153
+ # Remove currency symbols, commas in numbers, etc.
154
+ text = re.sub(r'[^0-9.-]', '', str(text))
155
+
156
+ try:
157
+ return float(text)
158
+ except ValueError:
159
+ print(f"Warning: Could not convert '{text}' to float")
160
+ return None
161
+ """
162
+ return utility_code
163
 
164
  def _analyze_code_safety(self, code: str) -> Dict[str, Any]:
165
  """Perform static analysis to check for potentially harmful code."""
 
197
  return {"safe": True}
198
  except SyntaxError:
199
  return {"safe": False, "reason": "Invalid Python syntax"}
200
+
201
  def _timeout_handler(self, signum, frame):
202
  """Handler for timeout signal."""
203
+ raise TimeoutError(f"Code execution timed out after {self.timeout} seconds")
204
+
205
  def _extract_numeric_value(self, output: str) -> Optional[Union[int, float]]:
206
  """Extract the final numeric value from output."""
207
+ if not output:
208
+ return None
209
 
210
+ # Look for the last line that contains a number
211
+ lines = output.strip().split('\n')
212
  for line in reversed(lines):
213
+ # Try to interpret it as a pure number
214
+ line = line.strip()
215
  try:
216
+ if '.' in line:
217
+ return float(line)
218
+ else:
219
+ return int(line)
220
  except ValueError:
221
+ # Not a pure number, try to extract numbers with regex
222
+ match = re.search(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?$', line)
223
+ if match:
224
+ num_str = match.group(0)
225
+ try:
226
+ if '.' in num_str:
227
+ return float(num_str)
228
+ else:
229
+ return int(num_str)
230
+ except ValueError:
231
+ pass
232
  return None
233
+
 
234
  def forward(self, code_string: Optional[str] = None, filepath: Optional[str] = None) -> Dict[str, Any]:
235
  if not code_string and not filepath:
236
  return {"success": False, "error": "No code string or filepath provided."}
 
251
  elif code_string:
252
  code_to_execute = code_string
253
 
254
+ # Inject utility functions
255
+ enhanced_code = self._utility_functions + "\n\n" + code_to_execute
256
+
257
+ return self._execute_actual_code(enhanced_code)
258
 
 
259
  def _execute_actual_code(self, code: str) -> Dict[str, Any]:
260
  """Execute Python code and capture the output or error."""
261
  safety_check = self._analyze_code_safety(code)
262
  if not safety_check["safe"]:
263
+ return {
264
+ "success": False,
265
+ "error": f"Safety check failed: {safety_check['reason']}"
266
+ }
267
+
268
+ # Capture stdout and execute the code with a timeout
269
+ stdout_buffer = io.StringIO()
270
+ result_value = None
271
+
 
 
 
 
272
  try:
273
+ # Set timeout handler
274
+ signal.signal(signal.SIGALRM, self._timeout_handler)
275
+ signal.alarm(self.timeout)
276
+
277
+ # Execute code and capture stdout
278
+ with contextlib.redirect_stdout(stdout_buffer):
279
+ # Execute the code within a new dictionary for local variables
280
+ local_vars = {}
281
+ exec(code, {}, local_vars)
282
+
283
+ # Try to extract the result from common variable names
284
+ for var_name in ['result', 'answer', 'output', 'value', 'final_result', 'data']:
285
+ if var_name in local_vars:
286
+ result_value = local_vars[var_name]
287
+ break
288
+
289
+ # Reset the alarm
290
+ signal.alarm(0)
291
 
292
+ # Get the captured output
293
+ output = stdout_buffer.getvalue()
294
  if len(output) > self.max_output_size:
295
+ output = output[:self.max_output_size] + f"\n... (output truncated, exceeded {self.max_output_size} characters)"
296
 
297
+ # If no result_value was found, try to extract a numeric value from the output
298
+ if result_value is None:
299
+ result_value = self._extract_numeric_value(output)
300
 
301
  return {
302
+ "success": True,
303
  "output": output,
304
+ "result_value": result_value
305
  }
306
+
307
+ except TimeoutError as e:
308
+ signal.alarm(0) # Reset the alarm
309
+ return {"success": False, "error": f"Code execution timed out after {self.timeout} seconds"}
310
  except Exception as e:
311
+ signal.alarm(0) # Reset the alarm
312
+ trace = traceback.format_exc()
313
+ error_msg = f"Error executing code: {str(e)}\n{trace}"
314
+ return {"success": False, "error": error_msg}
 
 
315
  finally:
316
+ # Ensure the alarm is reset
317
+ signal.alarm(0)
318
 
319
  # Kept execute_file and execute_code as helper methods if direct access is ever needed,
320
  # but they now call the main _execute_actual_code method.
src/web_browsing_tool.py CHANGED
@@ -1,17 +1,31 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from smolagents.tools import Tool
 
 
 
 
 
 
 
 
 
 
4
 
5
  class WebBrowser(Tool):
6
  """
7
  Retrieves information from online sources by browsing web pages.
8
- Useful for extracting or summarizing web content.
 
9
  """
10
  name = "web_browser"
11
- description = "Fetches the content of a web page and extracts its text. Input should be a valid URL."
12
- inputs = {'url': {'type': 'string', 'description': 'The URL of the web page to browse.'}}
13
- outputs = {'text_content': {'type': 'string', 'description': 'The extracted text content of the web page, or an error message.'}}
14
- output_type = "string"
 
 
 
15
 
16
  def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
17
  """
@@ -21,62 +35,371 @@ class WebBrowser(Tool):
21
  """
22
  super().__init__(*args, **kwargs)
23
  self.headers = {"User-Agent": user_agent}
24
- self.is_initialized = True # Example of a tool state
 
 
 
25
 
26
- def forward(self, url: str) -> str:
27
  """
28
- Fetches the content of a web page and extracts its text.
29
 
30
  Args:
31
  url (str): The URL of the web page to browse.
 
32
 
33
  Returns:
34
- str: The extracted text content of the web page, or an error message
35
- if fetching or parsing fails.
36
  """
 
37
  if not url.startswith(('http://', 'https://')):
38
- return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}"
39
 
40
  try:
41
- response = requests.get(url, headers=self.headers, timeout=15)
42
- response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Use BeautifulSoup to parse the HTML content
45
- soup = BeautifulSoup(response.content, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- # Remove script and style elements
48
- for script_or_style in soup(["script", "style"]):
49
- script_or_style.decompose()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- # Get text
52
- text_from_soup = soup.get_text(separator='\n', strip=True)
 
53
 
54
- # Convert multiple newlines to a single newline and clean spaces within lines
55
- cleaned_lines = []
56
- for line in text_from_soup.splitlines():
57
- line = line.strip() # Strip leading/trailing whitespace from the line itself
58
- if line: # Only process non-empty lines
59
- # Replace multiple spaces with a single space
60
- cleaned_line = ' '.join(line.split())
61
- cleaned_lines.append(cleaned_line)
62
 
63
- text = '\n'.join(cleaned_lines)
64
 
65
- if not text:
66
- return f"Error: No text content found at {url}."
67
 
68
- return text
 
 
 
 
 
69
 
70
- except requests.exceptions.HTTPError as http_err:
71
- return f"Error: HTTP error occurred while fetching {url}: {http_err}"
72
- except requests.exceptions.ConnectionError as conn_err:
73
- return f"Error: Connection error occurred while fetching {url}: {conn_err}"
74
- except requests.exceptions.Timeout as timeout_err:
75
- return f"Error: Timeout occurred while fetching {url}: {timeout_err}"
76
- except requests.exceptions.RequestException as req_err:
77
- return f"Error: An unexpected error occurred while fetching {url}: {req_err}"
78
- except Exception as e:
79
- return f"Error: An unexpected error occurred during parsing of {url}: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  if __name__ == '__main__':
82
  browser = WebBrowser() # Instantiation remains the same for testing
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from smolagents.tools import Tool
4
+ import re
5
+ import json
6
+ import logging
7
+ import time
8
+ from urllib.parse import urlparse, urljoin
9
+ import pandas as pd
10
+
11
+ # Set up logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
 
15
  class WebBrowser(Tool):
16
  """
17
  Retrieves information from online sources by browsing web pages.
18
+ Useful for extracting or summarizing web content, with special handling for structured data.
19
+ Can extract tables, lists, and key information from web pages.
20
  """
21
  name = "web_browser"
22
+ description = "Fetches content from web pages with improved structured data handling. Has specialized extraction for Wikipedia. Returns text content or structured data."
23
+ inputs = {
24
+ 'url': {'type': 'string', 'description': 'The URL of the web page to browse.'},
25
+ 'extraction_mode': {'type': 'string', 'description': 'Mode for data extraction: "text" (default), "tables", "lists", or "structured".', 'nullable': True}
26
+ }
27
+ outputs = {'content': {'type': 'object', 'description': 'The extracted content from the web page, either as text or structured data.'}}
28
+ output_type = "object"
29
 
30
  def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
31
  """
 
35
  """
36
  super().__init__(*args, **kwargs)
37
  self.headers = {"User-Agent": user_agent}
38
+ self.is_initialized = True
39
+ # Add a session to maintain cookies
40
+ self.session = requests.Session()
41
+ self.session.headers.update(self.headers)
42
 
43
+ def forward(self, url: str, extraction_mode: str = "text") -> dict:
44
  """
45
+ Fetches the content of a web page and extracts information based on the specified mode.
46
 
47
  Args:
48
  url (str): The URL of the web page to browse.
49
+ extraction_mode (str): The mode for data extraction - "text" (default), "tables", "lists", or "structured"
50
 
51
  Returns:
52
+ dict: The extracted content or an error message
 
53
  """
54
+ # Validate URL
55
  if not url.startswith(('http://', 'https://')):
56
+ return {"error": f"Invalid URL format. URL must start with http:// or https://. Received: {url}"}
57
 
58
  try:
59
+ # Check if it's Wikipedia and use special handling
60
+ if 'wikipedia.org' in url:
61
+ return self._handle_wikipedia(url, extraction_mode)
62
+
63
+ # Process normal web pages
64
+ return self._process_regular_webpage(url, extraction_mode)
65
+
66
+ except requests.exceptions.HTTPError as http_err:
67
+ return {"error": f"HTTP error occurred while fetching {url}: {http_err}"}
68
+ except requests.exceptions.ConnectionError as conn_err:
69
+ return {"error": f"Connection error occurred while fetching {url}: {conn_err}"}
70
+ except requests.exceptions.Timeout as timeout_err:
71
+ return {"error": f"Timeout occurred while fetching {url}: {timeout_err}"}
72
+ except requests.exceptions.RequestException as req_err:
73
+ return {"error": f"An unexpected error occurred while fetching {url}: {req_err}"}
74
+ except Exception as e:
75
+ return {"error": f"An unexpected error occurred during parsing of {url}: {e}"}
76
 
77
+ def _process_regular_webpage(self, url, extraction_mode):
78
+ """Process a regular (non-Wikipedia) webpage"""
79
+ response = self.session.get(url, timeout=15)
80
+ response.raise_for_status()
81
+
82
+ # Use BeautifulSoup to parse the HTML content
83
+ soup = BeautifulSoup(response.content, 'html.parser')
84
+
85
+ # Remove script and style elements
86
+ for script_or_style in soup(["script", "style"]):
87
+ script_or_style.decompose()
88
+
89
+ if extraction_mode == "text":
90
+ return self._extract_text(soup, url)
91
+ elif extraction_mode == "tables":
92
+ return self._extract_tables(soup, url)
93
+ elif extraction_mode == "lists":
94
+ return self._extract_lists(soup, url)
95
+ elif extraction_mode == "structured":
96
+ return self._extract_structured_data(soup, url)
97
+ else:
98
+ return {"error": f"Unknown extraction mode: {extraction_mode}"}
99
 
100
+ def _handle_wikipedia(self, url, extraction_mode):
101
+ """Special handling for Wikipedia pages"""
102
+ # For Wikipedia, try to use the API instead of scraping the HTML
103
+ parsed_url = urlparse(url)
104
+ if not parsed_url.netloc.endswith('wikipedia.org'):
105
+ return self._process_regular_webpage(url, extraction_mode)
106
+
107
+ # Extract the title from the URL path
108
+ path_parts = parsed_url.path.split('/')
109
+ if len(path_parts) < 3 or path_parts[1] != 'wiki':
110
+ # Not a standard Wikipedia article URL
111
+ return self._process_regular_webpage(url, extraction_mode)
112
+
113
+ title = path_parts[2]
114
+ lang = parsed_url.netloc.split('.')[0]
115
+
116
+ # Use Wikipedia API to get structured content
117
+ api_url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{title}"
118
+
119
+ try:
120
+ logger.info(f"Fetching Wikipedia API data from {api_url}")
121
+ api_response = self.session.get(api_url, timeout=15)
122
+ api_response.raise_for_status()
123
+ api_data = api_response.json()
124
+
125
+ # Basic information from the API
126
+ wiki_data = {
127
+ "title": api_data.get("title", ""),
128
+ "description": api_data.get("description", ""),
129
+ "extract": api_data.get("extract", ""),
130
+ "url": api_data.get("content_urls", {}).get("desktop", {}).get("page", url)
131
+ }
132
+
133
+ # If we need more detailed data beyond the summary
134
+ if extraction_mode in ["tables", "structured"]:
135
+ # Get the full HTML anyway for tables and other structured data
136
+ response = self.session.get(url, timeout=15)
137
+ response.raise_for_status()
138
+ soup = BeautifulSoup(response.content, 'html.parser')
139
+
140
+ # Add tables to the response
141
+ tables = self._extract_tables(soup, url, return_raw=False)
142
+ wiki_data["tables"] = tables.get("tables", [])
143
+
144
+ # For "structured" mode, add sections, infobox and other elements
145
+ if extraction_mode == "structured":
146
+ wiki_data["infobox"] = self._extract_wikipedia_infobox(soup)
147
+ wiki_data["sections"] = self._extract_wikipedia_sections(soup)
148
+
149
+ return {
150
+ "source": "wikipedia_api_enhanced",
151
+ "url": url,
152
+ "data": wiki_data
153
+ }
154
+
155
+ # For basic text, return the API data
156
+ return {
157
+ "source": "wikipedia_api",
158
+ "url": url,
159
+ "data": wiki_data
160
+ }
161
+
162
+ except (requests.exceptions.RequestException, ValueError) as e:
163
+ logger.warning(f"Wikipedia API request failed: {e}. Falling back to HTML scraping.")
164
+ # Fallback to normal HTML processing
165
+ return self._process_regular_webpage(url, extraction_mode)
166
 
167
+ def _extract_text(self, soup, url):
168
+ """Extract clean text from the page"""
169
+ text_from_soup = soup.get_text(separator='\n', strip=True)
170
 
171
+ # Convert multiple newlines to a single newline and clean spaces within lines
172
+ cleaned_lines = []
173
+ for line in text_from_soup.splitlines():
174
+ line = line.strip() # Strip leading/trailing whitespace
175
+ if line: # Only process non-empty lines
176
+ # Replace multiple spaces with a single space
177
+ cleaned_line = ' '.join(line.split())
178
+ cleaned_lines.append(cleaned_line)
179
 
180
+ text = '\n'.join(cleaned_lines)
181
 
182
+ if not text:
183
+ return {"error": f"No text content found at {url}."}
184
 
185
+ return {
186
+ "source": "web_page",
187
+ "url": url,
188
+ "content_type": "text",
189
+ "text": text
190
+ }
191
 
192
+ def _extract_tables(self, soup, url, return_raw=True):
193
+ """Extract tables from the page"""
194
+ tables = []
195
+
196
+ # Find all table elements
197
+ html_tables = soup.find_all('table')
198
+
199
+ for i, table in enumerate(html_tables):
200
+ try:
201
+ # Try to convert to a pandas DataFrame
202
+ dfs = pd.read_html(str(table))
203
+
204
+ if dfs:
205
+ # Convert each DataFrame to a dict for JSON serialization
206
+ for j, df in enumerate(dfs):
207
+ # Clean column names
208
+ df.columns = [str(col).strip() for col in df.columns]
209
+
210
+ # Convert DataFrame to dict
211
+ table_dict = {
212
+ "table_id": f"table_{i}_{j}",
213
+ "headers": df.columns.tolist(),
214
+ "rows": df.values.tolist(),
215
+ }
216
+ tables.append(table_dict)
217
+ except Exception as e:
218
+ logger.warning(f"Failed to parse table {i}: {e}")
219
+ # Try a manual extraction
220
+ try:
221
+ headers = []
222
+ header_row = table.find('tr')
223
+ if header_row:
224
+ headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
225
+
226
+ rows = []
227
+ for tr in table.find_all('tr'):
228
+ row = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
229
+ if row and row != headers: # Skip header row in data
230
+ rows.append(row)
231
+
232
+ if headers or rows:
233
+ tables.append({
234
+ "table_id": f"table_{i}_manual",
235
+ "headers": headers,
236
+ "rows": rows
237
+ })
238
+ except Exception:
239
+ continue # Skip if manual extraction also fails
240
+
241
+ if return_raw:
242
+ return {
243
+ "source": "web_page",
244
+ "url": url,
245
+ "content_type": "tables",
246
+ "table_count": len(tables),
247
+ "tables": tables
248
+ }
249
+ else:
250
+ return {"tables": tables}
251
+
252
+ def _extract_lists(self, soup, url):
253
+ """Extract lists from the page"""
254
+ lists = []
255
+
256
+ # Find all ul and ol elements
257
+ for list_type in ['ul', 'ol']:
258
+ list_elements = soup.find_all(list_type, recursive=True)
259
+
260
+ for i, list_elem in enumerate(list_elements):
261
+ # Skip nested lists to avoid duplication
262
+ if list_elem.parent.name in ['li', 'ul', 'ol']:
263
+ continue
264
+
265
+ items = []
266
+ for li in list_elem.find_all('li', recursive=False):
267
+ # Get text but exclude any nested lists
268
+ for nested_list in li.find_all(['ul', 'ol']):
269
+ nested_list.decompose()
270
+
271
+ item_text = li.get_text(strip=True)
272
+ if item_text:
273
+ items.append(item_text)
274
+
275
+ if items:
276
+ lists.append({
277
+ "list_id": f"{list_type}_{i}",
278
+ "list_type": "ordered" if list_type == "ol" else "unordered",
279
+ "items": items
280
+ })
281
+
282
+ return {
283
+ "source": "web_page",
284
+ "url": url,
285
+ "content_type": "lists",
286
+ "list_count": len(lists),
287
+ "lists": lists
288
+ }
289
+
290
+ def _extract_structured_data(self, soup, url):
291
+ """Extract various types of structured data from the page"""
292
+ result = {
293
+ "source": "web_page",
294
+ "url": url,
295
+ "content_type": "structured",
296
+ "title": soup.title.string if soup.title else "",
297
+ "meta_description": "",
298
+ }
299
+
300
+ # Extract meta description
301
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
302
+ if meta_desc:
303
+ result["meta_description"] = meta_desc.get('content', '')
304
+
305
+ # Extract main text content
306
+ text_result = self._extract_text(soup, url)
307
+ if "text" in text_result:
308
+ result["text"] = text_result["text"]
309
+
310
+ # Extract tables
311
+ tables_result = self._extract_tables(soup, url, return_raw=False)
312
+ result["tables"] = tables_result.get("tables", [])
313
+
314
+ # Extract lists
315
+ lists_result = self._extract_lists(soup, url)
316
+ result["lists"] = lists_result.get("lists", [])
317
+
318
+ # Extract headings for document structure
319
+ headings = []
320
+ for i, heading in enumerate(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])):
321
+ headings.append({
322
+ "id": f"heading_{i}",
323
+ "level": int(heading.name[1]),
324
+ "text": heading.get_text(strip=True)
325
+ })
326
+ result["headings"] = headings
327
+
328
+ # Look for JSON-LD structured data
329
+ json_ld_data = []
330
+ for script in soup.find_all('script', type='application/ld+json'):
331
+ try:
332
+ json_data = json.loads(script.string)
333
+ json_ld_data.append(json_data)
334
+ except (json.JSONDecodeError, ValueError):
335
+ continue
336
+
337
+ if json_ld_data:
338
+ result["structured_data"] = json_ld_data
339
+
340
+ return result
341
+
342
+ def _extract_wikipedia_infobox(self, soup):
343
+ """Extract information from Wikipedia infobox"""
344
+ infobox = {}
345
+
346
+ # Look for the infobox table
347
+ infobox_table = soup.find('table', class_=['infobox', 'vcard'])
348
+ if infobox_table:
349
+ for row in infobox_table.find_all('tr'):
350
+ # Look for th/td pairs
351
+ header = row.find('th')
352
+ value = row.find('td')
353
+
354
+ if header and value:
355
+ key = header.get_text(strip=True)
356
+ # Clean up the value text
357
+ for sup in value.find_all('sup'):
358
+ sup.decompose() # Remove reference superscripts
359
+
360
+ val = value.get_text(strip=True)
361
+ if key and val:
362
+ infobox[key] = val
363
+
364
+ return infobox
365
+
366
+ def _extract_wikipedia_sections(self, soup):
367
+ """Extract sections and their content from Wikipedia"""
368
+ sections = []
369
+ current_section = None
370
+
371
+ # Find all headings
372
+ headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
373
+
374
+ for heading in headings:
375
+ # Skip non-content headings
376
+ if heading.get('id') in ['firstHeading', 'mw-toc-heading']:
377
+ continue
378
+
379
+ level = int(heading.name[1])
380
+ title = heading.get_text(strip=True)
381
+
382
+ # Start a new section
383
+ current_section = {
384
+ "level": level,
385
+ "title": title,
386
+ "content": ""
387
+ }
388
+
389
+ # Get content until next heading
390
+ content_elements = []
391
+ sibling = heading.next_sibling
392
+
393
+ while sibling and not (sibling.name and sibling.name.startswith('h')):
394
+ if sibling.name in ['p', 'ul', 'ol']:
395
+ content_elements.append(sibling.get_text(strip=True))
396
+ sibling = sibling.next_sibling
397
+
398
+ if content_elements:
399
+ current_section["content"] = "\n".join(content_elements)
400
+ sections.append(current_section)
401
+
402
+ return sections
403
 
404
  if __name__ == '__main__':
405
  browser = WebBrowser() # Instantiation remains the same for testing
src/web_content_extractor.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents.tools import Tool
2
+ from typing import Dict, Any, Optional
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import re
6
+ import json
7
+ import pandas as pd
8
+ import logging
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class WebContentExtractor(Tool):
15
+ """
16
+ Specialized tool for extracting structured content from specific websites.
17
+ Has optimized extractors for Wikipedia, tabular data, and common content patterns.
18
+ """
19
+ name = "web_content_extractor"
20
+ description = "Extracts structured data from websites with specialized handlers for Wikipedia and other content types."
21
+ inputs = {
22
+ 'url': {'type': 'string', 'description': 'The URL of the web page to extract content from.'},
23
+ 'target_type': {'type': 'string', 'description': 'Type of content to extract: "info", "table", "list", or "specific_data".'},
24
+ 'extraction_details': {'type': 'object', 'description': 'Additional details for extraction (e.g., table index, data label).', 'nullable': True}
25
+ }
26
+ outputs = {'result': {'type': 'object', 'description': 'The extracted content as structured data.'}}
27
+ output_type = "object"
28
+
29
+ def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
30
+ super().__init__(*args, **kwargs)
31
+ self.headers = {"User-Agent": user_agent}
32
+ self.session = requests.Session()
33
+ self.session.headers.update(self.headers)
34
+ self.is_initialized = True
35
+
36
+ def forward(self, url: str, target_type: str, extraction_details: Optional[Dict] = None) -> Dict[str, Any]:
37
+ """
38
+ Extract specific content from a web page.
39
+
40
+ Args:
41
+ url: URL of the web page
42
+ target_type: Type of content to extract ("info", "table", "list", "specific_data")
43
+ extraction_details: Additional details for extraction
44
+
45
+ Returns:
46
+ Dict with extracted content or error message
47
+ """
48
+ if not extraction_details:
49
+ extraction_details = {}
50
+
51
+ # Validate URL
52
+ if not url.startswith(('http://', 'https://')):
53
+ return {"error": f"Invalid URL format: {url}"}
54
+
55
+ try:
56
+ # For Wikipedia, use specialized extraction
57
+ if 'wikipedia.org' in url:
58
+ return self._extract_from_wikipedia(url, target_type, extraction_details)
59
+
60
+ # For general websites
61
+ response = self.session.get(url, timeout=15)
62
+ response.raise_for_status()
63
+ soup = BeautifulSoup(response.content, 'html.parser')
64
+
65
+ # Handle different extraction types
66
+ if target_type == "info":
67
+ return self._extract_general_info(soup, url)
68
+ elif target_type == "table":
69
+ return self._extract_table(soup, url, extraction_details)
70
+ elif target_type == "list":
71
+ return self._extract_list(soup, url, extraction_details)
72
+ elif target_type == "specific_data":
73
+ return self._extract_specific_data(soup, url, extraction_details)
74
+ else:
75
+ return {"error": f"Unknown extraction type: {target_type}"}
76
+
77
+ except requests.exceptions.RequestException as e:
78
+ return {"error": f"Request error: {str(e)}"}
79
+ except Exception as e:
80
+ return {"error": f"Extraction error: {str(e)}"}
81
+
82
+ def _extract_general_info(self, soup, url):
83
+ """Extract general information from a web page"""
84
+ title = soup.title.string if soup.title else "No title found"
85
+
86
+ # Try to get meta description
87
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
88
+ description = meta_desc.get('content', '') if meta_desc else "No description found"
89
+
90
+ # Get main headings
91
+ main_headings = [h1.get_text(strip=True) for h1 in soup.find_all('h1')]
92
+
93
+ # Get key facts (look for definition lists, key-value pairs)
94
+ key_facts = {}
95
+ # Check for definition lists
96
+ for dl in soup.find_all('dl'):
97
+ for dt, dd in zip(dl.find_all('dt'), dl.find_all('dd')):
98
+ key = dt.get_text(strip=True)
99
+ value = dd.get_text(strip=True)
100
+ if key and value:
101
+ key_facts[key] = value
102
+
103
+ # Get text from first few paragraphs for a summary
104
+ paragraphs = soup.find_all('p')
105
+ summary = ""
106
+ para_count = 0
107
+ for p in paragraphs:
108
+ text = p.get_text(strip=True)
109
+ if len(text) > 50: # Only include substantial paragraphs
110
+ summary += text + "\n\n"
111
+ para_count += 1
112
+ if para_count >= 3: # Limit to first 3 substantial paragraphs
113
+ break
114
+
115
+ return {
116
+ "title": title,
117
+ "url": url,
118
+ "description": description,
119
+ "main_headings": main_headings,
120
+ "key_facts": key_facts,
121
+ "summary": summary.strip()
122
+ }
123
+
124
+ def _extract_table(self, soup, url, details):
125
+ """Extract table data from a web page"""
126
+ table_index = details.get('table_index', 0)
127
+
128
+ # Find all tables
129
+ tables = soup.find_all('table')
130
+
131
+ if not tables:
132
+ return {"error": "No tables found on the page"}
133
+
134
+ if table_index >= len(tables):
135
+ return {"error": f"Table index {table_index} is out of range. Found {len(tables)} tables."}
136
+
137
+ try:
138
+ # Try to use pandas to extract the table
139
+ table = tables[table_index]
140
+ dfs = pd.read_html(str(table))
141
+
142
+ if not dfs:
143
+ return {"error": "Failed to parse table with pandas"}
144
+
145
+ df = dfs[0]
146
+
147
+ # Convert to dictionary format
148
+ headers = df.columns.tolist()
149
+ rows = df.values.tolist()
150
+
151
+ return {
152
+ "table_data": {
153
+ "headers": headers,
154
+ "rows": rows
155
+ },
156
+ "row_count": len(rows),
157
+ "column_count": len(headers),
158
+ "url": url
159
+ }
160
+
161
+ except Exception as e:
162
+ # Fallback to manual extraction
163
+ logger.warning(f"Pandas table extraction failed: {e}. Falling back to manual extraction.")
164
+
165
+ table = tables[table_index]
166
+ headers = []
167
+ rows = []
168
+
169
+ # Try to find headers
170
+ thead = table.find('thead')
171
+ if thead:
172
+ header_row = thead.find('tr')
173
+ if header_row:
174
+ headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
175
+
176
+ # If no thead, use first row as header
177
+ if not headers:
178
+ first_row = table.find('tr')
179
+ if first_row:
180
+ headers = [th.get_text(strip=True) for th in first_row.find_all(['th', 'td'])]
181
+
182
+ # Extract rows
183
+ for tr in table.find_all('tr'):
184
+ row = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
185
+ if row and row != headers: # Skip header row in data
186
+ rows.append(row)
187
+
188
+ return {
189
+ "table_data": {
190
+ "headers": headers,
191
+ "rows": rows
192
+ },
193
+ "row_count": len(rows),
194
+ "column_count": len(headers) if headers else (len(rows[0]) if rows else 0),
195
+ "url": url,
196
+ "extraction_method": "manual_fallback"
197
+ }
198
+
199
+ def _extract_list(self, soup, url, details):
200
+ """Extract list data from a web page"""
201
+ list_type = details.get('list_type', 'all') # 'ul', 'ol', or 'all'
202
+ position = details.get('position', 0) # Which list to extract (0-based index)
203
+
204
+ list_elements = []
205
+
206
+ if list_type == 'ul' or list_type == 'all':
207
+ list_elements.extend(soup.find_all('ul'))
208
+
209
+ if list_type == 'ol' or list_type == 'all':
210
+ list_elements.extend(soup.find_all('ol'))
211
+
212
+ if not list_elements:
213
+ return {"error": "No lists found on the page"}
214
+
215
+ if position >= len(list_elements):
216
+ return {"error": f"List position {position} is out of range. Found {len(list_elements)} lists."}
217
+
218
+ target_list = list_elements[position]
219
+ items = []
220
+
221
+ for li in target_list.find_all('li', recursive=False):
222
+ # Ignore nested lists
223
+ for nested_list in li.find_all(['ul', 'ol']):
224
+ nested_list.decompose()
225
+
226
+ item_text = li.get_text(strip=True)
227
+ if item_text:
228
+ items.append(item_text)
229
+
230
+ return {
231
+ "list_type": target_list.name, # 'ul' or 'ol'
232
+ "items": items,
233
+ "count": len(items),
234
+ "url": url
235
+ }
236
+
237
+ def _extract_specific_data(self, soup, url, details):
238
+ """Extract specific data based on given selectors or patterns"""
239
+ data_label = details.get('data_label', '')
240
+ selector = details.get('selector', '')
241
+ attribute = details.get('attribute', '')
242
+ regex_pattern = details.get('regex_pattern', '')
243
+
244
+ result = {
245
+ "url": url,
246
+ "data_label": data_label,
247
+ "found": False
248
+ }
249
+
250
+ # Try CSS selector if provided
251
+ if selector:
252
+ elements = soup.select(selector)
253
+ if elements:
254
+ result["found"] = True
255
+
256
+ if attribute:
257
+ # Extract attribute value
258
+ values = [elem.get(attribute, '') for elem in elements]
259
+ result["values"] = values
260
+ else:
261
+ # Extract text content
262
+ values = [elem.get_text(strip=True) for elem in elements]
263
+ result["values"] = values
264
+
265
+ # If only one value, simplify the result
266
+ if len(values) == 1:
267
+ result["value"] = values[0]
268
+
269
+ return result
270
+
271
+ # Try regex pattern if provided
272
+ if regex_pattern:
273
+ page_text = soup.get_text()
274
+ matches = re.findall(regex_pattern, page_text)
275
+
276
+ if matches:
277
+ result["found"] = True
278
+ result["matches"] = matches
279
+
280
+ # If only one match, simplify the result
281
+ if len(matches) == 1:
282
+ result["value"] = matches[0]
283
+
284
+ return result
285
+
286
+ # Try common patterns based on data_label
287
+ if data_label:
288
+ # Look for label in text
289
+ label_pattern = re.compile(rf'{re.escape(data_label)}\s*[:=-]?\s*([\w\s,.()-]+)', re.IGNORECASE)
290
+ page_text = soup.get_text()
291
+ match = label_pattern.search(page_text)
292
+
293
+ if match:
294
+ result["found"] = True
295
+ result["value"] = match.group(1).strip()
296
+ return result
297
+
298
+ # Look for label in headings followed by paragraph
299
+ for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
300
+ if data_label.lower() in heading.get_text().lower():
301
+ next_sibling = heading.find_next_sibling()
302
+ if next_sibling and next_sibling.name == 'p':
303
+ result["found"] = True
304
+ result["value"] = next_sibling.get_text(strip=True)
305
+ return result
306
+
307
+ # If nothing found
308
+ return result
309
+
310
+ def _extract_from_wikipedia(self, url, target_type, details):
311
+ """Specialized extraction for Wikipedia pages using APIs when possible"""
312
+ # Extract page title from URL
313
+ title = url.split('/')[-1]
314
+
315
+ # Determine Wikipedia language
316
+ domain = url.split('//')[1].split('.')[0]
317
+
318
+ try:
319
+ # First try the Wikipedia API
320
+ api_url = f"https://{domain}.wikipedia.org/api/rest_v1/page/summary/{title}"
321
+ response = self.session.get(api_url, timeout=15)
322
+ response.raise_for_status()
323
+ api_data = response.json()
324
+
325
+ # For info requests, we can use just the API data
326
+ if target_type == "info":
327
+ return {
328
+ "title": api_data.get("title", ""),
329
+ "description": api_data.get("description", ""),
330
+ "extract": api_data.get("extract", ""),
331
+ "url": url,
332
+ "source": "wikipedia_api"
333
+ }
334
+
335
+ # For other requests, we need to fetch the HTML as well
336
+ html_response = self.session.get(url, timeout=15)
337
+ html_response.raise_for_status()
338
+ soup = BeautifulSoup(html_response.content, 'html.parser')
339
+
340
+ if target_type == "table":
341
+ # Get the infobox if requested
342
+ if details.get('infobox', False):
343
+ infobox = {}
344
+ infobox_div = soup.find('table', {'class': 'infobox'})
345
+
346
+ if infobox_div:
347
+ for row in infobox_div.find_all('tr'):
348
+ header = row.find('th')
349
+ data = row.find('td')
350
+ if header and data:
351
+ key = header.get_text(strip=True)
352
+ value = data.get_text(strip=True)
353
+ if key and value:
354
+ infobox[key] = value
355
+
356
+ return {
357
+ "title": api_data.get("title", ""),
358
+ "infobox": infobox,
359
+ "url": url,
360
+ "source": "wikipedia_infobox"
361
+ }
362
+
363
+ # Regular table extraction
364
+ return self._extract_table(soup, url, details)
365
+
366
+ elif target_type == "list":
367
+ return self._extract_list(soup, url, details)
368
+
369
+ elif target_type == "specific_data":
370
+ # Enhanced extraction for Wikipedia specific data
371
+ data_label = details.get('data_label', '')
372
+
373
+ # Try to find it in infobox first
374
+ infobox = soup.find('table', {'class': 'infobox'})
375
+ if infobox and data_label:
376
+ for row in infobox.find_all('tr'):
377
+ header = row.find('th')
378
+ if header and data_label.lower() in header.get_text().lower():
379
+ data = row.find('td')
380
+ if data:
381
+ return {
382
+ "found": True,
383
+ "value": data.get_text(strip=True),
384
+ "source": "wikipedia_infobox",
385
+ "url": url
386
+ }
387
+
388
+ # Fallback to regular specific data extraction
389
+ return self._extract_specific_data(soup, url, details)
390
+
391
+ except Exception as e:
392
+ logger.warning(f"Wikipedia API extraction failed: {e}. Falling back to HTML extraction.")
393
+
394
+ # Fallback to regular HTML extraction
395
+ try:
396
+ response = self.session.get(url, timeout=15)
397
+ response.raise_for_status()
398
+ soup = BeautifulSoup(response.content, 'html.parser')
399
+
400
+ if target_type == "info":
401
+ return self._extract_general_info(soup, url)
402
+ elif target_type == "table":
403
+ return self._extract_table(soup, url, details)
404
+ elif target_type == "list":
405
+ return self._extract_list(soup, url, details)
406
+ elif target_type == "specific_data":
407
+ return self._extract_specific_data(soup, url, details)
408
+
409
+ except Exception as fallback_error:
410
+ return {"error": f"Wikipedia extraction error: {fallback_error}"}