HF_Agents_Final_Project

Sleeping

Yago Bolivar commited on May 25

Commit

b09a8ba

1 Parent(s): baa65ee

feat: Enhance tools with new web content extractor and improved functionality

- Added WebContentExtractor for structured content extraction from websites, including specialized handling for Wikipedia.
- Updated WebBrowser to support multiple extraction modes (text, tables, lists, structured) and improved error handling.
- Enhanced CodeExecutionTool with utility functions for web data processing and robust error handling.
- Improved logging across tools for better debugging and traceability.

Files changed (4) hide show

app.py +7 -4
src/python_tool.py +211 -62
src/web_browsing_tool.py +365 -42
src/web_content_extractor.py +410 -0

app.py CHANGED Viewed

@@ -13,12 +13,13 @@ from src.final_answer_tool import FinalAnswerTool
 from src.web_browsing_tool import WebBrowser
 from src.file_processing_tool import FileIdentifier
 from src.image_processing_tool import ImageProcessor
-from src.markdown_table_parser import MarkdownTableParserTool # Updated
 from src.python_tool import CodeExecutionTool
-from src.speech_to_text import SpeechToTextTool # Updated
 from src.spreadsheet_tool import SpreadsheetTool
 from src.text_reversal_tool import TextReversalTool
 from src.video_processing_tool import VideoProcessingTool
 # (Keep Constants as is)
 # --- Constants ---
@@ -67,7 +68,8 @@ python_tool = CodeExecutionTool()
 speech_to_text_tool = SpeechToTextTool() # Updated
 spreadsheet_tool = SpreadsheetTool()
 text_reversal_tool = TextReversalTool()
-video_processing_tool = VideoProcessingTool()
 # Add debug prints for file paths
 print("Current directory:", os.getcwd())
@@ -160,7 +162,8 @@ agent_tools = [
     speech_to_text_tool, # Updated
     spreadsheet_tool,
     text_reversal_tool,
-    video_processing_tool
 ]
 # Flatten system_prompt if it's a dict (e.g., from YAML)

 from src.web_browsing_tool import WebBrowser
 from src.file_processing_tool import FileIdentifier
 from src.image_processing_tool import ImageProcessor
+from src.markdown_table_parser import MarkdownTableParserTool
 from src.python_tool import CodeExecutionTool
+from src.speech_to_text import SpeechToTextTool
 from src.spreadsheet_tool import SpreadsheetTool
 from src.text_reversal_tool import TextReversalTool
 from src.video_processing_tool import VideoProcessingTool
+from src.web_content_extractor import WebContentExtractor
 # (Keep Constants as is)
 # --- Constants ---
 speech_to_text_tool = SpeechToTextTool() # Updated
 spreadsheet_tool = SpreadsheetTool()
 text_reversal_tool = TextReversalTool()
+video_processing_tool = VideoProcessingTool()
+web_content_extractor = WebContentExtractor()  # Instantiate the new extractor tool
 # Add debug prints for file paths
 print("Current directory:", os.getcwd())
     speech_to_text_tool, # Updated
     spreadsheet_tool,
     text_reversal_tool,
+    video_processing_tool,
+    web_content_extractor  # Add the new tool here
 ]
 # Flatten system_prompt if it's a dict (e.g., from YAML)

src/python_tool.py CHANGED Viewed

@@ -7,21 +7,32 @@ import traceback
 from typing import Dict, Any, Optional, Union, List
 from smolagents.tools import Tool
 import os
 class CodeExecutionTool(Tool):
     """
-    Executes Python code in a controlled environment for safe code interpretation.
-    Useful for evaluating code snippets and returning their output or errors.
     """
-    name = "python_code_executor"
-    description = "Executes a given Python code string or Python code from a file. Returns the output or error."
     inputs = {
-        'code_string': {'type': 'string', 'description': 'The Python code to execute directly.', 'nullable': True},
-        'filepath': {'type': 'string', 'description': 'The path to a Python file to execute.', 'nullable': True}
     }
-    outputs = {'result': {'type': 'object', 'description': 'A dictionary containing \'success\', \'output\', and/or \'error\'.'}}
     output_type = "object"
     def __init__(self, timeout: int = 10, max_output_size: int = 20000, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.timeout = timeout
@@ -31,6 +42,124 @@ class CodeExecutionTool(Tool):
             'pickle', 'requests', 'socket', 'shutil', 'ctypes', 'multiprocessing'
         ]
         self.is_initialized = True
     def _analyze_code_safety(self, code: str) -> Dict[str, Any]:
         """Perform static analysis to check for potentially harmful code."""
@@ -68,34 +197,40 @@ class CodeExecutionTool(Tool):
             return {"safe": True}
         except SyntaxError:
             return {"safe": False, "reason": "Invalid Python syntax"}
     def _timeout_handler(self, signum, frame):
         """Handler for timeout signal."""
-        raise TimeoutError("Code execution timed out")
     def _extract_numeric_value(self, output: str) -> Optional[Union[int, float]]:
         """Extract the final numeric value from output."""
-        # First try to get the last line that's a number
-        lines = [line.strip() for line in output.strip().split('\n') if line.strip()]
         for line in reversed(lines):
-            # Try direct conversion first
             try:
-                return float(line)
             except ValueError:
-                pass
-            # Try to extract numeric portion if embedded in text
-            numeric_match = re.search(r'[-+]?\d*\.?\d+', line)
-            if numeric_match:
-                try:
-                    return float(numeric_match.group())
-                except ValueError:
-                    pass
         return None
-    # Main entry point for the agent
     def forward(self, code_string: Optional[str] = None, filepath: Optional[str] = None) -> Dict[str, Any]:
         if not code_string and not filepath:
             return {"success": False, "error": "No code string or filepath provided."}
@@ -116,56 +251,70 @@ class CodeExecutionTool(Tool):
         elif code_string:
             code_to_execute = code_string
-        return self._execute_actual_code(code_to_execute)
-    # Renamed from execute_code to _execute_actual_code to be internal
     def _execute_actual_code(self, code: str) -> Dict[str, Any]:
         """Execute Python code and capture the output or error."""
         safety_check = self._analyze_code_safety(code)
         if not safety_check["safe"]:
-            return {"success": False, "error": f"Safety check failed: {safety_check['reason']}"}
-        # Setup timeout
-        signal.signal(signal.SIGALRM, self._timeout_handler)
-        signal.alarm(self.timeout)
-        captured_output = io.StringIO()
-        # It's generally safer to execute in a restricted scope
-        # and not provide access to all globals/locals by default.
-        # However, for a tool that might need to define functions/classes and use them,
-        # a shared scope might be necessary. This needs careful consideration.
-        exec_globals = {}
         try:
-            with contextlib.redirect_stdout(captured_output):
-                with contextlib.redirect_stderr(captured_output): # Capture stderr as well
-                    exec(code, exec_globals) # Execute in a controlled global scope
-            output = captured_output.getvalue()
             if len(output) > self.max_output_size:
-                output = output[:self.max_output_size] + "... [output truncated]"
-            # Attempt to extract a final numeric value if applicable
-            # This might be specific to certain tasks, consider making it optional
-            # numeric_result = self._extract_numeric_value(output)
             return {
-                "success": True,
                 "output": output,
-                # "numeric_value": numeric_result
             }
-        except TimeoutError:
-            return {"success": False, "error": "Code execution timed out"}
         except Exception as e:
-            # Get detailed traceback
-            tb_lines = traceback.format_exception(type(e), e, e.__traceback__)
-            error_details = "".join(tb_lines)
-            if len(error_details) > self.max_output_size:
-                error_details = error_details[:self.max_output_size] + "... [error truncated]"
-            return {"success": False, "error": f"Execution failed: {str(e)}\nTraceback:\n{error_details}"}
         finally:
-            signal.alarm(0)  # Disable the alarm
-            captured_output.close()
     # Kept execute_file and execute_code as helper methods if direct access is ever needed,
     # but they now call the main _execute_actual_code method.

 from typing import Dict, Any, Optional, Union, List
 from smolagents.tools import Tool
 import os
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class CodeExecutionTool(Tool):
     """
+    Executes Python code snippets safely with timeout protection.
+    Useful for data processing, analysis, and transformation.
+    Includes special utilities for web data processing and robust error handling.
     """
+    name = "python_executor"
+    description = "Safely executes Python code with enhancements for data processing, parsing, and error recovery."
     inputs = {
+        'code_string': {'type': 'string', 'description': 'The Python code to execute.', 'nullable': True},
+        'filepath': {'type': 'string', 'description': 'Path to a Python file to execute.', 'nullable': True}
+    }
+    outputs = {
+        'success': {'type': 'boolean', 'description': 'Whether the code executed successfully.'},
+        'output': {'type': 'string', 'description': 'The captured stdout or formatted result.', 'nullable': True},
+        'error': {'type': 'string', 'description': 'Error message if execution failed.', 'nullable': True},
+        'result_value': {'type': 'any', 'description': 'The final expression value if applicable.', 'nullable': True}
     }
     output_type = "object"
     def __init__(self, timeout: int = 10, max_output_size: int = 20000, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.timeout = timeout
             'pickle', 'requests', 'socket', 'shutil', 'ctypes', 'multiprocessing'
         ]
         self.is_initialized = True
+        # Add utility functions that will be available to executed code
+        self._utility_functions = self._get_utility_functions()
+    def _get_utility_functions(self):
+        """Define utility functions that will be available in the executed code"""
+        utility_code = """
+# Utility functions for web data processing
+def extract_pattern(text, pattern, group=0, all_matches=False):
+    """
+    "Extract data using regex pattern from text.
+    Args:
+        text (str): Text to search in
+        pattern (str): Regex pattern to use
+        group (int): Capture group to return (default 0 - entire match)
+        all_matches (bool): If True, return all matches, otherwise just first
+    Returns:
+        Matched string(s) or None if no match
+    """
+    import re
+    if not text or not pattern:
+        print("Warning: Empty text or pattern provided to extract_pattern")
+        return None
+    try:
+        matches = re.finditer(pattern, text)
+        results = [m.group(group) if group < len(m.groups())+1 else m.group(0) for m in matches]
+        if not results:
+            print(f"No matches found for pattern '{pattern}'")
+            return None
+        if all_matches:
+            return results
+        else:
+            return results[0]
+    except Exception as e:
+        print(f"Error in extract_pattern: {e}")
+        return None
+def clean_text(text, remove_extra_whitespace=True, remove_special_chars=False):
+    """
+    Clean text by removing extra whitespace and optionally special characters.
+    Args:
+        text (str): Text to clean
+        remove_extra_whitespace (bool): If True, replace multiple spaces with single space
+        remove_special_chars (bool): If True, remove special characters
+    Returns:
+        Cleaned string
+    """
+    import re
+    if not text:
+        return ""
+    # Replace newlines and tabs with spaces
+    text = re.sub(r'[\\n\\t\\r]+', ' ', text)
+    if remove_special_chars:
+        # Keep only alphanumeric, spaces, and basic punctuation
+        text = re.sub(r'[^\\w\\s.,;:!?\'"()-]', '', text)
+    if remove_extra_whitespace:
+        # Replace multiple spaces with single space
+        text = re.sub(r'\\s+', ' ', text)
+    return text.strip()
+def parse_table_text(table_text):
+    """
+    Parse table-like text into list of rows
+    Args:
+        table_text (str): Text containing table-like data
+    Returns:
+        List of rows (each row is a list of cells)
+    """
+    rows = []
+    lines = table_text.strip().split('\\n')
+    for line in lines:
+        # Skip empty lines
+        if not line.strip():
+            continue
+        # Split by whitespace or common separators
+        cells = re.split(r'\\s{2,}|\\t+|\\|+', line.strip())
+        # Clean up cells
+        cells = [cell.strip() for cell in cells if cell.strip()]
+        if cells:
+            rows.append(cells)
+    # Print parsing result for debugging
+    print(f"Parsed {len(rows)} rows from table text")
+    if rows and len(rows) > 0:
+        print(f"First row (columns: {len(rows[0])}): {rows[0]}")
+    return rows
+def safe_float(text):
+    """
+    Safely convert text to float, handling various formats.
+    Args:
+        text (str): Text to convert
+    Returns:
+        float or None if conversion fails
+    """
+    if not text:
+        return None
+    # Remove currency symbols, commas in numbers, etc.
+    text = re.sub(r'[^0-9.-]', '', str(text))
+    try:
+        return float(text)
+    except ValueError:
+        print(f"Warning: Could not convert '{text}' to float")
+        return None
+"""
+        return utility_code
     def _analyze_code_safety(self, code: str) -> Dict[str, Any]:
         """Perform static analysis to check for potentially harmful code."""
             return {"safe": True}
         except SyntaxError:
             return {"safe": False, "reason": "Invalid Python syntax"}
     def _timeout_handler(self, signum, frame):
         """Handler for timeout signal."""
+        raise TimeoutError(f"Code execution timed out after {self.timeout} seconds")
     def _extract_numeric_value(self, output: str) -> Optional[Union[int, float]]:
         """Extract the final numeric value from output."""
+        if not output:
+            return None
+        # Look for the last line that contains a number
+        lines = output.strip().split('\n')
         for line in reversed(lines):
+            # Try to interpret it as a pure number
+            line = line.strip()
             try:
+                if '.' in line:
+                    return float(line)
+                else:
+                    return int(line)
             except ValueError:
+                # Not a pure number, try to extract numbers with regex
+                match = re.search(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?$', line)
+                if match:
+                    num_str = match.group(0)
+                    try:
+                        if '.' in num_str:
+                            return float(num_str)
+                        else:
+                            return int(num_str)
+                    except ValueError:
+                        pass
         return None
     def forward(self, code_string: Optional[str] = None, filepath: Optional[str] = None) -> Dict[str, Any]:
         if not code_string and not filepath:
             return {"success": False, "error": "No code string or filepath provided."}
         elif code_string:
             code_to_execute = code_string
+        # Inject utility functions
+        enhanced_code = self._utility_functions + "\n\n" + code_to_execute
+        return self._execute_actual_code(enhanced_code)
     def _execute_actual_code(self, code: str) -> Dict[str, Any]:
         """Execute Python code and capture the output or error."""
         safety_check = self._analyze_code_safety(code)
         if not safety_check["safe"]:
+            return {
+                "success": False,
+                "error": f"Safety check failed: {safety_check['reason']}"
+            }
+        # Capture stdout and execute the code with a timeout
+        stdout_buffer = io.StringIO()
+        result_value = None
         try:
+            # Set timeout handler
+            signal.signal(signal.SIGALRM, self._timeout_handler)
+            signal.alarm(self.timeout)
+            # Execute code and capture stdout
+            with contextlib.redirect_stdout(stdout_buffer):
+                # Execute the code within a new dictionary for local variables
+                local_vars = {}
+                exec(code, {}, local_vars)
+                # Try to extract the result from common variable names
+                for var_name in ['result', 'answer', 'output', 'value', 'final_result', 'data']:
+                    if var_name in local_vars:
+                        result_value = local_vars[var_name]
+                        break
+            # Reset the alarm
+            signal.alarm(0)
+            # Get the captured output
+            output = stdout_buffer.getvalue()
             if len(output) > self.max_output_size:
+                output = output[:self.max_output_size] + f"\n... (output truncated, exceeded {self.max_output_size} characters)"
+            # If no result_value was found, try to extract a numeric value from the output
+            if result_value is None:
+                result_value = self._extract_numeric_value(output)
             return {
+                "success": True,
                 "output": output,
+                "result_value": result_value
             }
+        except TimeoutError as e:
+            signal.alarm(0)  # Reset the alarm
+            return {"success": False, "error": f"Code execution timed out after {self.timeout} seconds"}
         except Exception as e:
+            signal.alarm(0)  # Reset the alarm
+            trace = traceback.format_exc()
+            error_msg = f"Error executing code: {str(e)}\n{trace}"
+            return {"success": False, "error": error_msg}
         finally:
+            # Ensure the alarm is reset
+            signal.alarm(0)
     # Kept execute_file and execute_code as helper methods if direct access is ever needed,
     # but they now call the main _execute_actual_code method.

src/web_browsing_tool.py CHANGED Viewed

@@ -1,17 +1,31 @@
 import requests
 from bs4 import BeautifulSoup
 from smolagents.tools import Tool
 class WebBrowser(Tool):
     """
     Retrieves information from online sources by browsing web pages.
-    Useful for extracting or summarizing web content.
     """
     name = "web_browser"
-    description = "Fetches the content of a web page and extracts its text. Input should be a valid URL."
-    inputs = {'url': {'type': 'string', 'description': 'The URL of the web page to browse.'}}
-    outputs = {'text_content': {'type': 'string', 'description': 'The extracted text content of the web page, or an error message.'}}
-    output_type = "string"
     def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
         """
@@ -21,62 +35,371 @@ class WebBrowser(Tool):
         """
         super().__init__(*args, **kwargs)
         self.headers = {"User-Agent": user_agent}
-        self.is_initialized = True  # Example of a tool state
-    def forward(self, url: str) -> str:
         """
-        Fetches the content of a web page and extracts its text.
         Args:
             url (str): The URL of the web page to browse.
         Returns:
-            str: The extracted text content of the web page, or an error message
-                 if fetching or parsing fails.
         """
         if not url.startswith(('http://', 'https://')):
-            return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}"
         try:
-            response = requests.get(url, headers=self.headers, timeout=15)
-            response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)
-            # Use BeautifulSoup to parse the HTML content
-            soup = BeautifulSoup(response.content, 'html.parser')
-            # Remove script and style elements
-            for script_or_style in soup(["script", "style"]):
-                script_or_style.decompose()
-            # Get text
-            text_from_soup = soup.get_text(separator='\n', strip=True)
-            # Convert multiple newlines to a single newline and clean spaces within lines
-            cleaned_lines = []
-            for line in text_from_soup.splitlines():
-                line = line.strip()  # Strip leading/trailing whitespace from the line itself
-                if line:  # Only process non-empty lines
-                    # Replace multiple spaces with a single space
-                    cleaned_line = ' '.join(line.split())
-                    cleaned_lines.append(cleaned_line)
-            text = '\n'.join(cleaned_lines)
-            if not text:
-                return f"Error: No text content found at {url}."
-            return text
-        except requests.exceptions.HTTPError as http_err:
-            return f"Error: HTTP error occurred while fetching {url}: {http_err}"
-        except requests.exceptions.ConnectionError as conn_err:
-            return f"Error: Connection error occurred while fetching {url}: {conn_err}"
-        except requests.exceptions.Timeout as timeout_err:
-            return f"Error: Timeout occurred while fetching {url}: {timeout_err}"
-        except requests.exceptions.RequestException as req_err:
-            return f"Error: An unexpected error occurred while fetching {url}: {req_err}"
-        except Exception as e:
-            return f"Error: An unexpected error occurred during parsing of {url}: {e}"
 if __name__ == '__main__':
     browser = WebBrowser()  # Instantiation remains the same for testing

 import requests
 from bs4 import BeautifulSoup
 from smolagents.tools import Tool
+import re
+import json
+import logging
+import time
+from urllib.parse import urlparse, urljoin
+import pandas as pd
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class WebBrowser(Tool):
     """
     Retrieves information from online sources by browsing web pages.
+    Useful for extracting or summarizing web content, with special handling for structured data.
+    Can extract tables, lists, and key information from web pages.
     """
     name = "web_browser"
+    description = "Fetches content from web pages with improved structured data handling. Has specialized extraction for Wikipedia. Returns text content or structured data."
+    inputs = {
+        'url': {'type': 'string', 'description': 'The URL of the web page to browse.'},
+        'extraction_mode': {'type': 'string', 'description': 'Mode for data extraction: "text" (default), "tables", "lists", or "structured".', 'nullable': True}
+    }
+    outputs = {'content': {'type': 'object', 'description': 'The extracted content from the web page, either as text or structured data.'}}
+    output_type = "object"
     def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
         """
         """
         super().__init__(*args, **kwargs)
         self.headers = {"User-Agent": user_agent}
+        self.is_initialized = True
+        # Add a session to maintain cookies
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
+    def forward(self, url: str, extraction_mode: str = "text") -> dict:
         """
+        Fetches the content of a web page and extracts information based on the specified mode.
         Args:
             url (str): The URL of the web page to browse.
+            extraction_mode (str): The mode for data extraction - "text" (default), "tables", "lists", or "structured"
         Returns:
+            dict: The extracted content or an error message
         """
+        # Validate URL
         if not url.startswith(('http://', 'https://')):
+            return {"error": f"Invalid URL format. URL must start with http:// or https://. Received: {url}"}
         try:
+            # Check if it's Wikipedia and use special handling
+            if 'wikipedia.org' in url:
+                return self._handle_wikipedia(url, extraction_mode)
+            # Process normal web pages
+            return self._process_regular_webpage(url, extraction_mode)
+        except requests.exceptions.HTTPError as http_err:
+            return {"error": f"HTTP error occurred while fetching {url}: {http_err}"}
+        except requests.exceptions.ConnectionError as conn_err:
+            return {"error": f"Connection error occurred while fetching {url}: {conn_err}"}
+        except requests.exceptions.Timeout as timeout_err:
+            return {"error": f"Timeout occurred while fetching {url}: {timeout_err}"}
+        except requests.exceptions.RequestException as req_err:
+            return {"error": f"An unexpected error occurred while fetching {url}: {req_err}"}
+        except Exception as e:
+            return {"error": f"An unexpected error occurred during parsing of {url}: {e}"}
+    def _process_regular_webpage(self, url, extraction_mode):
+        """Process a regular (non-Wikipedia) webpage"""
+        response = self.session.get(url, timeout=15)
+        response.raise_for_status()
+        # Use BeautifulSoup to parse the HTML content
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Remove script and style elements
+        for script_or_style in soup(["script", "style"]):
+            script_or_style.decompose()
+        if extraction_mode == "text":
+            return self._extract_text(soup, url)
+        elif extraction_mode == "tables":
+            return self._extract_tables(soup, url)
+        elif extraction_mode == "lists":
+            return self._extract_lists(soup, url)
+        elif extraction_mode == "structured":
+            return self._extract_structured_data(soup, url)
+        else:
+            return {"error": f"Unknown extraction mode: {extraction_mode}"}
+    def _handle_wikipedia(self, url, extraction_mode):
+        """Special handling for Wikipedia pages"""
+        # For Wikipedia, try to use the API instead of scraping the HTML
+        parsed_url = urlparse(url)
+        if not parsed_url.netloc.endswith('wikipedia.org'):
+            return self._process_regular_webpage(url, extraction_mode)
+        # Extract the title from the URL path
+        path_parts = parsed_url.path.split('/')
+        if len(path_parts) < 3 or path_parts[1] != 'wiki':
+            # Not a standard Wikipedia article URL
+            return self._process_regular_webpage(url, extraction_mode)
+        title = path_parts[2]
+        lang = parsed_url.netloc.split('.')[0]
+        # Use Wikipedia API to get structured content
+        api_url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{title}"
+        try:
+            logger.info(f"Fetching Wikipedia API data from {api_url}")
+            api_response = self.session.get(api_url, timeout=15)
+            api_response.raise_for_status()
+            api_data = api_response.json()
+            # Basic information from the API
+            wiki_data = {
+                "title": api_data.get("title", ""),
+                "description": api_data.get("description", ""),
+                "extract": api_data.get("extract", ""),
+                "url": api_data.get("content_urls", {}).get("desktop", {}).get("page", url)
+            }
+            # If we need more detailed data beyond the summary
+            if extraction_mode in ["tables", "structured"]:
+                # Get the full HTML anyway for tables and other structured data
+                response = self.session.get(url, timeout=15)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Add tables to the response
+                tables = self._extract_tables(soup, url, return_raw=False)
+                wiki_data["tables"] = tables.get("tables", [])
+                # For "structured" mode, add sections, infobox and other elements
+                if extraction_mode == "structured":
+                    wiki_data["infobox"] = self._extract_wikipedia_infobox(soup)
+                    wiki_data["sections"] = self._extract_wikipedia_sections(soup)
+                return {
+                    "source": "wikipedia_api_enhanced",
+                    "url": url,
+                    "data": wiki_data
+                }
+            # For basic text, return the API data
+            return {
+                "source": "wikipedia_api",
+                "url": url,
+                "data": wiki_data
+            }
+        except (requests.exceptions.RequestException, ValueError) as e:
+            logger.warning(f"Wikipedia API request failed: {e}. Falling back to HTML scraping.")
+            # Fallback to normal HTML processing
+            return self._process_regular_webpage(url, extraction_mode)
+    def _extract_text(self, soup, url):
+        """Extract clean text from the page"""
+        text_from_soup = soup.get_text(separator='\n', strip=True)
+        # Convert multiple newlines to a single newline and clean spaces within lines
+        cleaned_lines = []
+        for line in text_from_soup.splitlines():
+            line = line.strip()  # Strip leading/trailing whitespace
+            if line:  # Only process non-empty lines
+                # Replace multiple spaces with a single space
+                cleaned_line = ' '.join(line.split())
+                cleaned_lines.append(cleaned_line)
+        text = '\n'.join(cleaned_lines)
+        if not text:
+            return {"error": f"No text content found at {url}."}
+        return {
+            "source": "web_page",
+            "url": url,
+            "content_type": "text",
+            "text": text
+        }
+    def _extract_tables(self, soup, url, return_raw=True):
+        """Extract tables from the page"""
+        tables = []
+        # Find all table elements
+        html_tables = soup.find_all('table')
+        for i, table in enumerate(html_tables):
+            try:
+                # Try to convert to a pandas DataFrame
+                dfs = pd.read_html(str(table))
+                if dfs:
+                    # Convert each DataFrame to a dict for JSON serialization
+                    for j, df in enumerate(dfs):
+                        # Clean column names
+                        df.columns = [str(col).strip() for col in df.columns]
+                        # Convert DataFrame to dict
+                        table_dict = {
+                            "table_id": f"table_{i}_{j}",
+                            "headers": df.columns.tolist(),
+                            "rows": df.values.tolist(),
+                        }
+                        tables.append(table_dict)
+            except Exception as e:
+                logger.warning(f"Failed to parse table {i}: {e}")
+                # Try a manual extraction
+                try:
+                    headers = []
+                    header_row = table.find('tr')
+                    if header_row:
+                        headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
+                    rows = []
+                    for tr in table.find_all('tr'):
+                        row = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
+                        if row and row != headers:  # Skip header row in data
+                            rows.append(row)
+                    if headers or rows:
+                        tables.append({
+                            "table_id": f"table_{i}_manual",
+                            "headers": headers,
+                            "rows": rows
+                        })
+                except Exception:
+                    continue  # Skip if manual extraction also fails
+        if return_raw:
+            return {
+                "source": "web_page",
+                "url": url,
+                "content_type": "tables",
+                "table_count": len(tables),
+                "tables": tables
+            }
+        else:
+            return {"tables": tables}
+    def _extract_lists(self, soup, url):
+        """Extract lists from the page"""
+        lists = []
+        # Find all ul and ol elements
+        for list_type in ['ul', 'ol']:
+            list_elements = soup.find_all(list_type, recursive=True)
+            for i, list_elem in enumerate(list_elements):
+                # Skip nested lists to avoid duplication
+                if list_elem.parent.name in ['li', 'ul', 'ol']:
+                    continue
+                items = []
+                for li in list_elem.find_all('li', recursive=False):
+                    # Get text but exclude any nested lists
+                    for nested_list in li.find_all(['ul', 'ol']):
+                        nested_list.decompose()
+                    item_text = li.get_text(strip=True)
+                    if item_text:
+                        items.append(item_text)
+                if items:
+                    lists.append({
+                        "list_id": f"{list_type}_{i}",
+                        "list_type": "ordered" if list_type == "ol" else "unordered",
+                        "items": items
+                    })
+        return {
+            "source": "web_page",
+            "url": url,
+            "content_type": "lists",
+            "list_count": len(lists),
+            "lists": lists
+        }
+    def _extract_structured_data(self, soup, url):
+        """Extract various types of structured data from the page"""
+        result = {
+            "source": "web_page",
+            "url": url,
+            "content_type": "structured",
+            "title": soup.title.string if soup.title else "",
+            "meta_description": "",
+        }
+        # Extract meta description
+        meta_desc = soup.find('meta', attrs={'name': 'description'})
+        if meta_desc:
+            result["meta_description"] = meta_desc.get('content', '')
+        # Extract main text content
+        text_result = self._extract_text(soup, url)
+        if "text" in text_result:
+            result["text"] = text_result["text"]
+        # Extract tables
+        tables_result = self._extract_tables(soup, url, return_raw=False)
+        result["tables"] = tables_result.get("tables", [])
+        # Extract lists
+        lists_result = self._extract_lists(soup, url)
+        result["lists"] = lists_result.get("lists", [])
+        # Extract headings for document structure
+        headings = []
+        for i, heading in enumerate(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])):
+            headings.append({
+                "id": f"heading_{i}",
+                "level": int(heading.name[1]),
+                "text": heading.get_text(strip=True)
+            })
+        result["headings"] = headings
+        # Look for JSON-LD structured data
+        json_ld_data = []
+        for script in soup.find_all('script', type='application/ld+json'):
+            try:
+                json_data = json.loads(script.string)
+                json_ld_data.append(json_data)
+            except (json.JSONDecodeError, ValueError):
+                continue
+        if json_ld_data:
+            result["structured_data"] = json_ld_data
+        return result
+    def _extract_wikipedia_infobox(self, soup):
+        """Extract information from Wikipedia infobox"""
+        infobox = {}
+        # Look for the infobox table
+        infobox_table = soup.find('table', class_=['infobox', 'vcard'])
+        if infobox_table:
+            for row in infobox_table.find_all('tr'):
+                # Look for th/td pairs
+                header = row.find('th')
+                value = row.find('td')
+                if header and value:
+                    key = header.get_text(strip=True)
+                    # Clean up the value text
+                    for sup in value.find_all('sup'):
+                        sup.decompose()  # Remove reference superscripts
+                    val = value.get_text(strip=True)
+                    if key and val:
+                        infobox[key] = val
+        return infobox
+    def _extract_wikipedia_sections(self, soup):
+        """Extract sections and their content from Wikipedia"""
+        sections = []
+        current_section = None
+        # Find all headings
+        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
+        for heading in headings:
+            # Skip non-content headings
+            if heading.get('id') in ['firstHeading', 'mw-toc-heading']:
+                continue
+            level = int(heading.name[1])
+            title = heading.get_text(strip=True)
+            # Start a new section
+            current_section = {
+                "level": level,
+                "title": title,
+                "content": ""
+            }
+            # Get content until next heading
+            content_elements = []
+            sibling = heading.next_sibling
+            while sibling and not (sibling.name and sibling.name.startswith('h')):
+                if sibling.name in ['p', 'ul', 'ol']:
+                    content_elements.append(sibling.get_text(strip=True))
+                sibling = sibling.next_sibling
+            if content_elements:
+                current_section["content"] = "\n".join(content_elements)
+                sections.append(current_section)
+        return sections
 if __name__ == '__main__':
     browser = WebBrowser()  # Instantiation remains the same for testing

src/web_content_extractor.py ADDED Viewed

	@@ -0,0 +1,410 @@

+from smolagents.tools import Tool
+from typing import Dict, Any, Optional
+import requests
+from bs4 import BeautifulSoup
+import re
+import json
+import pandas as pd
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class WebContentExtractor(Tool):
+    """
+    Specialized tool for extracting structured content from specific websites.
+    Has optimized extractors for Wikipedia, tabular data, and common content patterns.
+    """
+    name = "web_content_extractor"
+    description = "Extracts structured data from websites with specialized handlers for Wikipedia and other content types."
+    inputs = {
+        'url': {'type': 'string', 'description': 'The URL of the web page to extract content from.'},
+        'target_type': {'type': 'string', 'description': 'Type of content to extract: "info", "table", "list", or "specific_data".'},
+        'extraction_details': {'type': 'object', 'description': 'Additional details for extraction (e.g., table index, data label).', 'nullable': True}
+    }
+    outputs = {'result': {'type': 'object', 'description': 'The extracted content as structured data.'}}
+    output_type = "object"
+    def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.headers = {"User-Agent": user_agent}
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
+        self.is_initialized = True
+    def forward(self, url: str, target_type: str, extraction_details: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        Extract specific content from a web page.
+        Args:
+            url: URL of the web page
+            target_type: Type of content to extract ("info", "table", "list", "specific_data")
+            extraction_details: Additional details for extraction
+        Returns:
+            Dict with extracted content or error message
+        """
+        if not extraction_details:
+            extraction_details = {}
+        # Validate URL
+        if not url.startswith(('http://', 'https://')):
+            return {"error": f"Invalid URL format: {url}"}
+        try:
+            # For Wikipedia, use specialized extraction
+            if 'wikipedia.org' in url:
+                return self._extract_from_wikipedia(url, target_type, extraction_details)
+            # For general websites
+            response = self.session.get(url, timeout=15)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Handle different extraction types
+            if target_type == "info":
+                return self._extract_general_info(soup, url)
+            elif target_type == "table":
+                return self._extract_table(soup, url, extraction_details)
+            elif target_type == "list":
+                return self._extract_list(soup, url, extraction_details)
+            elif target_type == "specific_data":
+                return self._extract_specific_data(soup, url, extraction_details)
+            else:
+                return {"error": f"Unknown extraction type: {target_type}"}
+        except requests.exceptions.RequestException as e:
+            return {"error": f"Request error: {str(e)}"}
+        except Exception as e:
+            return {"error": f"Extraction error: {str(e)}"}
+    def _extract_general_info(self, soup, url):
+        """Extract general information from a web page"""
+        title = soup.title.string if soup.title else "No title found"
+        # Try to get meta description
+        meta_desc = soup.find('meta', attrs={'name': 'description'})
+        description = meta_desc.get('content', '') if meta_desc else "No description found"
+        # Get main headings
+        main_headings = [h1.get_text(strip=True) for h1 in soup.find_all('h1')]
+        # Get key facts (look for definition lists, key-value pairs)
+        key_facts = {}
+        # Check for definition lists
+        for dl in soup.find_all('dl'):
+            for dt, dd in zip(dl.find_all('dt'), dl.find_all('dd')):
+                key = dt.get_text(strip=True)
+                value = dd.get_text(strip=True)
+                if key and value:
+                    key_facts[key] = value
+        # Get text from first few paragraphs for a summary
+        paragraphs = soup.find_all('p')
+        summary = ""
+        para_count = 0
+        for p in paragraphs:
+            text = p.get_text(strip=True)
+            if len(text) > 50:  # Only include substantial paragraphs
+                summary += text + "\n\n"
+                para_count += 1
+                if para_count >= 3:  # Limit to first 3 substantial paragraphs
+                    break
+        return {
+            "title": title,
+            "url": url,
+            "description": description,
+            "main_headings": main_headings,
+            "key_facts": key_facts,
+            "summary": summary.strip()
+        }
+    def _extract_table(self, soup, url, details):
+        """Extract table data from a web page"""
+        table_index = details.get('table_index', 0)
+        # Find all tables
+        tables = soup.find_all('table')
+        if not tables:
+            return {"error": "No tables found on the page"}
+        if table_index >= len(tables):
+            return {"error": f"Table index {table_index} is out of range. Found {len(tables)} tables."}
+        try:
+            # Try to use pandas to extract the table
+            table = tables[table_index]
+            dfs = pd.read_html(str(table))
+            if not dfs:
+                return {"error": "Failed to parse table with pandas"}
+            df = dfs[0]
+            # Convert to dictionary format
+            headers = df.columns.tolist()
+            rows = df.values.tolist()
+            return {
+                "table_data": {
+                    "headers": headers,
+                    "rows": rows
+                },
+                "row_count": len(rows),
+                "column_count": len(headers),
+                "url": url
+            }
+        except Exception as e:
+            # Fallback to manual extraction
+            logger.warning(f"Pandas table extraction failed: {e}. Falling back to manual extraction.")
+            table = tables[table_index]
+            headers = []
+            rows = []
+            # Try to find headers
+            thead = table.find('thead')
+            if thead:
+                header_row = thead.find('tr')
+                if header_row:
+                    headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
+            # If no thead, use first row as header
+            if not headers:
+                first_row = table.find('tr')
+                if first_row:
+                    headers = [th.get_text(strip=True) for th in first_row.find_all(['th', 'td'])]
+            # Extract rows
+            for tr in table.find_all('tr'):
+                row = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
+                if row and row != headers:  # Skip header row in data
+                    rows.append(row)
+            return {
+                "table_data": {
+                    "headers": headers,
+                    "rows": rows
+                },
+                "row_count": len(rows),
+                "column_count": len(headers) if headers else (len(rows[0]) if rows else 0),
+                "url": url,
+                "extraction_method": "manual_fallback"
+            }
+    def _extract_list(self, soup, url, details):
+        """Extract list data from a web page"""
+        list_type = details.get('list_type', 'all')  # 'ul', 'ol', or 'all'
+        position = details.get('position', 0)  # Which list to extract (0-based index)
+        list_elements = []
+        if list_type == 'ul' or list_type == 'all':
+            list_elements.extend(soup.find_all('ul'))
+        if list_type == 'ol' or list_type == 'all':
+            list_elements.extend(soup.find_all('ol'))
+        if not list_elements:
+            return {"error": "No lists found on the page"}
+        if position >= len(list_elements):
+            return {"error": f"List position {position} is out of range. Found {len(list_elements)} lists."}
+        target_list = list_elements[position]
+        items = []
+        for li in target_list.find_all('li', recursive=False):
+            # Ignore nested lists
+            for nested_list in li.find_all(['ul', 'ol']):
+                nested_list.decompose()
+            item_text = li.get_text(strip=True)
+            if item_text:
+                items.append(item_text)
+        return {
+            "list_type": target_list.name,  # 'ul' or 'ol'
+            "items": items,
+            "count": len(items),
+            "url": url
+        }
+    def _extract_specific_data(self, soup, url, details):
+        """Extract specific data based on given selectors or patterns"""
+        data_label = details.get('data_label', '')
+        selector = details.get('selector', '')
+        attribute = details.get('attribute', '')
+        regex_pattern = details.get('regex_pattern', '')
+        result = {
+            "url": url,
+            "data_label": data_label,
+            "found": False
+        }
+        # Try CSS selector if provided
+        if selector:
+            elements = soup.select(selector)
+            if elements:
+                result["found"] = True
+                if attribute:
+                    # Extract attribute value
+                    values = [elem.get(attribute, '') for elem in elements]
+                    result["values"] = values
+                else:
+                    # Extract text content
+                    values = [elem.get_text(strip=True) for elem in elements]
+                    result["values"] = values
+                # If only one value, simplify the result
+                if len(values) == 1:
+                    result["value"] = values[0]
+                return result
+        # Try regex pattern if provided
+        if regex_pattern:
+            page_text = soup.get_text()
+            matches = re.findall(regex_pattern, page_text)
+            if matches:
+                result["found"] = True
+                result["matches"] = matches
+                # If only one match, simplify the result
+                if len(matches) == 1:
+                    result["value"] = matches[0]
+                return result
+        # Try common patterns based on data_label
+        if data_label:
+            # Look for label in text
+            label_pattern = re.compile(rf'{re.escape(data_label)}\s*[:=-]?\s*([\w\s,.()-]+)', re.IGNORECASE)
+            page_text = soup.get_text()
+            match = label_pattern.search(page_text)
+            if match:
+                result["found"] = True
+                result["value"] = match.group(1).strip()
+                return result
+            # Look for label in headings followed by paragraph
+            for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
+                if data_label.lower() in heading.get_text().lower():
+                    next_sibling = heading.find_next_sibling()
+                    if next_sibling and next_sibling.name == 'p':
+                        result["found"] = True
+                        result["value"] = next_sibling.get_text(strip=True)
+                        return result
+        # If nothing found
+        return result
+    def _extract_from_wikipedia(self, url, target_type, details):
+        """Specialized extraction for Wikipedia pages using APIs when possible"""
+        # Extract page title from URL
+        title = url.split('/')[-1]
+        # Determine Wikipedia language
+        domain = url.split('//')[1].split('.')[0]
+        try:
+            # First try the Wikipedia API
+            api_url = f"https://{domain}.wikipedia.org/api/rest_v1/page/summary/{title}"
+            response = self.session.get(api_url, timeout=15)
+            response.raise_for_status()
+            api_data = response.json()
+            # For info requests, we can use just the API data
+            if target_type == "info":
+                return {
+                    "title": api_data.get("title", ""),
+                    "description": api_data.get("description", ""),
+                    "extract": api_data.get("extract", ""),
+                    "url": url,
+                    "source": "wikipedia_api"
+                }
+            # For other requests, we need to fetch the HTML as well
+            html_response = self.session.get(url, timeout=15)
+            html_response.raise_for_status()
+            soup = BeautifulSoup(html_response.content, 'html.parser')
+            if target_type == "table":
+                # Get the infobox if requested
+                if details.get('infobox', False):
+                    infobox = {}
+                    infobox_div = soup.find('table', {'class': 'infobox'})
+                    if infobox_div:
+                        for row in infobox_div.find_all('tr'):
+                            header = row.find('th')
+                            data = row.find('td')
+                            if header and data:
+                                key = header.get_text(strip=True)
+                                value = data.get_text(strip=True)
+                                if key and value:
+                                    infobox[key] = value
+                        return {
+                            "title": api_data.get("title", ""),
+                            "infobox": infobox,
+                            "url": url,
+                            "source": "wikipedia_infobox"
+                        }
+                # Regular table extraction
+                return self._extract_table(soup, url, details)
+            elif target_type == "list":
+                return self._extract_list(soup, url, details)
+            elif target_type == "specific_data":
+                # Enhanced extraction for Wikipedia specific data
+                data_label = details.get('data_label', '')
+                # Try to find it in infobox first
+                infobox = soup.find('table', {'class': 'infobox'})
+                if infobox and data_label:
+                    for row in infobox.find_all('tr'):
+                        header = row.find('th')
+                        if header and data_label.lower() in header.get_text().lower():
+                            data = row.find('td')
+                            if data:
+                                return {
+                                    "found": True,
+                                    "value": data.get_text(strip=True),
+                                    "source": "wikipedia_infobox",
+                                    "url": url
+                                }
+                # Fallback to regular specific data extraction
+                return self._extract_specific_data(soup, url, details)
+        except Exception as e:
+            logger.warning(f"Wikipedia API extraction failed: {e}. Falling back to HTML extraction.")
+            # Fallback to regular HTML extraction
+            try:
+                response = self.session.get(url, timeout=15)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.content, 'html.parser')
+                if target_type == "info":
+                    return self._extract_general_info(soup, url)
+                elif target_type == "table":
+                    return self._extract_table(soup, url, details)
+                elif target_type == "list":
+                    return self._extract_list(soup, url, details)
+                elif target_type == "specific_data":
+                    return self._extract_specific_data(soup, url, details)
+            except Exception as fallback_error:
+                return {"error": f"Wikipedia extraction error: {fallback_error}"}