HF_Agents_Final_Project / src /web_content_extractor.py
Yago Bolivar
feat: Enhance tools with new web content extractor and improved functionality
b09a8ba
from smolagents.tools import Tool
from typing import Dict, Any, Optional
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class WebContentExtractor(Tool):
"""
Specialized tool for extracting structured content from specific websites.
Has optimized extractors for Wikipedia, tabular data, and common content patterns.
"""
name = "web_content_extractor"
description = "Extracts structured data from websites with specialized handlers for Wikipedia and other content types."
inputs = {
'url': {'type': 'string', 'description': 'The URL of the web page to extract content from.'},
'target_type': {'type': 'string', 'description': 'Type of content to extract: "info", "table", "list", or "specific_data".'},
'extraction_details': {'type': 'object', 'description': 'Additional details for extraction (e.g., table index, data label).', 'nullable': True}
}
outputs = {'result': {'type': 'object', 'description': 'The extracted content as structured data.'}}
output_type = "object"
def __init__(self, user_agent="GAIA-Agent/1.0", *args, **kwargs):
super().__init__(*args, **kwargs)
self.headers = {"User-Agent": user_agent}
self.session = requests.Session()
self.session.headers.update(self.headers)
self.is_initialized = True
def forward(self, url: str, target_type: str, extraction_details: Optional[Dict] = None) -> Dict[str, Any]:
"""
Extract specific content from a web page.
Args:
url: URL of the web page
target_type: Type of content to extract ("info", "table", "list", "specific_data")
extraction_details: Additional details for extraction
Returns:
Dict with extracted content or error message
"""
if not extraction_details:
extraction_details = {}
# Validate URL
if not url.startswith(('http://', 'https://')):
return {"error": f"Invalid URL format: {url}"}
try:
# For Wikipedia, use specialized extraction
if 'wikipedia.org' in url:
return self._extract_from_wikipedia(url, target_type, extraction_details)
# For general websites
response = self.session.get(url, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Handle different extraction types
if target_type == "info":
return self._extract_general_info(soup, url)
elif target_type == "table":
return self._extract_table(soup, url, extraction_details)
elif target_type == "list":
return self._extract_list(soup, url, extraction_details)
elif target_type == "specific_data":
return self._extract_specific_data(soup, url, extraction_details)
else:
return {"error": f"Unknown extraction type: {target_type}"}
except requests.exceptions.RequestException as e:
return {"error": f"Request error: {str(e)}"}
except Exception as e:
return {"error": f"Extraction error: {str(e)}"}
def _extract_general_info(self, soup, url):
"""Extract general information from a web page"""
title = soup.title.string if soup.title else "No title found"
# Try to get meta description
meta_desc = soup.find('meta', attrs={'name': 'description'})
description = meta_desc.get('content', '') if meta_desc else "No description found"
# Get main headings
main_headings = [h1.get_text(strip=True) for h1 in soup.find_all('h1')]
# Get key facts (look for definition lists, key-value pairs)
key_facts = {}
# Check for definition lists
for dl in soup.find_all('dl'):
for dt, dd in zip(dl.find_all('dt'), dl.find_all('dd')):
key = dt.get_text(strip=True)
value = dd.get_text(strip=True)
if key and value:
key_facts[key] = value
# Get text from first few paragraphs for a summary
paragraphs = soup.find_all('p')
summary = ""
para_count = 0
for p in paragraphs:
text = p.get_text(strip=True)
if len(text) > 50: # Only include substantial paragraphs
summary += text + "\n\n"
para_count += 1
if para_count >= 3: # Limit to first 3 substantial paragraphs
break
return {
"title": title,
"url": url,
"description": description,
"main_headings": main_headings,
"key_facts": key_facts,
"summary": summary.strip()
}
def _extract_table(self, soup, url, details):
"""Extract table data from a web page"""
table_index = details.get('table_index', 0)
# Find all tables
tables = soup.find_all('table')
if not tables:
return {"error": "No tables found on the page"}
if table_index >= len(tables):
return {"error": f"Table index {table_index} is out of range. Found {len(tables)} tables."}
try:
# Try to use pandas to extract the table
table = tables[table_index]
dfs = pd.read_html(str(table))
if not dfs:
return {"error": "Failed to parse table with pandas"}
df = dfs[0]
# Convert to dictionary format
headers = df.columns.tolist()
rows = df.values.tolist()
return {
"table_data": {
"headers": headers,
"rows": rows
},
"row_count": len(rows),
"column_count": len(headers),
"url": url
}
except Exception as e:
# Fallback to manual extraction
logger.warning(f"Pandas table extraction failed: {e}. Falling back to manual extraction.")
table = tables[table_index]
headers = []
rows = []
# Try to find headers
thead = table.find('thead')
if thead:
header_row = thead.find('tr')
if header_row:
headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
# If no thead, use first row as header
if not headers:
first_row = table.find('tr')
if first_row:
headers = [th.get_text(strip=True) for th in first_row.find_all(['th', 'td'])]
# Extract rows
for tr in table.find_all('tr'):
row = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
if row and row != headers: # Skip header row in data
rows.append(row)
return {
"table_data": {
"headers": headers,
"rows": rows
},
"row_count": len(rows),
"column_count": len(headers) if headers else (len(rows[0]) if rows else 0),
"url": url,
"extraction_method": "manual_fallback"
}
def _extract_list(self, soup, url, details):
"""Extract list data from a web page"""
list_type = details.get('list_type', 'all') # 'ul', 'ol', or 'all'
position = details.get('position', 0) # Which list to extract (0-based index)
list_elements = []
if list_type == 'ul' or list_type == 'all':
list_elements.extend(soup.find_all('ul'))
if list_type == 'ol' or list_type == 'all':
list_elements.extend(soup.find_all('ol'))
if not list_elements:
return {"error": "No lists found on the page"}
if position >= len(list_elements):
return {"error": f"List position {position} is out of range. Found {len(list_elements)} lists."}
target_list = list_elements[position]
items = []
for li in target_list.find_all('li', recursive=False):
# Ignore nested lists
for nested_list in li.find_all(['ul', 'ol']):
nested_list.decompose()
item_text = li.get_text(strip=True)
if item_text:
items.append(item_text)
return {
"list_type": target_list.name, # 'ul' or 'ol'
"items": items,
"count": len(items),
"url": url
}
def _extract_specific_data(self, soup, url, details):
"""Extract specific data based on given selectors or patterns"""
data_label = details.get('data_label', '')
selector = details.get('selector', '')
attribute = details.get('attribute', '')
regex_pattern = details.get('regex_pattern', '')
result = {
"url": url,
"data_label": data_label,
"found": False
}
# Try CSS selector if provided
if selector:
elements = soup.select(selector)
if elements:
result["found"] = True
if attribute:
# Extract attribute value
values = [elem.get(attribute, '') for elem in elements]
result["values"] = values
else:
# Extract text content
values = [elem.get_text(strip=True) for elem in elements]
result["values"] = values
# If only one value, simplify the result
if len(values) == 1:
result["value"] = values[0]
return result
# Try regex pattern if provided
if regex_pattern:
page_text = soup.get_text()
matches = re.findall(regex_pattern, page_text)
if matches:
result["found"] = True
result["matches"] = matches
# If only one match, simplify the result
if len(matches) == 1:
result["value"] = matches[0]
return result
# Try common patterns based on data_label
if data_label:
# Look for label in text
label_pattern = re.compile(rf'{re.escape(data_label)}\s*[:=-]?\s*([\w\s,.()-]+)', re.IGNORECASE)
page_text = soup.get_text()
match = label_pattern.search(page_text)
if match:
result["found"] = True
result["value"] = match.group(1).strip()
return result
# Look for label in headings followed by paragraph
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
if data_label.lower() in heading.get_text().lower():
next_sibling = heading.find_next_sibling()
if next_sibling and next_sibling.name == 'p':
result["found"] = True
result["value"] = next_sibling.get_text(strip=True)
return result
# If nothing found
return result
def _extract_from_wikipedia(self, url, target_type, details):
"""Specialized extraction for Wikipedia pages using APIs when possible"""
# Extract page title from URL
title = url.split('/')[-1]
# Determine Wikipedia language
domain = url.split('//')[1].split('.')[0]
try:
# First try the Wikipedia API
api_url = f"https://{domain}.wikipedia.org/api/rest_v1/page/summary/{title}"
response = self.session.get(api_url, timeout=15)
response.raise_for_status()
api_data = response.json()
# For info requests, we can use just the API data
if target_type == "info":
return {
"title": api_data.get("title", ""),
"description": api_data.get("description", ""),
"extract": api_data.get("extract", ""),
"url": url,
"source": "wikipedia_api"
}
# For other requests, we need to fetch the HTML as well
html_response = self.session.get(url, timeout=15)
html_response.raise_for_status()
soup = BeautifulSoup(html_response.content, 'html.parser')
if target_type == "table":
# Get the infobox if requested
if details.get('infobox', False):
infobox = {}
infobox_div = soup.find('table', {'class': 'infobox'})
if infobox_div:
for row in infobox_div.find_all('tr'):
header = row.find('th')
data = row.find('td')
if header and data:
key = header.get_text(strip=True)
value = data.get_text(strip=True)
if key and value:
infobox[key] = value
return {
"title": api_data.get("title", ""),
"infobox": infobox,
"url": url,
"source": "wikipedia_infobox"
}
# Regular table extraction
return self._extract_table(soup, url, details)
elif target_type == "list":
return self._extract_list(soup, url, details)
elif target_type == "specific_data":
# Enhanced extraction for Wikipedia specific data
data_label = details.get('data_label', '')
# Try to find it in infobox first
infobox = soup.find('table', {'class': 'infobox'})
if infobox and data_label:
for row in infobox.find_all('tr'):
header = row.find('th')
if header and data_label.lower() in header.get_text().lower():
data = row.find('td')
if data:
return {
"found": True,
"value": data.get_text(strip=True),
"source": "wikipedia_infobox",
"url": url
}
# Fallback to regular specific data extraction
return self._extract_specific_data(soup, url, details)
except Exception as e:
logger.warning(f"Wikipedia API extraction failed: {e}. Falling back to HTML extraction.")
# Fallback to regular HTML extraction
try:
response = self.session.get(url, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
if target_type == "info":
return self._extract_general_info(soup, url)
elif target_type == "table":
return self._extract_table(soup, url, details)
elif target_type == "list":
return self._extract_list(soup, url, details)
elif target_type == "specific_data":
return self._extract_specific_data(soup, url, details)
except Exception as fallback_error:
return {"error": f"Wikipedia extraction error: {fallback_error}"}