|
from smolagents import Tool |
|
import requests |
|
import re |
|
from markdownify import markdownify as md |
|
|
|
class ExtractWikipediaSection(Tool): |
|
name = "extract_wikipedia_section" |
|
description = "Extracts a specific section from a Wikipedia page in Markdown format." |
|
|
|
inputs = { |
|
"url": { |
|
"type": "string", |
|
"description": "URL of the Wikipedia page" |
|
}, |
|
"section": { |
|
"type": "string", |
|
"description": "Title of the section to extract" |
|
}, |
|
} |
|
|
|
output_type = "string" |
|
|
|
def forward(self, url: str, section: str) -> str: |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (compatible; WebScraper/1.0; +https://example.com/bot)" |
|
} |
|
|
|
try: |
|
response = requests.get(url, headers=headers, timeout=10) |
|
response.raise_for_status() |
|
except Exception as e: |
|
raise RuntimeError(f"Failed to fetch page: {e}") |
|
|
|
markdown = md(response.text, heading_style="ATX") |
|
|
|
|
|
pattern = rf"^##+\s*{re.escape(section)}\s*$(.*?)^##+" |
|
match = re.search(pattern, markdown, re.DOTALL | re.MULTILINE) |
|
if match: |
|
return match.group(1).strip() |
|
else: |
|
return f"❌ Section '{section}' not found on page." |
|
|