WebpageCreator / myTools /ExtractWikipediaSection.py
Houzeric's picture
Upload 29 files
77c658d verified
from smolagents import Tool
import requests
import re
from markdownify import markdownify as md
class ExtractWikipediaSection(Tool):
name = "extract_wikipedia_section"
description = "Extracts a specific section from a Wikipedia page in Markdown format."
inputs = {
"url": {
"type": "string",
"description": "URL of the Wikipedia page"
},
"section": {
"type": "string",
"description": "Title of the section to extract"
},
}
output_type = "string"
def forward(self, url: str, section: str) -> str:
headers = {
"User-Agent": "Mozilla/5.0 (compatible; WebScraper/1.0; +https://example.com/bot)"
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
except Exception as e:
raise RuntimeError(f"Failed to fetch page: {e}")
markdown = md(response.text, heading_style="ATX")
# RegEx pour détecter la section markdown
pattern = rf"^##+\s*{re.escape(section)}\s*$(.*?)^##+"
match = re.search(pattern, markdown, re.DOTALL | re.MULTILINE)
if match:
return match.group(1).strip()
else:
return f"❌ Section '{section}' not found on page."