WebpageCreator / myTools /ExtractWebContentWithSelenium.py
Houzeric's picture
Upload 29 files
77c658d verified
from smolagents import Tool
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from markdownify import markdownify as md
class ExtractWebContentWithSelenium(Tool):
name = "extract_web_content_selenium"
description = "Visit a webpage and extract the full HTML content of a web page."
inputs = {
"url": {
"type": "string",
"description": "URL of the page to load"
}
}
output_type = "string"
def forward(self, url: str) -> str:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 Safari/537.36"
)
# Installe automatiquement ChromeDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
try:
driver.get(url)
page_content = driver.page_source
markdown = md(page_content, heading_style="ATX")
finally:
driver.quit()
return markdown