|
from smolagents import Tool |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.webdriver.chrome.service import Service as ChromeService |
|
from webdriver_manager.chrome import ChromeDriverManager |
|
from markdownify import markdownify as md |
|
|
|
class ExtractWebContentWithSelenium(Tool): |
|
name = "extract_web_content_selenium" |
|
description = "Visit a webpage and extract the full HTML content of a web page." |
|
|
|
inputs = { |
|
"url": { |
|
"type": "string", |
|
"description": "URL of the page to load" |
|
} |
|
} |
|
|
|
output_type = "string" |
|
|
|
def forward(self, url: str) -> str: |
|
chrome_options = Options() |
|
chrome_options.add_argument("--headless") |
|
chrome_options.add_argument("--no-sandbox") |
|
chrome_options.add_argument("--disable-dev-shm-usage") |
|
chrome_options.add_argument( |
|
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
"Chrome/114.0.0.0 Safari/537.36" |
|
) |
|
|
|
|
|
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) |
|
|
|
try: |
|
driver.get(url) |
|
page_content = driver.page_source |
|
markdown = md(page_content, heading_style="ATX") |
|
finally: |
|
driver.quit() |
|
|
|
return markdown |
|
|