|
|
|
import time |
|
import json |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.service import Service |
|
from selenium.webdriver.chrome.options import Options |
|
|
|
from bs4 import BeautifulSoup |
|
import logging |
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
def extract_elements(url: str) -> list[dict]: |
|
""" |
|
Scrapes a website URL to extract buttons, links, input fields, and forms. |
|
|
|
Args: |
|
url: The public URL of the website to scrape. |
|
|
|
Returns: |
|
A list of dictionaries, each representing an extracted UI element. |
|
Returns an empty list if scraping fails. |
|
""" |
|
logging.info(f"Starting scraping for URL: {url}") |
|
extracted_elements = [] |
|
|
|
chrome_options = Options() |
|
chrome_options.add_argument("--headless") |
|
chrome_options.add_argument("--no-sandbox") |
|
chrome_options.add_argument("--disable-dev-shm-usage") |
|
chrome_options.add_argument("--disable-gpu") |
|
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") |
|
|
|
|
|
driver = None |
|
|
|
try: |
|
|
|
|
|
chromedriver_path = "/usr/bin/chromedriver" |
|
logging.info(f"Attempting to use system chromedriver at: {chromedriver_path}") |
|
service = Service(executable_path=chromedriver_path) |
|
|
|
|
|
driver = webdriver.Chrome(service=service, options=chrome_options) |
|
driver.set_page_load_timeout(30) |
|
driver.get(url) |
|
|
|
|
|
time.sleep(3) |
|
|
|
page_source = driver.page_source |
|
soup = BeautifulSoup(page_source, 'lxml') |
|
|
|
|
|
buttons = soup.find_all('button') |
|
for btn in buttons: |
|
element_data = { |
|
'type': 'button', |
|
'text': btn.get_text(strip=True), |
|
'id': btn.get('id'), |
|
'name': btn.get('name'), |
|
'class': btn.get('class'), |
|
'attributes': {k: v for k, v in btn.attrs.items() if k not in ['id', 'name', 'class']} |
|
} |
|
extracted_elements.append(element_data) |
|
logging.info(f"Found {len(buttons)} buttons.") |
|
|
|
|
|
links = soup.find_all('a') |
|
for link in links: |
|
element_data = { |
|
'type': 'link', |
|
'text': link.get_text(strip=True), |
|
'href': link.get('href'), |
|
'id': link.get('id'), |
|
'class': link.get('class'), |
|
'attributes': {k: v for k, v in link.attrs.items() if k not in ['id', 'class', 'href']} |
|
} |
|
extracted_elements.append(element_data) |
|
logging.info(f"Found {len(links)} links.") |
|
|
|
|
|
inputs = soup.find_all('input') |
|
for inp in inputs: |
|
element_data = { |
|
'type': 'input', |
|
'input_type': inp.get('type', 'text'), |
|
'id': inp.get('id'), |
|
'name': inp.get('name'), |
|
'placeholder': inp.get('placeholder'), |
|
'value': inp.get('value'), |
|
'class': inp.get('class'), |
|
'attributes': {k: v for k, v in inp.attrs.items() if k not in ['id', 'name', 'class', 'type', 'placeholder', 'value']} |
|
} |
|
extracted_elements.append(element_data) |
|
logging.info(f"Found {len(inputs)} input fields.") |
|
|
|
|
|
forms = soup.find_all('form') |
|
for form in forms: |
|
form_elements = [] |
|
|
|
for child_input in form.find_all('input'): |
|
form_elements.append({ |
|
'tag': 'input', |
|
'type': child_input.get('type'), |
|
'id': child_input.get('id'), |
|
'name': child_input.get('name') |
|
}) |
|
for child_button in form.find_all('button'): |
|
form_elements.append({ |
|
'tag': 'button', |
|
'type': child_button.get('type'), |
|
'id': child_button.get('id'), |
|
'name': child_button.get('name'), |
|
'text': child_button.get_text(strip=True) |
|
}) |
|
|
|
|
|
element_data = { |
|
'type': 'form', |
|
'id': form.get('id'), |
|
'action': form.get('action'), |
|
'method': form.get('method'), |
|
'class': form.get('class'), |
|
'contained_elements': form_elements, |
|
'attributes': {k: v for k, v in form.attrs.items() if k not in ['id', 'class', 'action', 'method']} |
|
} |
|
extracted_elements.append(element_data) |
|
logging.info(f"Found {len(forms)} forms.") |
|
|
|
logging.info(f"Successfully extracted {len(extracted_elements)} elements in total.") |
|
|
|
except FileNotFoundError: |
|
logging.error(f"ERROR: System Chromedriver not found at {chromedriver_path}. Make sure 'chromium-driver' is in packages.txt.") |
|
|
|
except Exception as e: |
|
logging.error(f"Error during scraping URL {url}: {e}", exc_info=True) |
|
|
|
return [] |
|
finally: |
|
if driver: |
|
driver.quit() |
|
logging.info("WebDriver closed.") |
|
|
|
return extracted_elements |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|