File size: 6,574 Bytes
b399279 0b1188e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# ai_test_generator/scraper.py
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def extract_elements(url: str) -> list[dict]:
"""
Scrapes a website URL to extract buttons, links, input fields, and forms.
Args:
url: The public URL of the website to scrape.
Returns:
A list of dictionaries, each representing an extracted UI element.
Returns an empty list if scraping fails.
"""
logging.info(f"Starting scraping for URL: {url}")
extracted_elements = []
chrome_options = Options()
chrome_options.add_argument("--headless") # Run headless (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu") # Recommended for headless
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # Set user agent
# service = Service(ChromeDriverManager().install())
driver = None
try:
# --- Use the system's ChromeDriver ---
# Specify the path to the driver installed via packages.txt
chromedriver_path = "/usr/bin/chromedriver"
logging.info(f"Attempting to use system chromedriver at: {chromedriver_path}")
service = Service(executable_path=chromedriver_path)
# --- End of change ---
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.set_page_load_timeout(30) # Set timeout for page load
driver.get(url)
# Allow some time for dynamic content to potentially load
# A more robust solution might use WebDriverWait
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml') # Use lxml parser
# --- Extract Buttons ---
buttons = soup.find_all('button')
for btn in buttons:
element_data = {
'type': 'button',
'text': btn.get_text(strip=True),
'id': btn.get('id'),
'name': btn.get('name'),
'class': btn.get('class'),
'attributes': {k: v for k, v in btn.attrs.items() if k not in ['id', 'name', 'class']}
}
extracted_elements.append(element_data)
logging.info(f"Found {len(buttons)} buttons.")
# --- Extract Links ---
links = soup.find_all('a')
for link in links:
element_data = {
'type': 'link',
'text': link.get_text(strip=True),
'href': link.get('href'),
'id': link.get('id'),
'class': link.get('class'),
'attributes': {k: v for k, v in link.attrs.items() if k not in ['id', 'class', 'href']}
}
extracted_elements.append(element_data)
logging.info(f"Found {len(links)} links.")
# --- Extract Input Fields ---
inputs = soup.find_all('input')
for inp in inputs:
element_data = {
'type': 'input',
'input_type': inp.get('type', 'text'), # Default to 'text' if type not specified
'id': inp.get('id'),
'name': inp.get('name'),
'placeholder': inp.get('placeholder'),
'value': inp.get('value'),
'class': inp.get('class'),
'attributes': {k: v for k, v in inp.attrs.items() if k not in ['id', 'name', 'class', 'type', 'placeholder', 'value']}
}
extracted_elements.append(element_data)
logging.info(f"Found {len(inputs)} input fields.")
# --- Extract Forms ---
forms = soup.find_all('form')
for form in forms:
form_elements = []
# Find elements within this specific form
for child_input in form.find_all('input'):
form_elements.append({
'tag': 'input',
'type': child_input.get('type'),
'id': child_input.get('id'),
'name': child_input.get('name')
})
for child_button in form.find_all('button'):
form_elements.append({
'tag': 'button',
'type': child_button.get('type'),
'id': child_button.get('id'),
'name': child_button.get('name'),
'text': child_button.get_text(strip=True)
})
# Add other form element types if needed (select, textarea)
element_data = {
'type': 'form',
'id': form.get('id'),
'action': form.get('action'),
'method': form.get('method'),
'class': form.get('class'),
'contained_elements': form_elements,
'attributes': {k: v for k, v in form.attrs.items() if k not in ['id', 'class', 'action', 'method']}
}
extracted_elements.append(element_data)
logging.info(f"Found {len(forms)} forms.")
logging.info(f"Successfully extracted {len(extracted_elements)} elements in total.")
except FileNotFoundError:
logging.error(f"ERROR: System Chromedriver not found at {chromedriver_path}. Make sure 'chromium-driver' is in packages.txt.")
except Exception as e:
logging.error(f"Error during scraping URL {url}: {e}", exc_info=True)
# Return empty list on error, Gradio app will handle this
return []
finally:
if driver:
driver.quit()
logging.info("WebDriver closed.")
return extracted_elements
# Example usage (optional, for testing scraper independently)
# if __name__ == '__main__':
# test_url = "https://demoblaze.com/"
# elements = extract_elements(test_url)
# if elements:
# print(f"Extracted {len(elements)} elements.")
# # Save to a temporary file for inspection
# with open("temp_elements.json", "w", encoding="utf-8") as f:
# json.dump(elements, f, indent=4)
# print("Saved results to temp_elements.json")
# else:
# print("Scraping failed.") |