# ai_test_generator/scraper.py import time import json from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options # from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def extract_elements(url: str) -> list[dict]: """ Scrapes a website URL to extract buttons, links, input fields, and forms. Args: url: The public URL of the website to scrape. Returns: A list of dictionaries, each representing an extracted UI element. Returns an empty list if scraping fails. """ logging.info(f"Starting scraping for URL: {url}") extracted_elements = [] chrome_options = Options() chrome_options.add_argument("--headless") # Run headless (no GUI) chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") # Recommended for headless chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # Set user agent # service = Service(ChromeDriverManager().install()) driver = None try: # --- Use the system's ChromeDriver --- # Specify the path to the driver installed via packages.txt chromedriver_path = "/usr/bin/chromedriver" logging.info(f"Attempting to use system chromedriver at: {chromedriver_path}") service = Service(executable_path=chromedriver_path) # --- End of change --- driver = webdriver.Chrome(service=service, options=chrome_options) driver.set_page_load_timeout(30) # Set timeout for page load driver.get(url) # Allow some time for dynamic content to potentially load # A more robust solution might use WebDriverWait time.sleep(3) page_source = driver.page_source soup = BeautifulSoup(page_source, 'lxml') # Use lxml parser # --- Extract Buttons --- buttons = soup.find_all('button') for btn in buttons: element_data = { 'type': 'button', 'text': btn.get_text(strip=True), 'id': btn.get('id'), 'name': btn.get('name'), 'class': btn.get('class'), 'attributes': {k: v for k, v in btn.attrs.items() if k not in ['id', 'name', 'class']} } extracted_elements.append(element_data) logging.info(f"Found {len(buttons)} buttons.") # --- Extract Links --- links = soup.find_all('a') for link in links: element_data = { 'type': 'link', 'text': link.get_text(strip=True), 'href': link.get('href'), 'id': link.get('id'), 'class': link.get('class'), 'attributes': {k: v for k, v in link.attrs.items() if k not in ['id', 'class', 'href']} } extracted_elements.append(element_data) logging.info(f"Found {len(links)} links.") # --- Extract Input Fields --- inputs = soup.find_all('input') for inp in inputs: element_data = { 'type': 'input', 'input_type': inp.get('type', 'text'), # Default to 'text' if type not specified 'id': inp.get('id'), 'name': inp.get('name'), 'placeholder': inp.get('placeholder'), 'value': inp.get('value'), 'class': inp.get('class'), 'attributes': {k: v for k, v in inp.attrs.items() if k not in ['id', 'name', 'class', 'type', 'placeholder', 'value']} } extracted_elements.append(element_data) logging.info(f"Found {len(inputs)} input fields.") # --- Extract Forms --- forms = soup.find_all('form') for form in forms: form_elements = [] # Find elements within this specific form for child_input in form.find_all('input'): form_elements.append({ 'tag': 'input', 'type': child_input.get('type'), 'id': child_input.get('id'), 'name': child_input.get('name') }) for child_button in form.find_all('button'): form_elements.append({ 'tag': 'button', 'type': child_button.get('type'), 'id': child_button.get('id'), 'name': child_button.get('name'), 'text': child_button.get_text(strip=True) }) # Add other form element types if needed (select, textarea) element_data = { 'type': 'form', 'id': form.get('id'), 'action': form.get('action'), 'method': form.get('method'), 'class': form.get('class'), 'contained_elements': form_elements, 'attributes': {k: v for k, v in form.attrs.items() if k not in ['id', 'class', 'action', 'method']} } extracted_elements.append(element_data) logging.info(f"Found {len(forms)} forms.") logging.info(f"Successfully extracted {len(extracted_elements)} elements in total.") except FileNotFoundError: logging.error(f"ERROR: System Chromedriver not found at {chromedriver_path}. Make sure 'chromium-driver' is in packages.txt.") except Exception as e: logging.error(f"Error during scraping URL {url}: {e}", exc_info=True) # Return empty list on error, Gradio app will handle this return [] finally: if driver: driver.quit() logging.info("WebDriver closed.") return extracted_elements # Example usage (optional, for testing scraper independently) # if __name__ == '__main__': # test_url = "https://demoblaze.com/" # elements = extract_elements(test_url) # if elements: # print(f"Extracted {len(elements)} elements.") # # Save to a temporary file for inspection # with open("temp_elements.json", "w", encoding="utf-8") as f: # json.dump(elements, f, indent=4) # print("Saved results to temp_elements.json") # else: # print("Scraping failed.")