Spaces:

Ashgen12
/

AI-Powered-Web-Test-Automation

Running

File size: 6,574 Bytes

# ai_test_generator/scraper.py
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_elements(url: str) -> list[dict]:
    """
    Scrapes a website URL to extract buttons, links, input fields, and forms.

    Args:
        url: The public URL of the website to scrape.

    Returns:
        A list of dictionaries, each representing an extracted UI element.
        Returns an empty list if scraping fails.
    """
    logging.info(f"Starting scraping for URL: {url}")
    extracted_elements = []

    chrome_options = Options()
    chrome_options.add_argument("--headless") # Run headless (no GUI)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu") # Recommended for headless
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # Set user agent

    # service = Service(ChromeDriverManager().install())
    driver = None

    try:
        # --- Use the system's ChromeDriver ---
        # Specify the path to the driver installed via packages.txt
        chromedriver_path = "/usr/bin/chromedriver"
        logging.info(f"Attempting to use system chromedriver at: {chromedriver_path}")
        service = Service(executable_path=chromedriver_path)
        # --- End of change ---
        
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.set_page_load_timeout(30) # Set timeout for page load
        driver.get(url)
        # Allow some time for dynamic content to potentially load
        # A more robust solution might use WebDriverWait
        time.sleep(3)

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'lxml') # Use lxml parser

        # --- Extract Buttons ---
        buttons = soup.find_all('button')
        for btn in buttons:
            element_data = {
                'type': 'button',
                'text': btn.get_text(strip=True),
                'id': btn.get('id'),
                'name': btn.get('name'),
                'class': btn.get('class'),
                'attributes': {k: v for k, v in btn.attrs.items() if k not in ['id', 'name', 'class']}
            }
            extracted_elements.append(element_data)
        logging.info(f"Found {len(buttons)} buttons.")

        # --- Extract Links ---
        links = soup.find_all('a')
        for link in links:
            element_data = {
                'type': 'link',
                'text': link.get_text(strip=True),
                'href': link.get('href'),
                'id': link.get('id'),
                'class': link.get('class'),
                'attributes': {k: v for k, v in link.attrs.items() if k not in ['id', 'class', 'href']}
            }
            extracted_elements.append(element_data)
        logging.info(f"Found {len(links)} links.")

        # --- Extract Input Fields ---
        inputs = soup.find_all('input')
        for inp in inputs:
            element_data = {
                'type': 'input',
                'input_type': inp.get('type', 'text'), # Default to 'text' if type not specified
                'id': inp.get('id'),
                'name': inp.get('name'),
                'placeholder': inp.get('placeholder'),
                'value': inp.get('value'),
                'class': inp.get('class'),
                'attributes': {k: v for k, v in inp.attrs.items() if k not in ['id', 'name', 'class', 'type', 'placeholder', 'value']}
            }
            extracted_elements.append(element_data)
        logging.info(f"Found {len(inputs)} input fields.")

        # --- Extract Forms ---
        forms = soup.find_all('form')
        for form in forms:
            form_elements = []
            # Find elements within this specific form
            for child_input in form.find_all('input'):
                 form_elements.append({
                    'tag': 'input',
                    'type': child_input.get('type'),
                    'id': child_input.get('id'),
                    'name': child_input.get('name')
                 })
            for child_button in form.find_all('button'):
                 form_elements.append({
                    'tag': 'button',
                    'type': child_button.get('type'),
                    'id': child_button.get('id'),
                    'name': child_button.get('name'),
                    'text': child_button.get_text(strip=True)
                 })
            # Add other form element types if needed (select, textarea)

            element_data = {
                'type': 'form',
                'id': form.get('id'),
                'action': form.get('action'),
                'method': form.get('method'),
                'class': form.get('class'),
                'contained_elements': form_elements,
                'attributes': {k: v for k, v in form.attrs.items() if k not in ['id', 'class', 'action', 'method']}
            }
            extracted_elements.append(element_data)
        logging.info(f"Found {len(forms)} forms.")

        logging.info(f"Successfully extracted {len(extracted_elements)} elements in total.")

    except FileNotFoundError:
         logging.error(f"ERROR: System Chromedriver not found at {chromedriver_path}. Make sure 'chromium-driver' is in packages.txt.")

    except Exception as e:
        logging.error(f"Error during scraping URL {url}: {e}", exc_info=True)
        # Return empty list on error, Gradio app will handle this
        return []
    finally:
        if driver:
            driver.quit()
            logging.info("WebDriver closed.")

    return extracted_elements

# Example usage (optional, for testing scraper independently)
# if __name__ == '__main__':
#     test_url = "https://demoblaze.com/"
#     elements = extract_elements(test_url)
#     if elements:
#         print(f"Extracted {len(elements)} elements.")
#         # Save to a temporary file for inspection
#         with open("temp_elements.json", "w", encoding="utf-8") as f:
#             json.dump(elements, f, indent=4)
#         print("Saved results to temp_elements.json")
#     else:
#         print("Scraping failed.")