scraper
Browse files- scraper.py +159 -149
scraper.py
CHANGED
@@ -1,150 +1,160 @@
|
|
1 |
-
# ai_test_generator/scraper.py
|
2 |
-
import time
|
3 |
-
import json
|
4 |
-
from selenium import webdriver
|
5 |
-
from selenium.webdriver.chrome.service import Service
|
6 |
-
from selenium.webdriver.chrome.options import Options
|
7 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
8 |
-
from bs4 import BeautifulSoup
|
9 |
-
import logging
|
10 |
-
|
11 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
12 |
-
|
13 |
-
def extract_elements(url: str) -> list[dict]:
|
14 |
-
"""
|
15 |
-
Scrapes a website URL to extract buttons, links, input fields, and forms.
|
16 |
-
|
17 |
-
Args:
|
18 |
-
url: The public URL of the website to scrape.
|
19 |
-
|
20 |
-
Returns:
|
21 |
-
A list of dictionaries, each representing an extracted UI element.
|
22 |
-
Returns an empty list if scraping fails.
|
23 |
-
"""
|
24 |
-
logging.info(f"Starting scraping for URL: {url}")
|
25 |
-
extracted_elements = []
|
26 |
-
|
27 |
-
chrome_options = Options()
|
28 |
-
chrome_options.add_argument("--headless") # Run headless (no GUI)
|
29 |
-
chrome_options.add_argument("--no-sandbox")
|
30 |
-
chrome_options.add_argument("--disable-dev-shm-usage")
|
31 |
-
chrome_options.add_argument("--disable-gpu") # Recommended for headless
|
32 |
-
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # Set user agent
|
33 |
-
|
34 |
-
service = Service(ChromeDriverManager().install())
|
35 |
-
driver = None
|
36 |
-
|
37 |
-
try:
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
#
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
'
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
for
|
105 |
-
form_elements.append({
|
106 |
-
'tag': '
|
107 |
-
'type':
|
108 |
-
'id':
|
109 |
-
'name':
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
#
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
# print("Scraping failed.")
|
|
|
1 |
+
# ai_test_generator/scraper.py
|
2 |
+
import time
|
3 |
+
import json
|
4 |
+
from selenium import webdriver
|
5 |
+
from selenium.webdriver.chrome.service import Service
|
6 |
+
from selenium.webdriver.chrome.options import Options
|
7 |
+
# from webdriver_manager.chrome import ChromeDriverManager
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import logging
|
10 |
+
|
11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
12 |
+
|
13 |
+
def extract_elements(url: str) -> list[dict]:
|
14 |
+
"""
|
15 |
+
Scrapes a website URL to extract buttons, links, input fields, and forms.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
url: The public URL of the website to scrape.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
A list of dictionaries, each representing an extracted UI element.
|
22 |
+
Returns an empty list if scraping fails.
|
23 |
+
"""
|
24 |
+
logging.info(f"Starting scraping for URL: {url}")
|
25 |
+
extracted_elements = []
|
26 |
+
|
27 |
+
chrome_options = Options()
|
28 |
+
chrome_options.add_argument("--headless") # Run headless (no GUI)
|
29 |
+
chrome_options.add_argument("--no-sandbox")
|
30 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
31 |
+
chrome_options.add_argument("--disable-gpu") # Recommended for headless
|
32 |
+
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # Set user agent
|
33 |
+
|
34 |
+
# service = Service(ChromeDriverManager().install())
|
35 |
+
driver = None
|
36 |
+
|
37 |
+
try:
|
38 |
+
# --- Use the system's ChromeDriver ---
|
39 |
+
# Specify the path to the driver installed via packages.txt
|
40 |
+
chromedriver_path = "/usr/bin/chromedriver"
|
41 |
+
logging.info(f"Attempting to use system chromedriver at: {chromedriver_path}")
|
42 |
+
service = Service(executable_path=chromedriver_path)
|
43 |
+
# --- End of change ---
|
44 |
+
|
45 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
46 |
+
driver.set_page_load_timeout(30) # Set timeout for page load
|
47 |
+
driver.get(url)
|
48 |
+
# Allow some time for dynamic content to potentially load
|
49 |
+
# A more robust solution might use WebDriverWait
|
50 |
+
time.sleep(3)
|
51 |
+
|
52 |
+
page_source = driver.page_source
|
53 |
+
soup = BeautifulSoup(page_source, 'lxml') # Use lxml parser
|
54 |
+
|
55 |
+
# --- Extract Buttons ---
|
56 |
+
buttons = soup.find_all('button')
|
57 |
+
for btn in buttons:
|
58 |
+
element_data = {
|
59 |
+
'type': 'button',
|
60 |
+
'text': btn.get_text(strip=True),
|
61 |
+
'id': btn.get('id'),
|
62 |
+
'name': btn.get('name'),
|
63 |
+
'class': btn.get('class'),
|
64 |
+
'attributes': {k: v for k, v in btn.attrs.items() if k not in ['id', 'name', 'class']}
|
65 |
+
}
|
66 |
+
extracted_elements.append(element_data)
|
67 |
+
logging.info(f"Found {len(buttons)} buttons.")
|
68 |
+
|
69 |
+
# --- Extract Links ---
|
70 |
+
links = soup.find_all('a')
|
71 |
+
for link in links:
|
72 |
+
element_data = {
|
73 |
+
'type': 'link',
|
74 |
+
'text': link.get_text(strip=True),
|
75 |
+
'href': link.get('href'),
|
76 |
+
'id': link.get('id'),
|
77 |
+
'class': link.get('class'),
|
78 |
+
'attributes': {k: v for k, v in link.attrs.items() if k not in ['id', 'class', 'href']}
|
79 |
+
}
|
80 |
+
extracted_elements.append(element_data)
|
81 |
+
logging.info(f"Found {len(links)} links.")
|
82 |
+
|
83 |
+
# --- Extract Input Fields ---
|
84 |
+
inputs = soup.find_all('input')
|
85 |
+
for inp in inputs:
|
86 |
+
element_data = {
|
87 |
+
'type': 'input',
|
88 |
+
'input_type': inp.get('type', 'text'), # Default to 'text' if type not specified
|
89 |
+
'id': inp.get('id'),
|
90 |
+
'name': inp.get('name'),
|
91 |
+
'placeholder': inp.get('placeholder'),
|
92 |
+
'value': inp.get('value'),
|
93 |
+
'class': inp.get('class'),
|
94 |
+
'attributes': {k: v for k, v in inp.attrs.items() if k not in ['id', 'name', 'class', 'type', 'placeholder', 'value']}
|
95 |
+
}
|
96 |
+
extracted_elements.append(element_data)
|
97 |
+
logging.info(f"Found {len(inputs)} input fields.")
|
98 |
+
|
99 |
+
# --- Extract Forms ---
|
100 |
+
forms = soup.find_all('form')
|
101 |
+
for form in forms:
|
102 |
+
form_elements = []
|
103 |
+
# Find elements within this specific form
|
104 |
+
for child_input in form.find_all('input'):
|
105 |
+
form_elements.append({
|
106 |
+
'tag': 'input',
|
107 |
+
'type': child_input.get('type'),
|
108 |
+
'id': child_input.get('id'),
|
109 |
+
'name': child_input.get('name')
|
110 |
+
})
|
111 |
+
for child_button in form.find_all('button'):
|
112 |
+
form_elements.append({
|
113 |
+
'tag': 'button',
|
114 |
+
'type': child_button.get('type'),
|
115 |
+
'id': child_button.get('id'),
|
116 |
+
'name': child_button.get('name'),
|
117 |
+
'text': child_button.get_text(strip=True)
|
118 |
+
})
|
119 |
+
# Add other form element types if needed (select, textarea)
|
120 |
+
|
121 |
+
element_data = {
|
122 |
+
'type': 'form',
|
123 |
+
'id': form.get('id'),
|
124 |
+
'action': form.get('action'),
|
125 |
+
'method': form.get('method'),
|
126 |
+
'class': form.get('class'),
|
127 |
+
'contained_elements': form_elements,
|
128 |
+
'attributes': {k: v for k, v in form.attrs.items() if k not in ['id', 'class', 'action', 'method']}
|
129 |
+
}
|
130 |
+
extracted_elements.append(element_data)
|
131 |
+
logging.info(f"Found {len(forms)} forms.")
|
132 |
+
|
133 |
+
logging.info(f"Successfully extracted {len(extracted_elements)} elements in total.")
|
134 |
+
|
135 |
+
except FileNotFoundError:
|
136 |
+
logging.error(f"ERROR: System Chromedriver not found at {chromedriver_path}. Make sure 'chromium-driver' is in packages.txt.")
|
137 |
+
|
138 |
+
except Exception as e:
|
139 |
+
logging.error(f"Error during scraping URL {url}: {e}", exc_info=True)
|
140 |
+
# Return empty list on error, Gradio app will handle this
|
141 |
+
return []
|
142 |
+
finally:
|
143 |
+
if driver:
|
144 |
+
driver.quit()
|
145 |
+
logging.info("WebDriver closed.")
|
146 |
+
|
147 |
+
return extracted_elements
|
148 |
+
|
149 |
+
# Example usage (optional, for testing scraper independently)
|
150 |
+
# if __name__ == '__main__':
|
151 |
+
# test_url = "https://demoblaze.com/"
|
152 |
+
# elements = extract_elements(test_url)
|
153 |
+
# if elements:
|
154 |
+
# print(f"Extracted {len(elements)} elements.")
|
155 |
+
# # Save to a temporary file for inspection
|
156 |
+
# with open("temp_elements.json", "w", encoding="utf-8") as f:
|
157 |
+
# json.dump(elements, f, indent=4)
|
158 |
+
# print("Saved results to temp_elements.json")
|
159 |
+
# else:
|
160 |
# print("Scraping failed.")
|